Projects
Staging
x265
Sign Up
Log In
Username
Password
We truncated the diff of some files because they were too big. If you want to see the full diff for every file,
click here
.
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
Expand all
Collapse all
Changes of Revision 20
View file
x265.changes
Changed
@@ -1,4 +1,38 @@ ------------------------------------------------------------------- +Mon Sep 30 12:34:56 UTC 2024 - olaf@aepfle.de + +- Update to version 4.0 + New features: + * Alpha Channel feature. + * Screen Content Coding (SCC). + * MV-HEVC feature. + Enhancements to existing features: + * Added support for the VMAF v3.x. + API changes + * Add command line parameter for Alpha Channel feature :option:`--alpha`. + * Add command line parameter for SCC feature :option:`--scc 1`. + * Add command line parameters for the MV-HEVC feature + :option:`--multiview-config "multiview_config.txt"`. + Optimizations + * Arm SIMD optimizations: Several time-consuming scalar C + functions now have SIMD implementations on Arm platforms. + Existing Arm SIMD implementations have also been optimized. + These optimizations result in up to 57% faster encoding + compared to release 3.6. + * Arm SIMD optimizations include use of Armv8.4 DotProd, Armv8.6 + I8MM, and Armv9 SVE2 instruction set extensions. The following + algorithms now have optimized SIMD implementations: SAD, SSE, + DCT, SAO, convolution, quantization, intra_planar, + intraFilter, intrapred DC and IDCT16x16. + Bug fixes + * Fix for y4m pipe input broken. + * Fix SCC crash on multipass encode. + * Fix mcstf when :option:`--bframes` value was less than 5. + * Fix lowpass DCT for high bit depth. + * Fix issue in default code flow and memory leak. + * Fix scc crash on multipass encode. + +------------------------------------------------------------------- Thu Jun 13 05:58:19 UTC 2024 - Luigi Baldoni <aloisio@gmx.com> - Update to version 3.6
View file
x265.spec
Changed
@@ -17,12 +17,12 @@ # -%define sover 209 +%define sover 212 %define libname lib%{name} %define libsoname %{libname}-%{sover} -%define uver 3_6 +%define uver 4_0 Name: x265 -Version: 3.6 +Version: 4.0 Release: 0 Summary: A free h265/HEVC encoder - encoder binary License: GPL-2.0-or-later @@ -30,11 +30,20 @@ URL: https://bitbucket.org/multicoreware/x265_git Source0: https://bitbucket.org/multicoreware/x265_git/downloads/%{name}_%{version}.tar.gz Patch1: x265.pkgconfig.patch -Patch2: x265-fix_enable512.patch Patch3: 0001-Fix-arm-flags.patch Patch4: 0004-Do-not-build-with-assembly-support-on-arm.patch -BuildRequires: cmake >= 2.8.8 +BuildRequires: cmake +%if 0%{?suse_version} > 1500 BuildRequires: gcc-c++ +%else +%if 0%{?sle_version} > 150500 +BuildRequires: gcc13 +BuildRequires: gcc13-c++ +%else +BuildRequires: gcc10 +BuildRequires: gcc10-c++ +%endif +%endif BuildRequires: nasm >= 2.13 BuildRequires: pkgconfig %ifarch x86_64 @@ -73,16 +82,27 @@ streams. %prep -%setup -q -n %{name}_%{version} -%autopatch -p1 +%autosetup -p1 -n %{name}_%{version} +%build +test -x "$(type -p gcc)" && CC="$_" +test -x "$(type -p g++)" && CXX="$_" +test -x "$(type -p gcc-10)" && CC="$_" +test -x "$(type -p g++-10)" && CXX="$_" +test -x "$(type -p gcc-13)" && CC="$_" +test -x "$(type -p g++-13)" && CXX="$_" +export CC="$(readlink -f ${CC})" +export CXX="$(readlink -f ${CXX})" +CFLAGS='%optflags -Wno-misleading-indentation -Wno-unused-parameter -Wno-unused-variable' +CXXFLAGS='%optflags -Wno-misleading-indentation -Wno-unused-parameter -Wno-unused-variable' # set the version by hand -sed -i "/^include(Version)/d" source/CMakeLists.txt +sed -i~ "/^include(Version)/d" source/CMakeLists.txt +diff -u "$_"~ "$_" && exit 1 # force version number in the soname -sed -i 's/hdr10plus-shared PROPERTIES OUTPUT_NAME hdr10plus/hdr10plus-shared PROPERTIES OUTPUT_NAME hdr10plus-%{version}/' \ +sed -i~ 's/hdr10plus-shared PROPERTIES OUTPUT_NAME hdr10plus/hdr10plus-shared PROPERTIES OUTPUT_NAME hdr10plus-%{version}/' \ source/CMakeLists.txt +diff -u "$_"~ "$_" && exit 1 -%build SOURCE_DIR="$PWD"/source COMMON_FLAGS="-DENABLE_TESTS=OFF -DENABLE_PIC=ON -Wno-dev" HIGH_BIT_DEPTH_FLAGS="-DENABLE_CLI=OFF -DENABLE_SHARED=OFF -DEXPORT_C_API=OFF -DHIGH_BIT_DEPTH=ON"
View file
0001-Fix-arm-flags.patch
Changed
@@ -6,11 +6,9 @@ source/CMakeLists.txt | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) -diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt -index ab5ddfe..eb9b19b 100755 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt -@@ -253,10 +253,7 @@ if(GCC) +@@ -257,10 +257,7 @@ elseif(ARM) find_package(Neon) if(CPU_HAS_NEON) @@ -20,20 +18,42 @@ - set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm) endif() endif() - if(ARM64 OR CROSS_COMPILE_ARM64) -@@ -265,13 +262,13 @@ if(GCC) - find_package(SVE2) - if(CPU_HAS_SVE2 OR CROSS_COMPILE_SVE2) - message(STATUS "Found SVE2") -- set(ARM_ARGS -O3 -march=armv8-a+sve2 -fPIC -flax-vector-conversions) -+ set(ARM_ARGS -fPIC -flax-vector-conversions) - add_definitions(-DHAVE_SVE2) - add_definitions(-DHAVE_SVE) - add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE2 - elseif(CPU_HAS_SVE OR CROSS_COMPILE_SVE) - message(STATUS "Found SVE") -- set(ARM_ARGS -O3 -march=armv8-a+sve -fPIC -flax-vector-conversions) -+ set(ARM_ARGS -fPIC -flax-vector-conversions) - add_definitions(-DHAVE_SVE) - add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE - elseif(CPU_HAS_NEON) + if(ARM64) +--- a/source/cmake/FindNEON_DOTPROD.cmake ++++ b/source/cmake/FindNEON_DOTPROD.cmake +@@ -17,5 +17,5 @@ + endif() + + if(has_dot_product) +- set(CPU_HAS_NEON_DOTPROD 1) ++ set(CPU_HAS_NEON_DOTPROD 0) + endif() +--- a/source/cmake/FindNEON_I8MM.cmake ++++ b/source/cmake/FindNEON_I8MM.cmake +@@ -17,5 +17,5 @@ + endif() + + if(has_i8mm) +- set(CPU_HAS_NEON_I8MM 1) ++ set(CPU_HAS_NEON_I8MM 0) + endif() +--- a/source/cmake/FindSVE.cmake ++++ b/source/cmake/FindSVE.cmake +@@ -17,5 +17,5 @@ + endif() + + if(sve_version) +- set(CPU_HAS_SVE 1) ++ set(CPU_HAS_SVE 0) + endif() +--- a/source/cmake/FindSVE2.cmake ++++ b/source/cmake/FindSVE2.cmake +@@ -17,6 +17,6 @@ + endif() + + if(sve2_version) +- set(CPU_HAS_SVE 1) +- set(CPU_HAS_SVE2 1) ++ set(CPU_HAS_SVE 0) ++ set(CPU_HAS_SVE2 0) + endif()
View file
0004-Do-not-build-with-assembly-support-on-arm.patch
Changed
@@ -6,11 +6,9 @@ source/CMakeLists.txt | 9 --------- 1 file changed, 9 deletions(-) -diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt -index 672cc2d..f112330 100755 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt -@@ -73,15 +73,6 @@ elseif(POWERMATCH GREATER "-1") +@@ -72,15 +72,6 @@ add_definitions(-DPPC64=1) message(STATUS "Detected POWER PPC64 target processor") endif() @@ -24,5 +22,5 @@ - set(ARM 1) - add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1) elseif(ARM64MATCH GREATER "-1") - #if(CROSS_COMPILE_ARM64) - #message(STATUS "Cross compiling for ARM64 arch") + message(STATUS "Detected ARM64 target processor") + set(ARM64 1)
View file
x265-fix_enable512.patch
Deleted
@@ -1,26 +0,0 @@ ---- a/source/common/cpu.cpp -+++ b/source/common/cpu.cpp -@@ -110,6 +110,11 @@ const cpu_name_t cpu_names = - { "", 0 }, - }; - -+bool detect512() -+{ -+ return(enable512); -+} -+ - #if X265_ARCH_X86 - - extern "C" { -@@ -123,11 +128,6 @@ uint64_t PFX(cpu_xgetbv)(int xcr); - #pragma warning(disable: 4309) // truncation of constant value - #endif - --bool detect512() --{ -- return(enable512); --} -- - uint32_t cpu_detect(bool benableavx512 ) - { -
View file
baselibs.conf
Changed
@@ -1,1 +1,1 @@ -libx265-209 +libx265-212
View file
x265_3.6.tar.gz/source/common/aarch64/ipfilter-common.S
Deleted
@@ -1,1436 +0,0 @@ -/***************************************************************************** - * Copyright (C) 2022-2023 MulticoreWare, Inc - * - * Authors: David Chen <david.chen@myais.com.cn> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. - * - * This program is also available under a commercial proprietary license. - * For more information, contact us at license @ x265.com. - *****************************************************************************/ - -// This file contains the macros written using NEON instruction set -// that are also used by the SVE2 functions - -// Macros below follow these conventions: -// - input data in registers: v0, v1, v2, v3, v4, v5, v6, v7 -// - constants in registers: v24, v25, v26, v27, v31 -// - temporary registers: v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30. -// - _32b macros output a result in v17.4s -// - _64b and _32b_1 macros output results in v17.4s, v18.4s - -#include "asm.S" - -.arch armv8-a - -#ifdef __APPLE__ -.section __RODATA,__rodata -#else -.section .rodata -#endif - -.align 4 - -.macro vextin8 v - ldp d6, d7, x11, #16 -.if \v == 0 - // qpel_filter_0 only uses values in v3 - ext v3.8b, v6.8b, v7.8b, #4 -.else -.if \v != 3 - ext v0.8b, v6.8b, v7.8b, #1 -.endif - ext v1.8b, v6.8b, v7.8b, #2 - ext v2.8b, v6.8b, v7.8b, #3 - ext v3.8b, v6.8b, v7.8b, #4 - ext v4.8b, v6.8b, v7.8b, #5 - ext v5.8b, v6.8b, v7.8b, #6 - ext v6.8b, v6.8b, v7.8b, #7 -.endif -.endm - -.macro vextin8_64 v - ldp q6, q7, x11, #32 -.if \v == 0 - // qpel_filter_0 only uses values in v3 - ext v3.16b, v6.16b, v7.16b, #4 -.else -.if \v != 3 - // qpel_filter_3 does not use values in v0 - ext v0.16b, v6.16b, v7.16b, #1 -.endif - ext v1.16b, v6.16b, v7.16b, #2 - ext v2.16b, v6.16b, v7.16b, #3 - ext v3.16b, v6.16b, v7.16b, #4 - ext v4.16b, v6.16b, v7.16b, #5 - ext v5.16b, v6.16b, v7.16b, #6 -.if \v == 1 - ext v6.16b, v6.16b, v7.16b, #7 - // qpel_filter_1 does not use v7 -.else - ext v16.16b, v6.16b, v7.16b, #7 - ext v7.16b, v6.16b, v7.16b, #8 - mov v6.16b, v16.16b -.endif -.endif -.endm - -.macro vextin8_chroma v - ldp d6, d7, x11, #16 -.if \v == 0 - // qpel_filter_chroma_0 only uses values in v1 - ext v1.8b, v6.8b, v7.8b, #2 -.else - ext v0.8b, v6.8b, v7.8b, #1 - ext v1.8b, v6.8b, v7.8b, #2 - ext v2.8b, v6.8b, v7.8b, #3 - ext v3.8b, v6.8b, v7.8b, #4 -.endif -.endm - -.macro vextin8_chroma_64 v - ldp q16, q17, x11, #32 -.if \v == 0 - // qpel_filter_chroma_0 only uses values in v1 - ext v1.16b, v16.16b, v17.16b, #2 -.else - ext v0.16b, v16.16b, v17.16b, #1 - ext v1.16b, v16.16b, v17.16b, #2 - ext v2.16b, v16.16b, v17.16b, #3 - ext v3.16b, v16.16b, v17.16b, #4 -.endif -.endm - -.macro qpel_load_32b v -.if \v == 0 - add x6, x6, x11 // do not load 3 values that are not used in qpel_filter_0 - ld1 {v3.8b}, x6, x1 -.elseif \v == 1 || \v == 2 || \v == 3 -.if \v != 3 // not used in qpel_filter_3 - ld1 {v0.8b}, x6, x1 -.else - add x6, x6, x1 -.endif - ld1 {v1.8b}, x6, x1 - ld1 {v2.8b}, x6, x1 - ld1 {v3.8b}, x6, x1 - ld1 {v4.8b}, x6, x1 - ld1 {v5.8b}, x6, x1 -.if \v != 1 // not used in qpel_filter_1 - ld1 {v6.8b}, x6, x1 - ld1 {v7.8b}, x6 -.else - ld1 {v6.8b}, x6 -.endif -.endif -.endm - -.macro qpel_load_64b v -.if \v == 0 - add x6, x6, x11 // do not load 3 values that are not used in qpel_filter_0 - ld1 {v3.16b}, x6, x1 -.elseif \v == 1 || \v == 2 || \v == 3 -.if \v != 3 // not used in qpel_filter_3 - ld1 {v0.16b}, x6, x1 -.else - add x6, x6, x1 -.endif - ld1 {v1.16b}, x6, x1 - ld1 {v2.16b}, x6, x1 - ld1 {v3.16b}, x6, x1 - ld1 {v4.16b}, x6, x1 - ld1 {v5.16b}, x6, x1 -.if \v != 1 // not used in qpel_filter_1 - ld1 {v6.16b}, x6, x1 - ld1 {v7.16b}, x6 -.else - ld1 {v6.16b}, x6 -.endif -.endif -.endm - -.macro qpel_chroma_load_32b v -.if \v == 0 - // qpel_filter_chroma_0 only uses values in v1 - add x6, x6, x1 - ldr d1, x6 -.else - ld1 {v0.8b}, x6, x1 - ld1 {v1.8b}, x6, x1 - ld1 {v2.8b}, x6, x1 - ld1 {v3.8b}, x6 -.endif -.endm - -.macro qpel_chroma_load_64b v -.if \v == 0 - // qpel_filter_chroma_0 only uses values in v1 - add x6, x6, x1 - ldr q1, x6 -.else - ld1 {v0.16b}, x6, x1 - ld1 {v1.16b}, x6, x1 - ld1 {v2.16b}, x6, x1 - ld1 {v3.16b}, x6 -.endif -.endm - -// a, b, c, d, e, f, g, h -// .hword 0, 0, 0, 64, 0, 0, 0, 0 -.macro qpel_start_0 - movi v24.16b, #64 -.endm - -.macro qpel_filter_0_32b - umull v17.8h, v3.8b, v24.8b // 64*d -.endm -
View file
x265_3.6.tar.gz/source/common/aarch64/ipfilter-sve2.S
Deleted
@@ -1,1282 +0,0 @@ -/***************************************************************************** - * Copyright (C) 2022-2023 MulticoreWare, Inc - * - * Authors: David Chen <david.chen@myais.com.cn> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. - * - * This program is also available under a commercial proprietary license. - * For more information, contact us at license @ x265.com. - *****************************************************************************/ - -// Functions in this file: -// ***** luma_vpp ***** -// ***** luma_vps ***** -// ***** luma_vsp ***** -// ***** luma_vss ***** -// ***** luma_hpp ***** -// ***** luma_hps ***** -// ***** chroma_vpp ***** -// ***** chroma_vps ***** -// ***** chroma_vsp ***** -// ***** chroma_vss ***** -// ***** chroma_hpp ***** -// ***** chroma_hps ***** - -#include "asm-sve.S" -#include "ipfilter-common.S" - -.arch armv8-a+sve2 - -#ifdef __APPLE__ -.section __RODATA,__rodata -#else -.section .rodata -#endif - -.align 4 - -.text - -.macro qpel_load_32b_sve2 v -.if \v == 0 - add x6, x6, x11 // do not load 3 values that are not used in qpel_filter_0 - ld1b {z3.h}, p0/z, x6 - add x6, x6, x1 -.elseif \v == 1 || \v == 2 || \v == 3 -.if \v != 3 // not used in qpel_filter_3 - ld1b {z0.h}, p0/z, x6 - add x6, x6, x1 -.else - add x6, x6, x1 -.endif - ld1b {z1.h}, p0/z, x6 - add x6, x6, x1 - ld1b {z2.h}, p0/z, x6 - add x6, x6, x1 - ld1b {z3.h}, p0/z, x6 - add x6, x6, x1 - ld1b {z4.h}, p0/z, x6 - add x6, x6, x1 - ld1b {z5.h}, p0/z, x6 - add x6, x6, x1 -.if \v != 1 // not used in qpel_filter_1 - ld1b {z6.h}, p0/z, x6 - add x6, x6, x1 - ld1b {z7.h}, p0/z, x6 -.else - ld1b {z6.h}, p0/z, x6 -.endif -.endif -.endm - -.macro qpel_load_64b_sve2_gt_16 v -.if \v == 0 - add x6, x6, x11 // do not load 3 values that are not used in qpel_filter_0 - ld1b {z3.h}, p2/z, x6 - add x6, x6, x1 -.elseif \v == 1 || \v == 2 || \v == 3 -.if \v != 3 // not used in qpel_filter_3 - ld1b {z0.h}, p2/z, x6 - add x6, x6, x1 -.else - add x6, x6, x1 -.endif - ld1b {z1.h}, p2/z, x6 - add x6, x6, x1 - ld1b {z2.h}, p2/z, x6 - add x6, x6, x1 - ld1b {z3.h}, p2/z, x6 - add x6, x6, x1 - ld1b {z4.h}, p2/z, x6 - add x6, x6, x1 - ld1b {z5.h}, p2/z, x6 - add x6, x6, x1 -.if \v != 1 // not used in qpel_filter_1 - ld1b {z6.h}, p2/z, x6 - add x6, x6, x1 - ld1b {z7.h}, p2/z, x6 -.else - ld1b {z6.h}, p2/z, x6 -.endif -.endif -.endm - -.macro qpel_chroma_load_32b_sve2 v -.if \v == 0 - // qpel_filter_chroma_0 only uses values in v1 - add x6, x6, x1 - ld1b {z1.h}, p0/z, x6 -.else - ld1b {z0.h}, p0/z, x6 - add x6, x6, x1 - ld1b {z1.h}, p0/z, x6 - add x6, x6, x1 - ld1b {z2.h}, p0/z, x6 - add x6, x6, x1 - ld1b {z3.h}, p0/z, x6 -.endif -.endm - -.macro qpel_start_sve2_0 - mov z24.h, #64 -.endm - -.macro qpel_filter_sve2_0_32b - mul z17.h, z3.h, z24.h // 64*d -.endm - -.macro qpel_filter_sve2_0_64b - qpel_filter_sve2_0_32b - mul z18.h, z11.h, z24.h -.endm - -.macro qpel_start_sve2_1 - mov z24.h, #58 - mov z25.h, #10 - mov z26.h, #17 - mov z27.h, #5 -.endm - -.macro qpel_filter_sve2_1_32b - mul z19.h, z2.h, z25.h // c*10 - mul z17.h, z3.h, z24.h // d*58 - mul z21.h, z4.h, z26.h // e*17 - mul z23.h, z5.h, z27.h // f*5 - sub z17.h, z17.h, z19.h // d*58 - c*10 - lsl z18.h, z1.h, #2 // b*4 - add z17.h, z17.h, z21.h // d*58 - c*10 + e*17 - sub z21.h, z6.h, z0.h // g - a - add z17.h, z17.h, z18.h // d*58 - c*10 + e*17 + b*4 - sub z21.h, z21.h, z23.h // g - a - f*5 - add z17.h, z17.h, z21.h // d*58 - c*10 + e*17 + b*4 + g - a - f*5 -.endm - -.macro qpel_filter_sve2_1_64b - qpel_filter_sve2_1_32b - mul z20.h, z10.h, z25.h // c*10 - mul z18.h, z11.h, z24.h // d*58 - mul z21.h, z12.h, z26.h // e*17 - mul z23.h, z13.h, z27.h // f*5 - sub z18.h, z18.h, z20.h // d*58 - c*10 - lsl z28.h, z30.h, #2 // b*4 - add z18.h, z18.h, z21.h // d*58 - c*10 + e*17 - sub z21.h, z14.h, z29.h // g - a - add z18.h, z18.h, z28.h // d*58 - c*10 + e*17 + b*4 - sub z21.h, z21.h, z23.h // g - a - f*5 - add z18.h, z18.h, z21.h // d*58 - c*10 + e*17 + b*4 + g - a - f*5 -.endm - -.macro qpel_start_sve2_2 - mov z24.h, #11 - mov z25.h, #40 -.endm - -.macro qpel_filter_sve2_2_32b - add z17.h, z3.h, z4.h // d + e - add z19.h, z2.h, z5.h // c + f - add z23.h, z1.h, z6.h // b + g - add z21.h, z0.h, z7.h // a + h - mul z17.h, z17.h, z25.h // 40 * (d + e) - mul z19.h, z19.h, z24.h // 11 * (c + f) - lsl z23.h, z23.h, #2 // (b + g) * 4 - add z19.h, z19.h, z21.h // 11 * (c + f) + a + h - add z17.h, z17.h, z23.h // 40 * (d + e) + (b + g) * 4 - sub z17.h, z17.h, z19.h // 40 * (d + e) + (b + g) * 4 - 11 * (c + f) - a - h -.endm -
View file
x265_3.6.tar.gz/source/common/aarch64/ipfilter.S
Deleted
@@ -1,1054 +0,0 @@ -/***************************************************************************** - * Copyright (C) 2021 MulticoreWare, Inc - * - * Authors: Sebastian Pop <spop@amazon.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. - * - * This program is also available under a commercial proprietary license. - * For more information, contact us at license @ x265.com. - *****************************************************************************/ - -// Functions in this file: -// ***** luma_vpp ***** -// ***** luma_vps ***** -// ***** luma_vsp ***** -// ***** luma_vss ***** -// ***** luma_hpp ***** -// ***** luma_hps ***** -// ***** chroma_vpp ***** -// ***** chroma_vps ***** -// ***** chroma_vsp ***** -// ***** chroma_vss ***** -// ***** chroma_hpp ***** -// ***** chroma_hps ***** - -#include "asm.S" -#include "ipfilter-common.S" - -#ifdef __APPLE__ -.section __RODATA,__rodata -#else -.section .rodata -#endif - -.align 4 - -.text - -// ***** luma_vpp ***** -// void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx) -.macro LUMA_VPP_4xN h -function x265_interp_8tap_vert_pp_4x\h\()_neon - movrel x10, g_luma_s16 - sub x0, x0, x1 - sub x0, x0, x1, lsl #1 // src -= 3 * srcStride - lsl x4, x4, #4 - ldr q0, x10, x4 // q0 = luma interpolate coeff - dup v24.8h, v0.h0 - dup v25.8h, v0.h1 - trn1 v24.2d, v24.2d, v25.2d - dup v26.8h, v0.h2 - dup v27.8h, v0.h3 - trn1 v26.2d, v26.2d, v27.2d - dup v28.8h, v0.h4 - dup v29.8h, v0.h5 - trn1 v28.2d, v28.2d, v29.2d - dup v30.8h, v0.h6 - dup v31.8h, v0.h7 - trn1 v30.2d, v30.2d, v31.2d - - // prepare to load 8 lines - ld1 {v0.s}0, x0, x1 - ld1 {v0.s}1, x0, x1 - ushll v0.8h, v0.8b, #0 - ld1 {v1.s}0, x0, x1 - ld1 {v1.s}1, x0, x1 - ushll v1.8h, v1.8b, #0 - ld1 {v2.s}0, x0, x1 - ld1 {v2.s}1, x0, x1 - ushll v2.8h, v2.8b, #0 - ld1 {v3.s}0, x0, x1 - ld1 {v3.s}1, x0, x1 - ushll v3.8h, v3.8b, #0 - - mov x9, #\h -.loop_4x\h: - ld1 {v4.s}0, x0, x1 - ld1 {v4.s}1, x0, x1 - ushll v4.8h, v4.8b, #0 - - // row0-1 - mul v16.8h, v0.8h, v24.8h - ext v21.16b, v0.16b, v1.16b, #8 - mul v17.8h, v21.8h, v24.8h - mov v0.16b, v1.16b - - // row2-3 - mla v16.8h, v1.8h, v26.8h - ext v21.16b, v1.16b, v2.16b, #8 - mla v17.8h, v21.8h, v26.8h - mov v1.16b, v2.16b - - // row4-5 - mla v16.8h, v2.8h, v28.8h - ext v21.16b, v2.16b, v3.16b, #8 - mla v17.8h, v21.8h, v28.8h - mov v2.16b, v3.16b - - // row6-7 - mla v16.8h, v3.8h, v30.8h - ext v21.16b, v3.16b, v4.16b, #8 - mla v17.8h, v21.8h, v30.8h - mov v3.16b, v4.16b - - // sum row0-7 - trn1 v20.2d, v16.2d, v17.2d - trn2 v21.2d, v16.2d, v17.2d - add v16.8h, v20.8h, v21.8h - - sqrshrun v16.8b, v16.8h, #6 - st1 {v16.s}0, x2, x3 - st1 {v16.s}1, x2, x3 - - sub x9, x9, #2 - cbnz x9, .loop_4x\h - ret -endfunc -.endm - -LUMA_VPP_4xN 4 -LUMA_VPP_4xN 8 -LUMA_VPP_4xN 16 - -// void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx) -.macro LUMA_VPP w, h -function x265_interp_8tap_vert_pp_\w\()x\h\()_neon - cmp x4, #0 - b.eq 0f - cmp x4, #1 - b.eq 1f - cmp x4, #2 - b.eq 2f - cmp x4, #3 - b.eq 3f -0: - FILTER_LUMA_VPP \w, \h, 0 -1: - FILTER_LUMA_VPP \w, \h, 1 -2: - FILTER_LUMA_VPP \w, \h, 2 -3: - FILTER_LUMA_VPP \w, \h, 3 -endfunc -.endm - -LUMA_VPP 8, 4 -LUMA_VPP 8, 8 -LUMA_VPP 8, 16 -LUMA_VPP 8, 32 -LUMA_VPP 12, 16 -LUMA_VPP 16, 4 -LUMA_VPP 16, 8 -LUMA_VPP 16, 16 -LUMA_VPP 16, 32 -LUMA_VPP 16, 64 -LUMA_VPP 16, 12 -LUMA_VPP 24, 32 -LUMA_VPP 32, 8 -LUMA_VPP 32, 16 -LUMA_VPP 32, 32 -LUMA_VPP 32, 64 -LUMA_VPP 32, 24 -LUMA_VPP 48, 64 -LUMA_VPP 64, 16 -LUMA_VPP 64, 32 -LUMA_VPP 64, 64 -LUMA_VPP 64, 48 - -// ***** luma_vps ***** -// void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx) -.macro LUMA_VPS_4xN h -function x265_interp_8tap_vert_ps_4x\h\()_neon - lsl x3, x3, #1 - lsl x5, x4, #6 - lsl x4, x1, #2 - sub x4, x4, x1 - sub x0, x0, x4 - - mov w6, #8192 - dup v28.4s, w6 - mov x4, #\h - movrel x12, g_lumaFilter - add x12, x12, x5 - ld1r {v16.2d}, x12, #8 - ld1r {v17.2d}, x12, #8 - ld1r {v18.2d}, x12, #8 - ld1r {v19.2d}, x12, #8
View file
x265_3.6.tar.gz/source/common/aarch64/sad-a-common.S
Deleted
@@ -1,514 +0,0 @@ -/***************************************************************************** - * Copyright (C) 2022-2023 MulticoreWare, Inc - * - * Authors: David Chen <david.chen@myais.com.cn> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. - * - * This program is also available under a commercial proprietary license. - * For more information, contact us at license @ x265.com. - *****************************************************************************/ - -// This file contains the macros written using NEON instruction set -// that are also used by the SVE2 functions - -#include "asm.S" - -.arch armv8-a - -#ifdef __APPLE__ -.section __RODATA,__rodata -#else -.section .rodata -#endif - -.align 4 - -.macro SAD_START_4 f - ld1 {v0.s}0, x0, x1 - ld1 {v0.s}1, x0, x1 - ld1 {v1.s}0, x2, x3 - ld1 {v1.s}1, x2, x3 - \f v16.8h, v0.8b, v1.8b -.endm - -.macro SAD_4 h -.rept \h / 2 - 1 - SAD_START_4 uabal -.endr -.endm - -.macro SAD_START_8 f - ld1 {v0.8b}, x0, x1 - ld1 {v1.8b}, x2, x3 - ld1 {v2.8b}, x0, x1 - ld1 {v3.8b}, x2, x3 - \f v16.8h, v0.8b, v1.8b - \f v17.8h, v2.8b, v3.8b -.endm - -.macro SAD_8 h -.rept \h / 2 - 1 - SAD_START_8 uabal -.endr -.endm - -.macro SAD_START_16 f - ld1 {v0.16b}, x0, x1 - ld1 {v1.16b}, x2, x3 - ld1 {v2.16b}, x0, x1 - ld1 {v3.16b}, x2, x3 - \f v16.8h, v0.8b, v1.8b - \f\()2 v17.8h, v0.16b, v1.16b - uabal v16.8h, v2.8b, v3.8b - uabal2 v17.8h, v2.16b, v3.16b -.endm - -.macro SAD_16 h -.rept \h / 2 - 1 - SAD_START_16 uabal -.endr -.endm - -.macro SAD_START_32 - movi v16.16b, #0 - movi v17.16b, #0 - movi v18.16b, #0 - movi v19.16b, #0 -.endm - -.macro SAD_32 - ld1 {v0.16b-v1.16b}, x0, x1 - ld1 {v2.16b-v3.16b}, x2, x3 - ld1 {v4.16b-v5.16b}, x0, x1 - ld1 {v6.16b-v7.16b}, x2, x3 - uabal v16.8h, v0.8b, v2.8b - uabal2 v17.8h, v0.16b, v2.16b - uabal v18.8h, v1.8b, v3.8b - uabal2 v19.8h, v1.16b, v3.16b - uabal v16.8h, v4.8b, v6.8b - uabal2 v17.8h, v4.16b, v6.16b - uabal v18.8h, v5.8b, v7.8b - uabal2 v19.8h, v5.16b, v7.16b -.endm - -.macro SAD_END_32 - add v16.8h, v16.8h, v17.8h - add v17.8h, v18.8h, v19.8h - add v16.8h, v16.8h, v17.8h - uaddlv s0, v16.8h - fmov w0, s0 - ret -.endm - -.macro SAD_START_64 - movi v16.16b, #0 - movi v17.16b, #0 - movi v18.16b, #0 - movi v19.16b, #0 - movi v20.16b, #0 - movi v21.16b, #0 - movi v22.16b, #0 - movi v23.16b, #0 -.endm - -.macro SAD_64 - ld1 {v0.16b-v3.16b}, x0, x1 - ld1 {v4.16b-v7.16b}, x2, x3 - ld1 {v24.16b-v27.16b}, x0, x1 - ld1 {v28.16b-v31.16b}, x2, x3 - uabal v16.8h, v0.8b, v4.8b - uabal2 v17.8h, v0.16b, v4.16b - uabal v18.8h, v1.8b, v5.8b - uabal2 v19.8h, v1.16b, v5.16b - uabal v20.8h, v2.8b, v6.8b - uabal2 v21.8h, v2.16b, v6.16b - uabal v22.8h, v3.8b, v7.8b - uabal2 v23.8h, v3.16b, v7.16b - - uabal v16.8h, v24.8b, v28.8b - uabal2 v17.8h, v24.16b, v28.16b - uabal v18.8h, v25.8b, v29.8b - uabal2 v19.8h, v25.16b, v29.16b - uabal v20.8h, v26.8b, v30.8b - uabal2 v21.8h, v26.16b, v30.16b - uabal v22.8h, v27.8b, v31.8b - uabal2 v23.8h, v27.16b, v31.16b -.endm - -.macro SAD_END_64 - add v16.8h, v16.8h, v17.8h - add v17.8h, v18.8h, v19.8h - add v16.8h, v16.8h, v17.8h - uaddlp v16.4s, v16.8h - add v18.8h, v20.8h, v21.8h - add v19.8h, v22.8h, v23.8h - add v17.8h, v18.8h, v19.8h - uaddlp v17.4s, v17.8h - add v16.4s, v16.4s, v17.4s - uaddlv d0, v16.4s - fmov x0, d0 - ret -.endm - -.macro SAD_START_12 - movrel x12, sad12_mask - ld1 {v31.16b}, x12 - movi v16.16b, #0 - movi v17.16b, #0 -.endm - -.macro SAD_12 - ld1 {v0.16b}, x0, x1 - and v0.16b, v0.16b, v31.16b - ld1 {v1.16b}, x2, x3 - and v1.16b, v1.16b, v31.16b - ld1 {v2.16b}, x0, x1 - and v2.16b, v2.16b, v31.16b - ld1 {v3.16b}, x2, x3 - and v3.16b, v3.16b, v31.16b - uabal v16.8h, v0.8b, v1.8b - uabal2 v17.8h, v0.16b, v1.16b - uabal v16.8h, v2.8b, v3.8b - uabal2 v17.8h, v2.16b, v3.16b -.endm - -.macro SAD_END_12 - add v16.8h, v16.8h, v17.8h - uaddlv s0, v16.8h - fmov w0, s0 - ret -.endm - -.macro SAD_START_24 - movi v16.16b, #0 - movi v17.16b, #0 - movi v18.16b, #0 - sub x1, x1, #16
View file
x265_3.6.tar.gz/source/common/aarch64/sad-a-sve2.S
Deleted
@@ -1,511 +0,0 @@ -/***************************************************************************** - * Copyright (C) 2022-2023 MulticoreWare, Inc - * - * Authors: David Chen <david.chen@myais.com.cn> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. - * - * This program is also available under a commercial proprietary license. - * For more information, contact us at license @ x265.com. - *****************************************************************************/ - -#include "asm-sve.S" -#include "sad-a-common.S" - -.arch armv8-a+sve2 - -#ifdef __APPLE__ -.section __RODATA,__rodata -#else -.section .rodata -#endif - -.align 4 - -.text - -.macro SAD_SVE2_16 h - mov z16.d, #0 - ptrue p0.h, vl16 -.rept \h - ld1b {z0.h}, p0/z, x0 - ld1b {z2.h}, p0/z, x2 - add x0, x0, x1 - add x2, x2, x3 - uaba z16.h, z0.h, z2.h -.endr - uaddv d0, p0, z16.h - fmov w0, s0 - ret -.endm - -.macro SAD_SVE2_32 h - ptrue p0.b, vl32 -.rept \h - ld1b {z0.b}, p0/z, x0 - ld1b {z4.b}, p0/z, x2 - add x0, x0, x1 - add x2, x2, x3 - uabalb z16.h, z0.b, z4.b - uabalt z16.h, z0.b, z4.b -.endr - uaddv d0, p0, z16.h - fmov w0, s0 - ret -.endm - -.macro SAD_SVE2_64 h - cmp x9, #48 - bgt .vl_gt_48_pixel_sad_64x\h - mov z16.d, #0 - mov z17.d, #0 - mov z18.d, #0 - mov z19.d, #0 - ptrue p0.b, vl32 -.rept \h - ld1b {z0.b}, p0/z, x0 - ld1b {z1.b}, p0/z, x0, #1, mul vl - ld1b {z4.b}, p0/z, x2 - ld1b {z5.b}, p0/z, x2, #1, mul vl - add x0, x0, x1 - add x2, x2, x3 - uabalb z16.h, z0.b, z4.b - uabalt z17.h, z0.b, z4.b - uabalb z18.h, z1.b, z5.b - uabalt z19.h, z1.b, z5.b -.endr - add z16.h, z16.h, z17.h - add z17.h, z18.h, z19.h - add z16.h, z16.h, z17.h - uadalp z24.s, p0/m, z16.h - uaddv d5, p0, z24.s - fmov x0, d5 - ret -.vl_gt_48_pixel_sad_64x\h\(): - mov z16.d, #0 - mov z17.d, #0 - mov z24.d, #0 - ptrue p0.b, vl64 -.rept \h - ld1b {z0.b}, p0/z, x0 - ld1b {z4.b}, p0/z, x2 - add x0, x0, x1 - add x2, x2, x3 - uabalb z16.h, z0.b, z4.b - uabalt z17.h, z0.b, z4.b -.endr - add z16.h, z16.h, z17.h - uadalp z24.s, p0/m, z16.h - uaddv d5, p0, z24.s - fmov x0, d5 - ret -.endm - -.macro SAD_SVE2_24 h - mov z16.d, #0 - mov x10, #24 - mov x11, #0 - whilelt p0.b, x11, x10 -.rept \h - ld1b {z0.b}, p0/z, x0 - ld1b {z8.b}, p0/z, x2 - add x0, x0, x1 - add x2, x2, x3 - uabalb z16.h, z0.b, z8.b - uabalt z16.h, z0.b, z8.b -.endr - uaddv d5, p0, z16.h - fmov w0, s5 - ret -.endm - -.macro SAD_SVE2_48 h - cmp x9, #48 - bgt .vl_gt_48_pixel_sad_48x\h - mov z16.d, #0 - mov z17.d, #0 - mov z18.d, #0 - mov z19.d, #0 - ptrue p0.b, vl32 - ptrue p1.b, vl16 -.rept \h - ld1b {z0.b}, p0/z, x0 - ld1b {z1.b}, p1/z, x0, #1, mul vl - ld1b {z8.b}, p0/z, x2 - ld1b {z9.b}, p1/z, x2, #1, mul vl - add x0, x0, x1 - add x2, x2, x3 - uabalb z16.h, z0.b, z8.b - uabalt z17.h, z0.b, z8.b - uabalb z18.h, z1.b, z9.b - uabalt z19.h, z1.b, z9.b -.endr - add z16.h, z16.h, z17.h - add z17.h, z18.h, z19.h - add z16.h, z16.h, z17.h - uaddv d5, p0, z16.h - fmov w0, s5 - ret -.vl_gt_48_pixel_sad_48x\h\(): - mov z16.d, #0 - mov z17.d, #0 - mov x10, #48 - mov x11, #0 - whilelt p0.b, x11, x10 -.rept \h - ld1b {z0.b}, p0/z, x0 - ld1b {z8.b}, p0/z, x2 - add x0, x0, x1 - add x2, x2, x3 - uabalb z16.h, z0.b, z8.b - uabalt z17.h, z0.b, z8.b -.endr - add z16.h, z16.h, z17.h - uaddv d5, p0, z16.h - fmov w0, s5 - ret -.endm - -// Fully unrolled. -.macro SAD_FUNC_SVE2 w, h -function PFX(pixel_sad_\w\()x\h\()_sve2) - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_pixel_sad_\w\()x\h - SAD_START_\w uabdl - SAD_\w \h -.if \w > 4 - add v16.8h, v16.8h, v17.8h -.endif - uaddlv s0, v16.8h - fmov w0, s0 - ret -.vl_gt_16_pixel_sad_\w\()x\h\(): -.if \w == 4 || \w == 8 || \w == 12 - SAD_START_\w uabdl - SAD_\w \h -.if \w > 4
View file
x265_3.6.tar.gz/source/common/aarch64/ssd-a-sve.S
Deleted
@@ -1,78 +0,0 @@ -/***************************************************************************** - * Copyright (C) 2022-2023 MulticoreWare, Inc - * - * Authors: David Chen <david.chen@myais.com.cn> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. - * - * This program is also available under a commercial proprietary license. - * For more information, contact us at license @ x265.com. - *****************************************************************************/ - -#include "asm-sve.S" - -.arch armv8-a+sve - -#ifdef __APPLE__ -.section __RODATA,__rodata -#else -.section .rodata -#endif - -.align 4 - -.text - -function PFX(pixel_sse_pp_4x4_sve) - ptrue p0.s, vl4 - ld1b {z0.s}, p0/z, x0 - ld1b {z17.s}, p0/z, x2 - add x0, x0, x1 - add x2, x2, x3 - sub z0.s, p0/m, z0.s, z17.s - mul z0.s, p0/m, z0.s, z0.s -.rept 3 - ld1b {z16.s}, p0/z, x0 - ld1b {z17.s}, p0/z, x2 - add x0, x0, x1 - add x2, x2, x3 - sub z16.s, p0/m, z16.s, z17.s - mla z0.s, p0/m, z16.s, z16.s -.endr - uaddv d0, p0, z0.s - fmov w0, s0 - ret -endfunc - -function PFX(pixel_sse_pp_4x8_sve) - ptrue p0.s, vl4 - ld1b {z0.s}, p0/z, x0 - ld1b {z17.s}, p0/z, x2 - add x0, x0, x1 - add x2, x2, x3 - sub z0.s, p0/m, z0.s, z17.s - mul z0.s, p0/m, z0.s, z0.s -.rept 7 - ld1b {z16.s}, p0/z, x0 - ld1b {z17.s}, p0/z, x2 - add x0, x0, x1 - add x2, x2, x3 - sub z16.s, p0/m, z16.s, z17.s - mla z0.s, p0/m, z16.s, z16.s -.endr - uaddv d0, p0, z0.s - fmov w0, s0 - ret -endfunc
View file
x265_4.0.tar.gz/.readthedocs.yaml
Added
@@ -0,0 +1,27 @@ +# Read the Docs configuration file for Sphinx projects +# .readthedocs.yaml + +# Project Information +# Required +version: 2 + +build: + os: "ubuntu-20.04" + tools: + python: "3.10" + +# Use a requirements file for pip dependencies +python: + install: + - requirements: doc/requirements.txt + +# Build documentation in the "docs/" directory with Sphinx +sphinx: + builder: html + configuration: doc/reST/conf.py + fail_on_warning: false + +# Optionally build your docs in additional formats such as PDF and ePub +# formats: +# - pdf +# - epub
View file
x265_3.6.tar.gz/build/README.txt -> x265_4.0.tar.gz/build/README.txt
Changed
@@ -94,22 +94,42 @@ = Build Instructions for cross-compilation for Arm AArch64 Targets= -When the target platform is based on Arm AArch64 architecture, the x265 can be -built in x86 platforms. However, the CMAKE_C_COMPILER and CMAKE_CXX_COMPILER -enviroment variables should be set to point to the cross compilers of the -appropriate gcc. For example: +Cross compilation of x265 for AArch64 targets is possible on x86 platforms by +passing a toolchain file when running CMake to configure the project: -1. export CMAKE_C_COMPILER=aarch64-unknown-linux-gnu-gcc -2. export CMAKE_CXX_COMPILER=aarch64-unknown-linux-gnu-g++ +* cmake -DCMAKE_TOOLCHAIN_FILE=<path-to-toolchain-file> -The default ones are aarch64-linux-gnu-gcc and aarch64-linux-gnu-g++. -Then, the normal building process can be followed. +Toolchain files for AArch64 cross-compilation exist in the /build directory. +These specify a default cross-compiler to use; however this can be overridden +by setting the CMAKE_C_COMPILER and CMAKE_CXX_COMPILER CMake variables when +running CMake to configure the project. For example: -Moreover, if the target platform supports SVE or SVE2 instruction set, the -CROSS_COMPILE_SVE or CROSS_COMPILE_SVE2 environment variables should be set -to true, respectively. For example: +* cmake -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ -1. export CROSS_COMPILE_SVE2=true -2. export CROSS_COMPILE_SVE=true +If target platform supports Armv8.4 Neon DotProd instructions, the +CROSS_COMPILE_NEON_DOTPROD CMake option should be set to ON: -Then, the normal building process can be followed. +* cmake -DCROSS_COMPILE_NEON_DOTPROD=ON <other configuration options...> + +If target platform supports Armv8.6 Neon I8MM instructions, the +CROSS_COMPILE_NEON_I8MM CMake option should be set to ON: + +* cmake -DCROSS_COMPILE_NEON_I8MM=ON <other configuration options...> + +If the target platform supports SVE or SVE2, CROSS_COMPILE_SVE or +CROSS_COMPILE_SVE2 CMake options should be set to ON, respectively. +For example, when running CMake to configure the project: + +1. cmake -DCROSS_COMPILE_SVE=ON <other configuration options...> +2. cmake -DCROSS_COMPILE_SVE2=ON <other configuration options...> + +Note: when the CROSS_COMPILE_SVE option is set to ON the build configuration will +also compile for Neon DotProd and I8MM, as we impose the constraint that SVE implies +both Neon DotProd and I8MM. + +Similarly when the CROSS_COMPILE_SVE2 option is set to ON the build configuration +will also compile for Neon I8MM, as we impose the constraint that SVE2 implies Neon +I8MM. SVE2 already implies that Neon DotProd is implemented since SVE2 is an Armv9.0 +feature which implies Armv8.5, and Neon DotProd is mandatory from Armv8.4. + +Then, the normal build process can be followed.
View file
x265_3.6.tar.gz/build/aarch64-darwin/crosscompile.cmake -> x265_4.0.tar.gz/build/aarch64-darwin/crosscompile.cmake
Changed
@@ -7,17 +7,14 @@ set(CMAKE_SYSTEM_NAME Darwin) set(CMAKE_SYSTEM_PROCESSOR aarch64) -# specify the cross compiler -set(CMAKE_C_COMPILER gcc-12) -set(CMAKE_CXX_COMPILER g++-12) +# specify the cross compiler (giving precedence to user-supplied CC/CXX) +if(NOT DEFINED CMAKE_C_COMPILER) + set(CMAKE_C_COMPILER gcc) +endif() +if(NOT DEFINED CMAKE_CXX_COMPILER) + set(CMAKE_CXX_COMPILER g++) +endif() # specify the target environment SET(CMAKE_FIND_ROOT_PATH /opt/homebrew/bin/) -# specify whether SVE/SVE2 is supported by the target platform -if(DEFINED ENV{CROSS_COMPILE_SVE2}) - set(CROSS_COMPILE_SVE2 1) -elseif(DEFINED ENV{CROSS_COMPILE_SVE}) - set(CROSS_COMPILE_SVE 1) -endif() -
View file
x265_4.0.tar.gz/build/aarch64-linux-clang
Added
+(directory)
View file
x265_4.0.tar.gz/build/aarch64-linux-clang/crosscompile.cmake
Added
@@ -0,0 +1,25 @@ +# CMake toolchain file for cross compiling x265 for AArch64, using Clang. + +set(CROSS_COMPILE_ARM64 1) +set(CMAKE_SYSTEM_NAME Linux) +set(CMAKE_SYSTEM_PROCESSOR aarch64) + +set(TARGET_TRIPLE aarch64-linux-gnu) + +# specify the cross compiler (giving precedence to user-supplied CC/CXX) +if(NOT DEFINED CMAKE_C_COMPILER) + set(CMAKE_C_COMPILER clang) +endif() +if(NOT DEFINED CMAKE_CXX_COMPILER) + set(CMAKE_CXX_COMPILER clang++) +endif() + +# specify compiler target +set(CMAKE_C_COMPILER_TARGET ${TARGET_TRIPLE}) +set(CMAKE_CXX_COMPILER_TARGET ${TARGET_TRIPLE}) + +# specify assembler target +list(APPEND ASM_FLAGS "--target=${TARGET_TRIPLE}") + +# specify the target environment +SET(CMAKE_FIND_ROOT_PATH /usr/aarch64-linux-gnu)
View file
x265_3.6.tar.gz/build/aarch64-linux/crosscompile.cmake -> x265_4.0.tar.gz/build/aarch64-linux/crosscompile.cmake
Changed
@@ -7,25 +7,14 @@ set(CMAKE_SYSTEM_NAME Linux) set(CMAKE_SYSTEM_PROCESSOR aarch64) -# specify the cross compiler -if(DEFINED ENV{CMAKE_C_COMPILER}) - set(CMAKE_C_COMPILER $ENV{CMAKE_C_COMPILER}) -else() +# specify the cross compiler (giving precedence to user-supplied CC/CXX) +if(NOT DEFINED CMAKE_C_COMPILER) set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc) endif() -if(DEFINED ENV{CMAKE_CXX_COMPILER}) - set(CMAKE_CXX_COMPILER $ENV{CMAKE_CXX_COMPILER}) -else() +if(NOT DEFINED CMAKE_CXX_COMPILER) set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++) endif() # specify the target environment SET(CMAKE_FIND_ROOT_PATH /usr/aarch64-linux-gnu) -# specify whether SVE/SVE2 is supported by the target platform -if(DEFINED ENV{CROSS_COMPILE_SVE2}) - set(CROSS_COMPILE_SVE2 1) -elseif(DEFINED ENV{CROSS_COMPILE_SVE}) - set(CROSS_COMPILE_SVE 1) -endif() -
View file
x265_4.0.tar.gz/build/vc17-x86
Added
+(directory)
View file
x265_4.0.tar.gz/build/vc17-x86/build-all.bat
Added
@@ -0,0 +1,23 @@ +@echo off +setlocal enabledelayedexpansion +if "%VS170COMNTOOLS%" == "" ( +for /f "usebackq tokens=1* delims=: " %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -latest `) do ( + if /i "%%i"=="productPath" ( + set VS170COMNTOOLS=%%j +) +) +) +setx VS170COMNTOOLS "!VS170COMNTOOLS!" +if "%VS170COMNTOOLS%" == "" ( + msg "%username%" "Visual Studio 17 not detected" + exit 1 +) +if not exist x265.sln ( + call make-solutions.bat +) +if exist x265.sln ( + call "%VS170COMNTOOLS%\..\..\tools\VsDevCmd.bat" + MSBuild /property:Configuration="Release" x265.sln + MSBuild /property:Configuration="Debug" x265.sln + MSBuild /property:Configuration="RelWithDebInfo" x265.sln +)
View file
x265_4.0.tar.gz/build/vc17-x86/make-solutions.bat
Added
@@ -0,0 +1,6 @@ +@echo off +:: +:: run this batch file to create a Visual Studio solution file for this project. +:: See the cmake documentation for other generator targets +:: +cmake -G "Visual Studio 17 2022" ..\..\source && cmake-gui ..\..\source
View file
x265_4.0.tar.gz/build/vc17-x86_64
Added
+(directory)
View file
x265_4.0.tar.gz/build/vc17-x86_64/build-all.bat
Added
@@ -0,0 +1,23 @@ +@echo off +setlocal enabledelayedexpansion +if "%VS170COMNTOOLS%" == "" ( +for /f "usebackq tokens=1* delims=: " %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -latest `) do ( + if /i "%%i"=="productPath" ( + set VS170COMNTOOLS=%%j +) +) +) +setx VS170COMNTOOLS "!VS170COMNTOOLS!" +if "%VS170COMNTOOLS%" == "" ( + msg "%username%" "Visual Studio 17 not detected" + exit 1 +) +if not exist x265.sln ( + call make-solutions.bat +) +if exist x265.sln ( + call "%VS170COMNTOOLS%\..\..\tools\VsDevCmd.bat" + MSBuild /property:Configuration="Release" x265.sln + MSBuild /property:Configuration="Debug" x265.sln + MSBuild /property:Configuration="RelWithDebInfo" x265.sln +)
View file
x265_4.0.tar.gz/build/vc17-x86_64/make-solutions.bat
Added
@@ -0,0 +1,6 @@ +@echo off +:: +:: run this batch file to create a Visual Studio solution file for this project. +:: See the cmake documentation for other generator targets +:: +cmake -G "Visual Studio 17 2022" ..\..\source && cmake-gui ..\..\source
View file
x265_4.0.tar.gz/build/vc17-x86_64/multilib.bat
Added
@@ -0,0 +1,47 @@ +@echo off +setlocal enabledelayedexpansion +if "%VS170COMNTOOLS%" == "" ( +for /f "usebackq tokens=1* delims=: " %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -latest `) do ( + if /i "%%i"=="productPath" ( + set VS170COMNTOOLS=%%j +) +) +) +setx VS170COMNTOOLS "!VS170COMNTOOLS!" +call "%VS170COMNTOOLS%\..\..\tools\VsDevCmd.bat" +@mkdir 12bit +@mkdir 10bit +@mkdir 8bit + +@cd 12bit +cmake -G "Visual Studio 17 2022" ../../../source -DHIGH_BIT_DEPTH=ON -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=OFF -DMAIN12=ON +if exist x265.sln ( + MSBuild /property:Configuration="Release" x265.sln + copy/y Release\x265-static.lib ..\8bit\x265-static-main12.lib +) + +@cd ..\10bit +cmake -G "Visual Studio 17 2022" ../../../source -DHIGH_BIT_DEPTH=ON -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=OFF +if exist x265.sln ( + MSBuild /property:Configuration="Release" x265.sln + copy/y Release\x265-static.lib ..\8bit\x265-static-main10.lib +) + +@cd ..\8bit +if not exist x265-static-main10.lib ( + msg "%username%" "10bit build failed" + exit 1 +) +if not exist x265-static-main12.lib ( + msg "%username%" "12bit build failed" + exit 1 +) +cmake -G "Visual Studio 17 2022" ../../../source -DEXTRA_LIB="x265-static-main10.lib;x265-static-main12.lib" -DLINKED_10BIT=ON -DLINKED_12BIT=ON +if exist x265.sln ( + MSBuild /property:Configuration="Release" x265.sln + :: combine static libraries (ignore warnings caused by winxp.cpp hacks) + move Release\x265-static.lib x265-static-main.lib + LIB.EXE /ignore:4006 /ignore:4221 /OUT:Release\x265-static.lib x265-static-main.lib x265-static-main10.lib x265-static-main12.lib +) + +pause \ No newline at end of file
View file
x265_3.6.tar.gz/doc/reST/api.rst -> x265_4.0.tar.gz/doc/reST/api.rst
Changed
@@ -419,21 +419,21 @@ void x265_cleanup(void); VMAF (Video Multi-Method Assessment Fusion) -========================================== +=========================================== If you set the ENABLE_LIBVMAF cmake option to ON, then x265 will report per frame and aggregate VMAF score for the given input and dump the scores in csv file. -The user also need to specify the :option:`--recon` in command line to get the VMAF scores. +The user also need to specify the :option:`--recon` in command line to get the VMAF scores.:: /* x265_calculate_vmafScore: - * returns VMAF score for the input video. - * This api must be called only after encoding was done. */ - double x265_calculate_vmafscore(x265_param*, x265_vmaf_data*); + * returns VMAF score for the input video. + * This API must be called only after encoding was done. */ + double x265_calculate_vmafscore(x265_param*, x265_vmaf_data*); /* x265_calculate_vmaf_framelevelscore: - * returns VMAF score for each frame in a given input video. The frame level VMAF score does not include temporal scores. */ - double x265_calculate_vmaf_framelevelscore(x265_vmaf_framedata*); - + * returns VMAF score for each frame in a given input video. The frame level VMAF score does not include temporal scores. */ + double x265_calculate_vmaf_framelevelscore(x265_vmaf_framedata*); + .. Note:: When setting ENABLE_LIBVMAF cmake option to ON, it is recommended to
View file
x265_3.6.tar.gz/doc/reST/cli.rst -> x265_4.0.tar.gz/doc/reST/cli.rst
Changed
@@ -822,7 +822,7 @@ metrics from the 4 sub-CUs. When multiple inter modes like :option:`--rect` and/or :option:`--amp` are enabled, this feature will use motion cost heuristics from the 4 sub-CUs to bypass modes that are unlikely to be the - best choice. This can significantly improve performance when :option:`rect` + best choice. This can significantly improve performance when :option:`--rect` and/or :option:`--amp` are enabled at minimal compression efficiency loss. .. option:: --rect, --no-rect @@ -983,7 +983,7 @@ Store/normalize ctu distortion in analysis-save/load. 0 - Disabled. 1 - Save ctu distortion to the analysis file specified during :option:`--analysis-save`. - Load CTU distortion from the analysis file and normalize it across every frame during :option:`--analysis-load`. + - Load CTU distortion from the analysis file and normalize it across every frame during :option:`--analysis-load`. Default 0. .. option:: --scale-factor @@ -1056,27 +1056,13 @@ .. option:: --rdoq-level <0|1|2>, --no-rdoq-level - Specify the amount of rate-distortion analysis to use within - quantization:: + Specify the amount of rate-distortion analysis to use within quantization:: - At level 0 rate-distortion cost is not considered in quant - - At level 1 rate-distortion cost is used to find optimal rounding - values for each level (and allows psy-rdoq to be effective). It - trades-off the signaling cost of the coefficient vs its post-inverse - quant distortion from the pre-quant coefficient. When - :option:`--psy-rdoq` is enabled, this formula is biased in favor of - more energy in the residual (larger coefficient absolute levels) - - At level 2 rate-distortion cost is used to make decimate decisions - on each 4x4 coding group, including the cost of signaling the group - within the group bitmap. If the total distortion of not signaling - the entire coding group is less than the rate cost, the block is - decimated. Next, it applies rate-distortion cost analysis to the - last non-zero coefficient, which can result in many (or all) of the - coding groups being decimated. Psy-rdoq is less effective at - preserving energy when RDOQ is at level 2, since it only has - influence over the level distortion costs. + At level 0 rate-distortion cost is not considered in quant. + + At level 1 rate-distortion cost is used to find optimal rounding values for each level (and allows psy-rdoq to be effective). It trades-off the signaling cost of the coefficient vs its post-inverse quant distortion from the pre-quant coefficient. When :option:`--psy-rdoq` is enabled, this formula is biased in favor of more energy in the residual (larger coefficient absolute levels). + + At level 2 rate-distortion cost is used to make decimate decisions on each 4x4 coding group, including the cost of signaling the group within the group bitmap. If the total distortion of not signaling the entire coding group is less than the rate cost, the block is decimated. Next, it applies rate-distortion cost analysis to the last non-zero coefficient, which can result in many (or all) of the coding groups being decimated. Psy-rdoq is less effective at preserving energy when RDOQ is at level 2, since it only has influence over the level distortion costs. .. option:: --tu-intra-depth <1..4> @@ -1221,19 +1207,16 @@ .. option:: --me <integer|string> - Motion search method. Generally, the higher the number the harder - the ME method will try to find an optimal match. Diamond search is - the simplest. Hexagon search is a little better. Uneven - Multi-Hexagon is an adaption of the search method used by x264 for - slower presets. Star is a three-step search adapted from the HM - encoder: a star-pattern search followed by an optional radix scan - followed by an optional star-search refinement. Full is an - exhaustive search; an order of magnitude slower than all other - searches but not much better than umh or star. SEA is similar to - x264's ESA implementation and a speed optimization of full search. - It is a three-step motion search where the DC calculation is - followed by ADS calculation followed by SAD of the passed motion - vector candidates. + Motion search method. Generally, the higher the number the harder the ME method + will try to find an optimal match. Diamond search is the simplest. Hexagon search + is a little better. Uneven Multi-Hexagon is an adaption of the search method used + by x264 for slower presets. Star is a three-step search adapted from the HM encoder: a + star-pattern search followed by an optional radix scan followed by an optional + star-search refinement. Full is an exhaustive search; an order of magnitude slower + than all other searches but not much better than umh or star. SEA is similar to x264's + ESA implementation and a speed optimization of full search. It is a three-step motion + search where the DC calculation is followed by ADS calculation followed by SAD of the + passed motion vector candidates. 0. dia 1. hex **(default)** @@ -1331,7 +1314,14 @@ .. option:: --mcstf, --no-mcstf - Enable Motion Compensated Temporal filtering. + Motion-compensated spatio-temporal filtering (MCSTF) improves the compression + efficiency of videos that contain a high level of noise. It introduces a + temporal filter before encoding and this filter is applied only to the I- and P-frames. + It utilizes previously generated motion vectors across different video content + resolutions to find the best temporal correspondence for low-pass filtering. Here, + motion estimation is applied between the central picture and each future or past + picture, thereby generating multiple motion-compensated predictions, which are then + combined by using adaptive filtering to produce a final noise-reduced picture. Default: disabled Spatial/intra options @@ -1486,7 +1476,7 @@ whereas for the :option:`--scenecut`, inserts RADL at every scenecut. Recommended value is 2-3. Default 0 (disabled). - **Range of values: Between 0 and `--bframes` + **Range of values:** Between 0 and `--bframes` .. option:: --ctu-info <0, 1, 2, 4, 6> @@ -1550,9 +1540,7 @@ as *lslices* **Values:** 0 - disabled. 1 is the same as 0. Max 16. - Default: 8 for ultrafast, superfast, faster, fast, medium - 4 for slow, slower - disabled for veryslow, slower + Default: 8 for ultrafast, superfast, faster, fast, medium; 4 for slow, slower; disabled for veryslow, slower. .. option:: --lookahead-threads <integer> @@ -1602,14 +1590,17 @@ Values: 0 - flush the encoder only when all the input pictures are over. - 1 - flush all the frames even when the input is not over. - slicetype decision may change with this option. + 1 - flush all the frames even when the input is not over. Slicetype decision may change with this option. 2 - flush the slicetype decided frames only. .. option:: --fades, --no-fades Detect and handle fade-in regions. Default disabled. +.. option:: --cra-nal + + Force NAL type to CRA to all the frames expect for the first frame, works only with :option:`--keyint` is 1. + Quality, rate control and rate distortion options ================================================= @@ -1744,9 +1735,7 @@ 0. disabled 1. AQ enabled 2. AQ enabled with auto-variance **(default)** - 3. AQ enabled with auto-variance and bias to dark scenes. This is - recommended for 8-bit encodes or low-bitrate 10-bit encodes, to - prevent color banding/blocking. + 3. AQ enabled with auto-variance and bias to dark scenes. This is recommended for 8-bit encodes or low-bitrate 10-bit encodes, to prevent color banding/blocking. 4. AQ enabled with auto-variance and edge information. .. option:: --aq-strength <float> @@ -1759,11 +1748,13 @@ Default 1.0. **Range of values:** 0.0 to 3.0 -.. option:: --sbrc --no-sbrc +.. option:: --sbrc, --no-sbrc + + To enable and disable segment-based rate control. SBRC controls the overflow with + segment sizes, and it is based on the Capped CRF mode. Segment duration depends on + the keyframe interval specified. If unspecified, the default keyframe interval will + be used. Default: disabled. **Experimental Feature** - To enable and disable segment based rate control.Segment duration depends on the - keyframe interval specified.If unspecified,default keyframe interval will be used. - Default: disabled. .. option:: --hevc-aq @@ -1849,7 +1840,7 @@ and also redundant steps are skipped. In pass 1 analysis information like motion vector, depth, reference and prediction modes of the final best CTU partition is stored for each CTU. - Multipass analysis refinement cannot be enabled when :option:`--analysis-save`/:option:`analysis-load` + Multipass analysis refinement cannot be enabled when :option:`--analysis-save`/:option:`--analysis-load` is enabled and both will be disabled when enabled together. This feature requires :option:`--pmode`/:option:`--pme` to be disabled and hence pmode/pme will be disabled when enabled at the same time. @@ -2014,26 +2005,29 @@ When :option:`--scenecut-aware-qp` is: * 1 (Forward masking): - --masking-strength <fwdMaxWindow,fwdRefQPDelta,fwdNonRefQPDelta> - or - --masking-strength <fwdWindow1,fwdRefQPDelta1,fwdNonRefQPDelta1,fwdWindow2,fwdRefQPDelta2,fwdNonRefQPDelta2, - fwdWindow3,fwdRefQPDelta3,fwdNonRefQPDelta3,fwdWindow4,fwdRefQPDelta4,fwdNonRefQPDelta4, - fwdWindow5,fwdRefQPDelta5,fwdNonRefQPDelta5,fwdWindow6,fwdRefQPDelta6,fwdNonRefQPDelta6> + + --masking-strength <fwdMaxWindow,fwdRefQPDelta,fwdNonRefQPDelta> + + or + + --masking-strength <fwdWindow1,fwdRefQPDelta1,fwdNonRefQPDelta1,fwdWindow2,fwdRefQPDelta2,fwdNonRefQPDelta2,fwdWindow3,fwdRefQPDelta3,fwdNonRefQPDelta3,fwdWindow4,fwdRefQPDelta4,fwdNonRefQPDelta4,fwdWindow5,fwdRefQPDelta5,fwdNonRefQPDelta5,fwdWindow6,fwdRefQPDelta6,fwdNonRefQPDelta6> + * 2 (Backward masking): - --masking-strength <bwdMaxWindow,bwdRefQPDelta,bwdNonRefQPDelta> - or - --masking-strength <bwdWindow1,bwdRefQPDelta1,bwdNonRefQPDelta1,bwdWindow2,bwdRefQPDelta2,bwdNonRefQPDelta2, - bwdWindow3,bwdRefQPDelta3,bwdNonRefQPDelta3,bwdWindow4,bwdRefQPDelta4,bwdNonRefQPDelta4, - bwdWindow5,bwdRefQPDelta5,bwdNonRefQPDelta5,bwdWindow6,bwdRefQPDelta6,bwdNonRefQPDelta6> + + --masking-strength <bwdMaxWindow,bwdRefQPDelta,bwdNonRefQPDelta>
View file
x265_3.6.tar.gz/doc/reST/conf.py -> x265_4.0.tar.gz/doc/reST/conf.py
Changed
@@ -14,7 +14,7 @@ copyright = u'2014 MulticoreWare Inc' # -- Options for HTML output --------------------------------------------------- -html_theme = "default" +html_theme = "sphinx_rtd_theme" # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section).
View file
x265_3.6.tar.gz/doc/reST/presets.rst -> x265_4.0.tar.gz/doc/reST/presets.rst
Changed
@@ -21,16 +21,17 @@ The presets adjust encoder parameters as shown in the following table. Any parameters below that are specified in your command-line will be changed from the value specified by the preset. - 0. ultrafast - 1. superfast - 2. veryfast - 3. faster - 4. fast - 5. medium **(default)** - 6. slow - 7. slower - 8. veryslow - 9. placebo + + 0. ultrafast + 1. superfast + 2. veryfast + 3. faster + 4. fast + 5. medium **(default)** + 6. slow + 7. slower + 8. veryslow + 9. placebo +-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+ | preset | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | @@ -152,7 +153,7 @@ * :option:`--sao` 0 * :option:`--psy-rd` 4.0 * :option:`--psy-rdoq` 10.0 - * :option:`--recursion-skip` 0 + * :option:`--rskip` 0 It also enables a specialised ratecontrol algorithm :option:`--rc-grain` that strictly minimises QP fluctuations across frames, while still allowing
View file
x265_3.6.tar.gz/doc/reST/releasenotes.rst -> x265_4.0.tar.gz/doc/reST/releasenotes.rst
Changed
@@ -2,6 +2,44 @@ Release Notes ************* +Version 4.0 +=========== + +Release date - 13th September, 2024. + +New feature +----------- +1. Alpha Channel feature. +2. Screen Content Coding (SCC). +3. MV-HEVC feature. + +Enhancements to existing features +--------------------------------- +1. Added support for the VMAF v3.x. + +API changes +----------- +1. Add command line parameter for Alpha Channel feature :option:`--alpha`. +2. Add command line parameter for SCC feature :option:`--scc 1`. +3. Add command line parameters for the MV-HEVC feature :option:`--multiview-config "multiview_config.txt"`. + +Optimizations +--------------------- +1. Arm SIMD optimizations: Several time-consuming scalar C functions now have SIMD implementations on Arm platforms. Existing Arm SIMD implementations have also been optimized. These optimizations result in up to 57% faster encoding compared to release 3.6. +2. Arm SIMD optimizations include use of Armv8.4 DotProd, Armv8.6 I8MM, and Armv9 SVE2 instruction set extensions. The following algorithms now have optimized SIMD implementations: SAD, SSE, DCT, SAO, convolution, quantization, intra_planar, intraFilter, intrapred DC and IDCT16x16. + +Bug fixes +--------- +1. Fix for y4m pipe input broken. +2. Fix SCC crash on multipass encode. +3. Fix mcstf when :option:`--bframes` value was less than 5. +4. Fix lowpass DCT for high bit depth. +5. Added build support for Visual Studio 17. +6. Fix issue in default code flow and memory leak. +7. Framethreads tuning for Windows ARM devices. +8. Fix scc crash on multipass encode. + + Version 3.6 =========== @@ -9,44 +47,44 @@ New feature ----------- -1. Segment based Ratecontrol (SBRC) feature -2. Motion-Compensated Spatio-Temporal Filtering -3. Scene-cut aware qp - BBAQ (Bidirectional Boundary Aware Quantization) -4. Histogram-Based Scene Change Detection -5. Film-Grain characteristics as a SEI message to support Film Grain Synthesis(FGS) -6. Add temporal layer implementation(Hierarchical B-frame implementation) - +1. Segment based Ratecontrol (SBRC) feature. +2. Motion-Compensated Spatio-Temporal Filtering. +3. Scene-cut aware qp - BBAQ (Bidirectional Boundary Aware Quantization). +4. Histogram-Based Scene Change Detection. +5. Film-Grain characteristics as a SEI message to support Film Grain Synthesis (FGS). +6. Add temporal layer implementation (Hierarchical B-frame implementation). + Enhancements to existing features --------------------------------- -1. Added Dolby Vision 8.4 Profile Support +1. Added Dolby Vision 8.4 Profile Support. API changes ----------- -1. Add Segment based Ratecontrol(SBRC) feature: "--no-sbrc". -2. Add command line parameter for mcstf feature: "--no-mctf". -3. Add command line parameters for the scene cut aware qp feature: "--scenecut-aware-qp" and "--masking-strength". -4. Add command line parameters for Histogram-Based Scene Change Detection: "--hist-scenecut". -5. Add film grain characteristics as a SEI message to the bitstream: "--film-grain <filename>" -6. cli: add new option --cra-nal (Force nal type to CRA to all frames expect for the first frame, works only with keyint 1) +1. Add command line parameter for SBRC feature :option:`--sbrc`. +2. Add command line parameter for mcstf feature :option:`--mcstf`. +3. Add command line parameters for the scene cut aware qp feature :option:`--scenecut-aware-qp` and :option:`--masking-strength`. +4. Add command line parameters for Histogram-Based Scene Change Detection :option:`--hist-scenecut`. +5. Add command line parameters for film grain characteristics as a SEI message to the bitstream :option:`--film-grain`. +6. cli: add new option :option:`--cra-nal` (Force NAL type to CRA to all the frames expect for the first frame, works only with :option:`--keyint` is 1). Optimizations --------------------- -ARM64 NEON optimizations:- Several time-consuming C functions have been optimized for the targeted platform - aarch64. The overall performance increased by around 20%. -SVE/SVE2 optimizations +1. ARM64 NEON optimizations:- Several time-consuming C functions have been optimized for the targeted platform - aarch64. The overall performance increased by around 20%. +2. SVE/SVE2 optimizations. Bug fixes --------- -1. Linux bug to utilize all the cores -2. Crash with hist-scenecut build when source resolution is not multiple of minCuSize -3. 32bit and 64bit builds generation for ARM -4. bugs in zonefile feature (Reflect Zonefile Parameters inside Lookahead, extra IDR issue, Avg I Slice QP value issue etc..) -5. Add x86 ASM implementation for subsampling luma -6. Fix for abrladder segfault with load reuse level 1 -7. Reorder miniGOP based on temporal layer hierarchy and add support for more B frame -8. Add MacOS aarch64 build support -9. Fix boundary condition issue for Gaussian filter +1. Linux bug to utilize all the cores. +2. Crash with hist-scenecut build when source resolution is not multiple of minCuSize. +3. 32bit and 64bit builds generation for ARM. +4. bugs in zonefile feature (Reflect Zonefile Parameters inside Lookahead, extra IDR issue, Avg I Slice QP value issue etc.). +5. Add x86 ASM implementation for subsampling luma. +6. Fix for abrladder segfault with load reuse level 1. +7. Reorder miniGOP based on temporal layer hierarchy and add support for more B frames. +8. Add MacOS aarch64 build support. +9. Fix boundary condition issue for Gaussian filter. Version 3.5
View file
x265_3.6.tar.gz/doc/reST/svthevc.rst -> x265_4.0.tar.gz/doc/reST/svthevc.rst
Changed
@@ -3,7 +3,7 @@ .. _SvtHevc: -x265 has support for open source HEVC encoder `SVT-HEVC <https://01.org/svt>`_ +x265 has support for open source HEVC encoder `SVT-HEVC <https://www.intel.com/content/www/us/en/developer/articles/technical/scalable-video-technology.html>`_ and can generate SVT-HEVC compliant bitstreams. SVT-HEVC encoder can be enabled at run time using :option:`--svt`. Since SVT-HEVC params/CLI are not exposed outside, it has to be configured only via x265 CLI options. The API's of SVT-HEVC are accessed through x265's API @@ -22,7 +22,7 @@ **SVT-HEVC** -1. Clone `SVT-HEVC <https://github.com/intel/SVT-HEVC>`_ (say at path "/home/app/") and build it (follow the build steps in its README file) +1. Clone `SVT-HEVC-repo <https://github.com/intel/SVT-HEVC>`_ (say at path "/home/app/") and build it (follow the build steps in its README file) 2. Once build is successful, binaries can be found inside the *Bin* folder at its root directory ("/home/app/SVT-HEVC/Bin/Release/") **x265**
View file
x265_3.6.tar.gz/doc/reST/x265.rst -> x265_4.0.tar.gz/doc/reST/x265.rst
Changed
@@ -1,3 +1,5 @@ +:orphan: + x265 CLI Documentation ######################
View file
x265_4.0.tar.gz/doc/requirements.txt
Added
@@ -0,0 +1,3 @@ +sphinx +sphinx-rtd-theme +# Add other dependencies here
View file
x265_3.6.tar.gz/source/CMakeLists.txt -> x265_4.0.tar.gz/source/CMakeLists.txt
Changed
@@ -22,6 +22,8 @@ include(CheckFunctionExists) include(CheckSymbolExists) include(CheckCXXCompilerFlag) +include(CheckCSourceCompiles) +include(CheckCXXSourceCompiles) option(FPROFILE_GENERATE "Compile executable to generate usage data" OFF) option(FPROFILE_USE "Compile executable using generated usage data" OFF) @@ -29,7 +31,7 @@ option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF) mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD) # X265_BUILD must be incremented each time the public API is changed -set(X265_BUILD 209) +set(X265_BUILD 212) configure_file("${PROJECT_SOURCE_DIR}/x265.def.in" "${PROJECT_BINARY_DIR}/x265.def") configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in" @@ -80,14 +82,16 @@ set(ARM 1) add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1) elseif(ARM64MATCH GREATER "-1") - #if(CROSS_COMPILE_ARM64) - #message(STATUS "Cross compiling for ARM64 arch") - #else() - #set(CROSS_COMPILE_ARM64 0) - #endif() message(STATUS "Detected ARM64 target processor") set(ARM64 1) - add_definitions(-DX265_ARCH_ARM64=1 -DHAVE_NEON) + + option(AARCH64_WARNINGS_AS_ERRORS "Build with -Werror for AArch64 Intrinsics files" OFF) + + # Options for cross compiling AArch64 optional extensions + option(CROSS_COMPILE_SVE "Cross Compile for SVE Target" OFF) + option(CROSS_COMPILE_SVE2 "Cross Compile for SVE2 Target" OFF) + option(CROSS_COMPILE_NEON_DOTPROD "Cross Compile for Neon DotProd Target" OFF) + option(CROSS_COMPILE_NEON_I8MM "Cross Compile for Neon I8MM Target" OFF) else() message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown") message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}") @@ -259,28 +263,106 @@ set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm) endif() endif() - if(ARM64 OR CROSS_COMPILE_ARM64) - find_package(Neon) - find_package(SVE) - find_package(SVE2) - if(CPU_HAS_SVE2 OR CROSS_COMPILE_SVE2) - message(STATUS "Found SVE2") - set(ARM_ARGS -O3 -march=armv8-a+sve2 -fPIC -flax-vector-conversions) - add_definitions(-DHAVE_SVE2) - add_definitions(-DHAVE_SVE) - add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE2 - elseif(CPU_HAS_SVE OR CROSS_COMPILE_SVE) - message(STATUS "Found SVE") - set(ARM_ARGS -O3 -march=armv8-a+sve -fPIC -flax-vector-conversions) - add_definitions(-DHAVE_SVE) - add_definitions(-DHAVE_NEON) # for NEON c/c++ primitives, as currently there is no implementation that use SVE - elseif(CPU_HAS_NEON) - message(STATUS "Found NEON") - set(ARM_ARGS -fPIC -flax-vector-conversions) - add_definitions(-DHAVE_NEON) + if(ARM64) + message(STATUS "Found Neon") + set(CPU_HAS_NEON 1) + add_definitions(-DX265_ARCH_ARM64=1 -DHAVE_NEON=1) + + if(CROSS_COMPILE_ARM64) + # Handle cross-compilation options. + if(CROSS_COMPILE_NEON_DOTPROD) + set(CPU_HAS_NEON_DOTPROD 1) + endif() + if(CROSS_COMPILE_NEON_I8MM) + set(CPU_HAS_NEON_I8MM 1) + # Impose the constraint that Neon I8MM implies Neon DotProd. + set(CPU_HAS_NEON_DOTPROD 1) + endif() + if(CROSS_COMPILE_SVE) + set(CPU_HAS_SVE 1) + # Impose the constraint that SVE implies Neon DotProd and I8MM. + set(CPU_HAS_NEON_DOTPROD 1) + set(CPU_HAS_NEON_I8MM 1) + endif() + if(CROSS_COMPILE_SVE2) + set(CPU_HAS_SVE2 1) + # SVE2 implies SVE and Neon DotProd. + set(CPU_HAS_SVE 1) + set(CPU_HAS_NEON_DOTPROD 1) + # Impose the constraint that SVE2 implies Neon I8MM. + set(CPU_HAS_NEON_I8MM 1) + endif() else() - set(ARM_ARGS -fPIC -flax-vector-conversions) - endif() + if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin") + find_package(NEON_DOTPROD) + find_package(NEON_I8MM) + find_package(SVE) + find_package(SVE2) + else() + message(STATUS "Compile time feature detection unsupported on this platform") + endif() + endif() + + if(CPU_HAS_NEON_DOTPROD) + # Neon DotProd is mandatory from Armv8.4. + message(STATUS "Found Neon DotProd") + set(ARM_ARGS -O3 -march=armv8.2-a+dotprod) + add_definitions(-DHAVE_NEON_DOTPROD=1) + endif() + if(CPU_HAS_NEON_I8MM) + # Neon I8MM is mandatory from Armv8.6. + message(STATUS "Found Neon I8MM") + # Impose the constraint that Neon I8MM implies Neon DotProd. + if(NOT CPU_HAS_NEON_DOTPROD) + message(FATAL_ERROR "Unsupported AArch64 feature combination (Neon I8MM without Neon DotProd)") + endif() + set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm) + add_definitions(-DHAVE_NEON_I8MM=1) + endif() + if(CPU_HAS_SVE) + message(STATUS "Found SVE") + # Impose the constraint that SVE implies Neon I8MM. + if(NOT CPU_HAS_NEON_I8MM) + message(FATAL_ERROR "Unsupported AArch64 feature combination (SVE without Neon I8MM)") + endif() + set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm+sve) + add_definitions(-DHAVE_SVE=1) + endif() + if(CPU_HAS_SVE2) + message(STATUS "Found SVE2") + # SVE2 is only available from Armv9.0, and armv9-a implies +dotprod + set(ARM_ARGS -O3 -march=armv9-a+i8mm+sve2) + add_definitions(-DHAVE_SVE2=1) + endif() + set(ARM_ARGS ${ARM_ARGS} -fPIC) + # Do not allow implicit vector type conversions in Clang builds (this + # is already the default in GCC builds). + check_cxx_compiler_flag(-flax-vector-conversions=none CC_HAS_FLAX_VEC_CONV_NONE) + if(CC_HAS_FLAX_VEC_CONV_NONE) + set(ARM_ARGS ${ARM_ARGS} -flax-vector-conversions=none) + endif() + if(CPU_HAS_SVE) + set(SVE_HEADER_TEST " +#ifndef __ARM_NEON_SVE_BRIDGE +#error 1 +#endif +#include <arm_sve.h> +#include <arm_neon_sve_bridge.h> +int main() { return 0; }") + set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS}) + # CMAKE_REQUIRED_FLAGS requires a space-delimited string, whereas + # ARM_ARGS is defined and used elsewhere as a ;-list. + foreach(ARM_ARG ${ARM_ARGS}) + string(APPEND CMAKE_REQUIRED_FLAGS " ${ARM_ARG}") + endforeach() + check_c_source_compiles("${SVE_HEADER_TEST}" SVE_HEADER_C_TEST_COMPILED) + check_cxx_source_compiles("${SVE_HEADER_TEST}" SVE_HEADER_CXX_TEST_COMPILED) + set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS}) + if(SVE_HEADER_C_TEST_COMPILED AND SVE_HEADER_CXX_TEST_COMPILED) + add_definitions(-DHAVE_SVE_BRIDGE=1) + set(HAVE_SVE_BRIDGE 1) + endif() + endif() endif() if(ENABLE_PIC) list(APPEND ARM_ARGS -DPIC) @@ -334,9 +416,11 @@ if (CC_HAS_FAST_MATH) add_definitions(-ffast-math) endif() - check_cxx_compiler_flag(-mstackrealign CC_HAS_STACK_REALIGN) - if (CC_HAS_STACK_REALIGN) - add_definitions(-mstackrealign) + if (NOT (ARM64 OR CROSS_COMPILE_ARM64)) + check_cxx_compiler_flag(-mstackrealign CC_HAS_STACK_REALIGN) + if (CC_HAS_STACK_REALIGN) + add_definitions(-mstackrealign) + endif() endif() # Disable exceptions. Reduce executable size, increase compability. check_cxx_compiler_flag(-fno-exceptions CC_HAS_FNO_EXCEPTIONS_FLAG) @@ -558,6 +642,21 @@ add_definitions(-DDETAILED_CU_STATS) endif(DETAILED_CU_STATS) +option(ENABLE_ALPHA "Enable alpha encoding in x265" OFF) +if(ENABLE_ALPHA) + add_definitions(-DENABLE_ALPHA) +endif() + +option(ENABLE_MULTIVIEW "Enable Multi-view encoding in HEVC" OFF) +if(ENABLE_MULTIVIEW) + add_definitions(-DENABLE_MULTIVIEW) +endif() + +option(ENABLE_SCC_EXT "Enable screen content coding extension in HEVC" OFF) +if(ENABLE_SCC_EXT)
View file
x265_3.6.tar.gz/source/abrEncApp.cpp -> x265_4.0.tar.gz/source/abrEncApp.cpp
Changed
@@ -63,6 +63,7 @@ m_passEnci->init(ret); } + m_numInputViews = m_passEnc0->m_param->numViews; if (!allocBuffers()) { x265_log(NULL, X265_LOG_ERROR, "Unable to allocate memory for buffers\n"); @@ -76,7 +77,11 @@ bool AbrEncoder::allocBuffers() { +#if ENABLE_MULTIVIEW + m_inputPicBuffer = X265_MALLOC(x265_picture**, MAX_VIEWS); +#else m_inputPicBuffer = X265_MALLOC(x265_picture**, m_numEncodes); +#endif m_analysisBuffer = X265_MALLOC(x265_analysis_data*, m_numEncodes); m_picWriteCnt = new ThreadSafeIntegerm_numEncodes; @@ -89,21 +94,48 @@ m_analysisRead = X265_MALLOC(ThreadSafeInteger*, m_numEncodes); m_readFlag = X265_MALLOC(int*, m_numEncodes); - for (uint8_t pass = 0; pass < m_numEncodes; pass++) +#if ENABLE_MULTIVIEW + if (m_passEnc0->m_param->numViews > 1) { - m_inputPicBufferpass = X265_MALLOC(x265_picture*, m_queueSize); - for (uint32_t idx = 0; idx < m_queueSize; idx++) + for (uint8_t pass = 0; pass < m_numInputViews; pass++) { - m_inputPicBufferpassidx = x265_picture_alloc(); - x265_picture_init(m_passEncpass->m_param, m_inputPicBufferpassidx); + m_inputPicBufferpass = X265_MALLOC(x265_picture*, m_queueSize); + for (uint32_t idx = 0; idx < m_queueSize; idx++) + { + m_inputPicBufferpassidx = x265_picture_alloc(); + x265_picture_init(m_passEnc0->m_param, m_inputPicBufferpassidx); + } + if (pass == 0) + { + CHECKED_MALLOC_ZERO(m_analysisBufferpass, x265_analysis_data, m_queueSize); + m_picIdxReadCntpass = new ThreadSafeIntegerm_queueSize; + m_analysisWritepass = new ThreadSafeIntegerm_queueSize; + m_analysisReadpass = new ThreadSafeIntegerm_queueSize; + m_readFlagpass = X265_MALLOC(int, m_queueSize); + } } + } + else + { +#endif + for (uint8_t pass = 0; pass < m_numEncodes; pass++) + { + m_inputPicBufferpass = X265_MALLOC(x265_picture*, m_queueSize); + for (uint32_t idx = 0; idx < m_queueSize; idx++) + { + m_inputPicBufferpassidx = x265_picture_alloc(); + x265_picture_init(m_passEncpass->m_param, m_inputPicBufferpassidx); + } - CHECKED_MALLOC_ZERO(m_analysisBufferpass, x265_analysis_data, m_queueSize); - m_picIdxReadCntpass = new ThreadSafeIntegerm_queueSize; - m_analysisWritepass = new ThreadSafeIntegerm_queueSize; - m_analysisReadpass = new ThreadSafeIntegerm_queueSize; - m_readFlagpass = X265_MALLOC(int, m_queueSize); + CHECKED_MALLOC_ZERO(m_analysisBufferpass, x265_analysis_data, m_queueSize); + m_picIdxReadCntpass = new ThreadSafeIntegerm_queueSize; + m_analysisWritepass = new ThreadSafeIntegerm_queueSize; + m_analysisReadpass = new ThreadSafeIntegerm_queueSize; + m_readFlagpass = X265_MALLOC(int, m_queueSize); + } +#if ENABLE_MULTIVIEW } +#endif return true; fail: return false; @@ -112,15 +144,37 @@ void AbrEncoder::destroy() { x265_cleanup(); /* Free library singletons */ - for (uint8_t pass = 0; pass < m_numEncodes; pass++) +#if ENABLE_MULTIVIEW + for (uint8_t pass = 0; pass < m_numInputViews; pass++) { for (uint32_t index = 0; index < m_queueSize; index++) { X265_FREE(m_inputPicBufferpassindex->planes0); x265_picture_free(m_inputPicBufferpassindex); } + X265_FREE(m_inputPicBufferpass); + if (pass == 0) + { + X265_FREE(m_analysisBufferpass); + X265_FREE(m_readFlagpass); + delete m_picIdxReadCntpass; + delete m_analysisWritepass; + delete m_analysisReadpass; + m_passEncpass->destroy(); + delete m_passEncpass; + } + } +#else + for (uint8_t pass = 0; pass < m_numEncodes; pass++) + { + for (uint32_t index = 0; index < m_queueSize; index++) + { + X265_FREE(m_inputPicBufferpassindex->planes0); + x265_picture_free(m_inputPicBufferpassindex); + } X265_FREE(m_inputPicBufferpass); + X265_FREE(m_analysisBufferpass); X265_FREE(m_readFlagpass); delete m_picIdxReadCntpass; @@ -129,6 +183,7 @@ m_passEncpass->destroy(); delete m_passEncpass; } +#endif X265_FREE(m_inputPicBuffer); X265_FREE(m_analysisBuffer); X265_FREE(m_readFlag); @@ -150,8 +205,11 @@ m_id = id; m_cliopt = cliopt; m_parent = parent; - if(!(m_cliopt.enableScaler && m_id)) - m_input = m_cliopt.input; + if (!(m_cliopt.enableScaler && m_id)) + { + for (int view = 0; view < m_cliopt.param->numViews; view++) + m_inputview = m_cliopt.inputview; + } m_param = cliopt.param; m_inputOver = false; m_lastIdx = -1; @@ -206,6 +264,7 @@ { x265_log(NULL, X265_LOG_ERROR, "x265_encoder_open() failed for Enc, \n"); m_ret = 2; + m_reader = NULL; return -1; } @@ -402,7 +461,7 @@ } - bool PassEncoder::readPicture(x265_picture *dstPic) + bool PassEncoder::readPicture(x265_picture* dstPic, int view) { /*Check and wait if there any input frames to read*/ int ipread = m_parent->m_picReadCntm_id.get(); @@ -480,7 +539,7 @@ } - x265_picture *srcPic = (x265_picture*)(m_parent->m_inputPicBufferm_idreadPos); + x265_picture* srcPic = (m_param->numViews > 1) ? (x265_picture*)(m_parent->m_inputPicBufferviewreadPos) : (x265_picture*)(m_parent->m_inputPicBufferm_idreadPos); x265_picture *pic = (x265_picture*)(dstPic); pic->colorSpace = srcPic->colorSpace; @@ -499,6 +558,8 @@ pic->planes0 = srcPic->planes0; pic->planes1 = srcPic->planes1; pic->planes2 = srcPic->planes2; + pic->planes3 = srcPic->planes3; + pic->format = srcPic->format; if (isAbrLoad) pic->analysisData = *analysisData; return true; @@ -529,11 +590,17 @@ x265_log(m_param, X265_LOG_ERROR, "Unable to register CTRL+C handler: %s in %s\n", strerror(errno), profileName); - x265_picture pic_orig, pic_out; - x265_picture *pic_in = &pic_orig; + x265_picture pic_origMAX_VIEWS; + x265_picture *pic_inMAX_VIEWS; + for (int view = 0; view < m_param->numViews; view++) + pic_inview = &pic_origview; /* Allocate recon picture if analysis save/load is enabled */ std::priority_queue<int64_t>* pts_queue = m_cliopt.output->needPTS() ? new std::priority_queue<int64_t>() : NULL; - x265_picture *pic_recon = (m_cliopt.recon || m_param->analysisSave || m_param->analysisLoad || pts_queue || reconPlay || m_param->csvLogLevel) ? &pic_out : NULL; + x265_picture* pic_reconMAX_LAYERS; + x265_picture pic_outMAX_LAYERS; + + for (int i = 0; i < m_param->numLayers; i++) + pic_reconi = (m_cliopt.reconi || m_param->analysisSave || m_param->analysisLoad || pts_queue || reconPlay || m_param->csvLogLevel) ? &pic_outi : NULL; uint32_t inFrameCount = 0; uint32_t outFrameCount = 0; x265_nal *p_nal; @@ -544,7 +611,7 @@ uint8_t *rpuPayload = NULL; int inputPicNum = 1; x265_picture picField1, picField2;
View file
x265_3.6.tar.gz/source/abrEncApp.h -> x265_4.0.tar.gz/source/abrEncApp.h
Changed
@@ -42,6 +42,7 @@ { public: uint8_t m_numEncodes; + uint8_t m_numInputViews; // Number of inputs for multiview-extension PassEncoder **m_passEnc; uint32_t m_queueSize; ThreadSafeInteger m_numActiveEncodes; @@ -86,7 +87,7 @@ x265_picture **m_outputRecon; CLIOptions m_cliopt; - InputFile* m_input; + InputFile* m_inputMAX_VIEWS; const char* m_reconPlayCmd; FILE* m_qpfile; FILE* m_zoneFile; @@ -102,7 +103,7 @@ void startThreads(); void copyInfo(x265_analysis_data *src); - bool readPicture(x265_picture*); + bool readPicture(x265_picture*, int view); void destroy(); private: @@ -142,7 +143,7 @@ public: PassEncoder *m_parentEnc; int m_id; - InputFile* m_input; + InputFile* m_inputMAX_VIEWS; int m_threadActive; Reader(int id, PassEncoder *parentEnc);
View file
x265_4.0.tar.gz/source/cmake/FindNEON_DOTPROD.cmake
Added
@@ -0,0 +1,21 @@ +include(FindPackageHandleStandardArgs) + +# Check if Armv8.4 Neon DotProd is supported by the Arm CPU +if(APPLE) + execute_process(COMMAND sysctl -a + COMMAND grep "hw.optional.arm.FEAT_DotProd: 1" + OUTPUT_VARIABLE has_dot_product + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE) +else() + execute_process(COMMAND cat /proc/cpuinfo + COMMAND grep Features + COMMAND grep asimddp + OUTPUT_VARIABLE has_dot_product + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE) +endif() + +if(has_dot_product) + set(CPU_HAS_NEON_DOTPROD 1) +endif()
View file
x265_4.0.tar.gz/source/cmake/FindNEON_I8MM.cmake
Added
@@ -0,0 +1,21 @@ +include(FindPackageHandleStandardArgs) + +# Check if Armv8.6 Neon I8MM is supported by the Arm CPU +if(APPLE) + execute_process(COMMAND sysctl -a + COMMAND grep "hw.optional.arm.FEAT_I8MM: 1" + OUTPUT_VARIABLE has_i8mm + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE) +else() + execute_process(COMMAND cat /proc/cpuinfo + COMMAND grep Features + COMMAND grep i8mm + OUTPUT_VARIABLE has_i8mm + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE) +endif() + +if(has_i8mm) + set(CPU_HAS_NEON_I8MM 1) +endif()
View file
x265_3.6.tar.gz/source/common/CMakeLists.txt -> x265_4.0.tar.gz/source/common/CMakeLists.txt
Changed
@@ -103,22 +103,57 @@ add_definitions(-DAUTO_VECTORIZE=1) endif() - set(C_SRCS asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h) + set(C_SRCS_NEON asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h sao-prim.cpp mem-neon.h) + set(C_SRCS_NEON_DOTPROD filter-neon-dotprod.cpp) + set(C_SRCS_NEON_I8MM filter-neon-i8mm.cpp) + set(C_SRCS_SVE sao-prim-sve.cpp dct-prim-sve.cpp) + set(C_SRCS_SVE2 sao-prim-sve2.cpp) enable_language(ASM) # add ARM assembly/intrinsic files here - set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S sad-a-common.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S ipfilter.S ipfilter-common.S blockcopy8.S blockcopy8-common.S ssd-a.S ssd-a-common.S) - set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S ssd-a-sve.S) - set(A_SRCS_SVE2 mc-a-sve2.S sad-a-sve2.S pixel-util-sve2.S ipfilter-sve2.S ssd-a-sve2.S) + set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S blockcopy8.S blockcopy8-common.S ssd-a.S ssd-a-common.S intrapred.S dct.S) + set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S) + set(A_SRCS_SVE2 mc-a-sve2.S pixel-util-sve2.S ssd-a-sve2.S) + set(A_SRCS_NEON_DOTPROD sad-neon-dotprod.S ssd-neon-dotprod.S) set(VEC_PRIMITIVES) set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources") set(ARM_ASMS_SVE "${A_SRCS_SVE}" CACHE INTERNAL "ARM Assembly Sources that use SVE instruction set") set(ARM_ASMS_SVE2 "${A_SRCS_SVE2}" CACHE INTERNAL "ARM Assembly Sources that use SVE2 instruction set") - foreach(SRC ${C_SRCS}) + set(ARM_ASMS_NEON_DOTPROD "${A_SRCS_NEON_DOTPROD}" CACHE INTERNAL "Arm Assembly Sources that use the Neon DotProd extension") + foreach(SRC ${C_SRCS_NEON}) set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) endforeach() + + if(CPU_HAS_NEON_I8MM) + foreach(SRC ${C_SRCS_NEON_I8MM}) + set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) + endforeach() + endif() + + if(CPU_HAS_NEON_DOTPROD) + foreach(SRC ${C_SRCS_NEON_DOTPROD}) + set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) + endforeach() + endif() + + if(CPU_HAS_SVE AND HAVE_SVE_BRIDGE) + foreach(SRC ${C_SRCS_SVE}) + set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) + endforeach() + endif() + + if(CPU_HAS_SVE2 AND HAVE_SVE_BRIDGE) + foreach(SRC ${C_SRCS_SVE2}) + set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) + endforeach() + endif() + source_group(Assembly FILES ${ASM_PRIMITIVES}) + + if(AARCH64_WARNINGS_AS_ERRORS) + set_source_files_properties(${ASM_PRIMITIVES} PROPERTIES COMPILE_FLAGS -Werror) + endif() endif(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64)) if(POWER)
View file
x265_3.6.tar.gz/source/common/aarch64/arm64-utils.cpp -> x265_4.0.tar.gz/source/common/aarch64/arm64-utils.cpp
Changed
@@ -3,7 +3,6 @@ #include "arm64-utils.h" #include <arm_neon.h> -#define COPY_16(d,s) *(uint8x16_t *)(d) = *(uint8x16_t *)(s) namespace X265_NS { @@ -11,53 +10,58 @@ void transpose8x8(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride) { - uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7; - uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7; - - a0 = *(uint8x8_t *)(src + 0 * sstride); - a1 = *(uint8x8_t *)(src + 1 * sstride); - a2 = *(uint8x8_t *)(src + 2 * sstride); - a3 = *(uint8x8_t *)(src + 3 * sstride); - a4 = *(uint8x8_t *)(src + 4 * sstride); - a5 = *(uint8x8_t *)(src + 5 * sstride); - a6 = *(uint8x8_t *)(src + 6 * sstride); - a7 = *(uint8x8_t *)(src + 7 * sstride); - - b0 = vtrn1_u32(a0, a4); - b1 = vtrn1_u32(a1, a5); - b2 = vtrn1_u32(a2, a6); - b3 = vtrn1_u32(a3, a7); - b4 = vtrn2_u32(a0, a4); - b5 = vtrn2_u32(a1, a5); - b6 = vtrn2_u32(a2, a6); - b7 = vtrn2_u32(a3, a7); - - a0 = vtrn1_u16(b0, b2); - a1 = vtrn1_u16(b1, b3); - a2 = vtrn2_u16(b0, b2); - a3 = vtrn2_u16(b1, b3); - a4 = vtrn1_u16(b4, b6); - a5 = vtrn1_u16(b5, b7); - a6 = vtrn2_u16(b4, b6); - a7 = vtrn2_u16(b5, b7); - - b0 = vtrn1_u8(a0, a1); - b1 = vtrn2_u8(a0, a1); - b2 = vtrn1_u8(a2, a3); - b3 = vtrn2_u8(a2, a3); - b4 = vtrn1_u8(a4, a5); - b5 = vtrn2_u8(a4, a5); - b6 = vtrn1_u8(a6, a7); - b7 = vtrn2_u8(a6, a7); - - *(uint8x8_t *)(dst + 0 * dstride) = b0; - *(uint8x8_t *)(dst + 1 * dstride) = b1; - *(uint8x8_t *)(dst + 2 * dstride) = b2; - *(uint8x8_t *)(dst + 3 * dstride) = b3; - *(uint8x8_t *)(dst + 4 * dstride) = b4; - *(uint8x8_t *)(dst + 5 * dstride) = b5; - *(uint8x8_t *)(dst + 6 * dstride) = b6; - *(uint8x8_t *)(dst + 7 * dstride) = b7; + uint8x8_t a0 = vld1_u8(src + 0 * sstride); + uint8x8_t a1 = vld1_u8(src + 1 * sstride); + uint8x8_t a2 = vld1_u8(src + 2 * sstride); + uint8x8_t a3 = vld1_u8(src + 3 * sstride); + uint8x8_t a4 = vld1_u8(src + 4 * sstride); + uint8x8_t a5 = vld1_u8(src + 5 * sstride); + uint8x8_t a6 = vld1_u8(src + 6 * sstride); + uint8x8_t a7 = vld1_u8(src + 7 * sstride); + + uint32x2_t b0 = vtrn1_u32(vreinterpret_u32_u8(a0), vreinterpret_u32_u8(a4)); + uint32x2_t b1 = vtrn1_u32(vreinterpret_u32_u8(a1), vreinterpret_u32_u8(a5)); + uint32x2_t b2 = vtrn1_u32(vreinterpret_u32_u8(a2), vreinterpret_u32_u8(a6)); + uint32x2_t b3 = vtrn1_u32(vreinterpret_u32_u8(a3), vreinterpret_u32_u8(a7)); + uint32x2_t b4 = vtrn2_u32(vreinterpret_u32_u8(a0), vreinterpret_u32_u8(a4)); + uint32x2_t b5 = vtrn2_u32(vreinterpret_u32_u8(a1), vreinterpret_u32_u8(a5)); + uint32x2_t b6 = vtrn2_u32(vreinterpret_u32_u8(a2), vreinterpret_u32_u8(a6)); + uint32x2_t b7 = vtrn2_u32(vreinterpret_u32_u8(a3), vreinterpret_u32_u8(a7)); + + uint16x4_t c0 = vtrn1_u16(vreinterpret_u16_u32(b0), + vreinterpret_u16_u32(b2)); + uint16x4_t c1 = vtrn1_u16(vreinterpret_u16_u32(b1), + vreinterpret_u16_u32(b3)); + uint16x4_t c2 = vtrn2_u16(vreinterpret_u16_u32(b0), + vreinterpret_u16_u32(b2)); + uint16x4_t c3 = vtrn2_u16(vreinterpret_u16_u32(b1), + vreinterpret_u16_u32(b3)); + uint16x4_t c4 = vtrn1_u16(vreinterpret_u16_u32(b4), + vreinterpret_u16_u32(b6)); + uint16x4_t c5 = vtrn1_u16(vreinterpret_u16_u32(b5), + vreinterpret_u16_u32(b7)); + uint16x4_t c6 = vtrn2_u16(vreinterpret_u16_u32(b4), + vreinterpret_u16_u32(b6)); + uint16x4_t c7 = vtrn2_u16(vreinterpret_u16_u32(b5), + vreinterpret_u16_u32(b7)); + + uint8x8_t d0 = vtrn1_u8(vreinterpret_u8_u16(c0), vreinterpret_u8_u16(c1)); + uint8x8_t d1 = vtrn2_u8(vreinterpret_u8_u16(c0), vreinterpret_u8_u16(c1)); + uint8x8_t d2 = vtrn1_u8(vreinterpret_u8_u16(c2), vreinterpret_u8_u16(c3)); + uint8x8_t d3 = vtrn2_u8(vreinterpret_u8_u16(c2), vreinterpret_u8_u16(c3)); + uint8x8_t d4 = vtrn1_u8(vreinterpret_u8_u16(c4), vreinterpret_u8_u16(c5)); + uint8x8_t d5 = vtrn2_u8(vreinterpret_u8_u16(c4), vreinterpret_u8_u16(c5)); + uint8x8_t d6 = vtrn1_u8(vreinterpret_u8_u16(c6), vreinterpret_u8_u16(c7)); + uint8x8_t d7 = vtrn2_u8(vreinterpret_u8_u16(c6), vreinterpret_u8_u16(c7)); + + vst1_u8(dst + 0 * dstride, d0); + vst1_u8(dst + 1 * dstride, d1); + vst1_u8(dst + 2 * dstride, d2); + vst1_u8(dst + 3 * dstride, d3); + vst1_u8(dst + 4 * dstride, d4); + vst1_u8(dst + 5 * dstride, d5); + vst1_u8(dst + 6 * dstride, d6); + vst1_u8(dst + 7 * dstride, d7); } @@ -67,97 +71,171 @@ void transpose16x16(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride) { - uint16x8_t a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aA, aB, aC, aD, aE, aF; - uint16x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, bA, bB, bC, bD, bE, bF; - uint16x8_t c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF; - uint16x8_t d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, dA, dB, dC, dD, dE, dF; - - a0 = *(uint16x8_t *)(src + 0 * sstride); - a1 = *(uint16x8_t *)(src + 1 * sstride); - a2 = *(uint16x8_t *)(src + 2 * sstride); - a3 = *(uint16x8_t *)(src + 3 * sstride); - a4 = *(uint16x8_t *)(src + 4 * sstride); - a5 = *(uint16x8_t *)(src + 5 * sstride); - a6 = *(uint16x8_t *)(src + 6 * sstride); - a7 = *(uint16x8_t *)(src + 7 * sstride); - a8 = *(uint16x8_t *)(src + 8 * sstride); - a9 = *(uint16x8_t *)(src + 9 * sstride); - aA = *(uint16x8_t *)(src + 10 * sstride); - aB = *(uint16x8_t *)(src + 11 * sstride); - aC = *(uint16x8_t *)(src + 12 * sstride); - aD = *(uint16x8_t *)(src + 13 * sstride); - aE = *(uint16x8_t *)(src + 14 * sstride); - aF = *(uint16x8_t *)(src + 15 * sstride); - - b0 = vtrn1q_u64(a0, a8); - b1 = vtrn1q_u64(a1, a9); - b2 = vtrn1q_u64(a2, aA); - b3 = vtrn1q_u64(a3, aB); - b4 = vtrn1q_u64(a4, aC); - b5 = vtrn1q_u64(a5, aD); - b6 = vtrn1q_u64(a6, aE); - b7 = vtrn1q_u64(a7, aF); - b8 = vtrn2q_u64(a0, a8); - b9 = vtrn2q_u64(a1, a9); - bA = vtrn2q_u64(a2, aA); - bB = vtrn2q_u64(a3, aB); - bC = vtrn2q_u64(a4, aC); - bD = vtrn2q_u64(a5, aD); - bE = vtrn2q_u64(a6, aE); - bF = vtrn2q_u64(a7, aF); - - c0 = vtrn1q_u32(b0, b4); - c1 = vtrn1q_u32(b1, b5); - c2 = vtrn1q_u32(b2, b6); - c3 = vtrn1q_u32(b3, b7); - c4 = vtrn2q_u32(b0, b4); - c5 = vtrn2q_u32(b1, b5); - c6 = vtrn2q_u32(b2, b6); - c7 = vtrn2q_u32(b3, b7); - c8 = vtrn1q_u32(b8, bC); - c9 = vtrn1q_u32(b9, bD); - cA = vtrn1q_u32(bA, bE); - cB = vtrn1q_u32(bB, bF); - cC = vtrn2q_u32(b8, bC); - cD = vtrn2q_u32(b9, bD); - cE = vtrn2q_u32(bA, bE); - cF = vtrn2q_u32(bB, bF); - - d0 = vtrn1q_u16(c0, c2); - d1 = vtrn1q_u16(c1, c3); - d2 = vtrn2q_u16(c0, c2); - d3 = vtrn2q_u16(c1, c3); - d4 = vtrn1q_u16(c4, c6); - d5 = vtrn1q_u16(c5, c7); - d6 = vtrn2q_u16(c4, c6); - d7 = vtrn2q_u16(c5, c7); - d8 = vtrn1q_u16(c8, cA); - d9 = vtrn1q_u16(c9, cB); - dA = vtrn2q_u16(c8, cA); - dB = vtrn2q_u16(c9, cB); - dC = vtrn1q_u16(cC, cE); - dD = vtrn1q_u16(cD, cF); - dE = vtrn2q_u16(cC, cE); - dF = vtrn2q_u16(cD, cF); - - *(uint16x8_t *)(dst + 0 * dstride) = vtrn1q_u8(d0, d1); - *(uint16x8_t *)(dst + 1 * dstride) = vtrn2q_u8(d0, d1); - *(uint16x8_t *)(dst + 2 * dstride) = vtrn1q_u8(d2, d3); - *(uint16x8_t *)(dst + 3 * dstride) = vtrn2q_u8(d2, d3); - *(uint16x8_t *)(dst + 4 * dstride) = vtrn1q_u8(d4, d5); - *(uint16x8_t *)(dst + 5 * dstride) = vtrn2q_u8(d4, d5); - *(uint16x8_t *)(dst + 6 * dstride) = vtrn1q_u8(d6, d7); - *(uint16x8_t *)(dst + 7 * dstride) = vtrn2q_u8(d6, d7); - *(uint16x8_t *)(dst + 8 * dstride) = vtrn1q_u8(d8, d9);
View file
x265_3.6.tar.gz/source/common/aarch64/arm64-utils.h -> x265_4.0.tar.gz/source/common/aarch64/arm64-utils.h
Changed
@@ -1,6 +1,7 @@ #ifndef __ARM64_UTILS_H__ #define __ARM64_UTILS_H__ +#include <stdint.h> namespace X265_NS {
View file
x265_3.6.tar.gz/source/common/aarch64/asm-primitives.cpp -> x265_4.0.tar.gz/source/common/aarch64/asm-primitives.cpp
Changed
@@ -39,15 +39,9 @@ p.cuBLOCK_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \ p.cuBLOCK_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \ p.cuBLOCK_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu) -#define LUMA_TU_TYPED_NEON(prim, fncdef, fname) \ - p.cuBLOCK_4x4.prim = fncdef PFX(fname ## _4x4_ ## neon); \ - p.cuBLOCK_8x8.prim = fncdef PFX(fname ## _8x8_ ## neon); \ - p.cuBLOCK_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \ - p.cuBLOCK_64x64.prim = fncdef PFX(fname ## _64x64_ ## neon) #define LUMA_TU_TYPED_CAN_USE_SVE(prim, fncdef, fname) \ p.cuBLOCK_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve) #define ALL_LUMA_TU(prim, fname, cpu) ALL_LUMA_TU_TYPED(prim, , fname, cpu) -#define LUMA_TU_NEON(prim, fname) LUMA_TU_TYPED_NEON(prim, , fname) #define LUMA_TU_CAN_USE_SVE(prim, fname) LUMA_TU_TYPED_CAN_USE_SVE(prim, , fname) #define ALL_LUMA_PU_TYPED(prim, fncdef, fname, cpu) \ @@ -76,50 +70,6 @@ p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \ p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu); \ p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu) -#define LUMA_PU_TYPED_MULTIPLE_ARCHS_1(prim, fncdef, fname, cpu) \ - p.puLUMA_4x4.prim = fncdef PFX(fname ## _4x4_ ## cpu); \ - p.puLUMA_4x8.prim = fncdef PFX(fname ## _4x8_ ## cpu); \ - p.puLUMA_4x16.prim = fncdef PFX(fname ## _4x16_ ## cpu) -#define LUMA_PU_TYPED_MULTIPLE_ARCHS_2(prim, fncdef, fname, cpu) \ - p.puLUMA_8x8.prim = fncdef PFX(fname ## _8x8_ ## cpu); \ - p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \ - p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \ - p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \ - p.puLUMA_8x4.prim = fncdef PFX(fname ## _8x4_ ## cpu); \ - p.puLUMA_16x8.prim = fncdef PFX(fname ## _16x8_ ## cpu); \ - p.puLUMA_8x16.prim = fncdef PFX(fname ## _8x16_ ## cpu); \ - p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \ - p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \ - p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## cpu); \ - p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \ - p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \ - p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## cpu); \ - p.puLUMA_16x4.prim = fncdef PFX(fname ## _16x4_ ## cpu); \ - p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \ - p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \ - p.puLUMA_32x8.prim = fncdef PFX(fname ## _32x8_ ## cpu); \ - p.puLUMA_8x32.prim = fncdef PFX(fname ## _8x32_ ## cpu); \ - p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \ - p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \ - p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu); \ - p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu) -#define LUMA_PU_TYPED_NEON_1(prim, fncdef, fname) \ - p.puLUMA_4x4.prim = fncdef PFX(fname ## _4x4_ ## neon); \ - p.puLUMA_4x8.prim = fncdef PFX(fname ## _4x8_ ## neon); \ - p.puLUMA_4x16.prim = fncdef PFX(fname ## _4x16_ ## neon); \ - p.puLUMA_12x16.prim = fncdef PFX(fname ## _12x16_ ## neon); \ - p.puLUMA_8x8.prim = fncdef PFX(fname ## _8x8_ ## neon); \ - p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \ - p.puLUMA_8x4.prim = fncdef PFX(fname ## _8x4_ ## neon); \ - p.puLUMA_16x8.prim = fncdef PFX(fname ## _16x8_ ## neon); \ - p.puLUMA_8x16.prim = fncdef PFX(fname ## _8x16_ ## neon); \ - p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## neon); \ - p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## neon); \ - p.puLUMA_16x4.prim = fncdef PFX(fname ## _16x4_ ## neon); \ - p.puLUMA_24x32.prim = fncdef PFX(fname ## _24x32_ ## neon); \ - p.puLUMA_8x32.prim = fncdef PFX(fname ## _8x32_ ## neon); \ - p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## neon); \ - p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## neon) #define LUMA_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fncdef, fname) \ p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve); \ p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## sve); \ @@ -130,20 +80,6 @@ p.puLUMA_32x8.prim = fncdef PFX(fname ## _32x8_ ## sve); \ p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## sve); \ p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## sve) -#define LUMA_PU_TYPED_NEON_2(prim, fncdef, fname) \ - p.puLUMA_4x4.prim = fncdef PFX(fname ## _4x4_ ## neon); \ - p.puLUMA_8x4.prim = fncdef PFX(fname ## _8x4_ ## neon); \ - p.puLUMA_4x8.prim = fncdef PFX(fname ## _4x8_ ## neon); \ - p.puLUMA_8x8.prim = fncdef PFX(fname ## _8x8_ ## neon); \ - p.puLUMA_16x8.prim = fncdef PFX(fname ## _16x8_ ## neon); \ - p.puLUMA_8x16.prim = fncdef PFX(fname ## _8x16_ ## neon); \ - p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \ - p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## neon); \ - p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## neon); \ - p.puLUMA_16x4.prim = fncdef PFX(fname ## _16x4_ ## neon); \ - p.puLUMA_4x16.prim = fncdef PFX(fname ## _4x16_ ## neon); \ - p.puLUMA_8x32.prim = fncdef PFX(fname ## _8x32_ ## neon); \ - p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## neon) #define LUMA_PU_TYPED_MULTIPLE_ARCHS_3(prim, fncdef, fname, cpu) \ p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \ p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \ @@ -157,10 +93,6 @@ p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \ p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \ p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu) -#define LUMA_PU_TYPED_NEON_3(prim, fncdef, fname) \ - p.puLUMA_4x4.prim = fncdef PFX(fname ## _4x4_ ## neon); \ - p.puLUMA_4x8.prim = fncdef PFX(fname ## _4x8_ ## neon); \ - p.puLUMA_4x16.prim = fncdef PFX(fname ## _4x16_ ## neon) #define LUMA_PU_TYPED_CAN_USE_SVE2(prim, fncdef, fname) \ p.puLUMA_8x8.prim = fncdef PFX(fname ## _8x8_ ## sve2); \ p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## sve2); \ @@ -184,22 +116,6 @@ p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## sve2); \ p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## sve2); \ p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## sve2) -#define LUMA_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, fncdef) \ - p.puLUMA_4x4.prim = fncdef PFX(filterPixelToShort ## _4x4_ ## neon); \ - p.puLUMA_8x8.prim = fncdef PFX(filterPixelToShort ## _8x8_ ## neon); \ - p.puLUMA_16x16.prim = fncdef PFX(filterPixelToShort ## _16x16_ ## neon); \ - p.puLUMA_8x4.prim = fncdef PFX(filterPixelToShort ## _8x4_ ## neon); \ - p.puLUMA_4x8.prim = fncdef PFX(filterPixelToShort ## _4x8_ ## neon); \ - p.puLUMA_16x8.prim = fncdef PFX(filterPixelToShort ## _16x8_ ## neon); \ - p.puLUMA_8x16.prim = fncdef PFX(filterPixelToShort ## _8x16_ ## neon); \ - p.puLUMA_16x32.prim = fncdef PFX(filterPixelToShort ## _16x32_ ## neon); \ - p.puLUMA_16x12.prim = fncdef PFX(filterPixelToShort ## _16x12_ ## neon); \ - p.puLUMA_12x16.prim = fncdef PFX(filterPixelToShort ## _12x16_ ## neon); \ - p.puLUMA_16x4.prim = fncdef PFX(filterPixelToShort ## _16x4_ ## neon); \ - p.puLUMA_4x16.prim = fncdef PFX(filterPixelToShort ## _4x16_ ## neon); \ - p.puLUMA_24x32.prim = fncdef PFX(filterPixelToShort ## _24x32_ ## neon); \ - p.puLUMA_8x32.prim = fncdef PFX(filterPixelToShort ## _8x32_ ## neon); \ - p.puLUMA_16x64.prim = fncdef PFX(filterPixelToShort ## _16x64_ ## neon) #define LUMA_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, fncdef) \ p.puLUMA_32x32.prim = fncdef PFX(filterPixelToShort ## _32x32_ ## sve); \ p.puLUMA_32x16.prim = fncdef PFX(filterPixelToShort ## _32x16_ ## sve); \ @@ -211,17 +127,29 @@ p.puLUMA_64x48.prim = fncdef PFX(filterPixelToShort ## _64x48_ ## sve); \ p.puLUMA_64x16.prim = fncdef PFX(filterPixelToShort ## _64x16_ ## sve); \ p.puLUMA_48x64.prim = fncdef PFX(filterPixelToShort ## _48x64_ ## sve) +#define LUMA_PU_TYPED_MULTIPLE_16(prim, fncdef, fname, cpu) \ + p.puLUMA_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \ + p.puLUMA_32x32.prim = fncdef PFX(fname ## _32x32_ ## cpu); \ + p.puLUMA_64x64.prim = fncdef PFX(fname ## _64x64_ ## cpu); \ + p.puLUMA_16x8.prim = fncdef PFX(fname ## _16x8_ ## cpu); \ + p.puLUMA_16x32.prim = fncdef PFX(fname ## _16x32_ ## cpu); \ + p.puLUMA_32x16.prim = fncdef PFX(fname ## _32x16_ ## cpu); \ + p.puLUMA_64x32.prim = fncdef PFX(fname ## _64x32_ ## cpu); \ + p.puLUMA_32x64.prim = fncdef PFX(fname ## _32x64_ ## cpu); \ + p.puLUMA_16x12.prim = fncdef PFX(fname ## _16x12_ ## cpu); \ + p.puLUMA_16x4.prim = fncdef PFX(fname ## _16x4_ ## cpu); \ + p.puLUMA_32x24.prim = fncdef PFX(fname ## _32x24_ ## cpu); \ + p.puLUMA_32x8.prim = fncdef PFX(fname ## _32x8_ ## cpu); \ + p.puLUMA_64x48.prim = fncdef PFX(fname ## _64x48_ ## cpu); \ + p.puLUMA_48x64.prim = fncdef PFX(fname ## _48x64_ ## cpu); \ + p.puLUMA_64x16.prim = fncdef PFX(fname ## _64x16_ ## cpu); \ + p.puLUMA_16x64.prim = fncdef PFX(fname ## _16x64_ ## cpu) #define ALL_LUMA_PU(prim, fname, cpu) ALL_LUMA_PU_TYPED(prim, , fname, cpu) -#define LUMA_PU_MULTIPLE_ARCHS_1(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_ARCHS_1(prim, , fname, cpu) -#define LUMA_PU_MULTIPLE_ARCHS_2(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_ARCHS_2(prim, , fname, cpu) -#define LUMA_PU_NEON_1(prim, fname) LUMA_PU_TYPED_NEON_1(prim, , fname) #define LUMA_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fname) LUMA_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, , fname) -#define LUMA_PU_NEON_2(prim, fname) LUMA_PU_TYPED_NEON_2(prim, , fname) #define LUMA_PU_MULTIPLE_ARCHS_3(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_ARCHS_3(prim, , fname, cpu) -#define LUMA_PU_NEON_3(prim, fname) LUMA_PU_TYPED_NEON_3(prim, , fname) #define LUMA_PU_CAN_USE_SVE2(prim, fname) LUMA_PU_TYPED_CAN_USE_SVE2(prim, , fname) -#define LUMA_PU_NEON_FILTER_PIXEL_TO_SHORT(prim) LUMA_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, ) #define LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(prim) LUMA_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, ) +#define LUMA_PU_MULTIPLE_16(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_16(prim, , fname, cpu) #define ALL_LUMA_PU_T(prim, fname) \ @@ -276,37 +204,9 @@ p.chromaX265_CSP_I420.puCHROMA_420_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \ p.chromaX265_CSP_I420.puCHROMA_420_32x8.prim = fncdef PFX(fname ## _32x8_ ## cpu); \ p.chromaX265_CSP_I420.puCHROMA_420_8x32.prim = fncdef PFX(fname ## _8x32_ ## cpu) -#define CHROMA_420_PU_TYPED_NEON_1(prim, fncdef, fname) \ - p.chromaX265_CSP_I420.puCHROMA_420_4x4.prim = fncdef PFX(fname ## _4x4_ ## neon); \ - p.chromaX265_CSP_I420.puCHROMA_420_4x2.prim = fncdef PFX(fname ## _4x2_ ## neon); \ - p.chromaX265_CSP_I420.puCHROMA_420_4x8.prim = fncdef PFX(fname ## _4x8_ ## neon); \ - p.chromaX265_CSP_I420.puCHROMA_420_6x8.prim = fncdef PFX(fname ## _6x8_ ## neon); \ - p.chromaX265_CSP_I420.puCHROMA_420_12x16.prim = fncdef PFX(fname ## _12x16_ ## neon); \ - p.chromaX265_CSP_I420.puCHROMA_420_4x16.prim = fncdef PFX(fname ## _4x16_ ## neon); \ - p.chromaX265_CSP_I420.puCHROMA_420_32x24.prim = fncdef PFX(fname ## _32x24_ ## neon); \ - p.chromaX265_CSP_I420.puCHROMA_420_24x32.prim = fncdef PFX(fname ## _24x32_ ## neon); \ - p.chromaX265_CSP_I420.puCHROMA_420_32x8.prim = fncdef PFX(fname ## _32x8_ ## neon); \ - p.chromaX265_CSP_I420.puCHROMA_420_8x32.prim = fncdef PFX(fname ## _8x32_ ## neon); \ - p.chromaX265_CSP_I420.puCHROMA_420_8x8.prim = fncdef PFX(fname ## _8x8_ ## neon); \ - p.chromaX265_CSP_I420.puCHROMA_420_16x16.prim = fncdef PFX(fname ## _16x16_ ## neon); \ - p.chromaX265_CSP_I420.puCHROMA_420_2x4.prim = fncdef PFX(fname ## _2x4_ ## neon); \ - p.chromaX265_CSP_I420.puCHROMA_420_8x4.prim = fncdef PFX(fname ## _8x4_ ## neon); \ - p.chromaX265_CSP_I420.puCHROMA_420_16x8.prim = fncdef PFX(fname ## _16x8_ ## neon); \ - p.chromaX265_CSP_I420.puCHROMA_420_8x16.prim = fncdef PFX(fname ## _8x16_ ## neon); \ - p.chromaX265_CSP_I420.puCHROMA_420_16x32.prim = fncdef PFX(fname ## _16x32_ ## neon); \ - p.chromaX265_CSP_I420.puCHROMA_420_8x6.prim = fncdef PFX(fname ## _8x6_ ## neon); \ - p.chromaX265_CSP_I420.puCHROMA_420_8x2.prim = fncdef PFX(fname ## _8x2_ ## neon); \ - p.chromaX265_CSP_I420.puCHROMA_420_2x8.prim = fncdef PFX(fname ## _2x8_ ## neon); \ - p.chromaX265_CSP_I420.puCHROMA_420_16x12.prim = fncdef PFX(fname ## _16x12_ ## neon); \ - p.chromaX265_CSP_I420.puCHROMA_420_16x4.prim = fncdef PFX(fname ## _16x4_ ## neon) #define CHROMA_420_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fncdef, fname) \ p.chromaX265_CSP_I420.puCHROMA_420_32x32.prim = fncdef PFX(fname ## _32x32_ ## sve); \ p.chromaX265_CSP_I420.puCHROMA_420_32x16.prim = fncdef PFX(fname ## _32x16_ ## sve) -#define CHROMA_420_PU_TYPED_NEON_2(prim, fncdef, fname) \ - p.chromaX265_CSP_I420.puCHROMA_420_4x4.prim = fncdef PFX(fname ## _4x4_ ## neon); \ - p.chromaX265_CSP_I420.puCHROMA_420_4x2.prim = fncdef PFX(fname ## _4x2_ ## neon); \ - p.chromaX265_CSP_I420.puCHROMA_420_4x8.prim = fncdef PFX(fname ## _4x8_ ## neon); \ - p.chromaX265_CSP_I420.puCHROMA_420_4x16.prim = fncdef PFX(fname ## _4x16_ ## neon) #define CHROMA_420_PU_TYPED_MULTIPLE_ARCHS(prim, fncdef, fname, cpu) \ p.chromaX265_CSP_I420.puCHROMA_420_8x8.prim = fncdef PFX(fname ## _8x8_ ## cpu); \ p.chromaX265_CSP_I420.puCHROMA_420_16x16.prim = fncdef PFX(fname ## _16x16_ ## cpu); \ @@ -328,23 +228,6 @@ p.chromaX265_CSP_I420.puCHROMA_420_24x32.prim = fncdef PFX(fname ## _24x32_ ## cpu); \ p.chromaX265_CSP_I420.puCHROMA_420_32x8.prim = fncdef PFX(fname ## _32x8_ ## cpu); \ p.chromaX265_CSP_I420.puCHROMA_420_8x32.prim = fncdef PFX(fname ## _8x32_ ## cpu)
View file
x265_3.6.tar.gz/source/common/aarch64/asm.S -> x265_4.0.tar.gz/source/common/aarch64/asm.S
Changed
@@ -72,6 +72,16 @@ #define PFX_C(name) JOIN(JOIN(JOIN(EXTERN_ASM, X265_NS), _), name) +// Alignment of stack arguments of size less than 8 bytes. +#ifdef __APPLE__ +#define STACK_ARG_ALIGNMENT 4 +#else +#define STACK_ARG_ALIGNMENT 8 +#endif + +// Get offset from SP of stack argument at index `idx`. +#define STACK_ARG_OFFSET(idx) (idx * STACK_ARG_ALIGNMENT) + #ifdef __APPLE__ .macro endfunc ELF .size \name, . - \name @@ -184,4 +194,19 @@ vtrn \t3, \t4, \s3, \s4 .endm -#endif \ No newline at end of file + +.macro push_vec_regs + stp d8, d9, sp,#-16! + stp d10, d11, sp,#-16! + stp d12, d13, sp,#-16! + stp d14, d15, sp,#-16! +.endm + +.macro pop_vec_regs + ldp d14, d15, sp, #16 + ldp d12, d13, sp, #16 + ldp d10, d11, sp, #16 + ldp d8, d9, sp, #16 +.endm + +#endif
View file
x265_3.6.tar.gz/source/common/aarch64/blockcopy8-sve.S -> x265_4.0.tar.gz/source/common/aarch64/blockcopy8-sve.S
Changed
@@ -112,7 +112,7 @@ lsl x3, x3, #1 movrel x11, xtn_xtn2_table ld1 {v31.16b}, x11 -.loop_csp32_sve: +.Loop_csp32_sve: sub w12, w12, #1 .rept 4 ld1 {v0.8h-v3.8h}, x2, x3 @@ -124,7 +124,7 @@ st1 {v0.16b-v1.16b}, x0, x1 st1 {v2.16b-v3.16b}, x0, x1 .endr - cbnz w12, .loop_csp32_sve + cbnz w12, .Loop_csp32_sve ret .vl_gt_16_blockcopy_sp_32_32: cmp x9, #48 @@ -199,7 +199,7 @@ bgt .vl_gt_16_blockcopy_ps_32_32 lsl x1, x1, #1 mov w12, #4 -.loop_cps32_sve: +.Loop_cps32_sve: sub w12, w12, #1 .rept 4 ld1 {v16.16b-v17.16b}, x2, x3 @@ -215,7 +215,7 @@ st1 {v0.8h-v3.8h}, x0, x1 st1 {v4.8h-v7.8h}, x0, x1 .endr - cbnz w12, .loop_cps32_sve + cbnz w12, .Loop_cps32_sve ret .vl_gt_16_blockcopy_ps_32_32: cmp x9, #48 @@ -248,7 +248,7 @@ lsl x1, x1, #1 sub x1, x1, #64 mov w12, #16 -.loop_cps64_sve: +.Loop_cps64_sve: sub w12, w12, #1 .rept 4 ld1 {v16.16b-v19.16b}, x2, x3 @@ -263,7 +263,7 @@ st1 {v0.8h-v3.8h}, x0, #64 st1 {v4.8h-v7.8h}, x0, x1 .endr - cbnz w12, .loop_cps64_sve + cbnz w12, .Loop_cps64_sve ret .vl_gt_16_blockcopy_ps_64_64: cmp x9, #48 @@ -338,13 +338,13 @@ lsl x1, x1, #1 lsl x3, x3, #1 mov w12, #4 -.loop_css32_sve: +.Loop_css32_sve: sub w12, w12, #1 .rept 8 ld1 {v0.8h-v3.8h}, x2, x3 st1 {v0.8h-v3.8h}, x0, x1 .endr - cbnz w12, .loop_css32_sve + cbnz w12, .Loop_css32_sve ret .vl_gt_16_blockcopy_ss_32_32: cmp x9, #48 @@ -379,7 +379,7 @@ lsl x3, x3, #1 sub x3, x3, #64 mov w12, #8 -.loop_css64_sve: +.Loop_css64_sve: sub w12, w12, #1 .rept 8 ld1 {v0.8h-v3.8h}, x2, #64 @@ -387,7 +387,7 @@ st1 {v0.8h-v3.8h}, x0, #64 st1 {v4.8h-v7.8h}, x0, x1 .endr - cbnz w12, .loop_css64_sve + cbnz w12, .Loop_css64_sve ret .vl_gt_16_blockcopy_ss_64_64: cmp x9, #48 @@ -474,13 +474,13 @@ lsl x1, x1, #1 lsl x3, x3, #1 mov w12, #8 -.loop_css32x64_sve: +.Loop_css32x64_sve: sub w12, w12, #1 .rept 8 ld1 {v0.8h-v3.8h}, x2, x3 st1 {v0.8h-v3.8h}, x0, x1 .endr - cbnz w12, .loop_css32x64_sve + cbnz w12, .Loop_css32x64_sve ret .vl_gt_16_blockcopy_ss_32_64: cmp x9, #48 @@ -570,7 +570,7 @@ bgt .vl_gt_16_blockcopy_ps_32_64 lsl x1, x1, #1 mov w12, #8 -.loop_cps32x64_sve: +.Loop_cps32x64_sve: sub w12, w12, #1 .rept 4 ld1 {v16.16b-v17.16b}, x2, x3 @@ -586,7 +586,7 @@ st1 {v0.8h-v3.8h}, x0, x1 st1 {v4.8h-v7.8h}, x0, x1 .endr - cbnz w12, .loop_cps32x64_sve + cbnz w12, .Loop_cps32x64_sve ret .vl_gt_16_blockcopy_ps_32_64: cmp x9, #48 @@ -730,13 +730,13 @@ rdvl x9, #1 cmp x9, #16 bgt .vl_gt_16_blockcopy_pp_32xN_\h -.loop_sve_32x\h\(): +.Loop_sve_32x\h\(): sub w12, w12, #1 .rept 8 ld1 {v0.16b-v1.16b}, x2, x3 st1 {v0.16b-v1.16b}, x0, x1 .endr - cbnz w12, .loop_sve_32x\h + cbnz w12, .Loop_sve_32x\h ret .vl_gt_16_blockcopy_pp_32xN_\h: ptrue p0.b, vl32 @@ -765,13 +765,13 @@ rdvl x9, #1 cmp x9, #16 bgt .vl_gt_16_blockcopy_pp_64xN_\h -.loop_sve_64x\h\(): +.Loop_sve_64x\h\(): sub w12, w12, #1 .rept 4 ld1 {v0.16b-v3.16b}, x2, x3 st1 {v0.16b-v3.16b}, x0, x1 .endr - cbnz w12, .loop_sve_64x\h + cbnz w12, .Loop_sve_64x\h ret .vl_gt_16_blockcopy_pp_64xN_\h: cmp x9, #48 @@ -856,7 +856,7 @@ bgt .vl_gt_16_cpy2Dto1D_shl_16x16 cpy2Dto1D_shl_start_sve mov w12, #4 -.loop_cpy2Dto1D_shl_16_sve: +.Loop_cpy2Dto1D_shl_16_sve: sub w12, w12, #1 .rept 4 ld1 {v2.16b-v3.16b}, x1, x2 @@ -864,7 +864,7 @@ sshl v3.8h, v3.8h, v0.8h st1 {v2.16b-v3.16b}, x0, #32 .endr - cbnz w12, .loop_cpy2Dto1D_shl_16_sve + cbnz w12, .Loop_cpy2Dto1D_shl_16_sve ret .vl_gt_16_cpy2Dto1D_shl_16x16: ptrue p0.h, vl16 @@ -885,7 +885,7 @@ bgt .vl_gt_16_cpy2Dto1D_shl_32x32 cpy2Dto1D_shl_start_sve mov w12, #16 -.loop_cpy2Dto1D_shl_32_sve: +.Loop_cpy2Dto1D_shl_32_sve: sub w12, w12, #1 .rept 2 ld1 {v2.16b-v5.16b}, x1, x2 @@ -895,7 +895,7 @@ sshl v5.8h, v5.8h, v0.8h st1 {v2.16b-v5.16b}, x0, #64 .endr - cbnz w12, .loop_cpy2Dto1D_shl_32_sve + cbnz w12, .Loop_cpy2Dto1D_shl_32_sve ret .vl_gt_16_cpy2Dto1D_shl_32x32: cmp x9, #48 @@ -931,7 +931,7 @@ cpy2Dto1D_shl_start_sve mov w12, #32 sub x2, x2, #64 -.loop_cpy2Dto1D_shl_64_sve: +.Loop_cpy2Dto1D_shl_64_sve: sub w12, w12, #1 .rept 2 ld1 {v2.16b-v5.16b}, x1, #64 @@ -947,7 +947,7 @@
View file
x265_3.6.tar.gz/source/common/aarch64/blockcopy8.S -> x265_4.0.tar.gz/source/common/aarch64/blockcopy8.S
Changed
@@ -86,7 +86,7 @@ lsl x3, x3, #1 movrel x11, xtn_xtn2_table ld1 {v31.16b}, x11 -.loop_csp32: +.Loop_csp32: sub w12, w12, #1 .rept 4 ld1 {v0.8h-v3.8h}, x2, x3 @@ -98,7 +98,7 @@ st1 {v0.16b-v1.16b}, x0, x1 st1 {v2.16b-v3.16b}, x0, x1 .endr - cbnz w12, .loop_csp32 + cbnz w12, .Loop_csp32 ret endfunc @@ -108,7 +108,7 @@ sub x3, x3, #64 movrel x11, xtn_xtn2_table ld1 {v31.16b}, x11 -.loop_csp64: +.Loop_csp64: sub w12, w12, #1 .rept 4 ld1 {v0.8h-v3.8h}, x2, #64 @@ -119,7 +119,7 @@ tbl v3.16b, {v6.16b,v7.16b}, v31.16b st1 {v0.16b-v3.16b}, x0, x1 .endr - cbnz w12, .loop_csp64 + cbnz w12, .Loop_csp64 ret endfunc @@ -168,7 +168,7 @@ function PFX(blockcopy_ps_32x32_neon) lsl x1, x1, #1 mov w12, #4 -.loop_cps32: +.Loop_cps32: sub w12, w12, #1 .rept 4 ld1 {v16.16b-v17.16b}, x2, x3 @@ -184,7 +184,7 @@ st1 {v0.8h-v3.8h}, x0, x1 st1 {v4.8h-v7.8h}, x0, x1 .endr - cbnz w12, .loop_cps32 + cbnz w12, .Loop_cps32 ret endfunc @@ -192,7 +192,7 @@ lsl x1, x1, #1 sub x1, x1, #64 mov w12, #16 -.loop_cps64: +.Loop_cps64: sub w12, w12, #1 .rept 4 ld1 {v16.16b-v19.16b}, x2, x3 @@ -207,7 +207,7 @@ st1 {v0.8h-v3.8h}, x0, #64 st1 {v4.8h-v7.8h}, x0, x1 .endr - cbnz w12, .loop_cps64 + cbnz w12, .Loop_cps64 ret endfunc @@ -252,13 +252,13 @@ lsl x1, x1, #1 lsl x3, x3, #1 mov w12, #4 -.loop_css32: +.Loop_css32: sub w12, w12, #1 .rept 8 ld1 {v0.8h-v3.8h}, x2, x3 st1 {v0.8h-v3.8h}, x0, x1 .endr - cbnz w12, .loop_css32 + cbnz w12, .Loop_css32 ret endfunc @@ -268,7 +268,7 @@ lsl x3, x3, #1 sub x3, x3, #64 mov w12, #8 -.loop_css64: +.Loop_css64: sub w12, w12, #1 .rept 8 ld1 {v0.8h-v3.8h}, x2, #64 @@ -276,7 +276,7 @@ st1 {v0.8h-v3.8h}, x0, #64 st1 {v4.8h-v7.8h}, x0, x1 .endr - cbnz w12, .loop_css64 + cbnz w12, .Loop_css64 ret endfunc @@ -321,13 +321,13 @@ lsl x1, x1, #1 lsl x3, x3, #1 mov w12, #8 -.loop_css32x64: +.Loop_css32x64: sub w12, w12, #1 .rept 8 ld1 {v0.8h-v3.8h}, x2, x3 st1 {v0.8h-v3.8h}, x0, x1 .endr - cbnz w12, .loop_css32x64 + cbnz w12, .Loop_css32x64 ret endfunc @@ -376,7 +376,7 @@ function PFX(blockcopy_ps_32x64_neon) lsl x1, x1, #1 mov w12, #8 -.loop_cps32x64: +.Loop_cps32x64: sub w12, w12, #1 .rept 4 ld1 {v16.16b-v17.16b}, x2, x3 @@ -392,7 +392,7 @@ st1 {v0.8h-v3.8h}, x0, x1 st1 {v4.8h-v7.8h}, x0, x1 .endr - cbnz w12, .loop_cps32x64 + cbnz w12, .Loop_cps32x64 ret endfunc @@ -443,7 +443,7 @@ lsl x3, x3, #1 movrel x11, xtn_xtn2_table ld1 {v31.16b}, x11 -.loop_csp32x64: +.Loop_csp32x64: sub w12, w12, #1 .rept 4 ld1 {v0.8h-v3.8h}, x2, x3 @@ -455,7 +455,7 @@ st1 {v0.16b-v1.16b}, x0, x1 st1 {v2.16b-v3.16b}, x0, x1 .endr - cbnz w12, .loop_csp32x64 + cbnz w12, .Loop_csp32x64 ret endfunc @@ -595,13 +595,13 @@ function PFX(blockcopy_pp_8x64_neon) mov w12, #4 -.loop_pp_8x64: +.Loop_pp_8x64: sub w12, w12, #1 .rept 16 ld1 {v0.4h}, x2, x3 st1 {v0.4h}, x0, x1 .endr - cbnz w12, .loop_pp_8x64 + cbnz w12, .Loop_pp_8x64 ret endfunc @@ -623,13 +623,13 @@ .macro blockcopy_pp_16xN1_neon h function PFX(blockcopy_pp_16x\h\()_neon) mov w12, #\h / 8 -.loop_16x\h\(): +.Loop_16x\h\(): .rept 8 ld1 {v0.8h}, x2, x3 st1 {v0.8h}, x0, x1 .endr sub w12, w12, #1 - cbnz w12, .loop_16x\h + cbnz w12, .Loop_16x\h ret endfunc .endm @@ -651,38 +651,38 @@ function PFX(blockcopy_pp_12x32_neon) sub x1, x1, #8 mov w12, #4 -.loop_pp_12x32: +.Loop_pp_12x32: sub w12, w12, #1 .rept 8 ld1 {v0.16b}, x2, x3 str d0, x0, #8
View file
x265_4.0.tar.gz/source/common/aarch64/dct-prim-sve.cpp
Added
@@ -0,0 +1,491 @@ +/***************************************************************************** + * Copyright (C) 2024 MulticoreWare, Inc + * + * Authors: Hari Limaye <hari.limaye@arm.com> + * Jonathan Wright <jonathan.wright@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "dct-prim.h" +#include "neon-sve-bridge.h" +#include <arm_neon.h> + + +namespace +{ +using namespace X265_NS; + +// First four elements (duplicated) of rows 1, 3, 5 and 7 in g_t8 (8x8 DCT +// matrix.) +const int16_t t8_odd48 = +{ + { 89, 75, 50, 18, 89, 75, 50, 18 }, + { 75, -18, -89, -50, 75, -18, -89, -50 }, + { 50, -89, 18, 75, 50, -89, 18, 75 }, + { 18, -50, 75, -89, 18, -50, 75, -89 }, +}; + +template<int shift> +static inline void partialButterfly8_sve(const int16_t *src, int16_t *dst) +{ + const int line = 8; + + int16x8_t Oline / 2; + int32x4_t EEline / 2; + int32x4_t EOline / 2; + + for (int i = 0; i < line; i += 2) + { + int16x8_t s_lo = vcombine_s16(vld1_s16(src + i * line), + vld1_s16(src + (i + 1) * line)); + int16x8_t s_hi = vcombine_s16( + vrev64_s16(vld1_s16(src + i * line + 4)), + vrev64_s16(vld1_s16(src + (i + 1) * line + 4))); + + int32x4_t E0 = vaddl_s16(vget_low_s16(s_lo), vget_low_s16(s_hi)); + int32x4_t E1 = vaddl_s16(vget_high_s16(s_lo), vget_high_s16(s_hi)); + + Oi / 2 = vsubq_s16(s_lo, s_hi); + + int32x4_t t0 = vreinterpretq_s32_s64( + vzip1q_s64(vreinterpretq_s64_s32(E0), vreinterpretq_s64_s32(E1))); + int32x4_t t1 = vrev64q_s32(vreinterpretq_s32_s64( + vzip2q_s64(vreinterpretq_s64_s32(E0), vreinterpretq_s64_s32(E1)))); + + EEi / 2 = vaddq_s32(t0, t1); + EOi / 2 = vsubq_s32(t0, t1); + } + + int16_t *d = dst; + + int32x4_t c0 = vld1q_s32(t8_even0); + int32x4_t c2 = vld1q_s32(t8_even1); + int32x4_t c4 = vld1q_s32(t8_even2); + int32x4_t c6 = vld1q_s32(t8_even3); + int16x8_t c1 = vld1q_s16(t8_odd0); + int16x8_t c3 = vld1q_s16(t8_odd1); + int16x8_t c5 = vld1q_s16(t8_odd2); + int16x8_t c7 = vld1q_s16(t8_odd3); + + for (int j = 0; j < line; j += 4) + { + // O + int64x2_t t01 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 0, c1); + int64x2_t t23 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 1, c1); + int32x4_t t0123 = vcombine_s32(vmovn_s64(t01), vmovn_s64(t23)); + int16x4_t res1 = vrshrn_n_s32(t0123, shift); + vst1_s16(d + 1 * line, res1); + + t01 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 0, c3); + t23 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 1, c3); + t0123 = vcombine_s32(vmovn_s64(t01), vmovn_s64(t23)); + int16x4_t res3 = vrshrn_n_s32(t0123, shift); + vst1_s16(d + 3 * line, res3); + + t01 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 0, c5); + t23 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 1, c5); + t0123 = vcombine_s32(vmovn_s64(t01), vmovn_s64(t23)); + int16x4_t res5 = vrshrn_n_s32(t0123, shift); + vst1_s16(d + 5 * line, res5); + + t01 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 0, c7); + t23 = x265_sdotq_s16(vdupq_n_s64(0), Oj / 2 + 1, c7); + t0123 = vcombine_s32(vmovn_s64(t01), vmovn_s64(t23)); + int16x4_t res7 = vrshrn_n_s32(t0123, shift); + vst1_s16(d + 7 * line, res7); + + // EE and EO + int32x4_t t0 = vpaddq_s32(EEj / 2 + 0, EEj / 2 + 1); + int32x4_t t1 = vmulq_s32(c0, t0); + int16x4_t res0 = vrshrn_n_s32(t1, shift); + vst1_s16(d + 0 * line, res0); + + int32x4_t t2 = vmulq_s32(c2, EOj / 2 + 0); + int32x4_t t3 = vmulq_s32(c2, EOj / 2 + 1); + int16x4_t res2 = vrshrn_n_s32(vpaddq_s32(t2, t3), shift); + vst1_s16(d + 2 * line, res2); + + int32x4_t t4 = vmulq_s32(c4, EEj / 2 + 0); + int32x4_t t5 = vmulq_s32(c4, EEj / 2 + 1); + int16x4_t res4 = vrshrn_n_s32(vpaddq_s32(t4, t5), shift); + vst1_s16(d + 4 * line, res4); + + int32x4_t t6 = vmulq_s32(c6, EOj / 2 + 0); + int32x4_t t7 = vmulq_s32(c6, EOj / 2 + 1); + int16x4_t res6 = vrshrn_n_s32(vpaddq_s32(t6, t7), shift); + vst1_s16(d + 6 * line, res6); + + d += 4; + } +} + +template<int shift> +static inline void partialButterfly16_sve(const int16_t *src, int16_t *dst) +{ + const int line = 16; + + int16x8_t Oline; + int16x8_t EOline / 2; + int32x4_t EEEline; + int32x4_t EEOline; + + for (int i = 0; i < line; i += 2) + { + int16x8_t s0_lo = vld1q_s16(src + i * line); + int16x8_t s0_hi = rev16(vld1q_s16(src + i * line + 8)); + + int16x8_t s1_lo = vld1q_s16(src + (i + 1) * line); + int16x8_t s1_hi = rev16(vld1q_s16(src + (i + 1) * line + 8)); + + int32x4_t E02; + E00 = vaddl_s16(vget_low_s16(s0_lo), vget_low_s16(s0_hi)); + E01 = vaddl_s16(vget_high_s16(s0_lo), vget_high_s16(s0_hi)); + + int32x4_t E12; + E10 = vaddl_s16(vget_low_s16(s1_lo), vget_low_s16(s1_hi)); + E11 = vaddl_s16(vget_high_s16(s1_lo), vget_high_s16(s1_hi)); + + Oi + 0 = vsubq_s16(s0_lo, s0_hi); + Oi + 1 = vsubq_s16(s1_lo, s1_hi); + + int16x4_t EO_lo = vmovn_s32(vsubq_s32(E00, rev32(E01))); + int16x4_t EO_hi = vmovn_s32(vsubq_s32(E10, rev32(E11))); + EOi / 2 = vcombine_s16(EO_lo, EO_hi); + + int32x4_t EE0 = vaddq_s32(E00, rev32(E01)); + int32x4_t EE1 = vaddq_s32(E10, rev32(E11)); + + int32x4_t t0 = vreinterpretq_s32_s64( + vzip1q_s64(vreinterpretq_s64_s32(EE0), vreinterpretq_s64_s32(EE1))); + int32x4_t t1 = vrev64q_s32(vreinterpretq_s32_s64( + vzip2q_s64(vreinterpretq_s64_s32(EE0), + vreinterpretq_s64_s32(EE1)))); + + EEEi / 2 = vaddq_s32(t0, t1); + EEOi / 2 = vsubq_s32(t0, t1); + } + + for (int i = 0; i < line; i += 4) + { + for (int k = 1; k < 16; k += 2) + { + int16x8_t c0_c4 = vld1q_s16(&g_t16k0); + + int64x2_t t0 = x265_sdotq_s16(vdupq_n_s64(0), c0_c4, Oi + 0); + int64x2_t t1 = x265_sdotq_s16(vdupq_n_s64(0), c0_c4, Oi + 1); + int64x2_t t2 = x265_sdotq_s16(vdupq_n_s64(0), c0_c4, Oi + 2); + int64x2_t t3 = x265_sdotq_s16(vdupq_n_s64(0), c0_c4, Oi + 3); + + int32x4_t t01 = vcombine_s32(vmovn_s64(t0), vmovn_s64(t1)); + int32x4_t t23 = vcombine_s32(vmovn_s64(t2), vmovn_s64(t3)); + int16x4_t res = vrshrn_n_s32(vpaddq_s32(t01, t23), shift); + vst1_s16(dst + k * line, res); + }
View file
x265_3.6.tar.gz/source/common/aarch64/dct-prim.cpp -> x265_4.0.tar.gz/source/common/aarch64/dct-prim.cpp
Changed
@@ -5,36 +5,35 @@ #include <arm_neon.h> +#define X265_PRAGMA(text) _Pragma(#text) +#if defined(__clang__) +#define X265_PRAGMA_UNROLL(n) X265_PRAGMA(unroll(n)) +#elif defined(__GNUC__) +#define X265_PRAGMA_UNROLL(n) X265_PRAGMA(GCC unroll (n)) +#else +#define X265_PRAGMA_UNROLL(n) +#endif + +extern "C" void PFX(dct16_neon)(const int16_t *src, int16_t *dst, intptr_t srcStride); +extern "C" void PFX(idct16_neon)(const int16_t *src, int16_t *dst, intptr_t dstStride); namespace { using namespace X265_NS; - -static int16x8_t rev16(const int16x8_t a) +static void transpose_4x4x16(int16x4_t &x0, int16x4_t &x1, int16x4_t &x2, int16x4_t &x3) { - static const int8x16_t tbl = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1}; - return vqtbx1q_u8(a, a, tbl); -} + int32x2_t s0, s1, s2, s3; -static int32x4_t rev32(const int32x4_t a) -{ - static const int8x16_t tbl = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3}; - return vqtbx1q_u8(a, a, tbl); -} + s0 = vtrn1_s32(vreinterpret_s32_s16(x0), vreinterpret_s32_s16(x2)); + s1 = vtrn1_s32(vreinterpret_s32_s16(x1), vreinterpret_s32_s16(x3)); + s2 = vtrn2_s32(vreinterpret_s32_s16(x0), vreinterpret_s32_s16(x2)); + s3 = vtrn2_s32(vreinterpret_s32_s16(x1), vreinterpret_s32_s16(x3)); -static void transpose_4x4x16(int16x4_t &x0, int16x4_t &x1, int16x4_t &x2, int16x4_t &x3) -{ - int16x4_t s0, s1, s2, s3; - s0 = vtrn1_s32(x0, x2); - s1 = vtrn1_s32(x1, x3); - s2 = vtrn2_s32(x0, x2); - s3 = vtrn2_s32(x1, x3); - - x0 = vtrn1_s16(s0, s1); - x1 = vtrn2_s16(s0, s1); - x2 = vtrn1_s16(s2, s3); - x3 = vtrn2_s16(s2, s3); + x0 = vtrn1_s16(vreinterpret_s16_s32(s0), vreinterpret_s16_s32(s1)); + x1 = vtrn2_s16(vreinterpret_s16_s32(s0), vreinterpret_s16_s32(s1)); + x2 = vtrn1_s16(vreinterpret_s16_s32(s2), vreinterpret_s16_s32(s3)); + x3 = vtrn2_s16(vreinterpret_s16_s32(s2), vreinterpret_s16_s32(s3)); } @@ -111,13 +110,13 @@ int64x2_t vcost_sum_1 = vdupq_n_s64(0); for (int y = 0; y < MLS_CG_SIZE; y++) { - int16x4_t in = *(int16x4_t *)&m_resiDctCoeffblkPos; + int16x4_t in = vld1_s16(&m_resiDctCoeffblkPos); int32x4_t mul = vmull_s16(in, in); int64x2_t cost0, cost1; cost0 = vshll_n_s32(vget_low_s32(mul), scaleBits); cost1 = vshll_high_n_s32(mul, scaleBits); - *(int64x2_t *)&costUncodedblkPos + 0 = cost0; - *(int64x2_t *)&costUncodedblkPos + 2 = cost1; + vst1q_s64(&costUncodedblkPos + 0, cost0); + vst1q_s64(&costUncodedblkPos + 2, cost1); vcost_sum_0 = vaddq_s64(vcost_sum_0, cost0); vcost_sum_1 = vaddq_s64(vcost_sum_1, cost1); blkPos += trSize; @@ -143,8 +142,9 @@ int32x4_t vpsy = vdupq_n_s32(*psyScale); for (int y = 0; y < MLS_CG_SIZE; y++) { - int32x4_t signCoef = vmovl_s16(*(int16x4_t *)&m_resiDctCoeffblkPos); - int32x4_t predictedCoef = vsubq_s32(vmovl_s16(*(int16x4_t *)&m_fencDctCoeffblkPos), signCoef); + int32x4_t signCoef = vmovl_s16(vld1_s16(&m_resiDctCoeffblkPos)); + int32x4_t fencCoef = vmovl_s16(vld1_s16(&m_fencDctCoeffblkPos)); + int32x4_t predictedCoef = vsubq_s32(fencCoef, signCoef); int64x2_t cost0, cost1; cost0 = vmull_s32(vget_low_s32(signCoef), vget_low_s32(signCoef)); cost1 = vmull_high_s32(signCoef, signCoef); @@ -160,8 +160,8 @@ } cost0 = vsubq_s64(cost0, neg0); cost1 = vsubq_s64(cost1, neg1); - *(int64x2_t *)&costUncodedblkPos + 0 = cost0; - *(int64x2_t *)&costUncodedblkPos + 2 = cost1; + vst1q_s64(&costUncodedblkPos + 0, cost0); + vst1q_s64(&costUncodedblkPos + 2, cost1); vcost_sum_0 = vaddq_s64(vcost_sum_0, cost0); vcost_sum_1 = vaddq_s64(vcost_sum_1, cost1); @@ -188,8 +188,9 @@ int i = 0; for (; (i + 8) <= numCoeff; i += 8) { - int16x8_t in = *(int16x8_t *)&quantCoeffi; - vcount = vaddq_s16(vcount, vtstq_s16(in, in)); + int16x8_t in = vld1q_s16(&quantCoeffi); + uint16x8_t tst = vtstq_s16(in, in); + vcount = vaddq_s16(vcount, vreinterpretq_s16_u16(tst)); } for (; i < numCoeff; i++) { @@ -209,9 +210,10 @@ int j = 0; for (; (j + 8) <= trSize; j += 8) { - int16x8_t in = *(int16x8_t *)&residualj; - *(int16x8_t *)&coeffj = in; - vcount = vaddq_s16(vcount, vtstq_s16(in, in)); + int16x8_t in = vld1q_s16(&residualj); + vst1q_s16(&coeffj, in); + uint16x8_t tst = vtstq_s16(in, in); + vcount = vaddq_s16(vcount, vreinterpretq_s16_u16(tst)); } for (; j < trSize; j++) { @@ -225,200 +227,396 @@ return numSig - vaddvq_s16(vcount); } - -static void partialButterfly16(const int16_t *src, int16_t *dst, int shift, int line) +template<int shift> +static inline void partialButterfly16_neon(const int16_t *src, int16_t *dst) { - int j, k; - int32x4_t E2, O2; - int32x4_t EE, EO; - int32x2_t EEE, EEO; - const int add = 1 << (shift - 1); - const int32x4_t _vadd = {add, 0}; + const int line = 16; - for (j = 0; j < line; j++) + int16x8_t Oline; + int32x4_t EOline; + int32x4_t EEEline; + int32x4_t EEOline; + + for (int i = 0; i < line; i += 2) { - int16x8_t in0 = *(int16x8_t *)src; - int16x8_t in1 = rev16(*(int16x8_t *)&src8); + int16x8_t s0_lo = vld1q_s16(src + i * line); + int16x8_t s0_hi = rev16(vld1q_s16(src + i * line + 8)); - E0 = vaddl_s16(vget_low_s16(in0), vget_low_s16(in1)); - O0 = vsubl_s16(vget_low_s16(in0), vget_low_s16(in1)); - E1 = vaddl_high_s16(in0, in1); - O1 = vsubl_high_s16(in0, in1); + int16x8_t s1_lo = vld1q_s16(src + (i + 1) * line); + int16x8_t s1_hi = rev16(vld1q_s16(src + (i + 1) * line + 8)); - for (k = 1; k < 16; k += 2) - { - int32x4_t c0 = vmovl_s16(*(int16x4_t *)&g_t16k0); - int32x4_t c1 = vmovl_s16(*(int16x4_t *)&g_t16k4); + int32x4_t E02; + E00 = vaddl_s16(vget_low_s16(s0_lo), vget_low_s16(s0_hi)); + E01 = vaddl_s16(vget_high_s16(s0_lo), vget_high_s16(s0_hi)); - int32x4_t res = _vadd; - res = vmlaq_s32(res, c0, O0); - res = vmlaq_s32(res, c1, O1); - dstk * line = (int16_t)(vaddvq_s32(res) >> shift); - } + int32x4_t E12; + E10 = vaddl_s16(vget_low_s16(s1_lo), vget_low_s16(s1_hi)); + E11 = vaddl_s16(vget_high_s16(s1_lo), vget_high_s16(s1_hi)); + + Oi + 0 = vsubq_s16(s0_lo, s0_hi); + Oi + 1 = vsubq_s16(s1_lo, s1_hi); + + int32x4_t EE0 = vaddq_s32(E00, rev32(E01)); + int32x4_t EE1 = vaddq_s32(E10, rev32(E11)); + EOi + 0 = vsubq_s32(E00, rev32(E01)); + EOi + 1 = vsubq_s32(E10, rev32(E11)); + + int32x4_t t0 = vreinterpretq_s32_s64( + vzip1q_s64(vreinterpretq_s64_s32(EE0), vreinterpretq_s64_s32(EE1))); + int32x4_t t1 = vrev64q_s32(vreinterpretq_s32_s64(vzip2q_s64( + vreinterpretq_s64_s32(EE0), vreinterpretq_s64_s32(EE1)))); - /* EE and EO */ - EE = vaddq_s32(E0, rev32(E1)); - EO = vsubq_s32(E0, rev32(E1)); - for (k = 2; k < 16; k += 4) + EEEi / 2 = vaddq_s32(t0, t1); + EEOi / 2 = vsubq_s32(t0, t1); + } +
View file
x265_3.6.tar.gz/source/common/aarch64/dct-prim.h -> x265_4.0.tar.gz/source/common/aarch64/dct-prim.h
Changed
@@ -6,11 +6,51 @@ #include "primitives.h" #include "contexts.h" // costCoeffNxN_c #include "threading.h" // CLZ +#include <arm_neon.h> namespace X265_NS { +// First two columns of the 4x4 dct transform matrix, duplicated to 4x4 to allow +// processing two lines at once. +const int32_t t8_even44 = +{ + { 64, 64, 64, 64 }, + { 83, 36, 83, 36 }, + { 64, -64, 64, -64 }, + { 36, -83, 36, -83 }, +}; + +const uint8_t rev16_tbl16 = +{ + 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 +}; + +const uint8_t rev32_tbl16 = +{ + 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 +}; + +static inline int16x8_t rev16(const int16x8_t a) +{ + const uint8x16_t tbl = vld1q_u8(rev16_tbl); + const int8x16_t a_s8 = vreinterpretq_s8_s16(a); + + return vreinterpretq_s16_s8(vqtbx1q_s8(a_s8, a_s8, tbl)); +} + +static inline int32x4_t rev32(const int32x4_t a) +{ + const uint8x16_t tbl = vld1q_u8(rev32_tbl); + const int8x16_t a_s8 = vreinterpretq_s8_s32(a); + + return vreinterpretq_s32_s8(vqtbx1q_s8(a_s8, a_s8, tbl)); +} + // x265 private namespace void setupDCTPrimitives_neon(EncoderPrimitives &p); +#if defined(HAVE_SVE) && HAVE_SVE_BRIDGE +void setupDCTPrimitives_sve(EncoderPrimitives &p); +#endif };
View file
x265_4.0.tar.gz/source/common/aarch64/dct.S
Added
@@ -0,0 +1,883 @@ +/***************************************************************************** + * Copyright (C) 2021 MulticoreWare, Inc + * + * Authors: Min Chen <min.chen@multicorewareinc.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +// Functions in this file: +// ***** luma_vpp ***** + +#include "asm.S" + +#ifdef __APPLE__ +.section __RODATA,__rodata +#else +.section .rodata +#endif + +.align 4 + +.text +.set idct16_shift_1, 7 +.set idct16_shift_2, 12-(BIT_DEPTH-8) + +.set dct16_shift_1, 3+(BIT_DEPTH-8) +.set dct16_shift_2, 10 + +.align 4 +// NOTE: Hardcoded due to asm syntax issue, don't reorder! +tbl_const_idct_0: + .hword 64, 83, 36, 89, 75, 50, 18, 0 // v0 + .hword 90, 87, 80, 70, 57, 43, 25, 9 // v1 +// .hword 0=64, 1=83, 2=36, 3=89, 4=75, 5=50, 6=18, 7=00 +// .hword 0=90, 1=87, 2=80, 3=70, 4=57, 5=43, 6=25, 7= 9 + + .hword 64, 83, 64, 36 // v0 + .hword 64, 36,-64,-83 + .hword 64,-36,-64, 83 // v1 + .hword 64,-83, 64,-36 + + .hword 89, 75, 50, 18 // v2 + .hword 75,-18,-89,-50 + .hword 50,-89, 18, 75 // v3 + .hword 18,-50, 75,-89 + + .hword 90,+87,+80,+70, +57,+43,+25,+ 9 // v4 + .hword 87,+57, +9,-43, -80,-90,-70,-25 // v5 + .hword 80, +9,-70,-87, -25,+57,+90,+43 // v6 + .hword 70,-43,-87, +9, +90,+25,-80,-57 // v7 + .hword 57,-80,-25,+90, - 9,-87,+43,+70 // v8 + .hword 43,-90,+57,+25, -87,+70,+ 9,-80 // v9 + .hword 25,-70,+90,-80, +43,+ 9,-57,+87 // v16 + .hword 9,-25,+43,-57, +70,-80,+87,-90 // v17 + + .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 // v18 + +tbl_const_dct_0: + // EE + .hword 64,+64,+64,+64 // v16 + .hword 83,+36,-36,-83 // v17 + .hword 64,-64,-64,+64 // v18 + .hword 36,-83,+83,-36 // v19 + + // EO + .hword 89,+75,+50,+18 // v20 + .hword 75,-18,-89,-50 // v21 + .hword 50,-89,+18,+75 // v22 + .hword 18,-50,+75,-89 // v23 + + // O + .hword 90,+87,+80,+70,+57,+43,+25, +9 // v24 + .hword 87,+57, +9,-43,-80,-90,-70,-25 // v25 + .hword 80, +9,-70,-87,-25,+57,+90,+43 // v26 + .hword 70,-43,-87, +9,+90,+25,-80,-57 // v27 + .hword 57,-80,-25,+90, -9,-87,+43,+70 // v28 + .hword 43,-90,+57,+25,-87,+70, +9,-80 // v29 + .hword 25,-70,+90,-80,+43, +9,-57,+87 // v30 + .hword 9,-25,+43,-57,+70,-80,+87,-90 // v31 + + .byte 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 // v0 +// .byte 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9 // v1 + + .word 64, 83, 36, 89, 75, 50, 18, 0 // v0, v1 + .word 90, 87, 80, 70, 57, 43, 25, 9 // v2, v3 + + +// ***** idct 16x16 ***** +// void idct16(const int16_t* src, int16_t* dst, intptr_t dstStride) +function PFX(idct16_neon) +// Register map +// x0 = src +// x1 = dst +// x2 = dstStride +// x8 = tbl_const_idct_0 + + stp d8, d9, sp,#-16! + sub sp, sp, #(16*16*2) + + adr x8, tbl_const_idct_0 + ldp q0, q1, x8 + + mov x5, sp + mov w4, #16 + + // Pass1 +5: + ldr d16, x0, #(0*16*2) + ldr d17, x0, #(2*16*2) + ldr d18, x0, #(4*16*2) + ldr d19, x0, #(6*16*2) + ldr d20, x0, #(8*16*2) + ldr d21, x0, #(10*16*2) + ldr d22, x0, #(12*16*2) + ldr d23, x0, #(14*16*2) + +// EEE0 = 64*src0*16+i + 64*src 8*16+i; +// EEE1 = 64*src0*16+i - 64*src 8*16+i; +// EEO0 = 83*src4*16+i + 36*src12*16+i; +// EEO1 = 36*src4*16+i - 83*src12*16+i; + smull v24.4s, v16.4h, v0.h0 // EEE0 = 64*0 + smull v26.4s, v18.4h, v0.h1 // EEO0 = 83*4 + mov v25.16b, v24.16b // EEE1 = 64*0 + smull v27.4s, v18.4h, v0.h2 // EEO1 = 36*4 + +// EO0 = 89*src 2*16+i + 75*src 6*16+i + 50*src10*16+i + 18*src14*16+i; +// EO1 = 75*src 2*16+i - 18*src 6*16+i - 89*src10*16+i - 50*src14*16+i; +// EO2 = 50*src 2*16+i - 89*src 6*16+i + 18*src10*16+i + 75*src14*16+i; +// EO3 = 18*src 2*16+i - 50*src 6*16+i + 75*src10*16+i - 89*src14*16+i; + smull v28.4s, v17.4h, v0.h3 // EO0 = 89*2 + smull v29.4s, v17.4h, v0.h4 // EO1 = 75*2 + smull v30.4s, v17.4h, v0.h5 // EO2 = 50*2 + smull v31.4s, v17.4h, v0.h6 // EO3 = 18*2 + + smlal v28.4s, v19.4h, v0.h4 // EO0 = 89*2+75*6 + smlsl v29.4s, v19.4h, v0.h6 // EO1 = 75*2-18*6 + smlsl v30.4s, v19.4h, v0.h3 // EO2 = 50*2-89*6 + smlsl v31.4s, v19.4h, v0.h5 // EO3 = 18*2-50*6 + + ldr d16, x0, #(1*16*2) + ldr d17, x0, #(3*16*2) + ldr d18, x0, #(5*16*2) + ldr d19, x0, #(7*16*2) + + orr v2.8b, v20.8b, v21.8b + orr v2.8b, v2.8b, v22.8b + orr v2.8b, v2.8b, v23.8b + orr v3.8b, v18.8b, v19.8b + mov x6, v2.d0 + mov x7, v3.d0 + +// O0 = 90*src 1*16+i + 87*src 3*16+i + 80*src 5*16+i + 70*src 7*16+i + 57*src 9*16+i + 43*src11*16+i + 25*src13*16+i + 9*src15*16+i; +// O1 = 87*src 1*16+i + 57*src 3*16+i + 9*src 5*16+i - 43*src 7*16+i - 80*src 9*16+i - 90*src11*16+i - 70*src13*16+i - 25*src15*16+i; +// O2 = 80*src 1*16+i + 9*src 3*16+i - 70*src 5*16+i - 87*src 7*16+i - 25*src 9*16+i + 57*src11*16+i + 90*src13*16+i + 43*src15*16+i; +// O3 = 70*src 1*16+i - 43*src 3*16+i - 87*src 5*16+i + 9*src 7*16+i + 90*src 9*16+i + 25*src11*16+i - 80*src13*16+i - 57*src15*16+i; +// O4 = 57*src 1*16+i - 80*src 3*16+i - 25*src 5*16+i + 90*src 7*16+i - 9*src 9*16+i - 87*src11*16+i + 43*src13*16+i + 70*src15*16+i; +// O5 = 43*src 1*16+i - 90*src 3*16+i + 57*src 5*16+i + 25*src 7*16+i - 87*src 9*16+i + 70*src11*16+i + 9*src13*16+i - 80*src15*16+i; +// O6 = 25*src 1*16+i - 70*src 3*16+i + 90*src 5*16+i - 80*src 7*16+i + 43*src 9*16+i + 9*src11*16+i - 57*src13*16+i + 87*src15*16+i; +// O7 = 9*src 1*16+i - 25*src 3*16+i + 43*src 5*16+i - 57*src 7*16+i + 70*src 9*16+i - 80*src11*16+i + 87*src13*16+i - 90*src15*16+i; + smull v2.4s, v16.4h, v1.h0 // v2 = O0 = 90*1 + smull v3.4s, v16.4h, v1.h1 // v3 = O1 = 87*1 + smull v4.4s, v16.4h, v1.h2 // v4 = O2 = 80*1 + smull v5.4s, v16.4h, v1.h3 // v5 = O3 = 70*1 + smull v6.4s, v16.4h, v1.h4 // v6 = O4 = 57*1 + smull v7.4s, v16.4h, v1.h5 // v7 = O5 = 43*1 + smull v8.4s, v16.4h, v1.h6 // v8 = O6 = 25*1 + smull v9.4s, v16.4h, v1.h7 // v9 = O7 = 9*1 + + smlal v2.4s, v17.4h, v1.h1 // v2 = O0 = 90*1+87*3 + smlal v3.4s, v17.4h, v1.h4 // v3 = O1 = 87*1+57*3 + smlal v4.4s, v17.4h, v1.h7 // v4 = O2 = 80*1+ 9*3 + smlsl v5.4s, v17.4h, v1.h5 // v5 = O3 = 70*1-43*3 + smlsl v6.4s, v17.4h, v1.h2 // v6 = O4 = 57*1-80*3 + smlsl v7.4s, v17.4h, v1.h0 // v7 = O5 = 43*1-90*3 + smlsl v8.4s, v17.4h, v1.h3 // v8 = O6 = 25*1-70*3 + smlsl v9.4s, v17.4h, v1.h6 // v9 = O7 = 9*1-25*3 + + //cmp x7, #0 + //beq 1f + cbz x7, 1f + + smlal v2.4s, v18.4h, v1.h2 // v2 = O0 = 90*1+87*3+80*5 + smlal v3.4s, v18.4h, v1.h7 // v3 = O1 = 87*1+57*3+ 9*5 + smlsl v4.4s, v18.4h, v1.h3 // v4 = O2 = 80*1+ 9*3-70*5
View file
x265_4.0.tar.gz/source/common/aarch64/filter-neon-dotprod.cpp
Added
@@ -0,0 +1,1131 @@ +/***************************************************************************** + * Copyright (C) 2024 MulticoreWare, Inc + * + * Authors: Hari Limaye <hari.limaye@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "filter-neon-dotprod.h" + +#if !HIGH_BIT_DEPTH +#include "mem-neon.h" +#include <arm_neon.h> + +namespace { +static const uint8_t dotprod_permute_tbl48 = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, + 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +}; + +static const uint8_t dot_prod_merge_block_tbl48 = { + // Shift left and insert new last column in transposed 4x4 block. + 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, + // Shift left and insert two new columns in transposed 4x4 block. + 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, + // Shift left and insert three new columns in transposed 4x4 block. + 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 +}; + +uint8x8_t inline filter8_8_pp(uint8x16_t samples, const int8x8_t filter, + const int32x4_t constant, const uint8x16x3_t tbl) +{ + // Transform sample range from uint8_t to int8_t for signed dot product. + int8x16_t samples_s8 = + vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); + + // Permute input samples for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + int8x16_t perm_samples_0 = vqtbl1q_s8(samples_s8, tbl.val0); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + int8x16_t perm_samples_1 = vqtbl1q_s8(samples_s8, tbl.val1); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + int8x16_t perm_samples_2 = vqtbl1q_s8(samples_s8, tbl.val2); + + int32x4_t dotprod_lo = vdotq_lane_s32(constant, perm_samples_0, filter, 0); + int32x4_t dotprod_hi = vdotq_lane_s32(constant, perm_samples_1, filter, 0); + dotprod_lo = vdotq_lane_s32(dotprod_lo, perm_samples_1, filter, 1); + dotprod_hi = vdotq_lane_s32(dotprod_hi, perm_samples_2, filter, 1); + + // Narrow and combine. + int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo), + vmovn_s32(dotprod_hi)); + return vqrshrun_n_s16(dotprod, IF_FILTER_PREC); +} + +void inline init_sample_permute(uint8x8_t *samples, const uint8x16x3_t tbl, + int8x16_t *d) +{ + // Transform sample range from uint8_t to int8_t for signed dot product. + int8x8_t samples_s84; + samples_s80 = vreinterpret_s8_u8(vsub_u8(samples0, vdup_n_u8(128))); + samples_s81 = vreinterpret_s8_u8(vsub_u8(samples1, vdup_n_u8(128))); + samples_s82 = vreinterpret_s8_u8(vsub_u8(samples2, vdup_n_u8(128))); + samples_s83 = vreinterpret_s8_u8(vsub_u8(samples3, vdup_n_u8(128))); + + // Permute input samples for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + d0 = vqtbl1q_s8(vcombine_s8(samples_s80, vdup_n_s8(0)), tbl.val0); + d1 = vqtbl1q_s8(vcombine_s8(samples_s81, vdup_n_s8(0)), tbl.val0); + d2 = vqtbl1q_s8(vcombine_s8(samples_s82, vdup_n_s8(0)), tbl.val0); + d3 = vqtbl1q_s8(vcombine_s8(samples_s83, vdup_n_s8(0)), tbl.val0); +} + +uint8x8_t inline filter8_8_pp_reuse(uint8x16_t samples, const int8x8_t filter, + const int32x4_t constant, + const uint8x16x3_t tbl, + int8x16_t &perm_samples_0) +{ + // Transform sample range from uint8_t to int8_t for signed dot product. + int8x16_t samples_s8 = + vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); + + // Permute input samples for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // Already in perm_samples_0. + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + int8x16_t perm_samples_1 = vqtbl1q_s8(samples_s8, tbl.val1); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + int8x16_t perm_samples_2 = vqtbl1q_s8(samples_s8, tbl.val2); + + int32x4_t dotprod_lo = vdotq_lane_s32(constant, perm_samples_0, filter, 0); + int32x4_t dotprod_hi = vdotq_lane_s32(constant, perm_samples_1, filter, 0); + dotprod_lo = vdotq_lane_s32(dotprod_lo, perm_samples_1, filter, 1); + dotprod_hi = vdotq_lane_s32(dotprod_hi, perm_samples_2, filter, 1); + + // Save for re-use in next iteration. + perm_samples_0 = perm_samples_2; + + // Narrow and combine. + int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo), + vmovn_s32(dotprod_hi)); + return vqrshrun_n_s16(dotprod, IF_FILTER_PREC); +} + +int16x4_t inline filter8_4_ps(uint8x16_t samples, const int8x8_t filter, + const uint8x16x3_t tbl) +{ + // Transform sample range from uint8_t to int8_t for signed dot product. + int8x16_t samples_s8 = + vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); + + // Permute input samples for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + int8x16_t perm_samples_0 = vqtbl1q_s8(samples_s8, tbl.val0); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + int8x16_t perm_samples_1 = vqtbl1q_s8(samples_s8, tbl.val1); + + // Correction accounting for sample range transform cancels to 0. + int32x4_t constant = vdupq_n_s32(0); + int32x4_t dotprod = vdotq_lane_s32(constant, perm_samples_0, filter, 0); + dotprod = vdotq_lane_s32(dotprod, perm_samples_1, filter, 1); + + // Narrow. + return vmovn_s32(dotprod); +} + +int16x8_t inline filter8_8_ps(uint8x16_t samples, const int8x8_t filter, + const uint8x16x3_t tbl) +{ + // Transform sample range from uint8_t to int8_t for signed dot product. + int8x16_t samples_s8 = + vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); + + // Permute input samples for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + int8x16_t perm_samples_0 = vqtbl1q_s8(samples_s8, tbl.val0); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + int8x16_t perm_samples_1 = vqtbl1q_s8(samples_s8, tbl.val1); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + int8x16_t perm_samples_2 = vqtbl1q_s8(samples_s8, tbl.val2); + + // Correction accounting for sample range transform cancels to 0. + int32x4_t constant = vdupq_n_s32(0); + int32x4_t dotprod_lo = vdotq_lane_s32(constant, perm_samples_0, filter, 0); + int32x4_t dotprod_hi = vdotq_lane_s32(constant, perm_samples_1, filter, 0); + dotprod_lo = vdotq_lane_s32(dotprod_lo, perm_samples_1, filter, 1); + dotprod_hi = vdotq_lane_s32(dotprod_hi, perm_samples_2, filter, 1); + + // Narrow and combine. + return vcombine_s16(vmovn_s32(dotprod_lo), vmovn_s32(dotprod_hi)); +} + +int16x8_t inline filter8_8_ps_reuse(uint8x16_t samples, const int8x8_t filter, + const uint8x16x3_t tbl, + int8x16_t &perm_samples_0) +{ + // Transform sample range from uint8_t to int8_t for signed dot product. + int8x16_t samples_s8 = + vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); + + // Permute input samples for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // Already in perm_samples_0. + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + int8x16_t perm_samples_1 = vqtbl1q_s8(samples_s8, tbl.val1); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + int8x16_t perm_samples_2 = vqtbl1q_s8(samples_s8, tbl.val2); + + // Correction accounting for sample range transform cancels to 0. + int32x4_t constant = vdupq_n_s32(0); + int32x4_t dotprod_lo = vdotq_lane_s32(constant, perm_samples_0, filter, 0); + int32x4_t dotprod_hi = vdotq_lane_s32(constant, perm_samples_1, filter, 0); + dotprod_lo = vdotq_lane_s32(dotprod_lo, perm_samples_1, filter, 1); + dotprod_hi = vdotq_lane_s32(dotprod_hi, perm_samples_2, filter, 1); + + // Save for re-use in next iteration. + perm_samples_0 = perm_samples_2; + + // Narrow and combine. + return vcombine_s16(vmovn_s32(dotprod_lo), vmovn_s32(dotprod_hi)); +} + +uint8x8_t inline filter4_8_pp(uint8x16_t samples, const int8x8_t filter,
View file
x265_4.0.tar.gz/source/common/aarch64/filter-neon-dotprod.h
Added
@@ -0,0 +1,37 @@ +/***************************************************************************** + * Copyright (C) 2024 MulticoreWare, Inc + * + * Authors: Hari Limaye <hari.limaye@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_COMMON_AARCH64_FILTER_NEON_DOTPROD_H +#define X265_COMMON_AARCH64_FILTER_NEON_DOTPROD_H + +#if defined(HAVE_NEON_DOTPROD) + +#include "primitives.h" + +namespace X265_NS { +void setupFilterPrimitives_neon_dotprod(EncoderPrimitives &p); +} + +#endif // defined(HAVE_NEON_DOTPROD) + +#endif // X265_COMMON_AARCH64_FILTER_NEON_DOTPROD_H
View file
x265_4.0.tar.gz/source/common/aarch64/filter-neon-i8mm.cpp
Added
@@ -0,0 +1,1412 @@ +/***************************************************************************** + * Copyright (C) 2024 MulticoreWare, Inc + * + * Authors: Hari Limaye <hari.limaye@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#if defined(HAVE_NEON_I8MM) +#include "filter-neon-i8mm.h" +#if !HIGH_BIT_DEPTH + +#include "mem-neon.h" + +#include <arm_neon.h> + +namespace { +static const uint8_t dotprod_permute_tbl48 = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, + 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +}; + +static const uint8_t matmul_permute_tbl232 = { + // Permute for luma filter 3. + { 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9, + 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 }, + // Permute for luma filter 1. + { 1, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9, 10, + 5, 6, 7, 8, 9, 10, 11, 12, 7, 8, 9, 10, 11, 12, 13, 14 } +}; + +static const int8_t matmul_luma_filter216 = { + { -1, 4, -10, 58, 17, -5, 1, 0, 0, -1, 4, -10, 58, 17, -5, 1 }, + { 1, -5, 17, 58, -10, 4, -1, 0, 0, 1, -5, 17, 58, -10, 4, -1 } +}; + +static const uint8_t dot_prod_merge_block_tbl48 = { + // Shift left and insert new last column in transposed 4x4 block. + 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, + // Shift left and insert two new columns in transposed 4x4 block. + 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, + // Shift left and insert three new columns in transposed 4x4 block. + 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 +}; + +uint8x8_t inline filter8_8_pp(uint8x16_t samples, const int8x8_t filter, + const uint8x16x3_t tbl) +{ + // Permute input samples for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + uint8x16_t perm_s0 = vqtbl1q_u8(samples, tbl.val0); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val1); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + uint8x16_t perm_S2 = vqtbl1q_u8(samples, tbl.val2); + + int32x4_t dotprod_lo = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0); + dotprod_lo = vusdotq_lane_s32(dotprod_lo, perm_s1, filter, 1); + int32x4_t dotprod_hi = vusdotq_lane_s32(vdupq_n_s32(0), perm_s1, filter, 0); + dotprod_hi = vusdotq_lane_s32(dotprod_hi, perm_S2, filter, 1); + + // Narrow and combine. + int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo), + vmovn_s32(dotprod_hi)); + return vqrshrun_n_s16(dotprod, IF_FILTER_PREC); +} + +void inline init_sample_permute(uint8x8_t *samples, const uint8x16x3_t tbl, + uint8x16_t *d) +{ + // Permute input samples for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + d0 = vqtbl1q_u8(vcombine_u8(samples0, vdup_n_u8(0)), tbl.val0); + d1 = vqtbl1q_u8(vcombine_u8(samples1, vdup_n_u8(0)), tbl.val0); + d2 = vqtbl1q_u8(vcombine_u8(samples2, vdup_n_u8(0)), tbl.val0); + d3 = vqtbl1q_u8(vcombine_u8(samples3, vdup_n_u8(0)), tbl.val0); +} + +uint8x8_t inline filter8_8_pp_reuse(uint8x16_t samples, const int8x8_t filter, + const uint8x16x3_t tbl, uint8x16_t &perm_s0) +{ + // Permute input samples for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // Already in perm_s0. + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val1); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + uint8x16_t perm_s2 = vqtbl1q_u8(samples, tbl.val2); + + int32x4_t dotprod_lo = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0); + dotprod_lo = vusdotq_lane_s32(dotprod_lo, perm_s1, filter, 1); + int32x4_t dotprod_hi = vusdotq_lane_s32(vdupq_n_s32(0), perm_s1, filter, 0); + dotprod_hi = vusdotq_lane_s32(dotprod_hi, perm_s2, filter, 1); + + // Save for re-use in next iteration. + perm_s0 = perm_s2; + + // Narrow and combine. + int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo), + vmovn_s32(dotprod_hi)); + return vqrshrun_n_s16(dotprod, IF_FILTER_PREC); +} + +uint8x8_t inline filter8_8_pp_matmul(uint8x16_t samples, const int8x16_t filter, + const uint8x16x2_t tbl) +{ + // Permute input samples for 8x2 by 2x8 matrix multiply. + uint8x16_t perm_s0 = vqtbl1q_u8(samples, tbl.val0); + uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val1); + + int32x4_t matmul_lo = vusmmlaq_s32(vdupq_n_s32(0), perm_s0, filter); + int32x4_t matmul_hi = vusmmlaq_s32(vdupq_n_s32(0), perm_s1, filter); + + // Narrow and combine. + int16x8_t matmul = vcombine_s16(vmovn_s32(matmul_lo), vmovn_s32(matmul_hi)); + return vqrshrun_n_s16(matmul, IF_FILTER_PREC); +} + +int16x4_t inline filter8_4_ps(uint8x16_t samples, const int8x8_t filter, + const int16x8_t constant, const uint8x16x3_t tbl) +{ + // Permute input samples for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + uint8x16_t perm_s0 = vqtbl1q_u8(samples, tbl.val0); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val1); + + int32x4_t dotprod = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0); + dotprod = vusdotq_lane_s32(dotprod, perm_s1, filter, 1); + + // Narrow. + return vadd_s16(vmovn_s32(dotprod), vget_low_s16(constant)); +} + +int16x8_t inline filter8_8_ps(uint8x16_t samples, const int8x8_t filter, + const int16x8_t constant, const uint8x16x3_t tbl) +{ + // Permute input samples for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + uint8x16_t perm_s0 = vqtbl1q_u8(samples, tbl.val0); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val1); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + uint8x16_t perm_S2 = vqtbl1q_u8(samples, tbl.val2); + + int32x4_t dotprod_lo = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0); + dotprod_lo = vusdotq_lane_s32(dotprod_lo, perm_s1, filter, 1); + int32x4_t dotprod_hi = vusdotq_lane_s32(vdupq_n_s32(0), perm_s1, filter, 0); + dotprod_hi = vusdotq_lane_s32(dotprod_hi, perm_S2, filter, 1); + + // Narrow and combine. + int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo), + vmovn_s32(dotprod_hi)); + return vaddq_s16(dotprod, constant); +} + +int16x8_t inline filter8_8_ps_reuse(uint8x16_t samples, const int8x8_t filter, + const int16x8_t constant, + const uint8x16x3_t tbl, uint8x16_t &perm_s0) +{ + // Permute input samples for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // Already in perm_s0. + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val1); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + uint8x16_t perm_s2 = vqtbl1q_u8(samples, tbl.val2); + + int32x4_t dotprod_lo = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0); + dotprod_lo = vusdotq_lane_s32(dotprod_lo, perm_s1, filter, 1); + int32x4_t dotprod_hi = vusdotq_lane_s32(vdupq_n_s32(0), perm_s1, filter, 0); + dotprod_hi = vusdotq_lane_s32(dotprod_hi, perm_s2, filter, 1); + + // Save for re-use in next iteration. + perm_s0 = perm_s2; + + // Narrow and combine. + int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo), + vmovn_s32(dotprod_hi)); + return vaddq_s16(dotprod, constant); +} + +int16x8_t inline filter8_8_ps_matmul(uint8x16_t samples, const int8x16_t filter,
View file
x265_4.0.tar.gz/source/common/aarch64/filter-neon-i8mm.h
Added
@@ -0,0 +1,37 @@ +/***************************************************************************** + * Copyright (C) 2024 MulticoreWare, Inc + * + * Authors: Hari Limaye <hari.limaye@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_FILTER_NEON_I8MM_H +#define X265_FILTER_NEON_I8MM_H + +#if defined(HAVE_NEON_I8MM) + +#include "primitives.h" + +namespace X265_NS { +void setupFilterPrimitives_neon_i8mm(EncoderPrimitives &p); +} + +#endif // defined(HAVE_NEON_I8MM) + +#endif // X265_FILTER_NEON_I8MM_H
View file
x265_3.6.tar.gz/source/common/aarch64/filter-prim.cpp -> x265_4.0.tar.gz/source/common/aarch64/filter-prim.cpp
Changed
@@ -1,37 +1,2114 @@ #if HAVE_NEON #include "filter-prim.h" +#include "mem-neon.h" + #include <arm_neon.h> -namespace +namespace { +void inline filter4_s16x8(int coeffIdx, const int16x8_t *s, const int16x4_t f, + const int32x4_t c, int32x4_t &d0, int32x4_t &d1) +{ + if (coeffIdx == 4) + { + // { -4, 36, 36, -4 } + int16x8_t t0 = vaddq_s16(s1, s2); + int16x8_t t1 = vaddq_s16(s0, s3); + d0 = vmlal_n_s16(c, vget_low_s16(t0), 36); + d0 = vmlsl_n_s16(d0, vget_low_s16(t1), 4); + + d1 = vmlal_n_s16(c, vget_high_s16(t0), 36); + d1 = vmlsl_n_s16(d1, vget_high_s16(t1), 4); + } + else + { + d0 = vmlal_lane_s16(c, vget_low_s16(s0), f, 0); + d0 = vmlal_lane_s16(d0, vget_low_s16(s1), f, 1); + d0 = vmlal_lane_s16(d0, vget_low_s16(s2), f, 2); + d0 = vmlal_lane_s16(d0, vget_low_s16(s3), f, 3); + + d1 = vmlal_lane_s16(c, vget_high_s16(s0), f, 0); + d1 = vmlal_lane_s16(d1, vget_high_s16(s1), f, 1); + d1 = vmlal_lane_s16(d1, vget_high_s16(s2), f, 2); + d1 = vmlal_lane_s16(d1, vget_high_s16(s3), f, 3); + } +} + +template<int coeffIdx> +void inline filter8_s16x4(const int16x4_t *s, const int32x4_t c, int32x4_t &d) +{ + if (coeffIdx == 1) + { + // { -1, 4, -10, 58, 17, -5, 1, 0 } + d = vsubl_s16(s6, s0); + d = vaddq_s32(d, c); + d = vmlal_n_s16(d, s1, 4); + d = vmlsl_n_s16(d, s2, 10); + d = vmlal_n_s16(d, s3, 58); + d = vmlal_n_s16(d, s4, 17); + d = vmlsl_n_s16(d, s5, 5); + } + else if (coeffIdx == 2) + { + // { -1, 4, -11, 40, 40, -11, 4, -1 } + int32x4_t t0 = vaddl_s16(s3, s4); + int32x4_t t1 = vaddl_s16(s2, s5); + int32x4_t t2 = vaddl_s16(s1, s6); + int32x4_t t3 = vaddl_s16(s0, s7); + + d = vmlaq_n_s32(c, t0, 40); + d = vmlaq_n_s32(d, t1, -11); + d = vmlaq_n_s32(d, t2, 4); + d = vmlaq_n_s32(d, t3, -1); + } + else + { + // { 0, 1, -5, 17, 58, -10, 4, -1 } + d = vsubl_s16(s1, s7); + d = vaddq_s32(d, c); + d = vmlal_n_s16(d, s6, 4); + d = vmlsl_n_s16(d, s5, 10); + d = vmlal_n_s16(d, s4, 58); + d = vmlal_n_s16(d, s3, 17); + d = vmlsl_n_s16(d, s2, 5); + } +} + +template<int coeffIdx> +void inline filter8_s16x8(const int16x8_t *s, const int32x4_t c, int32x4_t &d0, + int32x4_t &d1) +{ + if (coeffIdx == 1) + { + // { -1, 4, -10, 58, 17, -5, 1, 0 } + d0 = vsubl_s16(vget_low_s16(s6), vget_low_s16(s0)); + d0 = vaddq_s32(d0, c); + d0 = vmlal_n_s16(d0, vget_low_s16(s1), 4); + d0 = vmlsl_n_s16(d0, vget_low_s16(s2), 10); + d0 = vmlal_n_s16(d0, vget_low_s16(s3), 58); + d0 = vmlal_n_s16(d0, vget_low_s16(s4), 17); + d0 = vmlsl_n_s16(d0, vget_low_s16(s5), 5); + + d1 = vsubl_s16(vget_high_s16(s6), vget_high_s16(s0)); + d1 = vaddq_s32(d1, c); + d1 = vmlal_n_s16(d1, vget_high_s16(s1), 4); + d1 = vmlsl_n_s16(d1, vget_high_s16(s2), 10); + d1 = vmlal_n_s16(d1, vget_high_s16(s3), 58); + d1 = vmlal_n_s16(d1, vget_high_s16(s4), 17); + d1 = vmlsl_n_s16(d1, vget_high_s16(s5), 5); + } + else if (coeffIdx == 2) + { + // { -1, 4, -11, 40, 40, -11, 4, -1 } + int32x4_t t0 = vaddl_s16(vget_low_s16(s3), vget_low_s16(s4)); + int32x4_t t1 = vaddl_s16(vget_low_s16(s2), vget_low_s16(s5)); + int32x4_t t2 = vaddl_s16(vget_low_s16(s1), vget_low_s16(s6)); + int32x4_t t3 = vaddl_s16(vget_low_s16(s0), vget_low_s16(s7)); + + d0 = vmlaq_n_s32(c, t0, 40); + d0 = vmlaq_n_s32(d0, t1, -11); + d0 = vmlaq_n_s32(d0, t2, 4); + d0 = vmlaq_n_s32(d0, t3, -1); + + int32x4_t t4 = vaddl_s16(vget_high_s16(s3), vget_high_s16(s4)); + int32x4_t t5 = vaddl_s16(vget_high_s16(s2), vget_high_s16(s5)); + int32x4_t t6 = vaddl_s16(vget_high_s16(s1), vget_high_s16(s6)); + int32x4_t t7 = vaddl_s16(vget_high_s16(s0), vget_high_s16(s7)); + + d1 = vmlaq_n_s32(c, t4, 40); + d1 = vmlaq_n_s32(d1, t5, -11); + d1 = vmlaq_n_s32(d1, t6, 4); + d1 = vmlaq_n_s32(d1, t7, -1); + } + else + { + // { 0, 1, -5, 17, 58, -10, 4, -1 } + d0 = vsubl_s16(vget_low_s16(s1), vget_low_s16(s7)); + d0 = vaddq_s32(d0, c); + d0 = vmlal_n_s16(d0, vget_low_s16(s6), 4); + d0 = vmlsl_n_s16(d0, vget_low_s16(s5), 10); + d0 = vmlal_n_s16(d0, vget_low_s16(s4), 58); + d0 = vmlal_n_s16(d0, vget_low_s16(s3), 17); + d0 = vmlsl_n_s16(d0, vget_low_s16(s2), 5); + + d1 = vsubl_s16(vget_high_s16(s1), vget_high_s16(s7)); + d1 = vaddq_s32(d1, c); + d1 = vmlal_n_s16(d1, vget_high_s16(s6), 4); + d1 = vmlsl_n_s16(d1, vget_high_s16(s5), 10); + d1 = vmlal_n_s16(d1, vget_high_s16(s4), 58); + d1 = vmlal_n_s16(d1, vget_high_s16(s3), 17); + d1 = vmlsl_n_s16(d1, vget_high_s16(s2), 5); + } +} + +template<int width, int height> +void interp4_vert_ss_neon(const int16_t *src, intptr_t srcStride, int16_t *dst, + intptr_t dstStride, int coeffIdx) +{ + const int N_TAPS = 4; + src -= (N_TAPS / 2 - 1) * srcStride; + + const int16x4_t filter = vld1_s16(X265_NS::g_chromaFiltercoeffIdx); + + // Zero constant in order to use filter helper functions (optimised away). + const int32x4_t c = vdupq_n_s32(0); + + if (width == 12) + { + const int16_t *s = src; + int16_t *d = dst; + + int16x8_t in7; + load_s16x8xn<3>(s, srcStride, in); + s += 3 * srcStride; + + for (int row = 0; (row + 4) <= height; row += 4) + { + load_s16x8xn<4>(s, srcStride, in + 3); + + int32x4_t sum_lo4; + int32x4_t sum_hi4; + filter4_s16x8(coeffIdx, in + 0, filter, c, sum_lo0, sum_hi0); + filter4_s16x8(coeffIdx, in + 1, filter, c, sum_lo1, sum_hi1); + filter4_s16x8(coeffIdx, in + 2, filter, c, sum_lo2, sum_hi2); + filter4_s16x8(coeffIdx, in + 3, filter, c, sum_lo3, sum_hi3); + + int16x8_t sum4; + sum0 = vcombine_s16(vshrn_n_s32(sum_lo0, IF_FILTER_PREC), + vshrn_n_s32(sum_hi0, IF_FILTER_PREC)); + sum1 = vcombine_s16(vshrn_n_s32(sum_lo1, IF_FILTER_PREC), + vshrn_n_s32(sum_hi1, IF_FILTER_PREC)); + sum2 = vcombine_s16(vshrn_n_s32(sum_lo2, IF_FILTER_PREC), + vshrn_n_s32(sum_hi2, IF_FILTER_PREC)); + sum3 = vcombine_s16(vshrn_n_s32(sum_lo3, IF_FILTER_PREC), + vshrn_n_s32(sum_hi3, IF_FILTER_PREC)); + + store_s16x8xn<4>(d, dstStride, sum); + + in0 = in4; + in1 = in5; + in2 = in6; + + s += 4 * srcStride; + d += 4 * dstStride; + } + + src += 8; + dst += 8; + s = src;
View file
x265_3.6.tar.gz/source/common/aarch64/fun-decls.h -> x265_4.0.tar.gz/source/common/aarch64/fun-decls.h
Changed
@@ -69,6 +69,24 @@ ret PFX(name ## _64x16_ ## cpu)(__VA_ARGS__); \ ret PFX(name ## _16x64_ ## cpu)(__VA_ARGS__) +#define FUNCDEF_PU_MULT_16(ret, name, cpu, ...) \ + ret PFX(name ## _16x16_ ## cpu)(__VA_ARGS__); \ + ret PFX(name ## _32x32_ ## cpu)(__VA_ARGS__); \ + ret PFX(name ## _64x64_ ## cpu)(__VA_ARGS__); \ + ret PFX(name ## _16x8_ ## cpu)(__VA_ARGS__); \ + ret PFX(name ## _16x32_ ## cpu)(__VA_ARGS__); \ + ret PFX(name ## _32x16_ ## cpu)(__VA_ARGS__); \ + ret PFX(name ## _64x32_ ## cpu)(__VA_ARGS__); \ + ret PFX(name ## _32x64_ ## cpu)(__VA_ARGS__); \ + ret PFX(name ## _16x12_ ## cpu)(__VA_ARGS__); \ + ret PFX(name ## _16x4_ ## cpu)(__VA_ARGS__); \ + ret PFX(name ## _32x24_ ## cpu)(__VA_ARGS__); \ + ret PFX(name ## _32x8_ ## cpu)(__VA_ARGS__); \ + ret PFX(name ## _64x48_ ## cpu)(__VA_ARGS__); \ + ret PFX(name ## _48x64_ ## cpu)(__VA_ARGS__); \ + ret PFX(name ## _64x16_ ## cpu)(__VA_ARGS__); \ + ret PFX(name ## _16x64_ ## cpu)(__VA_ARGS__) + #define FUNCDEF_CHROMA_PU(ret, name, cpu, ...) \ FUNCDEF_PU(ret, name, cpu, __VA_ARGS__); \ ret PFX(name ## _4x2_ ## cpu)(__VA_ARGS__); \ @@ -113,23 +131,8 @@ FUNCDEF_CHROMA_PU(void, blockcopy_pp, cpu, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); \ FUNCDEF_PU(void, blockcopy_sp, cpu, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); \ FUNCDEF_PU(void, blockcopy_ps, cpu, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); \ - FUNCDEF_PU(void, interp_8tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \ - FUNCDEF_PU(void, interp_8tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \ - FUNCDEF_PU(void, interp_8tap_vert_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \ - FUNCDEF_PU(void, interp_8tap_vert_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \ - FUNCDEF_PU(void, interp_8tap_vert_sp, cpu, const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \ - FUNCDEF_PU(void, interp_8tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \ - FUNCDEF_PU(void, interp_8tap_hv_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY); \ FUNCDEF_CHROMA_PU(void, filterPixelToShort, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \ FUNCDEF_CHROMA_PU(void, filterPixelToShort_aligned, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \ - FUNCDEF_CHROMA_PU(void, interp_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \ - FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \ - FUNCDEF_CHROMA_PU(void, interp_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \ - FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \ - FUNCDEF_CHROMA_PU(void, interp_4tap_vert_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \ - FUNCDEF_CHROMA_PU(void, interp_4tap_vert_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \ - FUNCDEF_CHROMA_PU(void, interp_4tap_vert_sp, cpu, const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \ - FUNCDEF_CHROMA_PU(void, interp_4tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \ FUNCDEF_CHROMA_PU(void, addAvg, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \ FUNCDEF_CHROMA_PU(void, addAvg_aligned, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \ FUNCDEF_PU(void, pixel_avg_pp, cpu, pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); \ @@ -154,70 +157,74 @@ DECLS(sve); DECLS(sve2); +FUNCDEF_PU_MULT_16(int, pixel_sad, neon_dotprod, const pixel*, intptr_t, const pixel*, intptr_t); +FUNCDEF_PU_MULT_16(void, sad_x3, neon_dotprod, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); +FUNCDEF_PU_MULT_16(void, sad_x4, neon_dotprod, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); +FUNCDEF_PU(sse_t, pixel_sse_pp, neon_dotprod, const pixel*, intptr_t, const pixel*, intptr_t); -void x265_pixel_planecopy_cp_neon(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift); +void PFX(pixel_planecopy_cp_neon(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift)); -uint64_t x265_pixel_var_8x8_neon(const pixel* pix, intptr_t stride); -uint64_t x265_pixel_var_16x16_neon(const pixel* pix, intptr_t stride); -uint64_t x265_pixel_var_32x32_neon(const pixel* pix, intptr_t stride); -uint64_t x265_pixel_var_64x64_neon(const pixel* pix, intptr_t stride); +uint64_t PFX(pixel_var_8x8_neon(const pixel* pix, intptr_t stride)); +uint64_t PFX(pixel_var_16x16_neon(const pixel* pix, intptr_t stride)); +uint64_t PFX(pixel_var_32x32_neon(const pixel* pix, intptr_t stride)); +uint64_t PFX(pixel_var_64x64_neon(const pixel* pix, intptr_t stride)); -void x265_getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); -void x265_getResidual8_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); -void x265_getResidual16_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); -void x265_getResidual32_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); +void PFX(getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride)); +void PFX(getResidual8_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride)); +void PFX(getResidual16_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride)); +void PFX(getResidual32_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride)); -void x265_scale1D_128to64_neon(pixel *dst, const pixel *src); -void x265_scale2D_64to32_neon(pixel* dst, const pixel* src, intptr_t stride); +void PFX(scale1D_128to64_neon(pixel *dst, const pixel *src)); +void PFX(scale2D_64to32_neon(pixel* dst, const pixel* src, intptr_t stride)); -int x265_pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); -int x265_pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); -int x265_pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); -int x265_pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); -int x265_pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); -int x265_pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); -int x265_pixel_satd_8x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); -int x265_pixel_satd_8x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); -int x265_pixel_satd_8x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); -int x265_pixel_satd_8x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); -int x265_pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); -int x265_pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); -int x265_pixel_satd_16x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); -int x265_pixel_satd_16x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); -int x265_pixel_satd_16x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); -int x265_pixel_satd_16x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); -int x265_pixel_satd_16x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); -int x265_pixel_satd_16x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); -int x265_pixel_satd_16x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); -int x265_pixel_satd_24x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); -int x265_pixel_satd_24x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); -int x265_pixel_satd_32x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); -int x265_pixel_satd_32x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); -int x265_pixel_satd_32x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); -int x265_pixel_satd_32x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); -int x265_pixel_satd_32x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); -int x265_pixel_satd_32x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); -int x265_pixel_satd_48x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); -int x265_pixel_satd_64x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); -int x265_pixel_satd_64x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); -int x265_pixel_satd_64x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); -int x265_pixel_satd_64x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); +int PFX(pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)); +int PFX(pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)); +int PFX(pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)); +int PFX(pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)); +int PFX(pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)); +int PFX(pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)); +int PFX(pixel_satd_8x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)); +int PFX(pixel_satd_8x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)); +int PFX(pixel_satd_8x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)); +int PFX(pixel_satd_8x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)); +int PFX(pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)); +int PFX(pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)); +int PFX(pixel_satd_16x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)); +int PFX(pixel_satd_16x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)); +int PFX(pixel_satd_16x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)); +int PFX(pixel_satd_16x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)); +int PFX(pixel_satd_16x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)); +int PFX(pixel_satd_16x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)); +int PFX(pixel_satd_16x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)); +int PFX(pixel_satd_24x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)); +int PFX(pixel_satd_24x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)); +int PFX(pixel_satd_32x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)); +int PFX(pixel_satd_32x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)); +int PFX(pixel_satd_32x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)); +int PFX(pixel_satd_32x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)); +int PFX(pixel_satd_32x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)); +int PFX(pixel_satd_32x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)); +int PFX(pixel_satd_48x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)); +int PFX(pixel_satd_64x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)); +int PFX(pixel_satd_64x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)); +int PFX(pixel_satd_64x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)); +int PFX(pixel_satd_64x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)); -int x265_pixel_sa8d_8x8_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2); -int x265_pixel_sa8d_8x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2); -int x265_pixel_sa8d_16x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2); -int x265_pixel_sa8d_16x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2); -int x265_pixel_sa8d_32x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2); -int x265_pixel_sa8d_32x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2); -int x265_pixel_sa8d_64x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2); +int PFX(pixel_sa8d_8x8_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)); +int PFX(pixel_sa8d_8x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)); +int PFX(pixel_sa8d_16x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)); +int PFX(pixel_sa8d_16x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)); +int PFX(pixel_sa8d_32x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)); +int PFX(pixel_sa8d_32x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)); +int PFX(pixel_sa8d_64x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)); uint32_t PFX(quant_neon)(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff); uint32_t PFX(nquant_neon)(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff); -void x265_dequant_scaling_neon(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift); -void x265_dequant_normal_neon(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift); +void PFX(dequant_scaling_neon(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift)); +void PFX(dequant_normal_neon(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)); -void x265_ssim_4x4x2_core_neon(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums24); +void PFX(ssim_4x4x2_core_neon(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums24)); int PFX(psyCost_4x4_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride); int PFX(psyCost_8x8_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride); @@ -226,30 +233,28 @@ int PFX(scanPosLast_neon)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize); uint32_t PFX(costCoeffNxN_neon)(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase); -uint64_t x265_pixel_var_8x8_sve2(const pixel* pix, intptr_t stride); -uint64_t x265_pixel_var_16x16_sve2(const pixel* pix, intptr_t stride); -uint64_t x265_pixel_var_32x32_sve2(const pixel* pix, intptr_t stride); -uint64_t x265_pixel_var_64x64_sve2(const pixel* pix, intptr_t stride); - -void x265_getResidual16_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); -void x265_getResidual32_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); +uint64_t PFX(pixel_var_8x8_sve2(const pixel* pix, intptr_t stride)); +uint64_t PFX(pixel_var_16x16_sve2(const pixel* pix, intptr_t stride)); +uint64_t PFX(pixel_var_32x32_sve2(const pixel* pix, intptr_t stride)); +uint64_t PFX(pixel_var_64x64_sve2(const pixel* pix, intptr_t stride)); -void x265_scale1D_128to64_sve2(pixel *dst, const pixel *src); -void x265_scale2D_64to32_sve2(pixel* dst, const pixel* src, intptr_t stride); +void PFX(getResidual16_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride)); +void PFX(getResidual32_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride)); -int x265_pixel_satd_4x4_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2); -int x265_pixel_satd_8x4_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
View file
x265_3.6.tar.gz/source/common/aarch64/intrapred-prim.cpp -> x265_4.0.tar.gz/source/common/aarch64/intrapred-prim.cpp
Changed
@@ -2,7 +2,7 @@ #include "primitives.h" -#if 1 +#if HAVE_NEON #include "arm64-utils.h" #include <arm_neon.h> @@ -12,6 +12,52 @@ { +template<int tuSize> +void intraFilter_neon(const pixel* samples, pixel* filtered) /* 1:2:1 filtering of left and top reference samples */ +{ + const int tuSize2 = tuSize << 1; + pixel topLeft = samples0, topLast = samplestuSize2, leftLast = samplestuSize2 + tuSize2; + + uint16x8_t two_vec = vdupq_n_u16(2); +#if !HIGH_BIT_DEPTH + { + for(int i = 0; i < tuSize2 + tuSize2; i+=8) + { + uint16x8_t sample1 = vmovl_u8(vld1_u8(&samplesi)); + uint16x8_t sample2 = vmovl_u8(vld1_u8(&samplesi-1)); + uint16x8_t sample3 = vmovl_u8(vld1_u8(&samplesi+1)); + + uint16x8_t result1 = vaddq_u16(vshlq_n_u16(sample1,1), sample2 ); + uint16x8_t result2 = vaddq_u16(sample3, two_vec); + uint16x8_t result3 = vaddq_u16(result1,result2); + vst1_u8(&filteredi , vmovn_u16(vshrq_n_u16(result3, 2))); + } + } +#else + { + for(int i = 0; i < tuSize2 + tuSize2; i+=8) + { + uint16x8_t sample1 = vld1q_u16(&samplesi); + uint16x8_t sample2 = vld1q_u16(&samplesi-1); + uint16x8_t sample3 = vld1q_u16(&samplesi+1); + + uint16x8_t result1 = vaddq_u16(vshlq_n_u16(sample1,1), sample2 ); + uint16x8_t result2 = vaddq_u16(sample3, two_vec); + uint16x8_t result3 = vaddq_u16(result1,result2); + vst1q_u16(&filteredi , vshrq_n_u16(result3, 2)); + } + } +#endif + // filtering top + filteredtuSize2 = topLast; + + // filtering top-left + filtered0 = ((topLeft << 1) + samples1 + samplestuSize2 + 1 + 2) >> 2; + + // filtering left + filteredtuSize2 + 1 = ((samplestuSize2 + 1 << 1) + topLeft + samplestuSize2 + 2 + 2) >> 2; + filteredtuSize2 + tuSize2 = leftLast; +} template<int width> void intra_pred_ang_neon(pixel *dst, intptr_t dstStride, const pixel *srcPix0, int dirMode, int bFilter) @@ -105,30 +151,42 @@ { if (width >= 8 && sizeof(pixel) == 1) { - const int16x8_t f0 = vdupq_n_s16(32 - fraction); - const int16x8_t f1 = vdupq_n_s16(fraction); + // We have to cast to the 'real' type so that this block + // will compile for both low and high bitdepth. + const uint8_t *ref_u8 = (const uint8_t *)ref + offset; + uint8_t *dst_u8 = (uint8_t *)dst; + + // f0 and f1 are unsigned (fraction is in range 0, 31). + const uint8x8_t f0 = vdup_n_u8(32 - fraction); + const uint8x8_t f1 = vdup_n_u8(fraction); for (int x = 0; x < width; x += 8) { - uint8x8_t in0 = *(uint8x8_t *)&refoffset + x; - uint8x8_t in1 = *(uint8x8_t *)&refoffset + x + 1; - int16x8_t lo = vmlaq_s16(vdupq_n_s16(16), vmovl_u8(in0), f0); - lo = vmlaq_s16(lo, vmovl_u8(in1), f1); - lo = vshrq_n_s16(lo, 5); - *(uint8x8_t *)&dsty * dstStride + x = vmovn_u16(lo); + uint8x8_t in0 = vld1_u8(ref_u8 + x); + uint8x8_t in1 = vld1_u8(ref_u8 + x + 1); + uint16x8_t lo = vmlal_u8(vdupq_n_u16(16), in0, f0); + lo = vmlal_u8(lo, in1, f1); + uint8x8_t res = vshrn_n_u16(lo, 5); + vst1_u8(dst_u8 + y * dstStride + x, res); } } else if (width >= 4 && sizeof(pixel) == 2) { - const int32x4_t f0 = vdupq_n_s32(32 - fraction); - const int32x4_t f1 = vdupq_n_s32(fraction); + // We have to cast to the 'real' type so that this block + // will compile for both low and high bitdepth. + const uint16_t *ref_u16 = (const uint16_t *)ref + offset; + uint16_t *dst_u16 = (uint16_t *)dst; + + // f0 and f1 are unsigned (fraction is in range 0, 31). + const uint16x4_t f0 = vdup_n_u16(32 - fraction); + const uint16x4_t f1 = vdup_n_u16(fraction); for (int x = 0; x < width; x += 4) { - uint16x4_t in0 = *(uint16x4_t *)&refoffset + x; - uint16x4_t in1 = *(uint16x4_t *)&refoffset + x + 1; - int32x4_t lo = vmlaq_s32(vdupq_n_s32(16), vmovl_u16(in0), f0); - lo = vmlaq_s32(lo, vmovl_u16(in1), f1); - lo = vshrq_n_s32(lo, 5); - *(uint16x4_t *)&dsty * dstStride + x = vmovn_u32(lo); + uint16x4_t in0 = vld1_u16(ref_u16 + x); + uint16x4_t in1 = vld1_u16(ref_u16 + x + 1); + uint32x4_t lo = vmlal_u16(vdupq_n_u32(16), in0, f0); + lo = vmlal_u16(lo, in1, f1); + uint16x4_t res = vshrn_n_u32(lo, 5); + vst1_u16(dst_u16 + y * dstStride + x, res); } } else @@ -176,6 +234,7 @@ } } +#endif template<int log2Size> void all_angs_pred_neon(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma) { @@ -220,14 +279,285 @@ } } } + +template<int log2Size> +void planar_pred_neon(pixel * dst, intptr_t dstStride, const pixel * srcPix, int /*dirMode*/, int /*bFilter*/) +{ + const int blkSize = 1 << log2Size; + + const pixel* above = srcPix + 1; + const pixel* left = srcPix + (2 * blkSize + 1); + + switch (blkSize) { + case 8: + { + const uint16_t log2SizePlusOne = log2Size + 1; + uint16x8_t blkSizeVec = vdupq_n_u16(blkSize); + uint16x8_t topRight = vdupq_n_u16(aboveblkSize); + uint16_t bottomLeft = leftblkSize; + uint16x8_t oneVec = vdupq_n_u16(1); + uint16x8_t blkSizeSubOneVec = vdupq_n_u16(blkSize - 1); + + for (int y = 0; y < blkSize; y++) { + // (blkSize - 1 - y) + uint16x8_t vlkSizeYVec = vdupq_n_u16(blkSize - 1 - y); + // (y + 1) * bottomLeft + uint16x8_t bottomLeftYVec = vdupq_n_u16((y + 1) * bottomLeft); + // lefty + uint16x8_t leftYVec = vdupq_n_u16(lefty); + + for (int x = 0; x < blkSize; x += 8) { + int idx = y * dstStride + x; + uint16x8_t xvec = { (uint16_t)(x + 0), (uint16_t)(x + 1), + (uint16_t)(x + 2), (uint16_t)(x + 3), + (uint16_t)(x + 4), (uint16_t)(x + 5), + (uint16_t)(x + 6), (uint16_t)(x + 7) }; + + // (blkSize - 1 - y) * abovex + uint16x8_t aboveVec = { (uint16_t)(abovex + 0), + (uint16_t)(abovex + 1), + (uint16_t)(abovex + 2), + (uint16_t)(abovex + 3), + (uint16_t)(abovex + 4), + (uint16_t)(abovex + 5), + (uint16_t)(abovex + 6), + (uint16_t)(abovex + 7) }; + + aboveVec = vmulq_u16(aboveVec, vlkSizeYVec); + + // (blkSize - 1 - x) * lefty + uint16x8_t first = vsubq_u16(blkSizeSubOneVec, xvec); + first = vmulq_u16(first, leftYVec); + + // (x + 1) * topRight + uint16x8_t second = vaddq_u16(xvec, oneVec); + second = vmulq_u16(second, topRight); + + uint16x8_t resVec = vaddq_u16(first, second); + resVec = vaddq_u16(resVec, aboveVec); + resVec = vaddq_u16(resVec, bottomLeftYVec); + resVec = vaddq_u16(resVec, blkSizeVec); + resVec = vshrq_n_u16(resVec, log2SizePlusOne); + + for (int i = 0; i < 8; i++) + dstidx + i = (pixel)resVeci; + } +} + } + break; + case 4:
View file
x265_4.0.tar.gz/source/common/aarch64/intrapred.S
Added
@@ -0,0 +1,171 @@ +/***************************************************************************** + * Copyright (C) 2021 MulticoreWare, Inc + * + * Authors: Min Chen <min.chen@multicorewareinc.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +// Functions in this file: +// ***** luma_vpp ***** + +#include "asm.S" + +#ifdef __APPLE__ +.section __RODATA,__rodata +#else +.section .rodata +#endif + +.align 4 + +.text + +.align 4 +tbl_const_1to8_7to0: + .byte 1, 2, 3, 4, 5, 6, 7, 8 + .byte 7, 6, 5, 4, 3, 2, 1, 0 + .byte 9, 10, 11, 12, 13, 14, 15, 16 + .byte 15, 14, 13, 12, 11, 10, 9, 8 + +// ***** planar_pred ***** +// void planar_pred(pixel* dst, intptr_t dstStride, const pixel* srcPix, int /*dirMode*/, int /*bFilter*/) +function PFX(intra_pred_planar8_neon) +// Register map +// x0 = dst +// x1 = dstStride +// x2 = *srcPix +// x3 = leftx +// x4 = tmp +// v0 = above7:0 +// v1 = left7:0 +// v2 = topRight = rep(aboveblkSize) +// v3 = bottomLeft = rep(leftblkSize) +// v4 = const8 7 6 5 4 3 2 1 +// v5 = const7 6 5 4 3 2 1 0 + +//{ +// const int blkSize = 1 << log2Size; +// const pixel* above = srcPix + 1; +// const pixel* left = srcPix + (2 * blkSize + 1); +// pixel topRight = aboveblkSize; +// pixel bottomLeft = leftblkSize; +// for (int y = 0; y < blkSize; y++) +// for (int x = 0; x < blkSize; x++) +// dsty * dstStride + x = (pixel) (((blkSize - 1 - x) * lefty + (blkSize - 1 -y) * abovex + (x + 1) * topRight + (y + 1) * bottomLeft + blkSize) >> (log2Size + 1)); +//} + + ldurb w3, x2, #(1+8) // topRight + ldurb w4, x2, #(2*8+1+8) // bottomLeft + dup v2.8b, w3 // v2 = topRight_b + dup v3.8h, w4 // v3 = bottomLeft_h + ldr x3, x2, #(2*8+1) // x3 = leftx_b + ldr d0, x2, #1 // v0 = abovex_b + + adr x4, tbl_const_1to8_7to0 + ldr d4, x4 // v4 = const_b8 7 6 5 4 3 2 1 + ldr d5, x4, #8 // v5 = const_b7 6 5 4 3 2 1 0 + + ushll v6.8h, v0.8b, #3 // v6 = 8 * abovex + usubw v0.8h, v3.8h, v0.8b // v0 = bottomLeft - abovex + + umlal v6.8h, v4.8b, v2.8b // v6 = 8 * abovex + (x + 1) * topRight + + mov w4, #8 + +1: + dup v1.8b, w3 + lsr x3, x3, #8 + add v6.8h, v6.8h, v0.8h // v6 = (blkSize - 1 -y=0) * abovex + (x + 1) * topRight + (y=0 + 1) * bottomLeft + mov v3.16b, v6.16b + umlal v3.8h, v5.8b, v1.8b // v3 = (blkSize - 1 - x) * lefty=0 + (blkSize - 1 -y=0) * abovex + (x + 1) * topRight + (y=0 + 1) * bottomLeft + rshrn v3.8b, v3.8h, #4 + sub w4, w4, #1 + st1 {v3.8b}, x0, x1 + cbnz w4, 1b + + ret +endfunc + +// void planar_pred(pixel* dst, intptr_t dstStride, const pixel* srcPix, int /*dirMode*/, int /*bFilter*/) +function PFX(intra_pred_planar16_neon) +// Register map +// x0 = dst +// x1 = dstStride +// x2 = *srcPix +// x3 = leftx +// x4 = tmp +// v0 = above7:0 +// v1 = left7:0 +// v2 = topRight = rep(aboveblkSize) +// v3 = bottomLeft = rep(leftblkSize) +// v4 = const16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 +// v5 = const15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 + +//{ +// const int blkSize = 1 << log2Size; +// const pixel* above = srcPix + 1; +// const pixel* left = srcPix + (2 * blkSize + 1); +// pixel topRight = aboveblkSize; +// pixel bottomLeft = leftblkSize; +// for (int y = 0; y < blkSize; y++) +// for (int x = 0; x < blkSize; x++) +// dsty * dstStride + x = (pixel) (((blkSize - 1 - x) * lefty + (blkSize - 1 -y) * abovex + (x + 1) * topRight + (y + 1) * bottomLeft + blkSize) >> (log2Size + 1)); +//} + + ldurb w3, x2, #(1+16) // topRight + ldurb w4, x2, #(2*16+1+16) // bottomLeft + ldr q0, x2, #(2*16+1) // v0 = leftx_b + ldr q1, x2, #1 // v1 = abovex_b + dup v2.16b, w3 // v2 = topRight_b + dup v3.8h, w4 // v3 = bottomLeft_h + + adr x4, tbl_const_1to8_7to0 + ld2 {v4.2d, v5.2d}, x4 // v4 = const_b16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 + ext v5.16b, v5.16b, v5.16b, #8 // v5 = const_b15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 + + ushll v16.8h, v1.8b, #4 // v16,v17 = 16 * abovex + ushll2 v17.8h, v1.16b, #4 + usubw v6.8h, v3.8h, v1.8b // v6,v7 = bottomLeft - abovex + usubw2 v7.8h, v3.8h, v1.16b + + umlal v16.8h, v4.8b, v2.8b // v16,v17 = 16 * abovex + (x + 1) * topRight + umlal2 v17.8h, v4.16b, v2.16b + + mov w4, #16 + +1: + dup v1.16b, v0.b0 // v1 = leftx_b + ext v0.16b, v0.16b, v0.16b, #1 + + add v16.8h, v16.8h, v6.8h // v16,v17 = (blkSize - 1 -y=0) * abovex + (x + 1) * topRight + (y=0 + 1) * bottomLeft + add v17.8h, v17.8h, v7.8h + + mov v18.16b, v16.16b + mov v19.16b, v17.16b + + umlal v18.8h, v5.8b, v1.8b // v3 = (blkSize - 1 - x) * lefty=0 + (blkSize - 1 -y=0) * abovex + (x + 1) * topRight + (y=0 + 1) * bottomLeft + umlal2 v19.8h, v5.16b, v1.16b + rshrn v18.8b, v18.8h, #5 + rshrn2 v18.16b, v19.8h, #5 + st1 {v18.16b}, x0, x1 + sub w4, w4, #1 + cbnz w4, 1b + + ret +endfunc
View file
x265_3.6.tar.gz/source/common/aarch64/loopfilter-prim.cpp -> x265_4.0.tar.gz/source/common/aarch64/loopfilter-prim.cpp
Changed
@@ -1,3 +1,4 @@ +#include "common.h" #include "loopfilter-prim.h" #define PIXEL_MIN 0 @@ -11,15 +12,10 @@ { -/* get the sign of input variable (TODO: this is a dup, make common) */ -static inline int8_t signOf(int x) -{ - return (x >> 31) | ((int)((((uint32_t) - x)) >> 31)); -} - static inline int8x8_t sign_diff_neon(const uint8x8_t in0, const uint8x8_t in1) { - int16x8_t in = vsubl_u8(in0, in1); + int16x8_t in = vreinterpretq_s16_u16(vsubl_u8(in0, in1)); + return vmovn_s16(vmaxq_s16(vminq_s16(in, vdupq_n_s16(1)), vdupq_n_s16(-1))); } @@ -28,12 +24,13 @@ int x = 0; for (; (x + 8) <= endX; x += 8) { - *(int8x8_t *)&dstx = sign_diff_neon(*(uint8x8_t *)&src1x, *(uint8x8_t *)&src2x); + int8x8_t sign = sign_diff_neon(vld1_u8(src1 + x), vld1_u8(src2 + x)); + vst1_s8(dst + x, sign); } for (; x < endX; x++) { - dstx = signOf(src1x - src2x); + dstx = x265_signOf(src1x - src2x); } } @@ -56,21 +53,20 @@ int8x8x2_t shifter; shifter.val10 = signLeft0; static const int8x8_t index = {8, 0, 1, 2, 3, 4, 5, 6}; - int8x8_t tbl = *(int8x8_t *)offsetEo; + int8x8_t tbl = vld1_s8(offsetEo); for (; (x + 8) <= width; x += 8) { - uint8x8_t in = *(uint8x8_t *)&recx; - vsignRight = sign_diff_neon(in, *(uint8x8_t *)&recx + 1); + uint8x8_t in = vld1_u8(rec + x); + vsignRight = sign_diff_neon(in, vld1_u8(rec + x + 1)); shifter.val0 = vneg_s8(vsignRight); int8x8_t tmp = shifter.val0; int8x8_t edge = vtbl2_s8(shifter, index); int8x8_t vedgeType = vadd_s8(vadd_s8(vsignRight, edge), vdup_n_s8(2)); shifter.val10 = tmp7; int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType)); - t1 = vaddw_u8(t1, in); - t1 = vmaxq_s16(t1, vdupq_n_s16(0)); - t1 = vminq_s16(t1, vdupq_n_s16(255)); - *(uint8x8_t *)&recx = vmovn_u16(t1); + t1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(t1), + in)); + vst1_u8(rec + x, vqmovun_s16(t1)); } signLeft0 = shifter.val10; } @@ -93,22 +89,26 @@ if (width >= 8) { - int8x8_t tbl = *(int8x8_t *)offsetEo; + int8x8_t tbl = vld1_s8(offsetEo); + const int8x8_t c = vdup_n_s8(2); + for (; (x + 8) <= width; x += 8) { - uint8x8_t in0 = *(uint8x8_t *)&recx; - uint8x8_t in1 = *(uint8x8_t *)&recx + stride; + uint8x8_t in0 = vld1_u8(rec + x); + uint8x8_t in1 = vld1_u8(rec + x + stride); int8x8_t vsignDown = sign_diff_neon(in0, in1); - int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&upBuff1x), vdup_n_s8(2)); - *(int8x8_t *)&upBuff1x = vneg_s8(vsignDown); + int8x8_t vsignUp = vld1_s8(upBuff1 + x); + int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, vsignUp), c); + vst1_s8(upBuff1 + x, vneg_s8(vsignDown)); int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType)); - t1 = vaddw_u8(t1, in0); - *(uint8x8_t *)&recx = vqmovun_s16(t1); + t1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(t1), + in0)); + vst1_u8(rec + x, vqmovun_s16(t1)); } } for (; x < width; x++) { - signDown = signOf(recx - recx + stride); + signDown = x265_signOf(recx - recx + stride); edgeType = signDown + upBuff1x + 2; upBuff1x = -signDown; recx = x265_clip(recx + offsetEoedgeType); @@ -126,25 +126,26 @@ int x = 0; if (width >= 8) { - int8x8_t tbl = *(int8x8_t *)offsetEo; + int8x8_t tbl = vld1_s8(offsetEo); + const int8x8_t c = vdup_n_s8(2); + for (; (x + 8) <= width; x += 8) { - uint8x8_t in0 = *(uint8x8_t *)&recx; - uint8x8_t in1 = *(uint8x8_t *)&recx + stride; + uint8x8_t in0 = vld1_u8(rec + x); + uint8x8_t in1 = vld1_u8(rec + x + stride); int8x8_t vsignDown = sign_diff_neon(in0, in1); - int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&upBuff1x), vdup_n_s8(2)); - *(int8x8_t *)&upBuff1x = vneg_s8(vsignDown); + int8x8_t vsignUp = vld1_s8(upBuff1 + x); + int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, vsignUp), c); + vst1_s8(upBuff1 + x, vneg_s8(vsignDown)); int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType)); - t1 = vaddw_u8(t1, in0); - t1 = vmaxq_s16(t1, vdupq_n_s16(0)); - t1 = vminq_s16(t1, vdupq_n_s16(255)); - *(uint8x8_t *)&recx = vmovn_u16(t1); - + t1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(t1), + in0)); + vst1_u8(rec + x, vqmovun_s16(t1)); } } for (; x < width; x++) { - signDown = signOf(recx - recx + stride); + signDown = x265_signOf(recx - recx + stride); edgeType = signDown + upBuff1x + 2; upBuff1x = -signDown; recx = x265_clip(recx + offsetEoedgeType); @@ -157,11 +158,11 @@ { int x; - if (abs(buff1 - bufft) < 16) + if (abs(static_cast<int>(buff1 - bufft)) < 16) { for (x = 0; x < width; x++) { - int8_t signDown = signOf(recx - recx + stride + 1); + int8_t signDown = x265_signOf(recx - recx + stride + 1); int edgeType = signDown + buff1x + 2; bufftx + 1 = -signDown; recx = x265_clip(recx + offsetEoedgeType);; @@ -169,24 +170,26 @@ } else { - int8x8_t tbl = *(int8x8_t *)offsetEo; + int8x8_t tbl = vld1_s8(offsetEo); + const int8x8_t c = vdup_n_s8(2); + x = 0; for (; (x + 8) <= width; x += 8) { - uint8x8_t in0 = *(uint8x8_t *)&recx; - uint8x8_t in1 = *(uint8x8_t *)&recx + stride + 1; + uint8x8_t in0 = vld1_u8(rec + x); + uint8x8_t in1 = vld1_u8(rec + x + stride + 1); int8x8_t vsignDown = sign_diff_neon(in0, in1); - int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&buff1x), vdup_n_s8(2)); - *(int8x8_t *)&bufftx + 1 = vneg_s8(vsignDown); + int8x8_t vsignUp = vld1_s8(buff1 + x); + int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, vsignUp), c); + vst1_s8(bufft + x + 1, vneg_s8(vsignDown)); int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType)); - t1 = vaddw_u8(t1, in0); - t1 = vmaxq_s16(t1, vdupq_n_s16(0)); - t1 = vminq_s16(t1, vdupq_n_s16(255)); - *(uint8x8_t *)&recx = vmovn_u16(t1); + t1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(t1), + in0)); + vst1_u8(rec + x, vqmovun_s16(t1)); } for (; x < width; x++) { - int8_t signDown = signOf(recx - recx + stride + 1); + int8_t signDown = x265_signOf(recx - recx + stride + 1); int edgeType = signDown + buff1x + 2; bufftx + 1 = -signDown; recx = x265_clip(recx + offsetEoedgeType);; @@ -200,26 +203,25 @@ { int8_t signDown; int8_t edgeType; - int8x8_t tbl = *(int8x8_t *)offsetEo; + int8x8_t tbl = vld1_s8(offsetEo); + const int8x8_t c = vdup_n_s8(2); int x = startX + 1;
View file
x265_3.6.tar.gz/source/common/aarch64/mc-a-sve2.S -> x265_4.0.tar.gz/source/common/aarch64/mc-a-sve2.S
Changed
@@ -219,7 +219,7 @@ mov x11, #0 whilelt p0.b, x11, x10 mov w12, #8 -.loop_gt_32_pixel_avg_pp_48x64: +.Loop_gt_32_pixel_avg_pp_48x64: sub w12, w12, #1 .rept 8 ld1b {z0.b}, p0/z, x2 @@ -230,7 +230,7 @@ st1b {z0.b}, p0, x0 add x0, x0, x1 .endr - cbnz w12, .loop_gt_32_pixel_avg_pp_48x64 + cbnz w12, .Loop_gt_32_pixel_avg_pp_48x64 ret endfunc @@ -339,7 +339,7 @@ mov w12, #\h / 2 ptrue p0.b, vl16 ptrue p2.h, vl6 -.loop_sve2_addavg_6x\h\(): +.Loop_sve2_addavg_6x\h\(): sub w12, w12, #1 ld1b {z0.b}, p0/z, x0 ld1b {z1.b}, p0/z, x1 @@ -359,7 +359,7 @@ add x2, x2, x5 st1b {z2.h}, p2, x2 add x2, x2, x5 - cbnz w12, .loop_sve2_addavg_6x\h + cbnz w12, .Loop_sve2_addavg_6x\h ret endfunc .endm @@ -398,7 +398,7 @@ function PFX(addAvg_8x\h\()_sve2) mov w12, #\h / 2 ptrue p0.b, vl16 -.loop_sve2_addavg_8x\h\(): +.Loop_sve2_addavg_8x\h\(): sub w12, w12, #1 ld1b {z0.b}, p0/z, x0 ld1b {z1.b}, p0/z, x1 @@ -418,7 +418,7 @@ add x2, x2, x5 st1b {z2.h}, p0, x2 add x2, x2, x5 - cbnz w12, .loop_sve2_addavg_8x\h + cbnz w12, .Loop_sve2_addavg_8x\h ret endfunc .endm @@ -440,7 +440,7 @@ bgt .vl_gt_16_addAvg_12x\h ptrue p0.b, vl16 ptrue p1.b, vl8 -.loop_sve2_addavg_12x\h\(): +.Loop_sve2_addavg_12x\h\(): sub w12, w12, #1 ld1b {z0.b}, p0/z, x0 ld1b {z1.b}, p0/z, x1 @@ -457,13 +457,13 @@ st1b {z0.h}, p0, x2 st1b {z2.h}, p1, x2, #1, mul vl add x2, x2, x5 - cbnz w12, .loop_sve2_addavg_12x\h + cbnz w12, .Loop_sve2_addavg_12x\h ret .vl_gt_16_addAvg_12x\h\(): mov x10, #24 mov x11, #0 whilelt p0.b, x11, x10 -.loop_sve2_gt_16_addavg_12x\h\(): +.Loop_sve2_gt_16_addavg_12x\h\(): sub w12, w12, #1 ld1b {z0.b}, p0/z, x0 ld1b {z1.b}, p0/z, x1 @@ -476,7 +476,7 @@ add z2.b, z2.b, #0x80 st1b {z0.h}, p0, x2 add x2, x2, x5 - cbnz w12, .loop_sve2_gt_16_addavg_12x\h + cbnz w12, .Loop_sve2_gt_16_addavg_12x\h ret endfunc .endm @@ -491,7 +491,7 @@ cmp x9, #16 bgt .vl_gt_16_addAvg_16x\h ptrue p0.b, vl16 -.loop_eq_16_sve2_addavg_16x\h\(): +.Loop_eq_16_sve2_addavg_16x\h\(): sub w12, w12, #1 ld1b {z0.b}, p0/z, x0 ld1b {z1.b}, p0/z, x1 @@ -508,13 +508,13 @@ st1b {z0.h}, p0, x2 st1b {z2.h}, p0, x2, #1, mul vl add x2, x2, x5 - cbnz w12, .loop_eq_16_sve2_addavg_16x\h + cbnz w12, .Loop_eq_16_sve2_addavg_16x\h ret .vl_gt_16_addAvg_16x\h\(): cmp x9, #32 bgt .vl_gt_32_addAvg_16x\h ptrue p0.b, vl32 -.loop_gt_16_sve2_addavg_16x\h\(): +.Loop_gt_16_sve2_addavg_16x\h\(): sub w12, w12, #1 ld1b {z0.b}, p0/z, x0 ld1b {z1.b}, p0/z, x1 @@ -525,13 +525,13 @@ add z0.b, z0.b, #0x80 st1b {z0.h}, p1, x2 add x2, x2, x5 - cbnz w12, .loop_gt_16_sve2_addavg_16x\h + cbnz w12, .Loop_gt_16_sve2_addavg_16x\h ret .vl_gt_32_addAvg_16x\h\(): mov x10, #48 mov x11, #0 whilelt p0.b, x11, x10 -.loop_gt_32_sve2_addavg_16x\h\(): +.Loop_gt_32_sve2_addavg_16x\h\(): sub w12, w12, #1 ld1b {z0.b}, p0/z, x0 add x0, x0, x3, lsl #1 @@ -541,7 +541,7 @@ add z0.b, z0.b, #0x80 st1b {z0.h}, p0, x2 add x2, x2, x5 - cbnz w12, .loop_gt_32_sve2_addavg_16x\h + cbnz w12, .Loop_gt_32_sve2_addavg_16x\h ret endfunc .endm @@ -561,7 +561,7 @@ cmp x9, #16 bgt .vl_gt_16_addAvg_24x\h addAvg_start -.loop_eq_16_sve2_addavg_24x\h\(): +.Loop_eq_16_sve2_addavg_24x\h\(): sub w12, w12, #1 ld1 {v0.16b-v2.16b}, x0, x3 ld1 {v3.16b-v5.16b}, x1, x4 @@ -572,14 +572,14 @@ sqxtun v1.8b, v1.8h sqxtun v2.8b, v2.8h st1 {v0.8b-v2.8b}, x2, x5 - cbnz w12, .loop_eq_16_sve2_addavg_24x\h + cbnz w12, .Loop_eq_16_sve2_addavg_24x\h ret .vl_gt_16_addAvg_24x\h\(): cmp x9, #48 bgt .vl_gt_48_addAvg_24x\h ptrue p0.b, vl32 ptrue p1.b, vl16 -.loop_gt_16_sve2_addavg_24x\h\(): +.Loop_gt_16_sve2_addavg_24x\h\(): sub w12, w12, #1 ld1b {z0.b}, p0/z, x0 ld1b {z1.b}, p1/z, x0, #1, mul vl @@ -596,13 +596,13 @@ st1b {z0.h}, p0, x2 st1b {z1.h}, p1, x2, #1, mul vl add x2, x2, x5 - cbnz w12, .loop_gt_16_sve2_addavg_24x\h + cbnz w12, .Loop_gt_16_sve2_addavg_24x\h ret .vl_gt_48_addAvg_24x\h\(): mov x10, #48 mov x11, #0 whilelt p0.b, x11, x10 -.loop_gt_48_sve2_addavg_24x\h\(): +.Loop_gt_48_sve2_addavg_24x\h\(): sub w12, w12, #1 ld1b {z0.b}, p0/z, x0 ld1b {z2.b}, p0/z, x1 @@ -613,7 +613,7 @@ add z0.b, z0.b, #0x80 st1b {z0.h}, p0, x2 add x2, x2, x5 - cbnz w12, .loop_gt_48_sve2_addavg_24x\h + cbnz w12, .Loop_gt_48_sve2_addavg_24x\h ret endfunc .endm @@ -628,7 +628,7 @@ cmp x9, #16 bgt .vl_gt_16_addAvg_32x\h ptrue p0.b, vl16 -.loop_eq_16_sve2_addavg_32x\h\(): +.Loop_eq_16_sve2_addavg_32x\h\(): sub w12, w12, #1 ld1b {z0.b}, p0/z, x0 ld1b {z1.b}, p0/z, x0, #1, mul vl @@ -657,13 +657,13 @@ st1b {z2.h}, p0, x2, #2, mul vl
View file
x265_3.6.tar.gz/source/common/aarch64/mc-a.S -> x265_4.0.tar.gz/source/common/aarch64/mc-a.S
Changed
@@ -283,7 +283,7 @@ addAvg_start mov w12, #\h / 2 sub x5, x5, #4 -.loop_addavg_6x\h: +.Loop_addavg_6x\h: sub w12, w12, #1 ld1 {v0.16b}, x0, x3 ld1 {v1.16b}, x1, x4 @@ -305,7 +305,7 @@ st1 {v0.h}2, x2, x5 str s1, x2, #4 st1 {v1.h}2, x2, x5 - cbnz w12, .loop_addavg_6x\h + cbnz w12, .Loop_addavg_6x\h ret endfunc .endm @@ -344,7 +344,7 @@ function PFX(addAvg_8x\h\()_neon) addAvg_start mov w12, #\h / 2 -.loop_addavg_8x\h: +.Loop_addavg_8x\h: sub w12, w12, #1 ld1 {v0.16b}, x0, x3 ld1 {v1.16b}, x1, x4 @@ -364,7 +364,7 @@ sqxtun v1.8b, v1.8h st1 {v0.8b}, x2, x5 st1 {v1.8b}, x2, x5 - cbnz w12, .loop_addavg_8x\h + cbnz w12, .Loop_addavg_8x\h ret endfunc .endm @@ -385,7 +385,7 @@ sub x4, x4, #16 sub x5, x5, #8 mov w12, #\h -.loop_addAvg_12X\h\(): +.Loop_addAvg_12X\h\(): sub w12, w12, #1 ld1 {v0.16b}, x0, #16 ld1 {v1.16b}, x1, #16 @@ -403,7 +403,7 @@ sqxtun v1.8b, v1.8h st1 {v0.8b}, x2, #8 st1 {v1.s}0, x2, x5 - cbnz w12, .loop_addAvg_12X\h + cbnz w12, .Loop_addAvg_12X\h ret endfunc .endm @@ -415,7 +415,7 @@ function PFX(addAvg_16x\h\()_neon) addAvg_start mov w12, #\h -.loop_addavg_16x\h: +.Loop_addavg_16x\h: sub w12, w12, #1 ld1 {v0.8h-v1.8h}, x0, x3 ld1 {v2.8h-v3.8h}, x1, x4 @@ -424,7 +424,7 @@ sqxtun v0.8b, v0.8h sqxtun2 v0.16b, v1.8h st1 {v0.16b}, x2, x5 - cbnz w12, .loop_addavg_16x\h + cbnz w12, .Loop_addavg_16x\h ret endfunc .endm @@ -441,7 +441,7 @@ function PFX(addAvg_24x\h\()_neon) addAvg_start mov w12, #\h -.loop_addavg_24x\h\(): +.Loop_addavg_24x\h\(): sub w12, w12, #1 ld1 {v0.16b-v2.16b}, x0, x3 ld1 {v3.16b-v5.16b}, x1, x4 @@ -452,7 +452,7 @@ sqxtun v1.8b, v1.8h sqxtun v2.8b, v2.8h st1 {v0.8b-v2.8b}, x2, x5 - cbnz w12, .loop_addavg_24x\h + cbnz w12, .Loop_addavg_24x\h ret endfunc .endm @@ -464,7 +464,7 @@ function PFX(addAvg_32x\h\()_neon) addAvg_start mov w12, #\h -.loop_addavg_32x\h\(): +.Loop_addavg_32x\h\(): sub w12, w12, #1 ld1 {v0.8h-v3.8h}, x0, x3 ld1 {v4.8h-v7.8h}, x1, x4 @@ -477,7 +477,7 @@ sqxtun v2.8b, v2.8h sqxtun v3.8b, v3.8h st1 {v0.8b-v3.8b}, x2, x5 - cbnz w12, .loop_addavg_32x\h + cbnz w12, .Loop_addavg_32x\h ret endfunc .endm @@ -494,7 +494,7 @@ sub x3, x3, #64 sub x4, x4, #64 mov w12, #64 -.loop_addavg_48x64: +.Loop_addavg_48x64: sub w12, w12, #1 ld1 {v0.8h-v3.8h}, x0, #64 ld1 {v4.8h-v7.8h}, x1, #64 @@ -513,7 +513,7 @@ sqxtun v2.8b, v20.8h sqxtun2 v2.16b, v21.8h st1 {v0.16b-v2.16b}, x2, x5 - cbnz w12, .loop_addavg_48x64 + cbnz w12, .Loop_addavg_48x64 ret endfunc @@ -523,7 +523,7 @@ mov w12, #\h sub x3, x3, #64 sub x4, x4, #64 -.loop_addavg_64x\h\(): +.Loop_addavg_64x\h\(): sub w12, w12, #1 ld1 {v0.8h-v3.8h}, x0, #64 ld1 {v4.8h-v7.8h}, x1, #64 @@ -546,7 +546,7 @@ sqxtun v3.8b, v22.8h sqxtun2 v3.16b, v23.8h st1 {v0.16b-v3.16b}, x2, x5 - cbnz w12, .loop_addavg_64x\h + cbnz w12, .Loop_addavg_64x\h ret endfunc .endm
View file
x265_4.0.tar.gz/source/common/aarch64/mem-neon.h
Added
@@ -0,0 +1,268 @@ +/***************************************************************************** + * Copyright (C) 2024 MulticoreWare, Inc + * + * Authors: Hari Limaye <hari.limaye@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_COMMON_AARCH64_MEM_NEON_H +#define X265_COMMON_AARCH64_MEM_NEON_H + +#include <arm_neon.h> +#include <cassert> +#include <stdint.h> + +// Load 4 bytes into the low half of a uint8x8_t, zero the upper half. +static uint8x8_t inline load_u8x4x1(const uint8_t *s) +{ + uint8x8_t ret = vdup_n_u8(0); + + ret = vreinterpret_u8_u32(vld1_lane_u32((const uint32_t*)s, + vreinterpret_u32_u8(ret), 0)); + return ret; +} + +static uint8x8_t inline load_u8x4x2(const uint8_t *s, intptr_t stride) +{ + uint8x8_t ret = vdup_n_u8(0); + + ret = vreinterpret_u8_u32(vld1_lane_u32((const uint32_t*)s, + vreinterpret_u32_u8(ret), 0)); + s += stride; + ret = vreinterpret_u8_u32(vld1_lane_u32((const uint32_t*)s, + vreinterpret_u32_u8(ret), 1)); + + return ret; +} + +// Store 4 bytes from the low half of a uint8x8_t. +static void inline store_u8x4x1(uint8_t *d, const uint8x8_t s) +{ + vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(s), 0); +} + +// Store N blocks of 32-bits from (N / 2) D-Registers. +template<int N> +static void inline store_u8x4_strided_xN(uint8_t *d, intptr_t stride, + const uint8x8_t *s) +{ + assert(N % 2 == 0); + for (int i = 0; i < N / 2; ++i) + { + vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(si), 0); + d += stride; + vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(si), 1); + d += stride; + } +} + +template<int N> +static void inline load_u8x8xn(const uint8_t *src, const intptr_t stride, + uint8x8_t *dst) +{ + for (int i = 0; i < N; ++i) + { + dsti = vld1_u8(src); + src += stride; + } +} + +template<int N> +static void inline load_u8x16xn(const uint8_t *src, const intptr_t stride, + uint8x16_t *dst) +{ + for (int i = 0; i < N; ++i) + { + dsti = vld1q_u8(src); + src += stride; + } +} + +template<int N> +static void inline store_u8x2xn(uint8_t *dst, intptr_t dst_stride, + const uint8x8_t *src) +{ + for (int i = 0; i < N; ++i) + { + vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(srci), 0); + dst += dst_stride; + } +} + +template<int N> +static void inline store_u8x4xn(uint8_t *dst, intptr_t dst_stride, + const uint8x8_t *src) +{ + for (int i = 0; i < N; ++i) + { + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(srci), 0); + dst += dst_stride; + } +} + +template<int N> +static void inline store_u8x6xn(uint8_t *dst, intptr_t dst_stride, + const uint8x8_t *src) +{ + for (int i = 0; i < N; ++i) + { + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(srci), 0); + vst1_lane_u16((uint16_t *)(dst + 4), vreinterpret_u16_u8(srci), 2); + dst += dst_stride; + } +} + +template<int N> +static void inline store_u8x8xn(uint8_t *dst, intptr_t dst_stride, + const uint8x8_t *src) +{ + for (int i = 0; i < N; ++i) + { + vst1_u8(dst, srci); + dst += dst_stride; + } +} + +template<int N, int M> +static void inline store_u8xnxm(uint8_t *dst, intptr_t dst_stride, + const uint8x8_t *src) +{ + switch (N) + { + case 2: return store_u8x2xn<M>(dst, dst_stride, src); + case 4: return store_u8x4xn<M>(dst, dst_stride, src); + case 6: return store_u8x6xn<M>(dst, dst_stride, src); + case 8: return store_u8x8xn<M>(dst, dst_stride, src); + } +} + +template<int N> +static void inline store_u8x16xn(uint8_t *dst, intptr_t dst_stride, + const uint8x16_t *src) +{ + for (int i = 0; i < N; ++i) + { + vst1q_u8(dst, srci); + dst += dst_stride; + } +} + +template<int N> +static void inline load_s16x4xn(const int16_t *src, const intptr_t stride, + int16x4_t *dst) +{ + for (int i = 0; i < N; ++i) + { + dsti = vld1_s16(src); + src += stride; + } +} + +template<int N> +static void inline load_s16x8xn(const int16_t *src, const intptr_t stride, + int16x8_t *dst) +{ + for (int i = 0; i < N; ++i) + { + dsti = vld1q_s16(src); + src += stride; + } +} + +template<int N> +static void inline store_s16x2xn(int16_t *dst, intptr_t dst_stride, + const int16x4_t *src) +{ + for (int i = 0; i < N; ++i) + { + vst1_lane_s32((int32_t*)dst, vreinterpret_s32_s16(srci), 0); + dst += dst_stride; + } +} + +template<int N>
View file
x265_4.0.tar.gz/source/common/aarch64/neon-sve-bridge.h
Added
@@ -0,0 +1,67 @@ +/***************************************************************************** + * Copyright (C) 2024 MulticoreWare, Inc + * + * Authors: Hari Limaye <hari.limaye@arm.com> + * Jonathan Wright <jonathan.wright@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_COMMON_AARCH64_NEON_SVE_BRIDGE_H +#define X265_COMMON_AARCH64_NEON_SVE_BRIDGE_H + +#include <arm_neon.h> + +#if defined(HAVE_SVE) && HAVE_SVE_BRIDGE +#include <arm_sve.h> +#include <arm_neon_sve_bridge.h> + +/* We can access instructions that are exclusive to the SVE or SVE2 instruction + * sets from a predominantly Neon context by making use of the Neon-SVE bridge + * intrinsics to reinterpret Neon vectors as SVE vectors - with the high part of + * the SVE vector (if it's longer than 128 bits) being "don't care". + * + * While sub-optimal on machines that have SVE vector length > 128-bit - as the + * remainder of the vector is unused - this approach is still beneficial when + * compared to a Neon-only implementation. */ + +static inline int32x4_t x265_vld1sh_s32(const int16_t *ptr) +{ + return svget_neonq_s32(svld1sh_s32(svptrue_pat_b32(SV_VL4), ptr)); +} + +static inline int64x2_t x265_sdotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y) +{ + return svget_neonq_s64(svdot_s64(svset_neonq_s64(svundef_s64(), acc), + svset_neonq_s16(svundef_s16(), x), + svset_neonq_s16(svundef_s16(), y))); +} + +static inline int8x16_t x265_sve_mask(const int x, const int endX, + const int8x16_t in) +{ + // Use predicate to shift "unused lanes" outside of range -2, 2 + svbool_t svpred = svwhilelt_b8(x, endX); + svint8_t edge_type = svsel_s8(svpred, svset_neonq_s8(svundef_s8(), in), + svdup_n_s8(-3)); + return svget_neonq_s8(edge_type); +} + +#endif // defined(HAVE_SVE) && HAVE_SVE_BRIDGE + +#endif // X265_COMMON_AARCH64_NEON_SVE_BRIDGE_H
View file
x265_3.6.tar.gz/source/common/aarch64/p2s-sve.S -> x265_4.0.tar.gz/source/common/aarch64/p2s-sve.S
Changed
@@ -204,7 +204,7 @@ #else p2s_start mov x9, #\h -.loop_filter_sve_P2S_32x\h: +.Loop_filter_sve_P2S_32x\h: sub x9, x9, #1 ld1 {v0.16b-v1.16b}, x0, x1 ushll v22.8h, v0.8b, #P2S_SHIFT @@ -216,7 +216,7 @@ add v24.8h, v24.8h, v31.8h add v25.8h, v25.8h, v31.8h st1 {v22.16b-v25.16b}, x2, x3 - cbnz x9, .loop_filter_sve_P2S_32x\h + cbnz x9, .Loop_filter_sve_P2S_32x\h ret #endif endfunc @@ -331,7 +331,7 @@ p2s_start sub x3, x3, #64 mov x9, #\h -.loop_filter_sve_P2S_64x\h: +.Loop_filter_sve_P2S_64x\h: sub x9, x9, #1 ld1 {v0.16b-v3.16b}, x0, x1 ushll v16.8h, v0.8b, #P2S_SHIFT @@ -352,7 +352,7 @@ add v23.8h, v23.8h, v31.8h st1 {v16.16b-v19.16b}, x2, #64 st1 {v20.16b-v23.16b}, x2, x3 - cbnz x9, .loop_filter_sve_P2S_64x\h + cbnz x9, .Loop_filter_sve_P2S_64x\h ret #endif endfunc @@ -422,7 +422,7 @@ p2s_start sub x3, x3, #64 mov x9, #64 -.loop_filterP2S_sve_48x64: +.Loop_filterP2S_sve_48x64: sub x9, x9, #1 ld1 {v0.16b-v2.16b}, x0, x1 ushll v16.8h, v0.8b, #P2S_SHIFT @@ -439,7 +439,7 @@ add v21.8h, v21.8h, v31.8h st1 {v16.16b-v19.16b}, x2, #64 st1 {v20.16b-v21.16b}, x2, x3 - cbnz x9, .loop_filterP2S_sve_48x64 + cbnz x9, .Loop_filterP2S_sve_48x64 ret #endif endfunc
View file
x265_3.6.tar.gz/source/common/aarch64/p2s.S -> x265_4.0.tar.gz/source/common/aarch64/p2s.S
Changed
@@ -262,7 +262,7 @@ function PFX(filterPixelToShort_32x\h\()_neon) p2s_start mov x9, #\h -.loop_filterP2S_32x\h: +.Loop_filterP2S_32x\h: sub x9, x9, #1 #if HIGH_BIT_DEPTH ld1 {v0.16b-v3.16b}, x0, x1 @@ -282,7 +282,7 @@ add v24.8h, v24.8h, v31.8h add v25.8h, v25.8h, v31.8h st1 {v22.16b-v25.16b}, x2, x3 - cbnz x9, .loop_filterP2S_32x\h + cbnz x9, .Loop_filterP2S_32x\h ret endfunc .endm @@ -302,7 +302,7 @@ #endif sub x3, x3, #64 mov x9, #\h -.loop_filterP2S_64x\h: +.Loop_filterP2S_64x\h: sub x9, x9, #1 #if HIGH_BIT_DEPTH ld1 {v0.16b-v3.16b}, x0, #64 @@ -336,7 +336,7 @@ add v23.8h, v23.8h, v31.8h st1 {v16.16b-v19.16b}, x2, #64 st1 {v20.16b-v23.16b}, x2, x3 - cbnz x9, .loop_filterP2S_64x\h + cbnz x9, .Loop_filterP2S_64x\h ret endfunc .endm @@ -353,7 +353,7 @@ #endif sub x3, x3, #64 mov x9, #64 -.loop_filterP2S_48x64: +.Loop_filterP2S_48x64: sub x9, x9, #1 #if HIGH_BIT_DEPTH ld1 {v0.16b-v3.16b}, x0, #64 @@ -381,6 +381,6 @@ add v21.8h, v21.8h, v31.8h st1 {v16.16b-v19.16b}, x2, #64 st1 {v20.16b-v21.16b}, x2, x3 - cbnz x9, .loop_filterP2S_48x64 + cbnz x9, .Loop_filterP2S_48x64 ret endfunc
View file
x265_3.6.tar.gz/source/common/aarch64/pixel-prim.cpp -> x265_4.0.tar.gz/source/common/aarch64/pixel-prim.cpp
Changed
@@ -7,6 +7,8 @@ #include "arm64-utils.h" #if HAVE_NEON +#include "mem-neon.h" + #include <arm_neon.h> using namespace X265_NS; @@ -24,26 +26,32 @@ sub = vsubq_s16(a, b); } -static inline void transpose_8h(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2) +static inline void transpose_8h_8h(int16x8_t &t1, int16x8_t &t2, + const int16x8_t s1, const int16x8_t s2) { t1 = vtrn1q_s16(s1, s2); t2 = vtrn2q_s16(s1, s2); } -static inline void transpose_4s(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2) +static inline void transpose_4s_8h(int16x8_t &t1, int16x8_t &t2, + const int16x8_t s1, const int16x8_t s2) { - t1 = vtrn1q_s32(s1, s2); - t2 = vtrn2q_s32(s1, s2); + int32x4_t tmp1 = vreinterpretq_s32_s16(s1); + int32x4_t tmp2 = vreinterpretq_s32_s16(s2); + + t1 = vreinterpretq_s16_s32(vtrn1q_s32(tmp1, tmp2)); + t2 = vreinterpretq_s16_s32(vtrn2q_s32(tmp1, tmp2)); } -#if (X265_DEPTH <= 10) -static inline void transpose_2d(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2) +static inline void transpose_2d_8h(int16x8_t &t1, int16x8_t &t2, + const int16x8_t s1, const int16x8_t s2) { - t1 = vtrn1q_s64(s1, s2); - t2 = vtrn2q_s64(s1, s2); -} -#endif + int64x2_t tmp1 = vreinterpretq_s64_s16(s1); + int64x2_t tmp2 = vreinterpretq_s64_s16(s2); + t1 = vreinterpretq_s16_s64(vtrn1q_s64(tmp1, tmp2)); + t2 = vreinterpretq_s16_s64(vtrn2q_s64(tmp1, tmp2)); +} static inline void SUMSUB_ABCD(int16x8_t &s1, int16x8_t &d1, int16x8_t &s2, int16x8_t &d2, int16x8_t a, int16x8_t b, int16x8_t c, int16x8_t d) @@ -73,29 +81,25 @@ SUMSUB_AB(v4 , v6 , v16, v18); SUMSUB_AB(v5 , v7 , v17, v19); - v0 = vtrn1q_s16(v4, v5); - v1 = vtrn2q_s16(v4, v5); - v2 = vtrn1q_s16(v6, v7); - v3 = vtrn2q_s16(v6, v7); + transpose_8h_8h(v0, v1, v4, v5); + transpose_8h_8h(v2, v3, v6, v7); SUMSUB_AB(v16, v17, v0, v1); SUMSUB_AB(v18, v19, v2, v3); - v0 = vtrn1q_s32(v16, v18); - v1 = vtrn2q_s32(v16, v18); - v2 = vtrn1q_s32(v17, v19); - v3 = vtrn2q_s32(v17, v19); + transpose_4s_8h(v0, v1, v16, v18); + transpose_4s_8h(v2, v3, v17, v19); - v0 = vabsq_s16(v0); - v1 = vabsq_s16(v1); - v2 = vabsq_s16(v2); - v3 = vabsq_s16(v3); + uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0)); + uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1)); + uint16x8_t abs2 = vreinterpretq_u16_s16(vabsq_s16(v2)); + uint16x8_t abs3 = vreinterpretq_u16_s16(vabsq_s16(v3)); - v0 = vmaxq_u16(v0, v1); - v1 = vmaxq_u16(v2, v3); + uint16x8_t max0 = vmaxq_u16(abs0, abs1); + uint16x8_t max1 = vmaxq_u16(abs2, abs3); - v0 = vaddq_u16(v0, v1); - return vaddlvq_u16(v0); + uint16x8_t sum = vaddq_u16(max0, max1); + return vaddlvq_u16(sum); } static inline int _satd_4x4_neon(int16x8_t v0, int16x8_t v1) @@ -103,22 +107,19 @@ int16x8_t v2, v3; SUMSUB_AB(v2, v3, v0, v1); - v0 = vzip1q_s64(v2, v3); - v1 = vzip2q_s64(v2, v3); + transpose_2d_8h(v0, v1, v2, v3); SUMSUB_AB(v2, v3, v0, v1); - v0 = vtrn1q_s16(v2, v3); - v1 = vtrn2q_s16(v2, v3); + transpose_8h_8h(v0, v1, v2, v3); SUMSUB_AB(v2, v3, v0, v1); - v0 = vtrn1q_s32(v2, v3); - v1 = vtrn2q_s32(v2, v3); + transpose_4s_8h(v0, v1, v2, v3); - v0 = vabsq_s16(v0); - v1 = vabsq_s16(v1); - v0 = vmaxq_u16(v0, v1); + uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0)); + uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1)); + uint16x8_t max = vmaxq_u16(abs0, abs1); - return vaddlvq_s16(v0); + return vaddlvq_u16(max); } static void _satd_8x4v_8x8h_neon(int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3, int16x8_t &v20, @@ -131,44 +132,47 @@ HADAMARD4_V(v20, v21, v22, v23, v0, v1, v2, v3); - transpose_8h(v0, v1, v16, v17); - transpose_8h(v2, v3, v18, v19); - transpose_8h(v4, v5, v20, v21); - transpose_8h(v6, v7, v22, v23); + transpose_8h_8h(v0, v1, v16, v17); + transpose_8h_8h(v2, v3, v18, v19); + transpose_8h_8h(v4, v5, v20, v21); + transpose_8h_8h(v6, v7, v22, v23); SUMSUB_AB(v16, v17, v0, v1); SUMSUB_AB(v18, v19, v2, v3); SUMSUB_AB(v20, v21, v4, v5); SUMSUB_AB(v22, v23, v6, v7); - transpose_4s(v0, v2, v16, v18); - transpose_4s(v1, v3, v17, v19); - transpose_4s(v4, v6, v20, v22); - transpose_4s(v5, v7, v21, v23); - - v0 = vabsq_s16(v0); - v1 = vabsq_s16(v1); - v2 = vabsq_s16(v2); - v3 = vabsq_s16(v3); - v4 = vabsq_s16(v4); - v5 = vabsq_s16(v5); - v6 = vabsq_s16(v6); - v7 = vabsq_s16(v7); - - v0 = vmaxq_u16(v0, v2); - v1 = vmaxq_u16(v1, v3); - v2 = vmaxq_u16(v4, v6); - v3 = vmaxq_u16(v5, v7); - + transpose_4s_8h(v0, v2, v16, v18); + transpose_4s_8h(v1, v3, v17, v19); + transpose_4s_8h(v4, v6, v20, v22); + transpose_4s_8h(v5, v7, v21, v23); + + uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0)); + uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1)); + uint16x8_t abs2 = vreinterpretq_u16_s16(vabsq_s16(v2)); + uint16x8_t abs3 = vreinterpretq_u16_s16(vabsq_s16(v3)); + uint16x8_t abs4 = vreinterpretq_u16_s16(vabsq_s16(v4)); + uint16x8_t abs5 = vreinterpretq_u16_s16(vabsq_s16(v5)); + uint16x8_t abs6 = vreinterpretq_u16_s16(vabsq_s16(v6)); + uint16x8_t abs7 = vreinterpretq_u16_s16(vabsq_s16(v7)); + + v0 = vreinterpretq_s16_u16(vmaxq_u16(abs0, abs2)); + v1 = vreinterpretq_s16_u16(vmaxq_u16(abs1, abs3)); + v2 = vreinterpretq_s16_u16(vmaxq_u16(abs4, abs6)); + v3 = vreinterpretq_s16_u16(vmaxq_u16(abs5, abs7)); } #if HIGH_BIT_DEPTH #if (X265_DEPTH > 10) -static inline void transpose_2d(int32x4_t &t1, int32x4_t &t2, const int32x4_t s1, const int32x4_t s2) +static inline void transpose_2d_4s(int32x4_t &t1, int32x4_t &t2, + const int32x4_t s1, const int32x4_t s2) { - t1 = vtrn1q_s64(s1, s2); - t2 = vtrn2q_s64(s1, s2); + int64x2_t tmp1 = vreinterpretq_s64_s32(s1); + int64x2_t tmp2 = vreinterpretq_s64_s32(s2); + + t1 = vreinterpretq_s32_s64(vtrn1q_s64(tmp1, tmp2)); + t2 = vreinterpretq_s32_s64(vtrn2q_s64(tmp1, tmp2)); } static inline void ISUMSUB_AB(int32x4_t &sum, int32x4_t &sub, const int32x4_t a, const int32x4_t b) @@ -197,35 +201,35 @@ int16x8_t v16, v17;
View file
x265_3.6.tar.gz/source/common/aarch64/pixel-util-sve.S -> x265_4.0.tar.gz/source/common/aarch64/pixel-util-sve.S
Changed
@@ -190,27 +190,27 @@ ld1b {z7.h}, p0/z, x2, x11 add x0, x0, x1 add x2, x2, x3 - ld1b {z29.h}, p0/z, x0 - ld1b {z9.h}, p0/z, x0, x11 - ld1b {z10.h}, p0/z, x2 - ld1b {z11.h}, p0/z, x2, x11 - add x0, x0, x1 - add x2, x2, x3 - ld1b {z12.h}, p0/z, x0 - ld1b {z13.h}, p0/z, x0, x11 - ld1b {z14.h}, p0/z, x2 - ld1b {z15.h}, p0/z, x2, x11 - add x0, x0, x1 - add x2, x2, x3 - sub \v0\().h, z0.h, z2.h sub \v4\().h, z1.h, z3.h sub \v1\().h, z4.h, z6.h sub \v5\().h, z5.h, z7.h - sub \v2\().h, z29.h, z10.h - sub \v6\().h, z9.h, z11.h - sub \v3\().h, z12.h, z14.h - sub \v7\().h, z13.h, z15.h + + ld1b {z0.h}, p0/z, x0 + ld1b {z1.h}, p0/z, x0, x11 + ld1b {z2.h}, p0/z, x2 + ld1b {z3.h}, p0/z, x2, x11 + add x0, x0, x1 + add x2, x2, x3 + ld1b {z4.h}, p0/z, x0 + ld1b {z5.h}, p0/z, x0, x11 + ld1b {z6.h}, p0/z, x2 + ld1b {z7.h}, p0/z, x2, x11 + add x0, x0, x1 + add x2, x2, x3 + sub \v2\().h, z0.h, z2.h + sub \v6\().h, z1.h, z3.h + sub \v3\().h, z4.h, z6.h + sub \v7\().h, z5.h, z7.h .endm // one vertical hadamard pass and two horizontal @@ -314,60 +314,3 @@ mov x0, x7 ret x10 endfunc - -/********* ssim ***********/ -// uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff) -// No need to fully use sve instructions for this function -function PFX(quant_sve) - mov w9, #1 - lsl w9, w9, w4 - mov z0.s, w9 - neg w9, w4 - mov z1.s, w9 - add w9, w9, #8 - mov z2.s, w9 - mov z3.s, w5 - - lsr w6, w6, #2 - eor z4.d, z4.d, z4.d - eor w10, w10, w10 - eor z17.d, z17.d, z17.d - -.loop_quant_sve: - ld1 {v18.4h}, x0, #8 - ld1 {v7.4s}, x1, #16 - sxtl v6.4s, v18.4h - - cmlt v5.4s, v6.4s, #0 - - abs v6.4s, v6.4s - - - mul v6.4s, v6.4s, v7.4s - - add v7.4s, v6.4s, v3.4s - sshl v7.4s, v7.4s, v1.4s - - mls v6.4s, v7.4s, v0.s0 - sshl v16.4s, v6.4s, v2.4s - st1 {v16.4s}, x2, #16 - - // numsig - cmeq v16.4s, v7.4s, v17.4s - add v4.4s, v4.4s, v16.4s - add w10, w10, #4 - - // level *= sign - eor z16.d, z7.d, z5.d - sub v16.4s, v16.4s, v5.4s - sqxtn v5.4h, v16.4s - st1 {v5.4h}, x3, #8 - - subs w6, w6, #1 - b.ne .loop_quant_sve - - addv s4, v4.4s - mov w9, v4.s0 - add w0, w10, w9 - ret -endfunc
View file
x265_3.6.tar.gz/source/common/aarch64/pixel-util-sve2.S -> x265_4.0.tar.gz/source/common/aarch64/pixel-util-sve2.S
Changed
@@ -64,11 +64,11 @@ bgt .vl_gt_16_pixel_var_16x16 pixel_var_start mov w12, #16 -.loop_var_16_sve2: +.Loop_var_16_sve2: sub w12, w12, #1 ld1 {v4.16b}, x0, x1 pixel_var_1 v4 - cbnz w12, .loop_var_16_sve2 + cbnz w12, .Loop_var_16_sve2 pixel_var_end ret .vl_gt_16_pixel_var_16x16: @@ -95,12 +95,12 @@ bgt .vl_gt_16_pixel_var_32x32 pixel_var_start mov w12, #32 -.loop_var_32_sve2: +.Loop_var_32_sve2: sub w12, w12, #1 ld1 {v4.16b-v5.16b}, x0, x1 pixel_var_1 v4 pixel_var_1 v5 - cbnz w12, .loop_var_32_sve2 + cbnz w12, .Loop_var_32_sve2 pixel_var_end ret .vl_gt_16_pixel_var_32x32: @@ -150,14 +150,14 @@ bgt .vl_gt_16_pixel_var_64x64 pixel_var_start mov w12, #64 -.loop_var_64_sve2: +.Loop_var_64_sve2: sub w12, w12, #1 ld1 {v4.16b-v7.16b}, x0, x1 pixel_var_1 v4 pixel_var_1 v5 pixel_var_1 v6 pixel_var_1 v7 - cbnz w12, .loop_var_64_sve2 + cbnz w12, .Loop_var_64_sve2 pixel_var_end ret .vl_gt_16_pixel_var_64x64: @@ -268,7 +268,7 @@ bgt .vl_gt_16_getResidual32 lsl x4, x3, #1 mov w12, #4 -.loop_residual_32: +.Loop_residual_32: sub w12, w12, #1 .rept 4 ld1 {v0.16b-v1.16b}, x0, x3 @@ -286,7 +286,7 @@ st1 {v16.8h-v19.8h}, x2, x4 st1 {v20.8h-v23.8h}, x2, x4 .endr - cbnz w12, .loop_residual_32 + cbnz w12, .Loop_residual_32 ret .vl_gt_16_getResidual32: cmp x9, #48 @@ -323,7 +323,7 @@ bgt .vl_gt_16_pixel_sub_ps_32x32 lsl x1, x1, #1 mov w12, #4 -.loop_sub_ps_32_sve2: +.Loop_sub_ps_32_sve2: sub w12, w12, #1 .rept 4 ld1 {v0.16b-v1.16b}, x2, x4 @@ -341,7 +341,7 @@ st1 {v16.8h-v19.8h}, x0, x1 st1 {v20.8h-v23.8h}, x0, x1 .endr - cbnz w12, .loop_sub_ps_32_sve2 + cbnz w12, .Loop_sub_ps_32_sve2 ret .vl_gt_16_pixel_sub_ps_32x32: cmp x9, #48 @@ -387,7 +387,7 @@ lsl x1, x1, #1 sub x1, x1, #64 mov w12, #16 -.loop_sub_ps_64_sve2: +.Loop_sub_ps_64_sve2: sub w12, w12, #1 .rept 4 ld1 {v0.16b-v3.16b}, x2, x4 @@ -403,7 +403,7 @@ st1 {v16.8h-v19.8h}, x0, #64 st1 {v20.8h-v23.8h}, x0, x1 .endr - cbnz w12, .loop_sub_ps_64_sve2 + cbnz w12, .Loop_sub_ps_64_sve2 ret .vl_gt_16_pixel_sub_ps_64x64: rdvl x9, #1 @@ -473,7 +473,7 @@ bgt .vl_gt_16_pixel_sub_ps_32x64 lsl x1, x1, #1 mov w12, #8 -.loop_sub_ps_32x64_sve2: +.Loop_sub_ps_32x64_sve2: sub w12, w12, #1 .rept 4 ld1 {v0.16b-v1.16b}, x2, x4 @@ -491,7 +491,7 @@ st1 {v16.8h-v19.8h}, x0, x1 st1 {v20.8h-v23.8h}, x0, x1 .endr - cbnz w12, .loop_sub_ps_32x64_sve2 + cbnz w12, .Loop_sub_ps_32x64_sve2 ret .vl_gt_16_pixel_sub_ps_32x64: cmp x9, #48 @@ -609,7 +609,7 @@ bgt .vl_gt_16_pixel_add_ps_32x\h lsl x5, x5, #1 mov w12, #\h / 4 -.loop_add_ps__sve2_32x\h\(): +.Loop_add_ps__sve2_32x\h\(): sub w12, w12, #1 .rept 4 ld1 {v0.16b-v1.16b}, x2, x4 @@ -628,7 +628,7 @@ sqxtun2 v5.16b, v27.8h st1 {v4.16b-v5.16b}, x0, x1 .endr - cbnz w12, .loop_add_ps__sve2_32x\h + cbnz w12, .Loop_add_ps__sve2_32x\h ret .vl_gt_16_pixel_add_ps_32x\h\(): cmp x9, #48 @@ -1157,7 +1157,7 @@ bgt .vl_gt_16_ssimDist16 ssimDist_start ptrue p0.s, vl4 -.loop_ssimDist16_sve2: +.Loop_ssimDist16_sve2: sub w12, w12, #1 ld1b {z4.s}, p0/z, x0 ld1b {z5.s}, p0/z, x0, #1, mul vl @@ -1171,7 +1171,7 @@ add x2, x2, x3 ssimDist_1_sve2 z4, z5, z8, z9 ssimDist_1_sve2 z6, z7, z10, z11 - cbnz w12, .loop_ssimDist16_sve2 + cbnz w12, .Loop_ssimDist16_sve2 ssimDist_end ret .vl_gt_16_ssimDist16: @@ -1217,7 +1217,7 @@ bgt .vl_gt_16_ssimDist32 ssimDist_start ptrue p0.s, vl4 -.loop_ssimDist32_sve2: +.Loop_ssimDist32_sve2: sub w12, w12, #1 ld1b {z2.s}, p0/z, x0 ld1b {z3.s}, p0/z, x0, #1, mul vl @@ -1241,7 +1241,7 @@ ssimDist_1_sve2 z4, z5, z12, z13 ssimDist_1_sve2 z6, z7, z14, z15 ssimDist_1_sve2 z8, z9, z30, z31 - cbnz w12, .loop_ssimDist32_sve2 + cbnz w12, .Loop_ssimDist32_sve2 ssimDist_end ret .vl_gt_16_ssimDist32: @@ -1309,7 +1309,7 @@ bgt .vl_gt_16_ssimDist64 ssimDist_start ptrue p0.s, vl4 -.loop_ssimDist64_sve2: +.Loop_ssimDist64_sve2: sub w12, w12, #1 ld1b {z2.s}, p0/z, x0 ld1b {z3.s}, p0/z, x0, #1, mul vl @@ -1357,7 +1357,7 @@ ssimDist_1_sve2 z8, z9, z29, z30 add x0, x0, x1 add x2, x2, x3 - cbnz w12, .loop_ssimDist64_sve2 + cbnz w12, .Loop_ssimDist64_sve2 ssimDist_end ret .vl_gt_16_ssimDist64: @@ -1482,7 +1482,7 @@ bgt .vl_gt_16_normFact16 normFact_start ptrue p0.s, vl4 -.loop_normFact16_sve2: +.Loop_normFact16_sve2: sub w12, w12, #1 ld1b {z4.s}, p0/z, x0 ld1b {z5.s}, p0/z, x0, #1, mul vl @@ -1491,7 +1491,7 @@
View file
x265_3.6.tar.gz/source/common/aarch64/pixel-util.S -> x265_4.0.tar.gz/source/common/aarch64/pixel-util.S
Changed
@@ -60,11 +60,11 @@ function PFX(pixel_var_16x16_neon) pixel_var_start mov w12, #16 -.loop_var_16: +.Loop_var_16: sub w12, w12, #1 ld1 {v4.16b}, x0, x1 pixel_var_1 v4 - cbnz w12, .loop_var_16 + cbnz w12, .Loop_var_16 pixel_var_end ret endfunc @@ -72,12 +72,12 @@ function PFX(pixel_var_32x32_neon) pixel_var_start mov w12, #32 -.loop_var_32: +.Loop_var_32: sub w12, w12, #1 ld1 {v4.16b-v5.16b}, x0, x1 pixel_var_1 v4 pixel_var_1 v5 - cbnz w12, .loop_var_32 + cbnz w12, .Loop_var_32 pixel_var_end ret endfunc @@ -85,14 +85,14 @@ function PFX(pixel_var_64x64_neon) pixel_var_start mov w12, #64 -.loop_var_64: +.Loop_var_64: sub w12, w12, #1 ld1 {v4.16b-v7.16b}, x0, x1 pixel_var_1 v4 pixel_var_1 v5 pixel_var_1 v6 pixel_var_1 v7 - cbnz w12, .loop_var_64 + cbnz w12, .Loop_var_64 pixel_var_end ret endfunc @@ -148,7 +148,7 @@ function PFX(getResidual32_neon) lsl x4, x3, #1 mov w12, #4 -.loop_residual_32: +.Loop_residual_32: sub w12, w12, #1 .rept 4 ld1 {v0.16b-v1.16b}, x0, x3 @@ -166,7 +166,7 @@ st1 {v16.8h-v19.8h}, x2, x4 st1 {v20.8h-v23.8h}, x2, x4 .endr - cbnz w12, .loop_residual_32 + cbnz w12, .Loop_residual_32 ret endfunc @@ -221,7 +221,7 @@ function PFX(pixel_sub_ps_32x32_neon) lsl x1, x1, #1 mov w12, #4 -.loop_sub_ps_32: +.Loop_sub_ps_32: sub w12, w12, #1 .rept 4 ld1 {v0.16b-v1.16b}, x2, x4 @@ -239,7 +239,7 @@ st1 {v16.8h-v19.8h}, x0, x1 st1 {v20.8h-v23.8h}, x0, x1 .endr - cbnz w12, .loop_sub_ps_32 + cbnz w12, .Loop_sub_ps_32 ret endfunc @@ -247,7 +247,7 @@ lsl x1, x1, #1 sub x1, x1, #64 mov w12, #16 -.loop_sub_ps_64: +.Loop_sub_ps_64: sub w12, w12, #1 .rept 4 ld1 {v0.16b-v3.16b}, x2, x4 @@ -263,7 +263,7 @@ st1 {v16.8h-v19.8h}, x0, #64 st1 {v20.8h-v23.8h}, x0, x1 .endr - cbnz w12, .loop_sub_ps_64 + cbnz w12, .Loop_sub_ps_64 ret endfunc @@ -318,7 +318,7 @@ function PFX(pixel_sub_ps_32x64_neon) lsl x1, x1, #1 mov w12, #8 -.loop_sub_ps_32x64: +.Loop_sub_ps_32x64: sub w12, w12, #1 .rept 4 ld1 {v0.16b-v1.16b}, x2, x4 @@ -336,7 +336,7 @@ st1 {v16.8h-v19.8h}, x0, x1 st1 {v20.8h-v23.8h}, x0, x1 .endr - cbnz w12, .loop_sub_ps_32x64 + cbnz w12, .Loop_sub_ps_32x64 ret endfunc @@ -383,7 +383,7 @@ function PFX(pixel_add_ps_16x\h\()_neon) lsl x5, x5, #1 mov w12, #\h / 8 -.loop_add_ps_16x\h\(): +.Loop_add_ps_16x\h\(): sub w12, w12, #1 .rept 4 ld1 {v0.16b}, x2, x4 @@ -405,7 +405,7 @@ st1 {v4.16b}, x0, x1 st1 {v5.16b}, x0, x1 .endr - cbnz w12, .loop_add_ps_16x\h + cbnz w12, .Loop_add_ps_16x\h ret endfunc .endm @@ -417,7 +417,7 @@ function PFX(pixel_add_ps_32x\h\()_neon) lsl x5, x5, #1 mov w12, #\h / 4 -.loop_add_ps_32x\h\(): +.Loop_add_ps_32x\h\(): sub w12, w12, #1 .rept 4 ld1 {v0.16b-v1.16b}, x2, x4 @@ -436,7 +436,7 @@ sqxtun2 v5.16b, v27.8h st1 {v4.16b-v5.16b}, x0, x1 .endr - cbnz w12, .loop_add_ps_32x\h + cbnz w12, .Loop_add_ps_32x\h ret endfunc .endm @@ -448,7 +448,7 @@ lsl x5, x5, #1 sub x5, x5, #64 mov w12, #32 -.loop_add_ps_64x64: +.Loop_add_ps_64x64: sub w12, w12, #1 .rept 2 ld1 {v0.16b-v3.16b}, x2, x4 @@ -480,7 +480,7 @@ sqxtun2 v3.16b, v7.8h st1 {v0.16b-v3.16b}, x0, x1 .endr - cbnz w12, .loop_add_ps_64x64 + cbnz w12, .Loop_add_ps_64x64 ret endfunc @@ -548,7 +548,7 @@ // void scale2D_64to32(pixel* dst, const pixel* src, intptr_t stride) function PFX(scale2D_64to32_neon) mov w12, #32 -.loop_scale2D: +.Loop_scale2D: ld1 {v0.16b-v3.16b}, x1, x2 sub w12, w12, #1 ld1 {v4.16b-v7.16b}, x1, x2 @@ -561,7 +561,7 @@ uqrshrn v1.8b, v2.8h, #2 uqrshrn2 v1.16b, v3.8h, #2 st1 {v0.16b-v1.16b}, x0, #32 - cbnz w12, .loop_scale2D + cbnz w12, .Loop_scale2D ret endfunc @@ -569,33 +569,33 @@ function PFX(pixel_planecopy_cp_neon) dup v2.16b, w6 sub x5, x5, #1 -.loop_h: +.Loop_h: mov x6, x0 mov x12, x2 mov x7, #0 -.loop_w:
View file
x265_3.6.tar.gz/source/common/aarch64/sad-a.S -> x265_4.0.tar.gz/source/common/aarch64/sad-a.S
Changed
@@ -1,8 +1,9 @@ /***************************************************************************** - * Copyright (C) 2020-2021 MulticoreWare, Inc + * Copyright (C) 2020-2024 MulticoreWare, Inc * * Authors: Hongbin Liu <liuhongbin1@huawei.com> * Sebastian Pop <spop@amazon.com> + Hari Limaye <hari.limaye@arm.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -23,7 +24,6 @@ *****************************************************************************/ #include "asm.S" -#include "sad-a-common.S" #ifdef __APPLE__ .section __RODATA,__rodata @@ -35,12 +35,234 @@ .text +.macro SAD_START_4 f + ldr s0, x0 + ldr s1, x2 + add x0, x0, x1 + add x2, x2, x3 + ld1 {v0.s}1, x0, x1 + ld1 {v1.s}1, x2, x3 + \f v16.8h, v0.8b, v1.8b +.endm + +.macro SAD_4 h +.rept \h / 2 - 1 + SAD_START_4 uabal +.endr +.endm + +.macro SAD_START_8 f + ld1 {v0.8b}, x0, x1 + ld1 {v1.8b}, x2, x3 + \f v16.8h, v0.8b, v1.8b +.endm + +.macro SAD_8 h +.rept \h - 3 + SAD_START_8 uabal +.endr + ldr d0, x0 + ldr d1, x2 + uabal v16.8h, v0.8b, v1.8b + ldr d0, x0, x1 + ldr d1, x2, x3 + uabal v16.8h, v0.8b, v1.8b +.endm + +.macro SAD_START_16 + movi v16.16b, #0 + movi v17.16b, #0 +.endm + +.macro SAD_16 + ld1 {v0.16b}, x0, x1 + ld1 {v1.16b}, x2, x3 + ld1 {v2.16b}, x0, x1 + ld1 {v3.16b}, x2, x3 + uabd v20.16b, v0.16b, v1.16b + uadalp v16.8h, v20.16b + uabd v21.16b, v2.16b, v3.16b + uadalp v17.8h, v21.16b +.endm + +.macro SAD_END_16 + add v16.8h, v16.8h, v17.8h + uaddlv s0, v16.8h + fmov x0, d0 + ret +.endm + +.macro SAD_START_32 + movi v16.16b, #0 + movi v17.16b, #0 + movi v18.16b, #0 + movi v19.16b, #0 +.endm + +.macro SAD_32 + ld1 {v0.16b-v1.16b}, x0, x1 + ld1 {v2.16b-v3.16b}, x2, x3 + ld1 {v4.16b-v5.16b}, x0, x1 + ld1 {v6.16b-v7.16b}, x2, x3 + uabd v20.16b, v0.16b, v2.16b + uadalp v16.8h, v20.16b + uabd v21.16b, v1.16b, v3.16b + uadalp v17.8h, v21.16b + uabd v22.16b, v4.16b, v6.16b + uadalp v18.8h, v22.16b + uabd v23.16b, v5.16b, v7.16b + uadalp v19.8h, v23.16b +.endm + +.macro SAD_END_32 + add v16.8h, v16.8h, v17.8h + add v17.8h, v18.8h, v19.8h + add v16.8h, v16.8h, v17.8h + uaddlv s0, v16.8h + fmov w0, s0 + ret +.endm + +.macro SAD_START_64 + movi v16.16b, #0 + movi v17.16b, #0 + movi v18.16b, #0 + movi v19.16b, #0 +.endm + +.macro SAD_64 + ld1 {v0.16b-v3.16b}, x0, x1 + ld1 {v4.16b-v7.16b}, x2, x3 + ld1 {v24.16b-v27.16b}, x0, x1 + ld1 {v28.16b-v31.16b}, x2, x3 + uabd v20.16b, v0.16b, v4.16b + uadalp v16.8h, v20.16b + uabd v21.16b, v1.16b, v5.16b + uadalp v17.8h, v21.16b + uabd v22.16b, v2.16b, v6.16b + uadalp v18.8h, v22.16b + uabd v23.16b, v3.16b, v7.16b + uadalp v19.8h, v23.16b + uabd v20.16b, v24.16b, v28.16b + uadalp v16.8h, v20.16b + uabd v21.16b, v25.16b, v29.16b + uadalp v17.8h, v21.16b + uabd v22.16b, v26.16b, v30.16b + uadalp v18.8h, v22.16b + uabd v23.16b, v27.16b, v31.16b + uadalp v19.8h, v23.16b +.endm + +.macro SAD_END_64 + uaddlp v16.4s, v16.8h + uadalp v16.4s, v17.8h + uadalp v16.4s, v18.8h + uadalp v16.4s, v19.8h + uaddlv d0, v16.4s + fmov x0, d0 + ret +.endm + +.macro SAD_START_12 + movrel x12, sad12_mask + ld1 {v31.16b}, x12 + movi v16.16b, #0 + movi v17.16b, #0 +.endm + +.macro SAD_12 + ld1 {v0.16b}, x0, x1 + and v0.16b, v0.16b, v31.16b + ld1 {v1.16b}, x2, x3 + and v1.16b, v1.16b, v31.16b + ld1 {v2.16b}, x0, x1 + and v2.16b, v2.16b, v31.16b + ld1 {v3.16b}, x2, x3 + and v3.16b, v3.16b, v31.16b + uabd v20.16b, v0.16b, v1.16b + uadalp v16.8h, v20.16b + uabd v21.16b, v2.16b, v3.16b + uadalp v17.8h, v21.16b +.endm + +.macro SAD_END_12 + add v16.8h, v16.8h, v17.8h + uaddlv s0, v16.8h + fmov w0, s0 + ret +.endm + +.macro SAD_START_24 + movi v16.16b, #0 + movi v17.16b, #0 + sub x1, x1, #16 + sub x3, x3, #16 +.endm + +.macro SAD_24 + ld1 {v0.16b}, x0, #16 + ld1 {v1.8b}, x0, x1 + ld1 {v2.16b}, x2, #16 + ld1 {v3.8b}, x2, x3 + ld1 {v4.16b}, x0, #16 + ld1 {v5.8b}, x0, x1 + ld1 {v6.16b}, x2, #16 + ld1 {v7.8b}, x2, x3 + uabd v20.16b, v0.16b, v2.16b + uadalp v16.8h, v20.16b + uabal v17.8h, v1.8b, v3.8b + uabd v20.16b, v4.16b, v6.16b
View file
x265_4.0.tar.gz/source/common/aarch64/sad-neon-dotprod.S
Added
@@ -0,0 +1,330 @@ +/***************************************************************************** + * Copyright (C) 2024 MulticoreWare, Inc + * + * Authors: Hari Limaye <hari.limaye@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "asm.S" + +.arch armv8.2-a+dotprod + +#ifdef __APPLE__ +.section __RODATA,__rodata +#else +.section .rodata +#endif + +.align 4 + +.text + +// Fully unrolled with single accumulator for smaller block heights. +.macro SAD_NEON_DOTPROD_16_S h +function PFX(pixel_sad_16x\h\()_neon_dotprod) + movi v0.16b, #0 + movi v1.16b, #1 +.rept \h - 2 + ldr q2, x0 + ldr q3, x2 + add x0, x0, x1 + add x2, x2, x3 + uabd v4.16b, v2.16b, v3.16b + udot v0.4s, v4.16b, v1.16b +.endr + ldr q2, x0 + ldr q3, x2 + uabd v4.16b, v2.16b, v3.16b + udot v0.4s, v4.16b, v1.16b + ldr q2, x0, x1 + ldr q3, x2, x3 + uabd v4.16b, v2.16b, v3.16b + udot v0.4s, v4.16b, v1.16b + + addv s0, v0.4s + fmov w0, s0 + ret +endfunc +.endm + +.macro SAD_NEON_DOTPROD_START + // v31: 1 across all lanes for use in UDOT instructions. + movi v31.16b, #1 + movi v16.16b, #0 + movi v17.16b, #0 +.endm + +.macro SAD_NEON_DOTPROD_END + add v16.4s, v16.4s, v17.4s + addv s0, v16.4s + fmov w0, s0 + ret +.endm + +// Fully unrolled. +.macro SAD_NEON_DOTPROD_16 h +function PFX(pixel_sad_16x\h\()_neon_dotprod) + SAD_NEON_DOTPROD_START +.rept \h / 2 + ld1 {v0.16b}, x0, x1 + ld1 {v1.16b}, x0, x1 + ld1 {v2.16b}, x2, x3 + ld1 {v3.16b}, x2, x3 + uabd v20.16b, v0.16b, v2.16b + udot v16.4s, v20.16b, v31.16b + uabd v21.16b, v1.16b, v3.16b + udot v17.4s, v21.16b, v31.16b +.endr + SAD_NEON_DOTPROD_END +endfunc +.endm + +// Process four rows of width 32. +.macro SAD_NEON_DOTPROD_32 +.rept 4 + ld1 {v0.16b-v1.16b}, x0, x1 + ld1 {v2.16b-v3.16b}, x2, x3 + uabd v20.16b, v0.16b, v2.16b + udot v16.4s, v20.16b, v31.16b + uabd v21.16b, v1.16b, v3.16b + udot v17.4s, v21.16b, v31.16b +.endr +.endm + +// Process four rows of width 48. +.macro SAD_NEON_DOTPROD_48 +.rept 4 + ld1 {v0.16b-v2.16b}, x0, x1 + ld1 {v4.16b-v6.16b}, x2, x3 + uabd v20.16b, v0.16b, v4.16b + udot v16.4s, v20.16b, v31.16b + uabd v21.16b, v1.16b, v5.16b + udot v17.4s, v21.16b, v31.16b + uabd v20.16b, v2.16b, v6.16b + udot v16.4s, v20.16b, v31.16b +.endr +.endm + +// Process four rows of width 64. +.macro SAD_NEON_DOTPROD_64 +.rept 4 + ld1 {v0.16b-v3.16b}, x0, x1 + ld1 {v4.16b-v7.16b}, x2, x3 + uabd v20.16b, v0.16b, v4.16b + udot v16.4s, v20.16b, v31.16b + uabd v21.16b, v1.16b, v5.16b + udot v17.4s, v21.16b, v31.16b + uabd v20.16b, v2.16b, v6.16b + udot v16.4s, v20.16b, v31.16b + uabd v21.16b, v3.16b, v7.16b + udot v17.4s, v21.16b, v31.16b +.endr +.endm + +// Loop unrolled to process 4 rows per iteration. +.macro SAD_NEON_DOTPROD_LOOP w, h +function PFX(pixel_sad_\w\()x\h\()_neon_dotprod) + SAD_NEON_DOTPROD_START + mov w9, #\h/4 +.Loop_\w\()x\h: + sub w9, w9, #1 + + SAD_NEON_DOTPROD_\w + + cbnz w9, .Loop_\w\()x\h + SAD_NEON_DOTPROD_END +endfunc +.endm + +SAD_NEON_DOTPROD_16_S 4 +SAD_NEON_DOTPROD_16_S 8 +SAD_NEON_DOTPROD_16_S 12 +SAD_NEON_DOTPROD_16_S 16 +SAD_NEON_DOTPROD_16 32 +SAD_NEON_DOTPROD_16 64 +SAD_NEON_DOTPROD_LOOP 32, 8 +SAD_NEON_DOTPROD_LOOP 32, 16 +SAD_NEON_DOTPROD_LOOP 32, 24 +SAD_NEON_DOTPROD_LOOP 32, 32 +SAD_NEON_DOTPROD_LOOP 32, 64 +SAD_NEON_DOTPROD_LOOP 48, 64 +SAD_NEON_DOTPROD_LOOP 64, 16 +SAD_NEON_DOTPROD_LOOP 64, 32 +SAD_NEON_DOTPROD_LOOP 64, 48 +SAD_NEON_DOTPROD_LOOP 64, 64 + +.macro PREP_ARGS_SAD_X_NEON_DOTPROD x + mov x9, #FENC_STRIDE + +// Make function arguments for x == 3 look like x == 4. +.if \x == 3 + mov x6, x5 + mov x5, x4 +.endif + + // v31: 1 across all lanes for use in UDOT instructions. + movi v31.16b, #1 +.endm + +.macro SAD_X_NEON_DOTPROD_START x + movi v16.4s, #0 + movi v17.4s, #0 + movi v18.4s, #0 +.if \x == 4 + movi v19.4s, #0 +.endif +.endm + +.macro SAD_X_NEON_DOTPROD_END x +.if \x == 3 + addv s0, v16.4s + addv s1, v17.4s + addv s2, v18.4s + stp s0, s1, x6
View file
x265_4.0.tar.gz/source/common/aarch64/sao-prim-sve.cpp
Added
@@ -0,0 +1,271 @@ +/***************************************************************************** + * Copyright (C) 2024 MulticoreWare, Inc + * + * Authors: Hari Limaye <hari.limaye@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "sao-prim.h" + +/* + * Compute Edge Offset statistics (count and stats). + * To save some instructions compute count and stats as negative values - since + * output of Neon comparison instructions for a matched condition is all 1s (-1). + */ +static inline void compute_eo_stats(const int8x16_t edge_type, + const int16_t *diff, int16x8_t *count, + int64x2_t *stats) +{ + // Create a mask for each edge type. + int8x16_t mask0 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(-2))); + int8x16_t mask1 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(-1))); + int8x16_t mask2 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(0))); + int8x16_t mask3 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(1))); + int8x16_t mask4 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(2))); + + // Compute negative counts for each edge type. + count0 = vpadalq_s8(count0, mask0); + count1 = vpadalq_s8(count1, mask1); + count2 = vpadalq_s8(count2, mask2); + count3 = vpadalq_s8(count3, mask3); + count4 = vpadalq_s8(count4, mask4); + + // Widen the masks to 16-bit. + int16x8_t mask0_lo = vreinterpretq_s16_s8(vzip1q_s8(mask0, mask0)); + int16x8_t mask0_hi = vreinterpretq_s16_s8(vzip2q_s8(mask0, mask0)); + int16x8_t mask1_lo = vreinterpretq_s16_s8(vzip1q_s8(mask1, mask1)); + int16x8_t mask1_hi = vreinterpretq_s16_s8(vzip2q_s8(mask1, mask1)); + int16x8_t mask2_lo = vreinterpretq_s16_s8(vzip1q_s8(mask2, mask2)); + int16x8_t mask2_hi = vreinterpretq_s16_s8(vzip2q_s8(mask2, mask2)); + int16x8_t mask3_lo = vreinterpretq_s16_s8(vzip1q_s8(mask3, mask3)); + int16x8_t mask3_hi = vreinterpretq_s16_s8(vzip2q_s8(mask3, mask3)); + int16x8_t mask4_lo = vreinterpretq_s16_s8(vzip1q_s8(mask4, mask4)); + int16x8_t mask4_hi = vreinterpretq_s16_s8(vzip2q_s8(mask4, mask4)); + + int16x8_t diff_lo = vld1q_s16(diff); + int16x8_t diff_hi = vld1q_s16(diff + 8); + + // Compute negative stats for each edge type. + stats0 = x265_sdotq_s16(stats0, diff_lo, mask0_lo); + stats0 = x265_sdotq_s16(stats0, diff_hi, mask0_hi); + stats1 = x265_sdotq_s16(stats1, diff_lo, mask1_lo); + stats1 = x265_sdotq_s16(stats1, diff_hi, mask1_hi); + stats2 = x265_sdotq_s16(stats2, diff_lo, mask2_lo); + stats2 = x265_sdotq_s16(stats2, diff_hi, mask2_hi); + stats3 = x265_sdotq_s16(stats3, diff_lo, mask3_lo); + stats3 = x265_sdotq_s16(stats3, diff_hi, mask3_hi); + stats4 = x265_sdotq_s16(stats4, diff_lo, mask4_lo); + stats4 = x265_sdotq_s16(stats4, diff_hi, mask4_hi); +} + +/* + * Reduce and store Edge Offset statistics (count and stats). + */ +static inline void reduce_eo_stats(int64x2_t *vstats, int16x8_t *vcount, + int32_t *stats, int32_t *count) +{ + // s_eoTable maps edge types to memory in order: {2, 0, 1, 3, 4}. + int16x8_t c01 = vpaddq_s16(vcount2, vcount0); + int16x8_t c23 = vpaddq_s16(vcount1, vcount3); + int16x8_t c0123 = vpaddq_s16(c01, c23); + // Subtract from current count, as we calculate the negation. + vst1q_s32(count, vsubq_s32(vld1q_s32(count), vpaddlq_s16(c0123))); + count4 -= vaddvq_s16(vcount4); + + int32x4_t s01 = vcombine_s32(vmovn_s64(vstats2), vmovn_s64(vstats0)); + int32x4_t s23 = vcombine_s32(vmovn_s64(vstats1), vmovn_s64(vstats3)); + int32x4_t s0123 = vpaddq_s32(s01, s23); + // Subtract from current stats, as we calculate the negation. + vst1q_s32(stats, vsubq_s32(vld1q_s32(stats), s0123)); + stats4 -= vaddvq_s64(vstats4); +} + +namespace X265_NS { +void saoCuStatsE0_sve(const int16_t *diff, const pixel *rec, intptr_t stride, + int endX, int endY, int32_t *stats, int32_t *count) +{ + // Separate buffers for each edge type, so that we can vectorise. + int16x8_t tmp_count5 = { vdupq_n_s16(0), vdupq_n_s16(0), vdupq_n_s16(0), + vdupq_n_s16(0), vdupq_n_s16(0) }; + int64x2_t tmp_stats5 = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0), + vdupq_n_s64(0), vdupq_n_s64(0) }; + + for (int y = 0; y < endY; y++) + { + // Calculate negated sign_left(x) directly, to save negation when + // reusing sign_right(x) as sign_left(x + 1). + int8x16_t neg_sign_left = vdupq_n_s8(x265_signOf(rec-1 - rec0)); + for (int x = 0; x < endX; x += 16) + { + int8x16_t sign_right = signOf_neon(rec + x, rec + x + 1); + + // neg_sign_left(x) = sign_right(x + 1), reusing one from previous + // iteration. + neg_sign_left = vextq_s8(neg_sign_left, sign_right, 15); + + // Subtract instead of add, as sign_left is negated. + int8x16_t edge_type = vsubq_s8(sign_right, neg_sign_left); + + // For reuse in the next iteration. + neg_sign_left = sign_right; + + edge_type = x265_sve_mask(x, endX, edge_type); + compute_eo_stats(edge_type, diff + x, tmp_count, tmp_stats); + } + + diff += MAX_CU_SIZE; + rec += stride; + } + + reduce_eo_stats(tmp_stats, tmp_count, stats, count); +} + +void saoCuStatsE1_sve(const int16_t *diff, const pixel *rec, intptr_t stride, + int8_t *upBuff1, int endX, int endY, int32_t *stats, + int32_t *count) +{ + // Separate buffers for each edge type, so that we can vectorise. + int16x8_t tmp_count5 = { vdupq_n_s16(0), vdupq_n_s16(0), vdupq_n_s16(0), + vdupq_n_s16(0), vdupq_n_s16(0) }; + int64x2_t tmp_stats5 = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0), + vdupq_n_s64(0), vdupq_n_s64(0) }; + + // Negate upBuff1 (sign_up), so we can subtract and save repeated negations. + for (int x = 0; x < endX; x += 16) + { + vst1q_s8(upBuff1 + x, vnegq_s8(vld1q_s8(upBuff1 + x))); + } + + for (int y = 0; y < endY; y++) + { + for (int x = 0; x < endX; x += 16) + { + int8x16_t sign_up = vld1q_s8(upBuff1 + x); + int8x16_t sign_down = signOf_neon(rec + x, rec + x + stride); + + // Subtract instead of add, as sign_up is negated. + int8x16_t edge_type = vsubq_s8(sign_down, sign_up); + + // For reuse in the next iteration. + vst1q_s8(upBuff1 + x, sign_down); + + edge_type = x265_sve_mask(x, endX, edge_type); + compute_eo_stats(edge_type, diff + x, tmp_count, tmp_stats); + } + + diff += MAX_CU_SIZE; + rec += stride; + } + + reduce_eo_stats(tmp_stats, tmp_count, stats, count); +} + +void saoCuStatsE2_sve(const int16_t *diff, const pixel *rec, intptr_t stride, + int8_t *upBuff1, int8_t *upBufft, int endX, int endY, + int32_t *stats, int32_t *count) +{ + // Separate buffers for each edge type, so that we can vectorise. + int16x8_t tmp_count5 = { vdupq_n_s16(0), vdupq_n_s16(0), vdupq_n_s16(0), + vdupq_n_s16(0), vdupq_n_s16(0) }; + int64x2_t tmp_stats5 = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0), + vdupq_n_s64(0), vdupq_n_s64(0) }; + + // Negate upBuff1 (sign_up) so we can subtract and save repeated negations. + for (int x = 0; x < endX; x += 16) + { + vst1q_s8(upBuff1 + x, vnegq_s8(vld1q_s8(upBuff1 + x))); + } + + for (int y = 0; y < endY; y++) + { + upBufft0 = x265_signOf(rec-1 - recstride); + for (int x = 0; x < endX; x += 16) + {
View file
x265_4.0.tar.gz/source/common/aarch64/sao-prim-sve2.cpp
Added
@@ -0,0 +1,317 @@ +/***************************************************************************** + * Copyright (C) 2024 MulticoreWare, Inc + * + * Authors: Hari Limaye <hari.limaye@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "sao-prim.h" + +static inline uint8x16_t sve_count(int8x16_t in) +{ + // We do not care about initialising the values in the rest of the vector, + // for VL > 128, as HISTSEG counts matching elements in 128-bit segments. + svint8_t edge_type = svset_neonq_s8(svundef_s8(), in); + + // Use an arbitrary value outside of range -2, 2 for lanes we don't + // need to use the result from. + const int DC = -3; + // s_eoTable maps edge types to memory in order: {2, 0, 1, 3, 4}. + // We use (edge_class - 2) resulting in {0, -2, -1, 1, 2} + int8x16_t idx = { 0, -2, -1, 1, 2, DC, DC, DC, DC, DC, DC, DC, DC, DC, DC, + DC }; + svint8_t svidx = svset_neonq_s8(svundef_s8(), idx); + + svuint8_t count = svhistseg_s8(svidx, edge_type); + return svget_neonq_u8(count); +} + +/* + * Compute Edge Offset statistics (stats array). + * To save some instructions compute stats as negative values - since output of + * Neon comparison instructions for a matched condition is all 1s (-1). + */ +static inline void compute_eo_stats(const int8x16_t edge_type, + const int16_t *diff, int64x2_t *stats) +{ + // Create a mask for each edge type. + int8x16_t mask0 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(-2))); + int8x16_t mask1 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(-1))); + int8x16_t mask2 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(0))); + int8x16_t mask3 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(1))); + int8x16_t mask4 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(2))); + + // Widen the masks to 16-bit. + int16x8_t mask0_lo = vreinterpretq_s16_s8(vzip1q_s8(mask0, mask0)); + int16x8_t mask0_hi = vreinterpretq_s16_s8(vzip2q_s8(mask0, mask0)); + int16x8_t mask1_lo = vreinterpretq_s16_s8(vzip1q_s8(mask1, mask1)); + int16x8_t mask1_hi = vreinterpretq_s16_s8(vzip2q_s8(mask1, mask1)); + int16x8_t mask2_lo = vreinterpretq_s16_s8(vzip1q_s8(mask2, mask2)); + int16x8_t mask2_hi = vreinterpretq_s16_s8(vzip2q_s8(mask2, mask2)); + int16x8_t mask3_lo = vreinterpretq_s16_s8(vzip1q_s8(mask3, mask3)); + int16x8_t mask3_hi = vreinterpretq_s16_s8(vzip2q_s8(mask3, mask3)); + int16x8_t mask4_lo = vreinterpretq_s16_s8(vzip1q_s8(mask4, mask4)); + int16x8_t mask4_hi = vreinterpretq_s16_s8(vzip2q_s8(mask4, mask4)); + + int16x8_t diff_lo = vld1q_s16(diff); + int16x8_t diff_hi = vld1q_s16(diff + 8); + + // Compute negative stats for each edge type. + stats0 = x265_sdotq_s16(stats0, diff_lo, mask0_lo); + stats0 = x265_sdotq_s16(stats0, diff_hi, mask0_hi); + stats1 = x265_sdotq_s16(stats1, diff_lo, mask1_lo); + stats1 = x265_sdotq_s16(stats1, diff_hi, mask1_hi); + stats2 = x265_sdotq_s16(stats2, diff_lo, mask2_lo); + stats2 = x265_sdotq_s16(stats2, diff_hi, mask2_hi); + stats3 = x265_sdotq_s16(stats3, diff_lo, mask3_lo); + stats3 = x265_sdotq_s16(stats3, diff_hi, mask3_hi); + stats4 = x265_sdotq_s16(stats4, diff_lo, mask4_lo); + stats4 = x265_sdotq_s16(stats4, diff_hi, mask4_hi); +} + +/* + * Reduce and store Edge Offset statistics (count and stats). + */ +static inline void reduce_eo_stats(int64x2_t *vstats, uint16x8_t vcount, + int32_t *stats, int32_t *count) +{ + // s_eoTable maps edge types to memory in order: {2, 0, 1, 3, 4}. + // We already have the count values in the correct order for the store, + // so widen to 32-bit and accumulate to the destination. + int32x4_t c0123 = vmovl_s16(vget_low_s16(vreinterpretq_s16_u16(vcount))); + vst1q_s32(count, vaddq_s32(vld1q_s32(count), c0123)); + count4 += vcount4; + + int32x4_t s01 = vcombine_s32(vmovn_s64(vstats2), vmovn_s64(vstats0)); + int32x4_t s23 = vcombine_s32(vmovn_s64(vstats1), vmovn_s64(vstats3)); + int32x4_t s0123 = vpaddq_s32(s01, s23); + // Subtract from current stats, as we calculate the negation. + vst1q_s32(stats, vsubq_s32(vld1q_s32(stats), s0123)); + stats4 -= vaddvq_s64(vstats4); +} + +namespace X265_NS { +void saoCuStatsE0_sve2(const int16_t *diff, const pixel *rec, intptr_t stride, + int endX, int endY, int32_t *stats, int32_t *count) +{ + // Separate buffers for each edge type, so that we can vectorise. + int64x2_t tmp_stats5 = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0), + vdupq_n_s64(0), vdupq_n_s64(0) }; + uint16x8_t count_acc_u16 = vdupq_n_u16(0); + + for (int y = 0; y < endY; y++) + { + uint8x16_t count_acc_u8 = vdupq_n_u8(0); + + // Calculate negated sign_left(x) directly, to save negation when + // reusing sign_right(x) as sign_left(x + 1). + int8x16_t neg_sign_left = vdupq_n_s8(x265_signOf(rec-1 - rec0)); + for (int x = 0; x < endX; x += 16) + { + int8x16_t sign_right = signOf_neon(rec + x, rec + x + 1); + + // neg_sign_left(x) = sign_right(x + 1), reusing one from previous + // iteration. + neg_sign_left = vextq_s8(neg_sign_left, sign_right, 15); + + // Subtract instead of add, as sign_left is negated. + int8x16_t edge_type = vsubq_s8(sign_right, neg_sign_left); + + // For reuse in the next iteration. + neg_sign_left = sign_right; + + edge_type = x265_sve_mask(x, endX, edge_type); + count_acc_u8 = vaddq_u8(count_acc_u8, sve_count(edge_type)); + compute_eo_stats(edge_type, diff + x, tmp_stats); + } + + // The width (endX) can be a maximum of 64, so we can safely + // widen from 8-bit count accumulators after one inner loop iteration. + // Technically the largest an accumulator could reach after one inner + // loop iteration is 64, if every input value had the same edge type, so + // we could complete two iterations (2 * 64 = 128) before widening. + count_acc_u16 = vaddw_u8(count_acc_u16, vget_low_u8(count_acc_u8)); + + diff += MAX_CU_SIZE; + rec += stride; + } + + reduce_eo_stats(tmp_stats, count_acc_u16, stats, count); +} + +void saoCuStatsE1_sve2(const int16_t *diff, const pixel *rec, intptr_t stride, + int8_t *upBuff1, int endX, int endY, int32_t *stats, + int32_t *count) +{ + // Separate buffers for each edge type, so that we can vectorise. + int64x2_t tmp_stats5 = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0), + vdupq_n_s64(0), vdupq_n_s64(0) }; + uint16x8_t count_acc_u16 = vdupq_n_u16(0); + + // Negate upBuff1 (sign_up), so we can subtract and save repeated negations. + for (int x = 0; x < endX; x += 16) + { + vst1q_s8(upBuff1 + x, vnegq_s8(vld1q_s8(upBuff1 + x))); + } + + for (int y = 0; y < endY; y++) + { + uint8x16_t count_acc_u8 = vdupq_n_u8(0); + + for (int x = 0; x < endX; x += 16) + { + int8x16_t sign_up = vld1q_s8(upBuff1 + x); + int8x16_t sign_down = signOf_neon(rec + x, rec + x + stride); + + // Subtract instead of add, as sign_up is negated. + int8x16_t edge_type = vsubq_s8(sign_down, sign_up); + + // For reuse in the next iteration. + vst1q_s8(upBuff1 + x, sign_down); + + edge_type = x265_sve_mask(x, endX, edge_type); + count_acc_u8 = vaddq_u8(count_acc_u8, sve_count(edge_type)); + compute_eo_stats(edge_type, diff + x, tmp_stats); + } + + // The width (endX) can be a maximum of 64, so we can safely + // widen from 8-bit count accumulators after one inner loop iteration. + // Technically the largest an accumulator could reach after one inner + // loop iteration is 64, if every input value had the same edge type, so + // we could complete two iterations (2 * 64 = 128) before widening. + count_acc_u16 = vaddw_u8(count_acc_u16, vget_low_u8(count_acc_u8)); +
View file
x265_4.0.tar.gz/source/common/aarch64/sao-prim.cpp
Added
@@ -0,0 +1,380 @@ +/***************************************************************************** + * Copyright (C) 2024 MulticoreWare, Inc + * + * Authors: Hari Limaye <hari.limaye@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "sao-prim.h" +#include "sao.h" +#include <arm_neon.h> + +// Predicate mask indices. +static const int8_t quad_reg_byte_indices16 = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +}; + +static inline int8x16_t mask_inactive_elems(const int rem, int8x16_t edge_type) +{ + // Compute a predicate mask where the bits of an element are 0 if the index + // is less than the remainder (active), and 1 otherwise. + const int8x16_t indices = vld1q_s8(quad_reg_byte_indices); + int8x16_t pred = vreinterpretq_s8_u8(vcgeq_s8(indices, vdupq_n_s8(rem))); + + // Use predicate mask to shift "unused lanes" outside of range -2, 2 + pred = vshlq_n_s8(pred, 3); + return veorq_s8(edge_type, pred); +} + +/* + * Compute Edge Offset statistics (count and stats). + * To save some instructions compute count and stats as negative values - since + * output of Neon comparison instructions for a matched condition is all 1s (-1). + */ +static inline void compute_eo_stats(const int8x16_t edge_type, + const int16_t *diff, int16x8_t *count, + int32x4_t *stats) +{ + // Create a mask for each edge type. + int8x16_t mask0 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(-2))); + int8x16_t mask1 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(-1))); + int8x16_t mask2 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(0))); + int8x16_t mask3 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(1))); + int8x16_t mask4 = vreinterpretq_s8_u8(vceqq_s8(edge_type, vdupq_n_s8(2))); + + // Compute negative counts for each edge type. + count0 = vpadalq_s8(count0, mask0); + count1 = vpadalq_s8(count1, mask1); + count2 = vpadalq_s8(count2, mask2); + count3 = vpadalq_s8(count3, mask3); + count4 = vpadalq_s8(count4, mask4); + + // Widen the masks to 16-bit. + int16x8_t mask0_lo = vreinterpretq_s16_s8(vzip1q_s8(mask0, mask0)); + int16x8_t mask0_hi = vreinterpretq_s16_s8(vzip2q_s8(mask0, mask0)); + int16x8_t mask1_lo = vreinterpretq_s16_s8(vzip1q_s8(mask1, mask1)); + int16x8_t mask1_hi = vreinterpretq_s16_s8(vzip2q_s8(mask1, mask1)); + int16x8_t mask2_lo = vreinterpretq_s16_s8(vzip1q_s8(mask2, mask2)); + int16x8_t mask2_hi = vreinterpretq_s16_s8(vzip2q_s8(mask2, mask2)); + int16x8_t mask3_lo = vreinterpretq_s16_s8(vzip1q_s8(mask3, mask3)); + int16x8_t mask3_hi = vreinterpretq_s16_s8(vzip2q_s8(mask3, mask3)); + int16x8_t mask4_lo = vreinterpretq_s16_s8(vzip1q_s8(mask4, mask4)); + int16x8_t mask4_hi = vreinterpretq_s16_s8(vzip2q_s8(mask4, mask4)); + + int16x8_t diff_lo = vld1q_s16(diff); + int16x8_t diff_hi = vld1q_s16(diff + 8); + + // Compute negative stats for each edge type. + int16x8_t stats0 = vmulq_s16(diff_lo, mask0_lo); + int16x8_t stats1 = vmulq_s16(diff_lo, mask1_lo); + int16x8_t stats2 = vmulq_s16(diff_lo, mask2_lo); + int16x8_t stats3 = vmulq_s16(diff_lo, mask3_lo); + int16x8_t stats4 = vmulq_s16(diff_lo, mask4_lo); + stats0 = vmlaq_s16(stats0, diff_hi, mask0_hi); + stats1 = vmlaq_s16(stats1, diff_hi, mask1_hi); + stats2 = vmlaq_s16(stats2, diff_hi, mask2_hi); + stats3 = vmlaq_s16(stats3, diff_hi, mask3_hi); + stats4 = vmlaq_s16(stats4, diff_hi, mask4_hi); + + stats0 = vpadalq_s16(stats0, stats0); + stats1 = vpadalq_s16(stats1, stats1); + stats2 = vpadalq_s16(stats2, stats2); + stats3 = vpadalq_s16(stats3, stats3); + stats4 = vpadalq_s16(stats4, stats4); +} + +/* + * Reduce and store Edge Offset statistics (count and stats). + */ +static inline void reduce_eo_stats(int32x4_t *vstats, int16x8_t *vcount, + int32_t *stats, int32_t *count) +{ + // s_eoTable maps edge types to memory in order: {2, 0, 1, 3, 4}. + int16x8_t c01 = vpaddq_s16(vcount2, vcount0); + int16x8_t c23 = vpaddq_s16(vcount1, vcount3); + int16x8_t c0123 = vpaddq_s16(c01, c23); + + // Subtract from current count, as we calculate the negation. + vst1q_s32(count, vsubq_s32(vld1q_s32(count), vpaddlq_s16(c0123))); + count4 -= vaddvq_s16(vcount4); + + int32x4_t s01 = vpaddq_s32(vstats2, vstats0); + int32x4_t s23 = vpaddq_s32(vstats1, vstats3); + int32x4_t s0123 = vpaddq_s32(s01, s23); + + // Subtract from current stats, as we calculate the negation. + vst1q_s32(stats, vsubq_s32(vld1q_s32(stats), s0123)); + stats4 -= vaddvq_s32(vstats4); +} + +namespace X265_NS { +void saoCuStatsBO_neon(const int16_t *diff, const pixel *rec, intptr_t stride, + int endX, int endY, int32_t *stats, int32_t *count) +{ +#if HIGH_BIT_DEPTH + const int n_elem = 4; + const int elem_width = 16; +#else + const int n_elem = 8; + const int elem_width = 8; +#endif + + // Additional temporary buffer for accumulation. + int32_t stats_tmp32 = { 0 }; + int32_t count_tmp32 = { 0 }; + + // Byte-addressable pointers to buffers, to optimise address calculation. + uint8_t *stats_b2 = { + reinterpret_cast<uint8_t *>(stats), + reinterpret_cast<uint8_t *>(stats_tmp), + }; + uint8_t *count_b2 = { + reinterpret_cast<uint8_t *>(count), + reinterpret_cast<uint8_t *>(count_tmp), + }; + + // Combine shift for index calculation with shift for address calculation. + const int right_shift = X265_DEPTH - X265_NS::SAO::SAO_BO_BITS; + const int left_shift = 2; + const int shift = right_shift - left_shift; + // Mask out bits 7, 1 & 0 to account for combination of shifts. + const int mask = 0x7c; + + // Compute statistics into temporary buffers. + for (int y = 0; y < endY; y++) + { + int x = 0; + for (; x + n_elem < endX; x += n_elem) + { + uint64_t class_idx_64 = + *reinterpret_cast<const uint64_t *>(rec + x) >> shift; + + for (int i = 0; i < n_elem; ++i) + { + const int idx = i & 1; + const int off = (class_idx_64 >> (i * elem_width)) & mask; + *reinterpret_cast<uint32_t*>(stats_bidx + off) += diffx + i; + *reinterpret_cast<uint32_t*>(count_bidx + off) += 1; + } + } + + if (x < endX) + { + uint64_t class_idx_64 = + *reinterpret_cast<const uint64_t *>(rec + x) >> shift; + + for (int i = 0; (i + x) < endX; ++i) + { + const int idx = i & 1; + const int off = (class_idx_64 >> (i * elem_width)) & mask; + *reinterpret_cast<uint32_t*>(stats_bidx + off) += diffx + i; + *reinterpret_cast<uint32_t*>(count_bidx + off) += 1; + } + } + + diff += MAX_CU_SIZE; + rec += stride; + } + + // Reduce temporary buffers to destination using Neon. + for (int i = 0; i < 32; i += 4) + { + int32x4_t s0 = vld1q_s32(stats_tmp + i); + int32x4_t s1 = vld1q_s32(stats + i);
View file
x265_4.0.tar.gz/source/common/aarch64/sao-prim.h
Added
@@ -0,0 +1,70 @@ +/***************************************************************************** + * Copyright (C) 2024 MulticoreWare, Inc + * + * Authors: Hari Limaye <hari.limaye@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_COMMON_AARCH64_SAO_PRIM_H +#define X265_COMMON_AARCH64_SAO_PRIM_H + +#include "neon-sve-bridge.h" +#include "primitives.h" +#include <arm_neon.h> + +static inline int8x16_t signOf_neon(const pixel *a, const pixel *b) +{ +#if HIGH_BIT_DEPTH + uint16x8_t s0_lo = vld1q_u16(a); + uint16x8_t s0_hi = vld1q_u16(a + 8); + uint16x8_t s1_lo = vld1q_u16(b); + uint16x8_t s1_hi = vld1q_u16(b + 8); + + // signOf(a - b) = -(a > b ? -1 : 0) | (a < b ? -1 : 0) + int16x8_t cmp0_lo = vreinterpretq_s16_u16(vcgtq_u16(s0_lo, s1_lo)); + int16x8_t cmp0_hi = vreinterpretq_s16_u16(vcgtq_u16(s0_hi, s1_hi)); + int16x8_t cmp1_lo = vreinterpretq_s16_u16(vcgtq_u16(s1_lo, s0_lo)); + int16x8_t cmp1_hi = vreinterpretq_s16_u16(vcgtq_u16(s1_hi, s0_hi)); + + int8x16_t cmp0 = vcombine_s8(vmovn_s16(cmp0_lo), vmovn_s16(cmp0_hi)); + int8x16_t cmp1 = vcombine_s8(vmovn_s16(cmp1_lo), vmovn_s16(cmp1_hi)); +#else // HIGH_BIT_DEPTH + uint8x16_t s0 = vld1q_u8(a); + uint8x16_t s1 = vld1q_u8(b); + + // signOf(a - b) = -(a > b ? -1 : 0) | (a < b ? -1 : 0) + int8x16_t cmp0 = vreinterpretq_s8_u8(vcgtq_u8(s0, s1)); + int8x16_t cmp1 = vreinterpretq_s8_u8(vcgtq_u8(s1, s0)); +#endif // HIGH_BIT_DEPTH + return vorrq_s8(vnegq_s8(cmp0), cmp1); +} + +namespace X265_NS { +void setupSaoPrimitives_neon(EncoderPrimitives &p); + +#if defined(HAVE_SVE) && HAVE_SVE_BRIDGE +void setupSaoPrimitives_sve(EncoderPrimitives &p); +#endif + +#if defined(HAVE_SVE2) && HAVE_SVE_BRIDGE +void setupSaoPrimitives_sve2(EncoderPrimitives &p); +#endif +} + +#endif // X265_COMMON_AARCH64_SAO_PRIM_H
View file
x265_3.6.tar.gz/source/common/aarch64/ssd-a-common.S -> x265_4.0.tar.gz/source/common/aarch64/ssd-a-common.S
Changed
@@ -29,9 +29,7 @@ .arch armv8-a .macro ret_v0_w0 - trn2 v1.2d, v0.2d, v0.2d - add v0.2s, v0.2s, v1.2s - addp v0.2s, v0.2s, v0.2s + addv s0, v0.4s fmov w0, s0 ret .endm
View file
x265_3.6.tar.gz/source/common/aarch64/ssd-a-sve2.S -> x265_4.0.tar.gz/source/common/aarch64/ssd-a-sve2.S
Changed
@@ -36,267 +36,6 @@ .text -function PFX(pixel_sse_pp_32x32_sve2) - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_pixel_sse_pp_32x32 - mov w12, #8 - movi v0.16b, #0 - movi v1.16b, #0 -.loop_sse_pp_32_sve2: - sub w12, w12, #1 -.rept 4 - ld1 {v16.16b,v17.16b}, x0, x1 - ld1 {v18.16b,v19.16b}, x2, x3 - usubl v2.8h, v16.8b, v18.8b - usubl2 v3.8h, v16.16b, v18.16b - usubl v4.8h, v17.8b, v19.8b - usubl2 v5.8h, v17.16b, v19.16b - smlal v0.4s, v2.4h, v2.4h - smlal2 v1.4s, v2.8h, v2.8h - smlal v0.4s, v3.4h, v3.4h - smlal2 v1.4s, v3.8h, v3.8h - smlal v0.4s, v4.4h, v4.4h - smlal2 v1.4s, v4.8h, v4.8h - smlal v0.4s, v5.4h, v5.4h - smlal2 v1.4s, v5.8h, v5.8h -.endr - cbnz w12, .loop_sse_pp_32_sve2 - add v0.4s, v0.4s, v1.4s - ret_v0_w0 -.vl_gt_16_pixel_sse_pp_32x32: - ptrue p0.b, vl32 - ld1b {z16.b}, p0/z, x0 - ld1b {z18.b}, p0/z, x2 - add x0, x0, x1 - add x2, x2, x3 - usublb z1.h, z16.b, z18.b - usublt z2.h, z16.b, z18.b - smullb z0.s, z1.h, z1.h - smlalt z0.s, z1.h, z1.h - smlalb z0.s, z2.h, z2.h - smlalt z0.s, z2.h, z2.h -.rept 31 - ld1b {z16.b}, p0/z, x0 - ld1b {z18.b}, p0/z, x2 - add x0, x0, x1 - add x2, x2, x3 - usublb z1.h, z16.b, z18.b - usublt z2.h, z16.b, z18.b - smullb z0.s, z1.h, z1.h - smlalt z0.s, z1.h, z1.h - smlalb z0.s, z2.h, z2.h - smlalt z0.s, z2.h, z2.h -.endr - uaddv d3, p0, z0.s - fmov w0, s3 - ret -endfunc - -function PFX(pixel_sse_pp_32x64_sve2) - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_pixel_sse_pp_32x64 - ptrue p0.b, vl16 - ld1b {z16.b}, p0/z, x0 - ld1b {z17.b}, p0/z, x0, #1, mul vl - ld1b {z18.b}, p0/z, x2 - ld1b {z19.b}, p0/z, x2, #1, mul vl - add x0, x0, x1 - add x2, x2, x3 - usublb z1.h, z16.b, z18.b - usublt z2.h, z16.b, z18.b - usublb z3.h, z17.b, z19.b - usublt z4.h, z17.b, z19.b - smullb z20.s, z1.h, z1.h - smullt z21.s, z1.h, z1.h - smlalb z20.s, z2.h, z2.h - smlalt z21.s, z2.h, z2.h - smlalb z20.s, z3.h, z3.h - smlalt z21.s, z3.h, z3.h - smlalb z20.s, z4.h, z4.h - smlalt z21.s, z4.h, z4.h -.rept 63 - ld1b {z16.b}, p0/z, x0 - ld1b {z17.b}, p0/z, x0, #1, mul vl - ld1b {z18.b}, p0/z, x2 - ld1b {z19.b}, p0/z, x2, #1, mul vl - add x0, x0, x1 - add x2, x2, x3 - usublb z1.h, z16.b, z18.b - usublt z2.h, z16.b, z18.b - usublb z3.h, z17.b, z19.b - usublt z4.h, z17.b, z19.b - smlalb z20.s, z1.h, z1.h - smlalt z21.s, z1.h, z1.h - smlalb z20.s, z2.h, z2.h - smlalt z21.s, z2.h, z2.h - smlalb z20.s, z3.h, z3.h - smlalt z21.s, z3.h, z3.h - smlalb z20.s, z4.h, z4.h - smlalt z21.s, z4.h, z4.h -.endr - uaddv d3, p0, z20.s - fmov w0, s3 - uaddv d4, p0, z21.s - fmov w1, s4 - add w0, w0, w1 - ret -.vl_gt_16_pixel_sse_pp_32x64: - ptrue p0.b, vl32 - ld1b {z16.b}, p0/z, x0 - ld1b {z18.b}, p0/z, x2 - add x0, x0, x1 - add x2, x2, x3 - usublb z1.h, z16.b, z18.b - usublt z2.h, z16.b, z18.b - smullb z20.s, z1.h, z1.h - smullt z21.s, z1.h, z1.h - smlalb z20.s, z2.h, z2.h - smlalt z21.s, z2.h, z2.h -.rept 63 - ld1b {z16.b}, p0/z, x0 - ld1b {z18.b}, p0/z, x2 - add x0, x0, x1 - add x2, x2, x3 - usublb z1.h, z16.b, z18.b - usublt z2.h, z16.b, z18.b - smlalb z20.s, z1.h, z1.h - smlalt z21.s, z1.h, z1.h - smlalb z20.s, z2.h, z2.h - smlalt z21.s, z2.h, z2.h -.endr - uaddv d3, p0, z20.s - fmov w0, s3 - uaddv d4, p0, z21.s - fmov w1, s4 - add w0, w0, w1 - ret -endfunc - -function PFX(pixel_sse_pp_64x64_sve2) - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_pixel_sse_pp_64x64 - mov w12, #16 - movi v0.16b, #0 - movi v1.16b, #0 - -.loop_sse_pp_64_sve2: - sub w12, w12, #1 -.rept 4 - ld1 {v16.16b-v19.16b}, x0, x1 - ld1 {v20.16b-v23.16b}, x2, x3 - - usubl v2.8h, v16.8b, v20.8b - usubl2 v3.8h, v16.16b, v20.16b - usubl v4.8h, v17.8b, v21.8b - usubl2 v5.8h, v17.16b, v21.16b - smlal v0.4s, v2.4h, v2.4h - smlal2 v1.4s, v2.8h, v2.8h - smlal v0.4s, v3.4h, v3.4h - smlal2 v1.4s, v3.8h, v3.8h - smlal v0.4s, v4.4h, v4.4h - smlal2 v1.4s, v4.8h, v4.8h - smlal v0.4s, v5.4h, v5.4h - smlal2 v1.4s, v5.8h, v5.8h - - usubl v2.8h, v18.8b, v22.8b - usubl2 v3.8h, v18.16b, v22.16b - usubl v4.8h, v19.8b, v23.8b - usubl2 v5.8h, v19.16b, v23.16b - smlal v0.4s, v2.4h, v2.4h - smlal2 v1.4s, v2.8h, v2.8h - smlal v0.4s, v3.4h, v3.4h - smlal2 v1.4s, v3.8h, v3.8h - smlal v0.4s, v4.4h, v4.4h - smlal2 v1.4s, v4.8h, v4.8h - smlal v0.4s, v5.4h, v5.4h - smlal2 v1.4s, v5.8h, v5.8h -.endr - cbnz w12, .loop_sse_pp_64_sve2 - add v0.4s, v0.4s, v1.4s - ret_v0_w0 -.vl_gt_16_pixel_sse_pp_64x64: - cmp x9, #48 - bgt .vl_gt_48_pixel_sse_pp_64x64 - ptrue p0.b, vl32 - ld1b {z16.b}, p0/z, x0 - ld1b {z17.b}, p0/z, x0, #1, mul vl - ld1b {z20.b}, p0/z, x2 - ld1b {z21.b}, p0/z, x2, #1, mul vl - add x0, x0, x1 - add x2, x2, x3 - usublb z1.h, z16.b, z20.b - usublt z2.h, z16.b, z20.b - usublb z3.h, z17.b, z21.b - usublt z4.h, z17.b, z21.b - smullb z24.s, z1.h, z1.h
View file
x265_3.6.tar.gz/source/common/aarch64/ssd-a.S -> x265_4.0.tar.gz/source/common/aarch64/ssd-a.S
Changed
@@ -2,6 +2,7 @@ * Copyright (C) 2021 MulticoreWare, Inc * * Authors: Sebastian Pop <spop@amazon.com> + * Hari Limaye <hari.limaye@arm.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -34,217 +35,145 @@ .text -function PFX(pixel_sse_pp_4x4_neon) - ld1 {v16.s}0, x0, x1 - ld1 {v17.s}0, x2, x3 - ld1 {v18.s}0, x0, x1 - ld1 {v19.s}0, x2, x3 - ld1 {v20.s}0, x0, x1 - ld1 {v21.s}0, x2, x3 - ld1 {v22.s}0, x0, x1 - ld1 {v23.s}0, x2, x3 - - usubl v1.8h, v16.8b, v17.8b - usubl v2.8h, v18.8b, v19.8b - usubl v3.8h, v20.8b, v21.8b - usubl v4.8h, v22.8b, v23.8b - - smull v0.4s, v1.4h, v1.4h - smlal v0.4s, v2.4h, v2.4h - smlal v0.4s, v3.4h, v3.4h - smlal v0.4s, v4.4h, v4.4h - ret_v0_w0 -endfunc +// Fully unrolled. +.macro SSE_PP_4xN h +function PFX(pixel_sse_pp_4x\h\()_neon) + movi v0.4s, #0 +.rept \h / 2 + ldr s16, x0 + ldr s17, x2 + add x0, x0, x1 + add x2, x2, x3 + ld1 {v16.s}1, x0, x1 + ld1 {v17.s}1, x2, x3 -function PFX(pixel_sse_pp_4x8_neon) - ld1 {v16.s}0, x0, x1 - ld1 {v17.s}0, x2, x3 - usubl v1.8h, v16.8b, v17.8b - ld1 {v16.s}0, x0, x1 - ld1 {v17.s}0, x2, x3 - smull v0.4s, v1.4h, v1.4h -.rept 6 - usubl v1.8h, v16.8b, v17.8b - ld1 {v16.s}0, x0, x1 - smlal v0.4s, v1.4h, v1.4h - ld1 {v17.s}0, x2, x3 + uabd v1.8b, v16.8b, v17.8b + umull v20.8h, v1.8b, v1.8b + uadalp v0.4s, v20.8h .endr - usubl v1.8h, v16.8b, v17.8b - smlal v0.4s, v1.4h, v1.4h ret_v0_w0 endfunc +.endm -function PFX(pixel_sse_pp_8x8_neon) - ld1 {v16.8b}, x0, x1 - ld1 {v17.8b}, x2, x3 - usubl v1.8h, v16.8b, v17.8b - ld1 {v16.8b}, x0, x1 - smull v0.4s, v1.4h, v1.4h - smlal2 v0.4s, v1.8h, v1.8h - ld1 {v17.8b}, x2, x3 - -.rept 6 - usubl v1.8h, v16.8b, v17.8b - ld1 {v16.8b}, x0, x1 - smlal v0.4s, v1.4h, v1.4h - smlal2 v0.4s, v1.8h, v1.8h - ld1 {v17.8b}, x2, x3 -.endr - usubl v1.8h, v16.8b, v17.8b - smlal v0.4s, v1.4h, v1.4h - smlal2 v0.4s, v1.8h, v1.8h - ret_v0_w0 -endfunc +SSE_PP_4xN 4 +SSE_PP_4xN 8 -function PFX(pixel_sse_pp_8x16_neon) - ld1 {v16.8b}, x0, x1 - ld1 {v17.8b}, x2, x3 - usubl v1.8h, v16.8b, v17.8b +// Fully unrolled. +.macro SSE_PP_8xN h +function PFX(pixel_sse_pp_8x\h\()_neon) + movi v0.4s, #0 +.rept \h ld1 {v16.8b}, x0, x1 - smull v0.4s, v1.4h, v1.4h - smlal2 v0.4s, v1.8h, v1.8h ld1 {v17.8b}, x2, x3 -.rept 14 - usubl v1.8h, v16.8b, v17.8b - ld1 {v16.8b}, x0, x1 - smlal v0.4s, v1.4h, v1.4h - smlal2 v0.4s, v1.8h, v1.8h - ld1 {v17.8b}, x2, x3 + uabd v1.8b, v16.8b, v17.8b + umull v20.8h, v1.8b, v1.8b + uadalp v0.4s, v20.8h .endr - usubl v1.8h, v16.8b, v17.8b - smlal v0.4s, v1.4h, v1.4h - smlal2 v0.4s, v1.8h, v1.8h ret_v0_w0 endfunc +.endm + +SSE_PP_8xN 8 +SSE_PP_8xN 16 -.macro sse_pp_16xN h +// Fully unrolled. +.macro SSE_PP_16xN h function PFX(pixel_sse_pp_16x\h\()_neon) + movi v0.4s, #0 + movi v1.4s, #0 +.rept \h ld1 {v16.16b}, x0, x1 ld1 {v17.16b}, x2, x3 - usubl v1.8h, v16.8b, v17.8b - usubl2 v2.8h, v16.16b, v17.16b - ld1 {v16.16b}, x0, x1 - ld1 {v17.16b}, x2, x3 - smull v0.4s, v1.4h, v1.4h - smlal2 v0.4s, v1.8h, v1.8h - smlal v0.4s, v2.4h, v2.4h - smlal2 v0.4s, v2.8h, v2.8h -.rept \h - 2 - usubl v1.8h, v16.8b, v17.8b - usubl2 v2.8h, v16.16b, v17.16b - ld1 {v16.16b}, x0, x1 - smlal v0.4s, v1.4h, v1.4h - smlal2 v0.4s, v1.8h, v1.8h - ld1 {v17.16b}, x2, x3 - smlal v0.4s, v2.4h, v2.4h - smlal2 v0.4s, v2.8h, v2.8h + + uabd v2.16b, v16.16b, v17.16b + umull v20.8h, v2.8b, v2.8b + uadalp v0.4s, v20.8h + umull2 v21.8h, v2.16b, v2.16b + uadalp v1.4s, v21.8h .endr - usubl v1.8h, v16.8b, v17.8b - usubl2 v2.8h, v16.16b, v17.16b - smlal v0.4s, v1.4h, v1.4h - smlal2 v0.4s, v1.8h, v1.8h - smlal v0.4s, v2.4h, v2.4h - smlal2 v0.4s, v2.8h, v2.8h + add v0.4s, v0.4s, v1.4s ret_v0_w0 endfunc .endm -sse_pp_16xN 16 -sse_pp_16xN 32 +SSE_PP_16xN 16 +SSE_PP_16xN 32 -function PFX(pixel_sse_pp_32x32_neon) - mov w12, #8 - movi v0.16b, #0 - movi v1.16b, #0 -.loop_sse_pp_32: - sub w12, w12, #1 +// Loop unrolled to process 4 rows per iteration. +function PFX(pixel_sse_pp_32xh_neon), export=0 + movi v0.4s, #0 + movi v1.4s, #0 +.Loop_sse_pp_32xh: + sub w4, w4, #1 .rept 4 ld1 {v16.16b,v17.16b}, x0, x1 ld1 {v18.16b,v19.16b}, x2, x3 - usubl v2.8h, v16.8b, v18.8b - usubl2 v3.8h, v16.16b, v18.16b - usubl v4.8h, v17.8b, v19.8b - usubl2 v5.8h, v17.16b, v19.16b - smlal v0.4s, v2.4h, v2.4h - smlal2 v1.4s, v2.8h, v2.8h - smlal v0.4s, v3.4h, v3.4h - smlal2 v1.4s, v3.8h, v3.8h - smlal v0.4s, v4.4h, v4.4h - smlal2 v1.4s, v4.8h, v4.8h - smlal v0.4s, v5.4h, v5.4h
View file
x265_4.0.tar.gz/source/common/aarch64/ssd-neon-dotprod.S
Added
@@ -0,0 +1,169 @@ +/***************************************************************************** + * Copyright (C) 2024 MulticoreWare, Inc + * + * Authors: Hari Limaye <hari.limaye@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "asm.S" + +.arch armv8.2-a+dotprod + +#ifdef __APPLE__ +.section __RODATA,__rodata +#else +.section .rodata +#endif + +.align 4 + +.text + +// Fully unrolled. +.macro SSE_PP_4xN h +function PFX(pixel_sse_pp_4x\h\()_neon_dotprod) + movi v0.4s, #0 +.rept \h / 4 + ldr s16, x0 + ldr s17, x2 + add x0, x0, x1 + add x2, x2, x3 + ld1 {v16.s}1, x0, x1 + ld1 {v16.s}2, x0, x1 + ld1 {v16.s}3, x0, x1 + ld1 {v17.s}1, x2, x3 + ld1 {v17.s}2, x2, x3 + ld1 {v17.s}3, x2, x3 + + uabd v1.16b, v16.16b, v17.16b + udot v0.4s, v1.16b, v1.16b +.endr + addv s0, v0.4s + fmov w0, s0 + ret +endfunc +.endm + +SSE_PP_4xN 4 +SSE_PP_4xN 8 + +// Fully unrolled. +.macro SSE_PP_8xN h +function PFX(pixel_sse_pp_8x\h\()_neon_dotprod) + movi v0.4s, #0 +.rept \h + ld1 {v16.8b}, x0, x1 + ld1 {v17.8b}, x2, x3 + + uabd v1.8b, v16.8b, v17.8b + udot v0.2s, v1.8b, v1.8b +.endr + addv s0, v0.4s + fmov w0, s0 + ret +endfunc +.endm + +SSE_PP_8xN 8 +SSE_PP_8xN 16 + +// Fully unrolled. +.macro SSE_PP_16xN h +function PFX(pixel_sse_pp_16x\h\()_neon_dotprod) + movi v0.4s, #0 + movi v1.4s, #0 +.rept \h / 2 + ld1 {v16.16b}, x0, x1 + ld1 {v17.16b}, x2, x3 + ld1 {v18.16b}, x0, x1 + ld1 {v19.16b}, x2, x3 + + uabd v2.16b, v16.16b, v17.16b + udot v0.4s, v2.16b, v2.16b + uabd v3.16b, v18.16b, v19.16b + udot v1.4s, v3.16b, v3.16b +.endr + add v0.4s, v0.4s, v1.4s + addv s0, v0.4s + fmov w0, s0 + ret +endfunc +.endm + +SSE_PP_16xN 16 +SSE_PP_16xN 32 + +// Loop unrolled to process 4 rows per iteration. +function PFX(pixel_sse_pp_32xh_neon_dotprod), export=0 + movi v0.4s, #0 + movi v1.4s, #0 +.Loop_sse_pp_32xh: + sub w4, w4, #1 +.rept 4 + ld1 {v16.16b,v17.16b}, x0, x1 + ld1 {v18.16b,v19.16b}, x2, x3 + + uabd v2.16b, v16.16b, v18.16b + udot v0.4s, v2.16b, v2.16b + uabd v3.16b, v17.16b, v19.16b + udot v1.4s, v3.16b, v3.16b +.endr + cbnz w4, .Loop_sse_pp_32xh + add v0.4s, v0.4s, v1.4s + addv s0, v0.4s + fmov w0, s0 + ret +endfunc + +.macro SSE_PP_32xN h +function PFX(pixel_sse_pp_32x\h\()_neon_dotprod) + mov w4, \h / 4 + b PFX(pixel_sse_pp_32xh_neon_dotprod) +endfunc +.endm + +SSE_PP_32xN 32 +SSE_PP_32xN 64 + +// Loop unrolled to process 4 rows per iteration. +function PFX(pixel_sse_pp_64x64_neon_dotprod) + mov w12, #16 + movi v0.4s, #0 + movi v1.4s, #0 +.Loop_sse_pp_64: + sub w12, w12, #1 +.rept 4 + ld1 {v16.16b-v19.16b}, x0, x1 + ld1 {v20.16b-v23.16b}, x2, x3 + + uabd v2.16b, v16.16b, v20.16b + udot v0.4s, v2.16b, v2.16b + uabd v3.16b, v17.16b, v21.16b + udot v1.4s, v3.16b, v3.16b + uabd v4.16b, v18.16b, v22.16b + udot v0.4s, v4.16b, v4.16b + uabd v5.16b, v19.16b, v23.16b + udot v1.4s, v5.16b, v5.16b +.endr + cbnz w12, .Loop_sse_pp_64 + add v0.4s, v0.4s, v1.4s + addv s0, v0.4s + fmov w0, s0 + ret +endfunc
View file
x265_3.6.tar.gz/source/common/arm/blockcopy8.S -> x265_4.0.tar.gz/source/common/arm/blockcopy8.S
Changed
@@ -795,7 +795,7 @@ vmov q2, q12 vmov q3, q14 -.loop: +.Loop: vldm r0!, {q8-q15} subs r1, #1 @@ -817,7 +817,7 @@ vadd.s8 q1, q10 vadd.s8 q2, q12 vadd.s8 q3, q14 - bgt .loop + bgt .Loop // sum vadd.s8 q0, q1
View file
x265_3.6.tar.gz/source/common/arm/dct-a.S -> x265_4.0.tar.gz/source/common/arm/dct-a.S
Changed
@@ -422,7 +422,7 @@ mov lr, #4*16*2 // DCT-1D -.loop1: +.Loop1: // Row0-3 vld1.16 {q8-q9}, r0, :64, r2 // q8 = 07 06 05 04 03 02 01 00, q9 = 0F 0E 0D 0C 0B 0A 09 08 vld1.16 {q10-q11}, r0, :64, r2 // q10 = 17 16 15 14 13 12 11 10, q11 = 1F 1E 1D 1C 1B 1A 19 18 @@ -628,7 +628,7 @@ // loop into next process group sub r3, #3*4*16*2 subs r12, #1 - bgt .loop1 + bgt .Loop1 // DCT-2D @@ -637,7 +637,7 @@ mov r3, #16*2*2 mov r12, #16/4 // Process 4 rows every loop -.loop2: +.Loop2: vldm r2, {q8-q15} // d16 = 30 20 10 00 @@ -887,7 +887,7 @@ sub r1, #(17*16-4)*2 subs r12, #1 - bgt .loop2 + bgt .Loop2 add sp, #16*16*2 vpop {q4-q7}
View file
x265_3.6.tar.gz/source/common/arm/ipfilter8.S -> x265_4.0.tar.gz/source/common/arm/ipfilter8.S
Changed
@@ -372,7 +372,7 @@ vmov.u16 q1, #8192 vneg.s16 q1, q1 mov r12, #8 -.loop_filterP2S_32x16: +.Loop_filterP2S_32x16: subs r12, #1 .rept 2 vld1.u8 {q9-q10}, r0, r1 @@ -391,7 +391,7 @@ vmla.s16 q3, q10, q0 vst1.16 {q2-q3}, r2, r3 .endr - bgt .loop_filterP2S_32x16 + bgt .Loop_filterP2S_32x16 bx lr endfunc @@ -402,7 +402,7 @@ vmov.u16 q1, #8192 vneg.s16 q1, q1 mov r12, #12 -.loop_filterP2S_32x24: +.Loop_filterP2S_32x24: subs r12, #1 .rept 2 vld1.u8 {q9-q10}, r0, r1 @@ -421,7 +421,7 @@ vmla.s16 q3, q10, q0 vst1.16 {q2-q3}, r2, r3 .endr - bgt .loop_filterP2S_32x24 + bgt .Loop_filterP2S_32x24 bx lr endfunc @@ -432,7 +432,7 @@ vmov.u16 q1, #8192 vneg.s16 q1, q1 mov r12, #16 -.loop_filterP2S_32x32: +.Loop_filterP2S_32x32: subs r12, #1 .rept 2 vld1.u8 {q9-q10}, r0, r1 @@ -451,7 +451,7 @@ vmla.s16 q3, q10, q0 vst1.16 {q2-q3}, r2, r3 .endr - bgt .loop_filterP2S_32x32 + bgt .Loop_filterP2S_32x32 bx lr endfunc @@ -462,7 +462,7 @@ vmov.u16 q1, #8192 vneg.s16 q1, q1 mov r12, #32 -.loop_filterP2S_32x64: +.Loop_filterP2S_32x64: subs r12, #1 .rept 2 vld1.u8 {q9-q10}, r0, r1 @@ -481,7 +481,7 @@ vmla.s16 q3, q10, q0 vst1.16 {q2-q3}, r2, r3 .endr - bgt .loop_filterP2S_32x64 + bgt .Loop_filterP2S_32x64 bx lr endfunc @@ -493,7 +493,7 @@ vmov.u16 q1, #8192 vneg.s16 q1, q1 mov r12, #8 -.loop_filterP2S_64x16: +.Loop_filterP2S_64x16: subs r12, #1 .rept 2 vld1.u8 {q9-q10}, r0! @@ -528,7 +528,7 @@ vmla.s16 q3, q10, q0 vst1.16 {q2-q3}, r2, r3 .endr - bgt .loop_filterP2S_64x16 + bgt .Loop_filterP2S_64x16 bx lr endfunc @@ -540,7 +540,7 @@ vmov.u16 q1, #8192 vneg.s16 q1, q1 mov r12, #16 -.loop_filterP2S_64x32: +.Loop_filterP2S_64x32: subs r12, #1 .rept 2 vld1.u8 {q9-q10}, r0! @@ -575,7 +575,7 @@ vmla.s16 q3, q10, q0 vst1.16 {q2-q3}, r2, r3 .endr - bgt .loop_filterP2S_64x32 + bgt .Loop_filterP2S_64x32 bx lr endfunc @@ -587,7 +587,7 @@ vmov.u16 q1, #8192 vneg.s16 q1, q1 mov r12, #24 -.loop_filterP2S_64x48: +.Loop_filterP2S_64x48: subs r12, #1 .rept 2 vld1.u8 {q9-q10}, r0! @@ -622,7 +622,7 @@ vmla.s16 q3, q10, q0 vst1.16 {q2-q3}, r2, r3 .endr - bgt .loop_filterP2S_64x48 + bgt .Loop_filterP2S_64x48 bx lr endfunc @@ -634,7 +634,7 @@ vmov.u16 q1, #8192 vneg.s16 q1, q1 mov r12, #32 -.loop_filterP2S_64x64: +.Loop_filterP2S_64x64: subs r12, #1 .rept 2 vld1.u8 {q9-q10}, r0! @@ -669,7 +669,7 @@ vmla.s16 q3, q10, q0 vst1.16 {q2-q3}, r2, r3 .endr - bgt .loop_filterP2S_64x64 + bgt .Loop_filterP2S_64x64 bx lr endfunc @@ -681,7 +681,7 @@ vmov.u16 q1, #8192 vneg.s16 q1, q1 mov r12, #32 -.loop_filterP2S_48x64: +.Loop_filterP2S_48x64: subs r12, #1 .rept 2 vld1.u8 {q9-q10}, r0! @@ -709,7 +709,7 @@ vmla.s16 q3, q9, q0 vst1.16 {q2-q3}, r2, r3 .endr - bgt .loop_filterP2S_48x64 + bgt .Loop_filterP2S_48x64 bx lr endfunc @@ -756,7 +756,7 @@ vmovl.u8 q2, d4 vmovl.u8 q3, d6 -.loop_4x\h: +.Loop_4x\h: // TODO: read extra 1 row for speed optimize, may made crash on OS X platform! vld1.u32 {d160}, r0, r1 vld1.u32 {d161}, r0, r1 @@ -795,7 +795,7 @@ vst1.u32 {d181}, r2, r3 subs r12, #2 - bne .loop_4x4 + bne .Loop_4x4 pop {pc} .ltorg @@ -945,13 +945,13 @@ .macro FILTER_VPP a b filterv -.loop_\filterv\()_\a\()x\b: +.Loop_\filterv\()_\a\()x\b: mov r7, r2 mov r6, r0 eor r8, r8 -.loop_w8_\filterv\()_\a\()x\b: +.Loop_w8_\filterv\()_\a\()x\b: add r6, r0, r8 @@ -988,12 +988,12 @@ add r8, #8 cmp r8, #\a
View file
x265_3.6.tar.gz/source/common/arm/mc-a.S -> x265_4.0.tar.gz/source/common/arm/mc-a.S
Changed
@@ -554,7 +554,7 @@ vsri.s16 q1, #1 vneg.s16 q0, q0 mov r3, #4 -.loop_cpy2Dto1D_shr_16: +.Loop_cpy2Dto1D_shr_16: subs r3, #1 .rept 4 vld1.s16 {q2-q3}, r1, r2 @@ -564,7 +564,7 @@ vshl.s16 q3, q0 vst1.16 {q2-q3}, r0! .endr - bgt .loop_cpy2Dto1D_shr_16 + bgt .Loop_cpy2Dto1D_shr_16 bx lr endfunc @@ -577,7 +577,7 @@ vsri.s16 q1, #1 vneg.s16 q0, q0 mov r3, 16 -.loop_cpy2Dto1D_shr_32: +.Loop_cpy2Dto1D_shr_32: subs r3, #1 .rept 2 vld1.s16 {q2-q3}, r1! @@ -593,7 +593,7 @@ vst1.16 {q2-q3}, r0! vst1.16 {q8-q9}, r0! .endr - bgt .loop_cpy2Dto1D_shr_32 + bgt .Loop_cpy2Dto1D_shr_32 bx lr endfunc
View file
x265_3.6.tar.gz/source/common/arm/pixel-util.S -> x265_4.0.tar.gz/source/common/arm/pixel-util.S
Changed
@@ -848,36 +848,36 @@ vdup.8 q2, r12 sub r5, #1 -.loop_h: +.Loop_h: mov r6, r0 mov r12, r2 eor r7, r7 -.loop_w: +.Loop_w: vld1.u8 {q0}, r6! vshl.u8 q0, q0, q2 vst1.u8 {q0}, r12! add r7, #16 cmp r7, r4 - blt .loop_w + blt .Loop_w add r0, r1 add r2, r3 subs r5, #1 - bgt .loop_h + bgt .Loop_h // handle last row mov r5, r4 lsr r5, #3 -.loopW8: +.LoopW8: vld1.u8 d0, r0! vshl.u8 d0, d0, d4 vst1.u8 d0, r2! subs r4, r4, #8 subs r5, #1 - bgt .loopW8 + bgt .LoopW8 mov r5,#8 sub r5, r4 @@ -1970,7 +1970,7 @@ eor r5, r5 veor.s32 q12, q12 -.loop_quant: +.Loop_quant: vld1.s16 d16, r0! vmovl.s16 q9, d16 // q9= coefblockpos @@ -1999,7 +1999,7 @@ vst1.s16 d16, r3! subs r4, #1 - bne .loop_quant + bne .Loop_quant vadd.u32 d8, d9 vpadd.u32 d8, d8 @@ -2023,7 +2023,7 @@ eor r4, r4 veor.s32 q12, q12 -.loop_nquant: +.Loop_nquant: vld1.s16 d16, r0! vmovl.s16 q9, d16 // q9= coefblockpos @@ -2049,7 +2049,7 @@ vst1.s16 d17, r2! subs r3, #1 - bne .loop_nquant + bne .Loop_nquant vadd.u32 d8, d9 vpadd.u32 d8, d8 @@ -2148,7 +2148,7 @@ mov r10, #4 eor r9, r9 -.loop_32: +.Loop_32: sa8d_16x16 r4 @@ -2166,7 +2166,7 @@ sub r2, r2, #24 subs r10, #1 - bgt .loop_32 + bgt .Loop_32 mov r0, r9 vpop {d8-d11} @@ -2183,7 +2183,7 @@ mov r10, #4 eor r9, r9 -.loop_1: +.Loop_1: sa8d_16x16 r4 @@ -2217,7 +2217,7 @@ sub r2, r2, #56 subs r10, #1 - bgt .loop_1 + bgt .Loop_1 mov r0, r9 vpop {d8-d11}
View file
x265_3.6.tar.gz/source/common/arm/sad-a.S -> x265_4.0.tar.gz/source/common/arm/sad-a.S
Changed
@@ -103,7 +103,7 @@ vabal.u8 q9, d5, d7 mov r12, #(\h-2)/2 -.loop_16x\h: +.Loop_16x\h: subs r12, #1 vld1.8 {q0}, r0, r1 @@ -115,7 +115,7 @@ vabal.u8 q9, d1, d3 vabal.u8 q8, d4, d6 vabal.u8 q9, d5, d7 - bne .loop_16x\h + bne .Loop_16x\h vadd.u16 q8, q8, q9 .if \h == 64 @@ -147,7 +147,7 @@ veor.u8 q11, q11 mov r12, #\h/8 -.loop_32x\h: +.Loop_32x\h: subs r12, #1 .rept 4 @@ -166,7 +166,7 @@ vabal.u8 q10, d26, d30 vabal.u8 q11, d27, d31 .endr - bne .loop_32x\h + bne .Loop_32x\h vadd.u16 q8, q8, q9 vadd.u16 q10, q10, q11 @@ -213,7 +213,7 @@ sub r3, r12 mov r12, #\h/8 -.loop_64x\h: +.Loop_64x\h: subs r12, #1 .rept 4 @@ -246,7 +246,7 @@ vabal.u8 q10, d26, d30 vabal.u8 q11, d27, d31 .endr - bne .loop_64x\h + bne .Loop_64x\h vadd.u16 q8, q8, q9 vadd.u16 q10, q10, q11 @@ -283,7 +283,7 @@ sub r3, #16 mov r12, #8 -.loop_24x32: +.Loop_24x32: subs r12, #1 .rept 4 @@ -296,7 +296,7 @@ vld1.8 {d1}, r2, r3 vabal.u8 q10, d0, d1 .endr - bne .loop_24x32 + bne .Loop_24x32 vadd.u16 q8, q8, q9 vadd.u16 d16, d16, d17 @@ -322,7 +322,7 @@ sub r3, #32 mov r12, #16 -.loop_48x64: +.Loop_48x64: subs r12, #1 .rept 4 @@ -337,7 +337,7 @@ vabal.u8 q14, d4, d20 vabal.u8 q15, d5, d21 .endr - bne .loop_48x64 + bne .Loop_48x64 vadd.u16 q3, q3, q11 vadd.u16 d6, d6, d7 @@ -635,12 +635,12 @@ veor.u8 q15, q15 .endif -.loop_sad_x\x\()_16x\h: +.Loop_sad_x\x\()_16x\h: .rept 8 SAD_X_16 \x .endr subs r6, #1 - bne .loop_sad_x\x\()_16x\h + bne .Loop_sad_x\x\()_16x\h vadd.u16 q8, q8, q9 vadd.u16 q10, q10, q11 @@ -929,12 +929,12 @@ veor.u8 q14, q14 veor.u8 q15, q15 .endif -.loop_sad_x\x\()_64x\h: +.Loop_sad_x\x\()_64x\h: .rept 8 SAD_X_64 \x .endr subs r6, #1 - bne .loop_sad_x\x\()_64x\h + bne .Loop_sad_x\x\()_64x\h .if \h <= 16 vadd.u16 q8, q8, q9 @@ -1071,12 +1071,12 @@ veor.u8 q15, q15 .endif -.loop_sad_x\x\()_48x64: +.Loop_sad_x\x\()_48x64: .rept 8 SAD_X_48 \x .endr subs r6, #1 - bne .loop_sad_x\x\()_48x64 + bne .Loop_sad_x\x\()_48x64 vpaddl.u16 q8, q8 vpaddl.u16 q9, q9 @@ -1179,12 +1179,12 @@ veor.u8 q15, q15 .endif -.loop_sad_x\x\()_24x32: +.Loop_sad_x\x\()_24x32: .rept 8 SAD_X_24 \x .endr subs r6, #1 - bne .loop_sad_x\x\()_24x32 + bne .Loop_sad_x\x\()_24x32 vadd.u16 q8, q8, q9 vadd.u16 q10, q10, q11
View file
x265_3.6.tar.gz/source/common/arm/ssd-a.S -> x265_4.0.tar.gz/source/common/arm/ssd-a.S
Changed
@@ -121,7 +121,7 @@ veor.u8 q0, q0 veor.u8 q1, q1 -.loop_sse_pp_32: +.Loop_sse_pp_32: subs r12, #1 .rept 4 vld1.64 {q8-q9}, r0, r1 @@ -139,7 +139,7 @@ vmlal.s16 q0, d26, d26 vmlal.s16 q1, d27, d27 .endr - bne .loop_sse_pp_32 + bne .Loop_sse_pp_32 vadd.s32 q0, q1 vadd.s32 d0, d0, d1 vpadd.s32 d0, d0, d0 @@ -154,7 +154,7 @@ veor.u8 q0, q0 veor.u8 q1, q1 -.loop_sse_pp_64: +.Loop_sse_pp_64: subs r12, #1 .rept 4 vld1.64 {q8-q9}, r0! @@ -187,7 +187,7 @@ vmlal.s16 q0, d26, d26 vmlal.s16 q1, d27, d27 .endr - bne .loop_sse_pp_64 + bne .Loop_sse_pp_64 vadd.s32 q0, q1 vadd.s32 d0, d0, d1 vpadd.s32 d0, d0, d0 @@ -257,7 +257,7 @@ veor.u8 q0, q0 veor.u8 q1, q1 -.loop_sse_ss_16: +.Loop_sse_ss_16: subs r12, #1 .rept 4 vld1.s16 {q8-q9}, r0, r1 @@ -269,7 +269,7 @@ vmlal.s16 q0, d18, d18 vmlal.s16 q1, d19, d19 .endr - bne .loop_sse_ss_16 + bne .Loop_sse_ss_16 vadd.s32 q0, q1 vadd.s32 d0, d0, d1 vpadd.s32 d0, d0, d0 @@ -286,7 +286,7 @@ veor.u8 q0, q0 veor.u8 q1, q1 -.loop_sse_ss_32: +.Loop_sse_ss_32: subs r12, #1 .rept 4 vld1.s16 {q8-q9}, r0! @@ -307,7 +307,7 @@ vmlal.s16 q0, d18, d18 vmlal.s16 q1, d19, d19 .endr - bne .loop_sse_ss_32 + bne .Loop_sse_ss_32 vadd.s32 q0, q1 vadd.s32 d0, d0, d1 vpadd.s32 d0, d0, d0 @@ -324,7 +324,7 @@ veor.u8 q0, q0 veor.u8 q1, q1 -.loop_sse_ss_64: +.Loop_sse_ss_64: subs r12, #1 .rept 2 vld1.s16 {q8-q9}, r0! @@ -363,7 +363,7 @@ vmlal.s16 q0, d18, d18 vmlal.s16 q1, d19, d19 .endr - bne .loop_sse_ss_64 + bne .Loop_sse_ss_64 vadd.s32 q0, q1 vadd.s32 d0, d0, d1 vpadd.s32 d0, d0, d0 @@ -417,7 +417,7 @@ veor.u8 q0, q0 veor.u8 q1, q1 -.loop_ssd_s_16: +.Loop_ssd_s_16: subs r12, #1 .rept 2 vld1.s16 {q8-q9}, r0, r1 @@ -431,7 +431,7 @@ vmlal.s16 q0, d22, d22 vmlal.s16 q1, d23, d23 .endr - bne .loop_ssd_s_16 + bne .Loop_ssd_s_16 vadd.s32 q0, q1 vadd.s32 d0, d0, d1 vpadd.s32 d0, d0, d0 @@ -446,7 +446,7 @@ veor.u8 q0, q0 veor.u8 q1, q1 -.loop_ssd_s_32: +.Loop_ssd_s_32: subs r12, #1 .rept 4 vld1.s16 {q8-q9}, r0! @@ -460,7 +460,7 @@ vmlal.s16 q0, d22, d22 vmlal.s16 q1, d23, d23 .endr - bne .loop_ssd_s_32 + bne .Loop_ssd_s_32 vadd.s32 q0, q1 vadd.s32 d0, d0, d1 vpadd.s32 d0, d0, d0
View file
x265_3.6.tar.gz/source/common/common.h -> x265_4.0.tar.gz/source/common/common.h
Changed
@@ -176,6 +176,12 @@ template<typename T> /* clip to pixel range, 0..255 or 0..1023 */ inline pixel x265_clip(T x) { return (pixel)x265_min<T>(T((1 << X265_DEPTH) - 1), x265_max<T>(T(0), x)); } +/* get the sign of input variable */ +static inline int8_t x265_signOf(int32_t x) +{ + return (x >> 31) | ((int32_t)((((uint32_t) - x)) >> 31)); +} + typedef int16_t coeff_t; // transform coefficient #define X265_MIN(a, b) ((a) < (b) ? (a) : (b))
View file
x265_3.6.tar.gz/source/common/cpu.cpp -> x265_4.0.tar.gz/source/common/cpu.cpp
Changed
@@ -115,6 +115,12 @@ #if defined(HAVE_SVE2) { "SVE2", X265_CPU_SVE2 }, #endif +#if defined(HAVE_NEON_DOTPROD) + { "Neon_DotProd", X265_CPU_NEON_DOTPROD }, +#endif +#if defined(HAVE_NEON_I8MM) + { "Neon_I8MM", X265_CPU_NEON_I8MM }, +#endif #elif X265_ARCH_POWER8 { "Altivec", X265_CPU_ALTIVEC }, @@ -389,17 +395,22 @@ { int flags = 0; - #if defined(HAVE_SVE2) - flags |= X265_CPU_SVE2; - flags |= X265_CPU_SVE; + #if HAVE_NEON flags |= X265_CPU_NEON; - #elif defined(HAVE_SVE) + #endif + #if HAVE_NEON_DOTPROD + flags |= X265_CPU_NEON_DOTPROD; + #endif + #if HAVE_NEON_I8MM + flags |= X265_CPU_NEON_I8MM; + #endif + #if HAVE_SVE flags |= X265_CPU_SVE; - flags |= X265_CPU_NEON; - #elif HAVE_NEON - flags |= X265_CPU_NEON; #endif - + #if HAVE_SVE2 + flags |= X265_CPU_SVE2; + #endif + return flags; }
View file
x265_3.6.tar.gz/source/common/cudata.cpp -> x265_4.0.tar.gz/source/common/cudata.cpp
Changed
@@ -290,6 +290,10 @@ m_bFirstRowInSlice = (uint8_t)firstRowInSlice; m_bLastRowInSlice = (uint8_t)lastRowInSlice; m_bLastCuInSlice = (uint8_t)lastCuInSlice; +#if ENABLE_SCC_EXT + m_lastIntraBCMv0.set(0, 0); + m_lastIntraBCMv1.set(0, 0); +#endif /* sequential memsets */ m_partSet((uint8_t*)m_qp, (uint8_t)qp); @@ -323,7 +327,7 @@ } // initialize Sub partition -void CUData::initSubCU(const CUData& ctu, const CUGeom& cuGeom, int qp) +void CUData::initSubCU(const CUData& ctu, const CUGeom& cuGeom, int qp, MV lastIntraBCMv2) { m_absIdxInCTU = cuGeom.absPartIdx; m_encData = ctu.m_encData; @@ -360,6 +364,14 @@ /* initialize the remaining CU data in one memset */ memset(m_predMode, 0, (ctu.m_chromaFormat == X265_CSP_I400 ? BytesPerPartition - 13 : BytesPerPartition - 9) * m_numPartitions); memset(m_distortion, 0, m_numPartitions * sizeof(sse_t)); + +#if ENABLE_SCC_EXT + if (lastIntraBCMv) + { + for (int i = 0; i < 2; i++) + m_lastIntraBCMvi = lastIntraBCMvi; + } +#endif } /* Copy the results of a sub-part (split) CU to the parent CU */ @@ -415,6 +427,10 @@ memcpy(m_trCoeff1 + tmpC2, subCU.m_trCoeff1, sizeof(coeff_t) * tmpC); memcpy(m_trCoeff2 + tmpC2, subCU.m_trCoeff2, sizeof(coeff_t) * tmpC); } +#if ENABLE_SCC_EXT + for (int i = 0; i < 2; i++) + m_lastIntraBCMvi = subCU.m_lastIntraBCMvi; +#endif } /* If a sub-CU part is not present (off the edge of the picture) its depth and @@ -1591,7 +1607,11 @@ return maxNumMergeCand; } } +#if ENABLE_SCC_EXT + if (m_slice->m_bTemporalMvp) +#else if (m_slice->m_sps->bTemporalMVPEnabled) +#endif { uint32_t partIdxRB = deriveRightBottomIdx(puIdx); MV colmv; @@ -1681,10 +1701,15 @@ } } } - int numRefIdx = (isInterB) ? X265_MIN(m_slice->m_numRefIdx0, m_slice->m_numRefIdx1) : m_slice->m_numRefIdx0; + int numRefIdx0 = m_slice->m_numRefIdx0; +#if ENABLE_SCC_EXT + if (m_slice->m_param->bEnableSCC) + numRefIdx0--; +#endif + int numRefIdx = (isInterB) ? X265_MIN(numRefIdx0, m_slice->m_numRefIdx1) : numRefIdx0; int r = 0; int refcnt = 0; - while (count < maxNumMergeCand) + while (numRefIdx && (count < maxNumMergeCand)) { candDircount = 1; candMvFieldcount0.mv.word = 0; @@ -1712,28 +1737,61 @@ } // Create the PMV list. Called for each reference index. -int CUData::getPMV(InterNeighbourMV *neighbours, uint32_t picList, uint32_t refIdx, MV* amvpCand, MV* pmv) const +int CUData::getPMV(InterNeighbourMV* neighbours, uint32_t picList, uint32_t refIdx, MV* amvpCand, MV* pmv, uint32_t puIdx, uint32_t absPartIdx) const { MV directMVMD_ABOVE_LEFT + 1; MV indirectMVMD_ABOVE_LEFT + 1; bool validDirectMD_ABOVE_LEFT + 1; bool validIndirectMD_ABOVE_LEFT + 1; - // Left candidate. - validDirectMD_BELOW_LEFT = getDirectPMV(directMVMD_BELOW_LEFT, neighbours + MD_BELOW_LEFT, picList, refIdx); - validDirectMD_LEFT = getDirectPMV(directMVMD_LEFT, neighbours + MD_LEFT, picList, refIdx); - // Top candidate. - validDirectMD_ABOVE_RIGHT = getDirectPMV(directMVMD_ABOVE_RIGHT, neighbours + MD_ABOVE_RIGHT, picList, refIdx); - validDirectMD_ABOVE = getDirectPMV(directMVMD_ABOVE, neighbours + MD_ABOVE, picList, refIdx); - validDirectMD_ABOVE_LEFT = getDirectPMV(directMVMD_ABOVE_LEFT, neighbours + MD_ABOVE_LEFT, picList, refIdx); - - // Left candidate. - validIndirectMD_BELOW_LEFT = getIndirectPMV(indirectMVMD_BELOW_LEFT, neighbours + MD_BELOW_LEFT, picList, refIdx); - validIndirectMD_LEFT = getIndirectPMV(indirectMVMD_LEFT, neighbours + MD_LEFT, picList, refIdx); - // Top candidate. - validIndirectMD_ABOVE_RIGHT = getIndirectPMV(indirectMVMD_ABOVE_RIGHT, neighbours + MD_ABOVE_RIGHT, picList, refIdx); - validIndirectMD_ABOVE = getIndirectPMV(indirectMVMD_ABOVE, neighbours + MD_ABOVE, picList, refIdx); - validIndirectMD_ABOVE_LEFT = getIndirectPMV(indirectMVMD_ABOVE_LEFT, neighbours + MD_ABOVE_LEFT, picList, refIdx); +#if (ENABLE_MULTIVIEW || ENABLE_SCC_EXT) + if (m_slice->m_param->numViews > 1 || m_slice->m_param->bEnableSCC) + { + // Left candidate. + if ((neighbours + MD_BELOW_LEFT)->isAvailable || (neighbours + MD_LEFT)->isAvailable) + { + validIndirectMD_ABOVE_RIGHT = validIndirectMD_ABOVE = validIndirectMD_ABOVE_LEFT = false; + + validDirectMD_BELOW_LEFT = getDirectPMV(directMVMD_BELOW_LEFT, neighbours + MD_BELOW_LEFT, picList, refIdx); + validDirectMD_LEFT = getDirectPMV(directMVMD_LEFT, neighbours + MD_LEFT, picList, refIdx); + + validIndirectMD_BELOW_LEFT = getIndirectPMV(indirectMVMD_BELOW_LEFT, neighbours + MD_BELOW_LEFT, picList, refIdx); + validIndirectMD_LEFT = getIndirectPMV(indirectMVMD_LEFT, neighbours + MD_LEFT, picList, refIdx); + } + + // Top candidate. + validDirectMD_ABOVE_RIGHT = getDirectPMV(directMVMD_ABOVE_RIGHT, neighbours + MD_ABOVE_RIGHT, picList, refIdx); + validDirectMD_ABOVE = getDirectPMV(directMVMD_ABOVE, neighbours + MD_ABOVE, picList, refIdx); + validDirectMD_ABOVE_LEFT = getDirectPMV(directMVMD_ABOVE_LEFT, neighbours + MD_ABOVE_LEFT, picList, refIdx); + + // Top candidate. + if (!((neighbours + MD_BELOW_LEFT)->isAvailable || (neighbours + MD_LEFT)->isAvailable)) + { + validDirectMD_BELOW_LEFT = validDirectMD_LEFT = validIndirectMD_BELOW_LEFT = validIndirectMD_LEFT = false; + validIndirectMD_ABOVE_RIGHT = getIndirectPMV(indirectMVMD_ABOVE_RIGHT, neighbours + MD_ABOVE_RIGHT, picList, refIdx); + validIndirectMD_ABOVE = getIndirectPMV(indirectMVMD_ABOVE, neighbours + MD_ABOVE, picList, refIdx); + validIndirectMD_ABOVE_LEFT = getIndirectPMV(indirectMVMD_ABOVE_LEFT, neighbours + MD_ABOVE_LEFT, picList, refIdx); + } + } + else +#endif + { + // Left candidate. + validDirectMD_BELOW_LEFT = getDirectPMV(directMVMD_BELOW_LEFT, neighbours + MD_BELOW_LEFT, picList, refIdx); + validDirectMD_LEFT = getDirectPMV(directMVMD_LEFT, neighbours + MD_LEFT, picList, refIdx); + // Top candidate. + validDirectMD_ABOVE_RIGHT = getDirectPMV(directMVMD_ABOVE_RIGHT, neighbours + MD_ABOVE_RIGHT, picList, refIdx); + validDirectMD_ABOVE = getDirectPMV(directMVMD_ABOVE, neighbours + MD_ABOVE, picList, refIdx); + validDirectMD_ABOVE_LEFT = getDirectPMV(directMVMD_ABOVE_LEFT, neighbours + MD_ABOVE_LEFT, picList, refIdx); + + // Left candidate. + validIndirectMD_BELOW_LEFT = getIndirectPMV(indirectMVMD_BELOW_LEFT, neighbours + MD_BELOW_LEFT, picList, refIdx); + validIndirectMD_LEFT = getIndirectPMV(indirectMVMD_LEFT, neighbours + MD_LEFT, picList, refIdx); + // Top candidate. + validIndirectMD_ABOVE_RIGHT = getIndirectPMV(indirectMVMD_ABOVE_RIGHT, neighbours + MD_ABOVE_RIGHT, picList, refIdx); + validIndirectMD_ABOVE = getIndirectPMV(indirectMVMD_ABOVE, neighbours + MD_ABOVE, picList, refIdx); + validIndirectMD_ABOVE_LEFT = getIndirectPMV(indirectMVMD_ABOVE_LEFT, neighbours + MD_ABOVE_LEFT, picList, refIdx); + } int num = 0; // Left predictor search @@ -1781,27 +1839,80 @@ // Get the collocated candidate. At this step, either the first candidate // was found or its value is 0. - if (m_slice->m_sps->bTemporalMVPEnabled && num < 2) +#if ENABLE_MULTIVIEW || ENABLE_SCC_EXT + if (m_slice->m_param->numViews > 1 || m_slice->m_param->bEnableSCC) { - int tempRefIdx = neighboursMD_COLLOCATED.refIdxpicList; - if (tempRefIdx != -1) + if (m_slice->m_bTemporalMvp && num < 2) { - uint32_t cuAddr = neighboursMD_COLLOCATED.cuAddrpicList; - const Frame* colPic = m_slice->m_refFrameListm_slice->isInterB() && !m_slice->m_colFromL0Flagm_slice->m_colRefIdx; - const CUData* colCU = colPic->m_encData->getPicCTU(cuAddr); + int refId = refIdx; + uint32_t absPartAddr = m_absIdxInCTU + absPartIdx; + uint32_t partIdxRB = deriveRightBottomIdx(puIdx); + bool isValid; + + // co-located RightBottom temporal predictor (H) + int ctuIdx = -1; - // Scale the vector - int colRefPOC = colCU->m_slice->m_refPOCListtempRefIdx >> 4tempRefIdx & 0xf; - int colPOC = colCU->m_slice->m_poc; + // image boundary check + if (m_encData->getPicCTU(m_cuAddr)->m_cuPelX + g_zscanToPelXpartIdxRB + UNIT_SIZE < m_slice->m_sps->picWidthInLumaSamples && + m_encData->getPicCTU(m_cuAddr)->m_cuPelY + g_zscanToPelYpartIdxRB + UNIT_SIZE < m_slice->m_sps->picHeightInLumaSamples) + { + uint32_t absPartIdxRB = g_zscanToRasterpartIdxRB; + uint32_t numUnits = s_numPartInCUSize; + bool bNotLastCol = lessThanCol(absPartIdxRB, numUnits - 1); // is not at the last column of CTU + bool bNotLastRow = lessThanRow(absPartIdxRB, numUnits - 1); // is not at the last row of CTU - int curRefPOC = m_slice->m_refPOCListpicListrefIdx; - int curPOC = m_slice->m_poc; - pmvnumMvc++ = amvpCandnum++ = scaleMvByPOCDist(neighboursMD_COLLOCATED.mvpicList, curPOC, curRefPOC, colPOC, colRefPOC); + if (bNotLastCol && bNotLastRow) + { + absPartAddr = g_rasterToZscanabsPartIdxRB + RASTER_SIZE + 1; + ctuIdx = m_cuAddr; + } + else if (bNotLastCol) + absPartAddr = g_rasterToZscan(absPartIdxRB + 1) & (numUnits - 1); + else if (bNotLastRow)
View file
x265_3.6.tar.gz/source/common/cudata.h -> x265_4.0.tar.gz/source/common/cudata.h
Changed
@@ -37,6 +37,9 @@ class Slice; struct TUEntropyCodingParameters; struct CUDataMemPool; +#if ENABLE_SCC_EXT +struct IBC; +#endif enum PartSize { @@ -107,6 +110,8 @@ // Collocated right bottom CU addr. uint32_t cuAddr2; + bool isAvailable; + // For spatial prediction, this field contains the reference index // in each list (-1 if not available). // @@ -118,6 +123,14 @@ union { int16_t refIdx2; int32_t unifiedRef; }; }; +struct IBC +{ + int m_numBVs; + int m_numBV16s; + MV m_BVs64; + MV m_lastIntraBCMv2; +}; + typedef void(*cucopy_t)(uint8_t* dst, uint8_t* src); // dst and src are aligned to MIN(size, 32) typedef void(*cubcast_t)(uint8_t* dst, uint8_t val); // dst is aligned to MIN(size, 32) @@ -230,13 +243,17 @@ uint32_t* m_collectCUVariance; uint32_t* m_collectCUCount; +#if ENABLE_SCC_EXT + MV m_lastIntraBCMv2; +#endif + CUData(); void initialize(const CUDataMemPool& dataPool, uint32_t depth, const x265_param& param, int instance); static void calcCTUGeoms(uint32_t ctuWidth, uint32_t ctuHeight, uint32_t maxCUSize, uint32_t minCUSize, CUGeom cuDataArrayCUGeom::MAX_GEOMS); void initCTU(const Frame& frame, uint32_t cuAddr, int qp, uint32_t firstRowInSlice, uint32_t lastRowInSlice, uint32_t lastCUInSlice); - void initSubCU(const CUData& ctu, const CUGeom& cuGeom, int qp); + void initSubCU(const CUData& ctu, const CUGeom& cuGeom, int qp, MV lastIntraBCMv2 = 0); void initLosslessCU(const CUData& cu, const CUGeom& cuGeom); void copyPartFrom(const CUData& cu, const CUGeom& childGeom, uint32_t subPartIdx); @@ -272,7 +289,7 @@ int8_t getRefQP(uint32_t currAbsIdxInCTU) const; uint32_t getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MVField (*candMvField)2, uint8_t* candDir) const; void clipMv(MV& outMV) const; - int getPMV(InterNeighbourMV *neighbours, uint32_t reference_list, uint32_t refIdx, MV* amvpCand, MV* pmv) const; + int getPMV(InterNeighbourMV* neighbours, uint32_t reference_list, uint32_t refIdx, MV* amvpCand, MV* pmv, uint32_t puIdx = 0, uint32_t absPartIdx = 0) const; void getNeighbourMV(uint32_t puIdx, uint32_t absPartIdx, InterNeighbourMV* neighbours) const; void getIntraTUQtDepthRange(uint32_t tuDepthRange2, uint32_t absPartIdx) const; void getInterTUQtDepthRange(uint32_t tuDepthRange2, uint32_t absPartIdx) const; @@ -309,6 +326,15 @@ const CUData* getPUAboveRightAdi(uint32_t& arPartUnitIdx, uint32_t curPartUnitIdx, uint32_t partUnitOffset) const; const CUData* getPUBelowLeftAdi(uint32_t& blPartUnitIdx, uint32_t curPartUnitIdx, uint32_t partUnitOffset) const; +#if ENABLE_SCC_EXT + void getIntraBCMVPsEncOnly(uint32_t absPartIdx, MV* MvPred, int& nbPred, int puIdx); + bool getDerivedBV(uint32_t absPartIdx, const MV& currentMv, MV& derivedMv, uint32_t width, uint32_t height); + bool isIntraBC(const CUData* cu, uint32_t absPartIdx) const; + bool getColMVPIBC(int ctuRsAddr, int partUnitIdx, MV& rcMv); + void roundMergeCandidates(MVField(*candMvField)2, int iCount) const; + bool is8x8BipredRestriction(MV mvL0, MV mvL1, int iRefIdxL0, int iRefIdxL1) const; +#endif + protected: template<typename T>
View file
x265_3.6.tar.gz/source/common/dct.cpp -> x265_4.0.tar.gz/source/common/dct.cpp
Changed
@@ -439,7 +439,8 @@ } } -static void dst4_c(const int16_t* src, int16_t* dst, intptr_t srcStride) +namespace X265_NS { +void dst4_c(const int16_t* src, int16_t* dst, intptr_t srcStride) { const int shift_1st = 1 + X265_DEPTH - 8; const int shift_2nd = 8; @@ -456,7 +457,7 @@ fastForwardDst(coef, dst, shift_2nd); } -static void dct4_c(const int16_t* src, int16_t* dst, intptr_t srcStride) +void dct4_c(const int16_t* src, int16_t* dst, intptr_t srcStride) { const int shift_1st = 1 + X265_DEPTH - 8; const int shift_2nd = 8; @@ -473,7 +474,7 @@ partialButterfly4(coef, dst, shift_2nd, 4); } -static void dct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride) +void dct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride) { const int shift_1st = 2 + X265_DEPTH - 8; const int shift_2nd = 9; @@ -490,7 +491,7 @@ partialButterfly8(coef, dst, shift_2nd, 8); } -static void dct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride) +void dct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride) { const int shift_1st = 3 + X265_DEPTH - 8; const int shift_2nd = 10; @@ -507,7 +508,7 @@ partialButterfly16(coef, dst, shift_2nd, 16); } -static void dct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride) +void dct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride) { const int shift_1st = 4 + X265_DEPTH - 8; const int shift_2nd = 11; @@ -524,7 +525,7 @@ partialButterfly32(coef, dst, shift_2nd, 32); } -static void idst4_c(const int16_t* src, int16_t* dst, intptr_t dstStride) +void idst4_c(const int16_t* src, int16_t* dst, intptr_t dstStride) { const int shift_1st = 7; const int shift_2nd = 12 - (X265_DEPTH - 8); @@ -541,7 +542,7 @@ } } -static void idct4_c(const int16_t* src, int16_t* dst, intptr_t dstStride) +void idct4_c(const int16_t* src, int16_t* dst, intptr_t dstStride) { const int shift_1st = 7; const int shift_2nd = 12 - (X265_DEPTH - 8); @@ -558,7 +559,7 @@ } } -static void idct8_c(const int16_t* src, int16_t* dst, intptr_t dstStride) +void idct8_c(const int16_t* src, int16_t* dst, intptr_t dstStride) { const int shift_1st = 7; const int shift_2nd = 12 - (X265_DEPTH - 8); @@ -575,7 +576,7 @@ } } -static void idct16_c(const int16_t* src, int16_t* dst, intptr_t dstStride) +void idct16_c(const int16_t* src, int16_t* dst, intptr_t dstStride) { const int shift_1st = 7; const int shift_2nd = 12 - (X265_DEPTH - 8); @@ -592,7 +593,7 @@ } } -static void idct32_c(const int16_t* src, int16_t* dst, intptr_t dstStride) +void idct32_c(const int16_t* src, int16_t* dst, intptr_t dstStride) { const int shift_1st = 7; const int shift_2nd = 12 - (X265_DEPTH - 8); @@ -608,6 +609,7 @@ memcpy(&dsti * dstStride, &blocki * 32, 32 * sizeof(int16_t)); } } +} // namespace X265_NS static void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift) {
View file
x265_3.6.tar.gz/source/common/deblock.cpp -> x265_4.0.tar.gz/source/common/deblock.cpp
Changed
@@ -316,7 +316,7 @@ void Deblock::edgeFilterLuma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength) { - PicYuv* reconPic = cuQ->m_encData->m_reconPic; + PicYuv* reconPic = cuQ->m_encData->m_reconPic0; pixel* src = reconPic->getLumaAddr(cuQ->m_cuAddr, absPartIdx); intptr_t stride = reconPic->m_stride; const PPS* pps = cuQ->m_slice->m_pps; @@ -429,7 +429,7 @@ : ((g_zscanToPelYabsPartIdx + edge * UNIT_SIZE) >> cuQ->m_vChromaShift)) % DEBLOCK_SMALLEST_BLOCK == 0, "invalid edge\n"); - PicYuv* reconPic = cuQ->m_encData->m_reconPic; + PicYuv* reconPic = cuQ->m_encData->m_reconPic0; intptr_t stride = reconPic->m_strideC; intptr_t srcOffset = reconPic->getChromaAddrOffset(cuQ->m_cuAddr, absPartIdx); bool bCheckNoFilter = pps->bTransquantBypassEnabled;
View file
x265_3.6.tar.gz/source/common/frame.cpp -> x265_4.0.tar.gz/source/common/frame.cpp
Changed
@@ -37,7 +37,8 @@ m_reconColCount = NULL; m_countRefEncoders = 0; m_encData = NULL; - m_reconPic = NULL; + for (int i = 0; i < NUM_RECON_VERSION; i++) + m_reconPici = NULL; m_quantOffsets = NULL; m_next = NULL; m_prev = NULL; @@ -75,6 +76,11 @@ m_tempLayer = 0; m_sameLayerRefPic = false; + + m_viewId = 0; + m_valid = 0; + m_nextSubDPB = NULL; + m_prevSubDPB = NULL; } bool Frame::create(x265_param *param, float* quantOffsets) @@ -85,6 +91,7 @@ if (m_param->bEnableTemporalFilter) { m_mcstf = new TemporalFilter; + m_mcstf->m_range = param->mcstfFrameRange; m_mcstf->init(param); m_fencPicSubsampled2 = new PicYuv; @@ -198,29 +205,35 @@ bool Frame::allocEncodeData(x265_param *param, const SPS& sps) { m_encData = new FrameData; - m_reconPic = new PicYuv; m_param = param; - m_encData->m_reconPic = m_reconPic; - bool ok = m_encData->create(*param, sps, m_fencPic->m_picCsp) && m_reconPic->create(param); + for (int i = 0; i < !!m_param->bEnableSCC + 1; i++) + { + m_reconPici = new PicYuv; + m_encData->m_reconPici = m_reconPici; + } + bool ok = m_encData->create(*param, sps, m_fencPic->m_picCsp) && m_reconPic0->create(param) && (param->bEnableSCC ? (param->bEnableSCC && m_reconPic1->create(param)) : 1); if (ok) { - /* initialize right border of m_reconpicYuv as SAO may read beyond the + /* initialize right border of m_reconPicYuv as SAO may read beyond the * end of the picture accessing uninitialized pixels */ int maxHeight = sps.numCuInHeight * param->maxCUSize; - memset(m_reconPic->m_picOrg0, 0, sizeof(pixel)* m_reconPic->m_stride * maxHeight); + memset(m_reconPic0->m_picOrg0, 0, sizeof(pixel)* m_reconPic0->m_stride * maxHeight); - /* use pre-calculated cu/pu offsets cached in the SPS structure */ - m_reconPic->m_cuOffsetY = sps.cuOffsetY; - m_reconPic->m_buOffsetY = sps.buOffsetY; - - if (param->internalCsp != X265_CSP_I400) + for (int i = 0; i < !!m_param->bEnableSCC + 1; i++) { - memset(m_reconPic->m_picOrg1, 0, sizeof(pixel) * m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift)); - memset(m_reconPic->m_picOrg2, 0, sizeof(pixel) * m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift)); - /* use pre-calculated cu/pu offsets cached in the SPS structure */ - m_reconPic->m_cuOffsetC = sps.cuOffsetC; - m_reconPic->m_buOffsetC = sps.buOffsetC; + m_reconPici->m_cuOffsetY = sps.cuOffsetY; + m_reconPici->m_buOffsetY = sps.buOffsetY; + + if (param->internalCsp != X265_CSP_I400) + { + memset(m_reconPici->m_picOrg1, 0, sizeof(pixel) * m_reconPici->m_strideC * (maxHeight >> m_reconPici->m_vChromaShift)); + memset(m_reconPici->m_picOrg2, 0, sizeof(pixel) * m_reconPici->m_strideC * (maxHeight >> m_reconPici->m_vChromaShift)); + + /* use pre-calculated cu/pu offsets cached in the SPS structure */ + m_reconPici->m_cuOffsetC = sps.cuOffsetC; + m_reconPici->m_buOffsetC = sps.buOffsetC; + } } } return ok; @@ -230,7 +243,8 @@ void Frame::reinit(const SPS& sps) { m_bChromaExtended = false; - m_reconPic = m_encData->m_reconPic; + for (int i = 0; i < !!m_param->bEnableSCC + 1; i++) + m_reconPici = m_encData->m_reconPici; m_encData->reinit(sps); } @@ -243,6 +257,35 @@ m_encData = NULL; } +#if ENABLE_MULTIVIEW + //Destroy interlayer References + if (refPicSetInterLayer0.size()) + { + Frame* iterFrame = refPicSetInterLayer0.first(); + + while (iterFrame) + { + Frame* curFrame = iterFrame; + iterFrame = iterFrame->m_nextSubDPB; + refPicSetInterLayer0.removeSubDPB(*curFrame); + iterFrame = refPicSetInterLayer0.first(); + } + } + + if (refPicSetInterLayer1.size()) + { + Frame* iterFrame = refPicSetInterLayer1.first(); + + while (iterFrame) + { + Frame* curFrame = iterFrame; + iterFrame = iterFrame->m_nextSubDPB; + refPicSetInterLayer1.removeSubDPB(*curFrame); + iterFrame = refPicSetInterLayer1.first(); + } + } +#endif + if (m_fencPic) { if (m_param->bCopyPicToFrame) @@ -271,11 +314,14 @@ X265_FREE(m_isSubSampled); } - if (m_reconPic) + for (int i = 0; i < !!m_param->bEnableSCC + 1; i++) { - m_reconPic->destroy(); - delete m_reconPic; - m_reconPic = NULL; + if (m_reconPici) + { + m_reconPici->destroy(); + delete m_reconPici; + m_reconPici = NULL; + } } if (m_reconRowFlag)
View file
x265_3.6.tar.gz/source/common/frame.h -> x265_4.0.tar.gz/source/common/frame.h
Changed
@@ -81,13 +81,16 @@ /* These two items will be NULL until the Frame begins to be encoded, at which point * it will be assigned a FrameData instance, which comes with a reconstructed image PicYuv */ FrameData* m_encData; - PicYuv* m_reconPic; + PicYuv* m_reconPicNUM_RECON_VERSION; /* Data associated with x265_picture */ PicYuv* m_fencPic; PicYuv* m_fencPicSubsampled2; PicYuv* m_fencPicSubsampled4; + PicList refPicSetInterLayer0; + PicList refPicSetInterLayer1; + int m_poc; int m_encodeOrder; int m_gopOffset; @@ -161,6 +164,13 @@ int8_t m_gopId; bool m_sameLayerRefPic; + int m_sLayerId; + bool m_valid; + + int m_viewId; + Frame* m_nextSubDPB; // PicList doubly linked list pointers + Frame* m_prevSubDPB; + Frame(); bool create(x265_param *param, float* quantOffsets);
View file
x265_3.6.tar.gz/source/common/framedata.h -> x265_4.0.tar.gz/source/common/framedata.h
Changed
@@ -115,7 +115,7 @@ const x265_param* m_param; FrameData* m_freeListNext; - PicYuv* m_reconPic; + PicYuv* m_reconPicNUM_RECON_VERSION; bool m_bHasReferences; /* used during DPB/RPS updates */ int m_frameEncoderID; /* the ID of the FrameEncoder encoding this frame */ JobProvider* m_jobProvider;
View file
x265_3.6.tar.gz/source/common/ipfilter.cpp -> x265_4.0.tar.gz/source/common/ipfilter.cpp
Changed
@@ -34,8 +34,8 @@ #pragma warning(disable: 4127) // conditional expression is constant, typical for templated functions #endif -namespace { -// file local namespace +namespace X265_NS { +// x265 private namespace template<int width, int height> void filterPixelToShort_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride) @@ -367,10 +367,6 @@ interp_horiz_ps_c<N, width, height>(src, srcStride, immed, width, idxX, 1); filterVertical_sp_c<N>(immed + (N / 2 - 1) * width, width, dst, dstStride, width, height, idxY); } -} - -namespace X265_NS { -// x265 private namespace #define CHROMA_420(W, H) \ p.chromaX265_CSP_I420.puCHROMA_420_ ## W ## x ## H.filter_hpp = interp_horiz_pp_c<4, W, H>; \
View file
x265_3.6.tar.gz/source/common/loopfilter.cpp -> x265_4.0.tar.gz/source/common/loopfilter.cpp
Changed
@@ -30,16 +30,10 @@ namespace { -/* get the sign of input variable (TODO: this is a dup, make common) */ -inline int8_t signOf(int x) -{ - return (x >> 31) | ((int)((((uint32_t)-x)) >> 31)); -} - static void calSign(int8_t *dst, const pixel *src1, const pixel *src2, const int endX) { for (int x = 0; x < endX; x++) - dstx = signOf(src1x - src2x); + dstx = x265_signOf(src1x - src2x); } static void processSaoCUE0(pixel * rec, int8_t * offsetEo, int width, int8_t* signLeft, intptr_t stride) @@ -70,7 +64,7 @@ for (x = 0; x < width; x++) { - signDown = signOf(recx - recx + stride); + signDown = x265_signOf(recx - recx + stride); edgeType = signDown + upBuff1x + 2; upBuff1x = -signDown; recx = x265_clip(recx + offsetEoedgeType); @@ -87,7 +81,7 @@ { for (x = 0; x < width; x++) { - signDown = signOf(recx - recx + stride); + signDown = x265_signOf(recx - recx + stride); edgeType = signDown + upBuff1x + 2; upBuff1x = -signDown; recx = x265_clip(recx + offsetEoedgeType); @@ -101,7 +95,7 @@ int x; for (x = 0; x < width; x++) { - int8_t signDown = signOf(recx - recx + stride + 1); + int8_t signDown = x265_signOf(recx - recx + stride + 1); int edgeType = signDown + buff1x + 2; bufftx + 1 = -signDown; recx = x265_clip(recx + offsetEoedgeType);; @@ -115,7 +109,7 @@ for (int x = startX + 1; x < endX; x++) { - signDown = signOf(recx - recx + stride); + signDown = x265_signOf(recx - recx + stride); edgeType = signDown + upBuff1x + 2; upBuff1x - 1 = -signDown; recx = x265_clip(recx + offsetEoedgeType);
View file
x265_3.6.tar.gz/source/common/lowpassdct.cpp -> x265_4.0.tar.gz/source/common/lowpassdct.cpp
Changed
@@ -58,7 +58,7 @@ } // replace first coef with total block average - dst0 = totalSum << 1; + dst0 = (X265_DEPTH == 8) ? (totalSum << 1) : (totalSum >> ((X265_DEPTH - 9))); } static void lowPassDct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride) @@ -83,7 +83,7 @@ { memcpy(&dsti * 16, &coefi * 8, 8 * sizeof(int16_t)); } - dst0 = static_cast<int16_t>(totalSum >> 1); + dst0 = static_cast<int16_t>(totalSum >> (1 + (X265_DEPTH - 8))); } static void lowPassDct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride) @@ -108,7 +108,7 @@ { memcpy(&dsti * 32, &coefi * 16, 16 * sizeof(int16_t)); } - dst0 = static_cast<int16_t>(totalSum >> 3); + dst0 = static_cast<int16_t>(totalSum >> (3 + (X265_DEPTH - 8))); } namespace X265_NS {
View file
x265_3.6.tar.gz/source/common/param.cpp -> x265_4.0.tar.gz/source/common/param.cpp
Changed
@@ -183,6 +183,7 @@ param->bEnableSceneCutAwareQp = 0; param->fwdMaxScenecutWindow = 1200; param->bwdMaxScenecutWindow = 600; + param->mcstfFrameRange = 2; for (int i = 0; i < 6; i++) { int deltas6 = { 5, 4, 3, 2, 1, 0 }; @@ -391,6 +392,10 @@ param->bEnableTemporalFilter = 0; param->temporalFilterStrength = 0.95; + /*Alpha Channel Encoding*/ + param->bEnableAlpha = 0; + param->numScalableLayers = 1; + #ifdef SVT_HEVC param->svtHevcParam = svtParam; svt_param_default(param); @@ -398,6 +403,15 @@ /* Film grain characteristics model filename */ param->filmGrain = NULL; param->bEnableSBRC = 0; + + /* Multi-View Encoding*/ + param->numViews = 1; + param->format = 0; + + param->numLayers = 1; + + /* SCC */ + param->bEnableSCC = 0; } int x265_param_default_preset(x265_param* param, const char* preset, const char* tune) @@ -417,6 +431,7 @@ if (!strcmp(preset, "ultrafast")) { + param->mcstfFrameRange = 1; param->maxNumMergeCand = 2; param->bIntraInBFrames = 0; param->lookaheadDepth = 5; @@ -441,6 +456,7 @@ } else if (!strcmp(preset, "superfast")) { + param->mcstfFrameRange = 1; param->maxNumMergeCand = 2; param->bIntraInBFrames = 0; param->lookaheadDepth = 10; @@ -461,6 +477,7 @@ } else if (!strcmp(preset, "veryfast")) { + param->mcstfFrameRange = 1; param->maxNumMergeCand = 2; param->limitReferences = 3; param->bIntraInBFrames = 0; @@ -474,6 +491,7 @@ } else if (!strcmp(preset, "faster")) { + param->mcstfFrameRange = 1; param->maxNumMergeCand = 2; param->limitReferences = 3; param->bIntraInBFrames = 0; @@ -485,6 +503,7 @@ } else if (!strcmp(preset, "fast")) { + param->mcstfFrameRange = 1; param->maxNumMergeCand = 2; param->limitReferences = 3; param->bEnableEarlySkip = 0; @@ -497,6 +516,7 @@ } else if (!strcmp(preset, "medium")) { + param->mcstfFrameRange = 1; /* defaults */ } else if (!strcmp(preset, "slow")) @@ -1437,6 +1457,33 @@ OPT("film-grain") p->filmGrain = (char* )value; OPT("mcstf") p->bEnableTemporalFilter = atobool(value); OPT("sbrc") p->bEnableSBRC = atobool(value); +#if ENABLE_ALPHA + OPT("alpha") + { + if (atobool(value)) + { + p->bEnableAlpha = 1; + p->numScalableLayers = 2; + p->numLayers = 2; + } + } +#endif +#if ENABLE_MULTIVIEW + OPT("format") + p->format = atoi(value); + OPT("num-views") + { + p->numViews = atoi(value); + } +#endif +#if ENABLE_SCC_EXT + OPT("scc") + { + p->bEnableSCC = atoi(value); + if (p->bEnableSCC) + p->bEnableWeightedPred = false; + } +#endif else return X265_PARAM_BAD_NAME; } @@ -1674,7 +1721,7 @@ CHECK(param->edgeVarThreshold < 0.0f || param->edgeVarThreshold > 1.0f, "Minimum edge density percentage for a CU should be an integer between 0 to 100"); } - CHECK(param->bframes && param->bframes >= param->lookaheadDepth && !param->rc.bStatRead, + CHECK(param->bframes && (param->bEnableTemporalFilter ? (param->bframes > param->lookaheadDepth) : (param->bframes >= param->lookaheadDepth)) && !param->rc.bStatRead, "Lookahead depth must be greater than the max consecutive bframe count"); CHECK(param->bframes < 0, "bframe count should be greater than zero"); @@ -1908,6 +1955,21 @@ } } CHECK(param->rc.dataShareMode != X265_SHARE_MODE_FILE && param->rc.dataShareMode != X265_SHARE_MODE_SHAREDMEM, "Invalid data share mode. It must be one of the X265_DATA_SHARE_MODES enum values\n" ); +#if ENABLE_ALPHA + if (param->bEnableAlpha) + { + CHECK((param->internalCsp != X265_CSP_I420), "Alpha encode supported only with i420a colorspace"); + CHECK((param->analysisMultiPassDistortion || param->analysisMultiPassRefine), "Alpha encode doesnot support multipass feature"); + } +#endif +#if ENABLE_MULTIVIEW + CHECK((param->numViews > 2), "Multi-View Encoding currently support only 2 views"); + CHECK((param->numViews > 1) && (param->internalBitDepth != 8), "BitDepthConstraint must be 8 for Multiview main profile"); + CHECK((param->numViews > 1) && (param->analysisMultiPassDistortion || param->analysisMultiPassRefine), "Multiview encode doesnot support multipass feature"); +#endif +#if ENABLE_SCC_EXT + CHECK(!!param->bEnableSCC&& param->rdLevel != 6, "Enabling scc extension in x265 requires rdlevel of 6 "); +#endif return check_failed; } @@ -2072,6 +2134,12 @@ TOOLOPT(param->rc.bStatWrite, "stats-write"); TOOLOPT(param->rc.bStatRead, "stats-read"); TOOLOPT(param->bSingleSeiNal, "single-sei"); +#if ENABLE_ALPHA + TOOLOPT(param->numScalableLayers > 1, "alpha"); +#endif +#if ENABLE_MULTIVIEW + TOOLOPT(param->numViews > 1, "multi-view"); +#endif #if ENABLE_HDR10_PLUS TOOLOPT(param->toneMapFile != NULL, "dhdr10-info"); #endif @@ -2336,6 +2404,16 @@ if (p->filmGrain) s += sprintf(s, " film-grain=%s", p->filmGrain); // Film grain characteristics model filename BOOL(p->bEnableTemporalFilter, "mcstf"); +#if ENABLE_ALPHA + BOOL(p->bEnableAlpha, "alpha"); +#endif +#if ENABLE_MULTIVIEW + s += sprintf(s, " num-views=%d", p->numViews); + s += sprintf(s, " format=%d", p->format); +#endif +#if ENABLE_SCC_EXT + s += sprintf(s, "scc=%d", p->bEnableSCC); +#endif BOOL(p->bEnableSBRC, "sbrc"); #undef BOOL return buf; @@ -2558,6 +2636,7 @@ void x265_copy_params(x265_param* dst, x265_param* src) { + dst->mcstfFrameRange = src->mcstfFrameRange; dst->cpuid = src->cpuid; dst->frameNumThreads = src->frameNumThreads; if (src->numaPools) dst->numaPools = strdup(src->numaPools); @@ -2856,6 +2935,18 @@ dst->confWinRightOffset = src->confWinRightOffset; dst->confWinBottomOffset = src->confWinBottomOffset; dst->bliveVBV2pass = src->bliveVBV2pass; +#if ENABLE_ALPHA + dst->bEnableAlpha = src->bEnableAlpha; + dst->numScalableLayers = src->numScalableLayers; +#endif +#if ENABLE_MULTIVIEW + dst->numViews = src->numViews; + dst->format = src->format; +#endif + dst->numLayers = src->numLayers; +#if ENABLE_SCC_EXT
View file
x265_3.6.tar.gz/source/common/piclist.cpp -> x265_4.0.tar.gz/source/common/piclist.cpp
Changed
@@ -82,6 +82,82 @@ m_count++; } +#if ENABLE_MULTIVIEW +Frame* PicList::popFrontSubDPB() +{ + if (m_start) + { + Frame* temp = m_start; + m_count--; + + if (m_count) + { + m_start = m_start->m_nextSubDPB; + m_start->m_prevSubDPB = NULL; + } + else + { + m_start = m_end = NULL; + } + temp->m_next = temp->m_prev = NULL; + return temp; + } + else + return NULL; +} + +void PicList::pushBackSubDPB(Frame& curFrame) +{ + X265_CHECK(!curFrame.m_nextSubDPB && !curFrame.m_prevSubDPB, "piclist: picture already in Sub DPB list\n"); // ensure frame is not in a list + curFrame.m_nextSubDPB = NULL; + curFrame.m_prevSubDPB = m_end; + + if (m_count) + { + m_end->m_nextSubDPB = &curFrame; + m_end = &curFrame; + } + else + { + m_start = m_end = &curFrame; + } + m_count++; +} + +void PicList::removeSubDPB(Frame& curFrame) +{ +#if _DEBUG + Frame* tmp = m_start; + while (tmp && tmp != &curFrame) + { + tmp = tmp->m_nextSubDPB; + } + + X265_CHECK(tmp == &curFrame, "piclist: pic being removed was not in list\n"); // verify pic is in this list +#endif + + m_count--; + if (m_count) + { + if (m_start == &curFrame) + m_start = curFrame.m_nextSubDPB; + if (m_end == &curFrame) + m_end = curFrame.m_prevSubDPB; + + if (curFrame.m_nextSubDPB) + curFrame.m_nextSubDPB->m_prevSubDPB = curFrame.m_prevSubDPB; + if (curFrame.m_prevSubDPB) + curFrame.m_prevSubDPB->m_nextSubDPB = curFrame.m_nextSubDPB; + } + else + { + m_start = m_end = NULL; + } + + curFrame.m_nextSubDPB = curFrame.m_prevSubDPB = NULL; +} +#endif + void PicList::pushBackMCSTF(Frame& curFrame) { X265_CHECK(!curFrame.m_nextMCSTF && !curFrame.m_prevMCSTF, "piclist: picture already in OPB list\n"); // ensure frame is not in a list @@ -123,11 +199,16 @@ return NULL; } -Frame* PicList::getPOC(int poc) +Frame* PicList::getPOC(int poc, int sLayerId) { Frame *curFrame = m_start; - while (curFrame && curFrame->m_poc != poc) + int layer = curFrame->m_param->numViews > 1 ? curFrame->m_viewId : (curFrame->m_param->numScalableLayers > 1) ? curFrame->m_sLayerId : 0; + while (curFrame && (curFrame->m_poc != poc || layer != sLayerId)) + { curFrame = curFrame->m_next; + if(curFrame) + layer = curFrame->m_param->numViews > 1 ? curFrame->m_viewId : (curFrame->m_param->numScalableLayers > 1) ? curFrame->m_sLayerId : 0; + } return curFrame; } @@ -185,10 +266,11 @@ return NULL; } -Frame* PicList::getCurFrame(void) +Frame* PicList::getCurFrame(int sLayer) { Frame *curFrame = m_start; - if (curFrame != NULL) + int layer = curFrame->m_param->numViews > 1 ? curFrame->m_viewId : (curFrame->m_param->numScalableLayers > 1) ? curFrame->m_sLayerId : 0; + if (layer == sLayer && curFrame != NULL) return curFrame; else return NULL; @@ -227,6 +309,42 @@ curFrame.m_next = curFrame.m_prev = NULL; } + +Frame* PicList::removeFrame(Frame& curFrame) +{ + Frame* tmp = &curFrame; +#if _DEBUG + tmp = m_start; + while (tmp && tmp != &curFrame) + { + tmp = tmp->m_next; + } + + X265_CHECK(tmp == &curFrame, "piclist: pic being removed was not in list\n"); // verify pic is in this list +#endif + + m_count--; + if (m_count) + { + if (m_start == &curFrame) + m_start = curFrame.m_next; + if (m_end == &curFrame) + m_end = curFrame.m_prev; + + if (curFrame.m_next) + curFrame.m_next->m_prev = curFrame.m_prev; + if (curFrame.m_prev) + curFrame.m_prev->m_next = curFrame.m_next; + } + else + { + m_start = m_end = NULL; + } + + curFrame.m_next = curFrame.m_prev = NULL; + return tmp; +} + void PicList::removeMCSTF(Frame& curFrame) { #if _DEBUG
View file
x265_3.6.tar.gz/source/common/piclist.h -> x265_4.0.tar.gz/source/common/piclist.h
Changed
@@ -50,10 +50,16 @@ /** Push picture to end of the list */ void pushBack(Frame& pic); void pushBackMCSTF(Frame& pic); +#if ENABLE_MULTIVIEW + void pushBackSubDPB(Frame& pic); +#endif /** Push picture to beginning of the list */ void pushFront(Frame& pic); void pushFrontMCSTF(Frame& pic); +#if ENABLE_MULTIVIEW + Frame* popFrontSubDPB(); +#endif /** Pop picture from end of the list */ Frame* popBack(); @@ -63,17 +69,24 @@ Frame* popFront(); /** Find frame with specified POC */ - Frame* getPOC(int poc); + Frame* getPOC(int poc, int sLayerId = 0); /* Find next MCSTF frame with specified POC */ Frame* getPOCMCSTF(int poc); /** Get the current Frame from the list **/ - Frame* getCurFrame(void); + Frame* getCurFrame(int sLayer); /** Remove picture from list */ void remove(Frame& pic); + + /** Remove picture from list */ + Frame* removeFrame(Frame& pic); /* Remove MCSTF picture from list */ void removeMCSTF(Frame& pic); +#if ENABLE_MULTIVIEW + /** Remove picture from Sub list */ + void removeSubDPB(Frame& pic); +#endif Frame* first() { return m_start; }
View file
x265_3.6.tar.gz/source/common/picyuv.cpp -> x265_4.0.tar.gz/source/common/picyuv.cpp
Changed
@@ -258,7 +258,7 @@ /* Copy pixels from an x265_picture into internal PicYuv instance. * Shift pixels as necessary, mask off bits above X265_DEPTH for safety. */ -void PicYuv::copyFromPicture(const x265_picture& pic, const x265_param& param, int padx, int pady) +void PicYuv::copyFromPicture(const x265_picture& pic, const x265_param& param, int padx, int pady, bool isBase) { /* m_picWidth is the width that is being encoded, padx indicates how many * of those pixels are padding to reach multiple of MinCU(4) size. @@ -321,78 +321,157 @@ #else /* Case for (X265_DEPTH == 8) */ // TODO: Does we need this path? may merge into above in future { - pixel *yPixel = m_picOrg0; - uint8_t *yChar = (uint8_t*)pic.planes0; - - for (int r = 0; r < height; r++) + if (isBase || param.numViews > 1) { - memcpy(yPixel, yChar, width * sizeof(pixel)); + int offsetX, offsetY; + offsetX = (!isBase && pic.format == 1 ? width : 0); + offsetY = (!isBase && pic.format == 2 ? pic.stride0 * height : 0); + pixel *yPixel = m_picOrg0; + uint8_t* yChar = (uint8_t*)pic.planes0 + offsetX + offsetY; - yPixel += m_stride; - yChar += pic.stride0 / sizeof(*yChar); - } + for (int r = 0; r < height; r++) + { + memcpy(yPixel, yChar, width * sizeof(pixel)); - if (param.internalCsp != X265_CSP_I400) + yPixel += m_stride; + yChar += pic.stride0 / sizeof(*yChar); + } + + if (param.internalCsp != X265_CSP_I400) + { + offsetX = offsetX >> m_hChromaShift; + int offsetYU = (!isBase && pic.format == 2 ? pic.stride1 * (height >> m_vChromaShift) : 0); + int offsetYV = (!isBase && pic.format == 2 ? pic.stride2 * (height >> m_vChromaShift) : 0); + + pixel *uPixel = m_picOrg1; + pixel *vPixel = m_picOrg2; + + uint8_t* uChar = (uint8_t*)pic.planes1 + offsetX + offsetYU; + uint8_t* vChar = (uint8_t*)pic.planes2 + offsetX + offsetYV; + + for (int r = 0; r < height >> m_vChromaShift; r++) + { + memcpy(uPixel, uChar, (width >> m_hChromaShift) * sizeof(pixel)); + memcpy(vPixel, vChar, (width >> m_hChromaShift) * sizeof(pixel)); + + uPixel += m_strideC; + vPixel += m_strideC; + uChar += pic.stride1 / sizeof(*uChar); + vChar += pic.stride2 / sizeof(*vChar); + } + } + } +#if ENABLE_ALPHA + if (!isBase && param.bEnableAlpha) { - pixel *uPixel = m_picOrg1; - pixel *vPixel = m_picOrg2; + pixel* aPixel = m_picOrg0; + uint8_t* aChar = (uint8_t*)pic.planes3; - uint8_t *uChar = (uint8_t*)pic.planes1; - uint8_t *vChar = (uint8_t*)pic.planes2; + for (int r = 0; r < height; r++) + { + memcpy(aPixel, aChar, width * sizeof(pixel)); + + aPixel += m_stride; + aChar += pic.stride0 / sizeof(*aChar); + } + + pixel* uPixel = m_picOrg1; + pixel* vPixel = m_picOrg2; for (int r = 0; r < height >> m_vChromaShift; r++) { - memcpy(uPixel, uChar, (width >> m_hChromaShift) * sizeof(pixel)); - memcpy(vPixel, vChar, (width >> m_hChromaShift) * sizeof(pixel)); + memset(uPixel, 128, (width >> m_hChromaShift) * sizeof(pixel)); + memset(vPixel, 128, (width >> m_hChromaShift) * sizeof(pixel)); uPixel += m_strideC; vPixel += m_strideC; - uChar += pic.stride1 / sizeof(*uChar); - vChar += pic.stride2 / sizeof(*vChar); } } +#endif } #endif /* (X265_DEPTH > 8) */ } else /* pic.bitDepth > 8 */ { /* defensive programming, mask off bits that are supposed to be zero */ - uint16_t mask = (1 << X265_DEPTH) - 1; - int shift = abs(pic.bitDepth - X265_DEPTH); - pixel *yPixel = m_picOrg0; + if (isBase) + { + uint16_t mask = (1 << X265_DEPTH) - 1; + int shift = abs(pic.bitDepth - X265_DEPTH); + pixel* yPixel = m_picOrg0; - uint16_t *yShort = (uint16_t*)pic.planes0; + uint16_t* yShort = (uint16_t*)pic.planes0; - if (pic.bitDepth > X265_DEPTH) - { - /* shift right and mask pixels to final size */ - primitives.planecopy_sp(yShort, pic.stride0 / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask); - } - else /* Case for (pic.bitDepth <= X265_DEPTH) */ - { - /* shift left and mask pixels to final size */ - primitives.planecopy_sp_shl(yShort, pic.stride0 / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask); - } + if (pic.bitDepth > X265_DEPTH) + { + /* shift right and mask pixels to final size */ + primitives.planecopy_sp(yShort, pic.stride0 / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask); + } + else /* Case for (pic.bitDepth <= X265_DEPTH) */ + { + /* shift left and mask pixels to final size */ + primitives.planecopy_sp_shl(yShort, pic.stride0 / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask); + } - if (param.internalCsp != X265_CSP_I400) + if (param.internalCsp != X265_CSP_I400) + { + pixel* uPixel = m_picOrg1; + pixel* vPixel = m_picOrg2; + + uint16_t* uShort = (uint16_t*)pic.planes1; + uint16_t* vShort = (uint16_t*)pic.planes2; + + if (pic.bitDepth > X265_DEPTH) + { + primitives.planecopy_sp(uShort, pic.stride1 / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask); + primitives.planecopy_sp(vShort, pic.stride2 / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask); + } + else /* Case for (pic.bitDepth <= X265_DEPTH) */ + { + primitives.planecopy_sp_shl(uShort, pic.stride1 / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask); + primitives.planecopy_sp_shl(vShort, pic.stride2 / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask); + } + } + } +#if ENABLE_ALPHA + if (!isBase && param.bEnableAlpha) { - pixel *uPixel = m_picOrg1; - pixel *vPixel = m_picOrg2; + /* defensive programming, mask off bits that are supposed to be zero */ + uint16_t mask = (1 << X265_DEPTH) - 1; + int shift = abs(pic.bitDepth - X265_DEPTH); + pixel* yPixel = m_picOrg0; - uint16_t *uShort = (uint16_t*)pic.planes1; - uint16_t *vShort = (uint16_t*)pic.planes2; + uint16_t* yShort = (uint16_t*)pic.planes3; if (pic.bitDepth > X265_DEPTH) { - primitives.planecopy_sp(uShort, pic.stride1 / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask); - primitives.planecopy_sp(vShort, pic.stride2 / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask); + /* shift right and mask pixels to final size */ + primitives.planecopy_sp(yShort, pic.stride0 / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask); } else /* Case for (pic.bitDepth <= X265_DEPTH) */ { - primitives.planecopy_sp_shl(uShort, pic.stride1 / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask); - primitives.planecopy_sp_shl(vShort, pic.stride2 / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask); + /* shift left and mask pixels to final size */ + primitives.planecopy_sp_shl(yShort, pic.stride0 / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask); + } + + if (param.internalCsp != X265_CSP_I400) + { + pixel* uPixel = m_picOrg1; + pixel* vPixel = m_picOrg2; + + for (int r = 0; r < height >> m_vChromaShift; r++) + { + for (int c = 0; c < (width >> m_hChromaShift); c++) + { + uPixelc = ((1 << X265_DEPTH) >> 1); + vPixelc = ((1 << X265_DEPTH) >> 1); + } + uPixel += m_strideC; + vPixel += m_strideC;
View file
x265_3.6.tar.gz/source/common/picyuv.h -> x265_4.0.tar.gz/source/common/picyuv.h
Changed
@@ -83,7 +83,7 @@ void destroy(); int getLumaBufLen(uint32_t picWidth, uint32_t picHeight, uint32_t picCsp); - void copyFromPicture(const x265_picture&, const x265_param& param, int padx, int pady); + void copyFromPicture(const x265_picture&, const x265_param& param, int padx, int pady, bool isBase = true); void copyFromFrame(PicYuv* source); intptr_t getChromaAddrOffset(uint32_t ctuAddr, uint32_t absPartIdx) const { return m_cuOffsetCctuAddr + m_buOffsetCabsPartIdx; }
View file
x265_3.6.tar.gz/source/common/pixel.cpp -> x265_4.0.tar.gz/source/common/pixel.cpp
Changed
@@ -266,10 +266,6 @@ { int satd = 0; -#if ENABLE_ASSEMBLY && X265_ARCH_ARM64 && !HIGH_BIT_DEPTH - pixelcmp_t satd_4x4 = x265_pixel_satd_4x4_neon; -#endif - for (int row = 0; row < h; row += 4) for (int col = 0; col < w; col += 4) satd += satd_4x4(pix1 + row * stride_pix1 + col, stride_pix1, @@ -284,10 +280,6 @@ { int satd = 0; -#if ENABLE_ASSEMBLY && X265_ARCH_ARM64 && !HIGH_BIT_DEPTH - pixelcmp_t satd_8x4 = x265_pixel_satd_8x4_neon; -#endif - for (int row = 0; row < h; row += 4) for (int col = 0; col < w; col += 8) satd += satd_8x4(pix1 + row * stride_pix1 + col, stride_pix1,
View file
x265_3.6.tar.gz/source/common/predict.cpp -> x265_4.0.tar.gz/source/common/predict.cpp
Changed
@@ -112,10 +112,22 @@ } else { - if (bLuma) - predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0); - if (bChroma) - predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0); +#if ENABLE_SCC_EXT + if (cu.m_slice->m_param->bEnableSCC && refIdx0 == (cu.m_slice->m_numRefIdx0 - 1)) + { + if (bLuma) + predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refFrameList0refIdx0->m_reconPic1, mv0); + if (bChroma) + predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refFrameList0refIdx0->m_reconPic1, mv0); + } + else +#endif + { + if (bLuma) + predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0); + if (bChroma) + predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0); + } } } else @@ -174,12 +186,22 @@ if (bLuma) { - predInterLumaShort(pu, m_predShortYuv0, *cu.m_slice->m_refReconPicList0refIdx0, mv0); +#if ENABLE_SCC_EXT + if (cu.m_slice->m_param->bEnableSCC && refIdx0 == (cu.m_slice->m_numRefIdx0 - 1)) + predInterLumaShort(pu, m_predShortYuv0, *cu.m_slice->m_refFrameList0refIdx0->m_reconPic1, mv0); + else +#endif + predInterLumaShort(pu, m_predShortYuv0, *cu.m_slice->m_refReconPicList0refIdx0, mv0); predInterLumaShort(pu, m_predShortYuv1, *cu.m_slice->m_refReconPicList1refIdx1, mv1); } if (bChroma) { - predInterChromaShort(pu, m_predShortYuv0, *cu.m_slice->m_refReconPicList0refIdx0, mv0); +#if ENABLE_SCC_EXT + if (cu.m_slice->m_param->bEnableSCC && refIdx0 == (cu.m_slice->m_numRefIdx0 - 1)) + predInterChromaShort(pu, m_predShortYuv0, *cu.m_slice->m_refFrameList0refIdx0->m_reconPic1, mv0); + else +#endif + predInterChromaShort(pu, m_predShortYuv0, *cu.m_slice->m_refReconPicList0refIdx0, mv0); predInterChromaShort(pu, m_predShortYuv1, *cu.m_slice->m_refReconPicList1refIdx1, mv1); } @@ -206,10 +228,22 @@ } else { - if (bLuma) - predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0); - if (bChroma) - predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0); +#if ENABLE_SCC_EXT + if (cu.m_slice->m_param->bEnableSCC && refIdx0 == (cu.m_slice->m_numRefIdx0 - 1)) + { + if (bLuma) + predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refFrameList0refIdx0->m_reconPic1, mv0); + if (bChroma) + predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refFrameList0refIdx0->m_reconPic1, mv0); + } + else +#endif + { + if (bLuma) + predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0); + if (bChroma) + predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList0refIdx0, mv0); + } } } else @@ -602,7 +636,7 @@ int tuSize = 1 << intraNeighbors.log2TrSize; int tuSize2 = tuSize << 1; - PicYuv* reconPic = cu.m_encData->m_reconPic; + PicYuv* reconPic = cu.m_encData->m_reconPic0; pixel* adiOrigin = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx); intptr_t picStride = reconPic->m_stride; @@ -651,7 +685,7 @@ void Predict::initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t puAbsPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId) { - PicYuv* reconPic = cu.m_encData->m_reconPic; + PicYuv* reconPic = cu.m_encData->m_reconPic0; const pixel* adiOrigin = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx); intptr_t picStride = reconPic->m_strideC;
View file
x265_3.6.tar.gz/source/common/primitives.cpp -> x265_4.0.tar.gz/source/common/primitives.cpp
Changed
@@ -258,8 +258,8 @@ primitives.cui.intra_pred_allangs = NULL; #if ENABLE_ASSEMBLY -#if X265_ARCH_X86 - setupInstrinsicPrimitives(primitives, param->cpuid); +#if defined(X265_ARCH_X86) || defined(X265_ARCH_ARM64) + setupIntrinsicPrimitives(primitives, param->cpuid); #endif setupAssemblyPrimitives(primitives, param->cpuid); #endif
View file
x265_3.6.tar.gz/source/common/primitives.h -> x265_4.0.tar.gz/source/common/primitives.h
Changed
@@ -470,12 +470,9 @@ } void setupCPrimitives(EncoderPrimitives &p); -void setupInstrinsicPrimitives(EncoderPrimitives &p, int cpuMask); +void setupIntrinsicPrimitives(EncoderPrimitives &p, int cpuMask); void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask); void setupAliasPrimitives(EncoderPrimitives &p); -#if X265_ARCH_ARM64 -void setupAliasCPrimitives(EncoderPrimitives &cp, EncoderPrimitives &asmp, int cpuMask); -#endif #if HAVE_ALTIVEC void setupPixelPrimitives_altivec(EncoderPrimitives &p); void setupDCTPrimitives_altivec(EncoderPrimitives &p);
View file
x265_3.6.tar.gz/source/common/slice.cpp -> x265_4.0.tar.gz/source/common/slice.cpp
Changed
@@ -29,17 +29,83 @@ using namespace X265_NS; -void Slice::setRefPicList(PicList& picList) +#if ENABLE_MULTIVIEW +void Slice::createInterLayerReferencePictureSet(PicList& picList, PicList& refPicSetInterLayer0, PicList& refPicSetInterLayer1) { + + for (int i = 0; i < 1; i++) + { + int layerIdRef = 0;// getRefPicLayerId(i); + Frame* refPic = picList.getPOC(m_poc, 0); + int viewIdCur = 0; + int viewIdZero = 1; + int viewIdRef = 1; + + if ((viewIdCur <= viewIdZero && viewIdCur <= viewIdRef) || (viewIdCur >= viewIdZero && viewIdCur >= viewIdRef)) + { + refPicSetInterLayer0.pushBackSubDPB(*refPic); + } + else + { + refPicSetInterLayer1.pushBackSubDPB(*refPic); + } + } +} +#endif + +void Slice::setRefPicList(PicList& picList, PicList& refPicSetInterLayer0, PicList& refPicSetInterLayer1, int sLayerId) +{ + bool checkNumPocTotalCurr = m_param->bEnableSCC ? false : true; if (m_sliceType == I_SLICE) { memset(m_refFrameList, 0, sizeof(m_refFrameList)); memset(m_refReconPicList, 0, sizeof(m_refReconPicList)); memset(m_refPOCList, 0, sizeof(m_refPOCList)); m_numRefIdx1 = m_numRefIdx0 = 0; + +#if ENABLE_SCC_EXT + if (!checkNumPocTotalCurr) + { + if (m_rps.numberOfPictures == 0) + { + Frame* prevPic = picList.getPOC(X265_MAX(0, m_poc - 1)); + if (prevPic->m_poc != X265_MAX(0, m_poc - 1)) + { + prevPic = picList.getPOC(m_poc); + } + m_lastEncPic = prevPic; + } + return; + } +#endif + return; } +#if ENABLE_SCC_EXT || ENABLE_MULTIVIEW || ENABLE_ALPHA + /*Reset the number of references for I-slice marked as P-slice*/ + if ((m_param->bEnableSCC || sLayerId) && m_sliceType != m_origSliceType) + { + memset(m_refFrameList, 0, sizeof(m_refFrameList)); + memset(m_refReconPicList, 0, sizeof(m_refReconPicList)); + memset(m_refPOCList, 0, sizeof(m_refPOCList)); + m_numRefIdx0 = 1; + } +#endif + +#if ENABLE_SCC_EXT + if (!checkNumPocTotalCurr && m_rps.numberOfPictures == 0) + { + Frame* prevPic = picList.getPOC(X265_MAX(0, m_poc - 1)); + if (prevPic->m_poc != X265_MAX(0, m_poc - 1)) + { + prevPic = picList.getPOC(m_poc); + + } + m_lastEncPic = prevPic; + } +#endif + Frame* refPic = NULL; Frame* refPicSetStCurr0MAX_NUM_REF; Frame* refPicSetStCurr1MAX_NUM_REF; @@ -51,9 +117,9 @@ for (i = 0; i < m_rps.numberOfNegativePictures; i++) { - if (m_rps.bUsedi) + if (m_rps.bUsedi && m_origSliceType != I_SLICE) { - refPic = picList.getPOC(m_poc + m_rps.deltaPOCi); + refPic = picList.getPOC(m_poc + m_rps.deltaPOCi, m_rps.deltaPOCi ? sLayerId : 0); refPicSetStCurr0numPocStCurr0 = refPic; numPocStCurr0++; } @@ -61,9 +127,9 @@ for (; i < m_rps.numberOfNegativePictures + m_rps.numberOfPositivePictures; i++) { - if (m_rps.bUsedi) + if (m_rps.bUsedi && m_origSliceType != I_SLICE) { - refPic = picList.getPOC(m_poc + m_rps.deltaPOCi); + refPic = picList.getPOC(m_poc + m_rps.deltaPOCi, m_rps.deltaPOCi ? sLayerId : 0); refPicSetStCurr1numPocStCurr1 = refPic; numPocStCurr1++; } @@ -75,18 +141,44 @@ // ref_pic_list_init Frame* rpsCurrList0MAX_NUM_REF + 1; Frame* rpsCurrList1MAX_NUM_REF + 1; +#if ENABLE_MULTIVIEW + int numPocTotalCurr = numPocStCurr0 + numPocStCurr1 + numPocLtCurr + refPicSetInterLayer0.size() + refPicSetInterLayer1.size(); +#else int numPocTotalCurr = numPocStCurr0 + numPocStCurr1 + numPocLtCurr; +#endif + +#if ENABLE_SCC_EXT + if (m_param->bEnableSCC) + numPocTotalCurr++; +#endif int cIdx = 0; for (i = 0; i < numPocStCurr0; i++, cIdx++) rpsCurrList0cIdx = refPicSetStCurr0i; +#if ENABLE_MULTIVIEW + if (m_param->numViews > 1) + for (i = 0; i < refPicSetInterLayer0.size(); i++, cIdx++) + rpsCurrList0cIdx = refPicSetInterLayer0.getPOC(m_poc, 0); +#endif + for (i = 0; i < numPocStCurr1; i++, cIdx++) rpsCurrList0cIdx = refPicSetStCurr1i; for (i = 0; i < numPocLtCurr; i++, cIdx++) rpsCurrList0cIdx = refPicSetLtCurri; +#if ENABLE_MULTIVIEW + if (m_param->numViews > 1) + for (i = 0; i < refPicSetInterLayer1.size(); i++, cIdx++) + rpsCurrList0cIdx = refPicSetInterLayer1.getPOC(m_poc, 0); +#endif + +#if ENABLE_SCC_EXT + if (m_param->bEnableSCC) + rpsCurrList0cIdx++ = picList.getPOC(m_poc); +#endif + X265_CHECK(cIdx == numPocTotalCurr, "RPS index check fail\n"); if (m_sliceType == B_SLICE) @@ -95,12 +187,29 @@ for (i = 0; i < numPocStCurr1; i++, cIdx++) rpsCurrList1cIdx = refPicSetStCurr1i; +#if ENABLE_MULTIVIEW + if (m_param->numViews > 1) + for (i = 0; i < refPicSetInterLayer1.size(); i++, cIdx++) + rpsCurrList1cIdx = refPicSetInterLayer1.getPOC(m_poc, 0); +#endif + for (i = 0; i < numPocStCurr0; i++, cIdx++) rpsCurrList1cIdx = refPicSetStCurr0i; for (i = 0; i < numPocLtCurr; i++, cIdx++) rpsCurrList1cIdx = refPicSetLtCurri; +#if ENABLE_MULTIVIEW + if (m_param->numViews > 1) + for (i = 0; i < refPicSetInterLayer0.size(); i++, cIdx++) + rpsCurrList1cIdx = refPicSetInterLayer0.getPOC(m_poc, 0); +#endif + +#if ENABLE_SCC_EXT + if (m_param->bEnableSCC) + rpsCurrList1cIdx++ = picList.getPOC(m_poc); +#endif + X265_CHECK(cIdx == numPocTotalCurr, "RPS index check fail\n"); } @@ -109,8 +218,18 @@ cIdx = rIdx % numPocTotalCurr; X265_CHECK(cIdx >= 0 && cIdx < numPocTotalCurr, "RPS index check fail\n"); m_refFrameList0rIdx = rpsCurrList0cIdx; +#if ENABLE_MULTIVIEW + m_refFrameList0rIdx = rpsCurrList0cIdx; +#endif } +#if ENABLE_SCC_EXT + if (m_param->bEnableSCC && numPocTotalCurr > m_numRefIdx0) + { + m_refFrameList0m_numRefIdx0 - 1 = picList.getPOC(m_poc); + } +#endif +
View file
x265_3.6.tar.gz/source/common/slice.h -> x265_4.0.tar.gz/source/common/slice.h
Changed
@@ -73,7 +73,11 @@ MAIN10 = 2, MAINSTILLPICTURE = 3, MAINREXT = 4, - HIGHTHROUGHPUTREXT = 5 + HIGHTHROUGHPUTREXT = 5, + MULTIVIEWMAIN = 6, + SCALABLEMAIN = 7, + SCALABLEMAIN10 = 8, + MAINSCC = 9 }; } @@ -106,7 +110,7 @@ struct ProfileTierLevel { - int profileIdc; + int profileIdcMAX_LAYERS; int levelIdc; uint32_t minCrForLevel; uint32_t maxLumaSrForLevel; @@ -159,6 +163,27 @@ uint32_t numReorderPicsMAX_T_LAYERS; uint32_t maxDecPicBufferingMAX_T_LAYERS; uint32_t maxLatencyIncreaseMAX_T_LAYERS; + int m_numLayers; + int m_numViews; + bool vps_extension_flag; + +#if (ENABLE_ALPHA || ENABLE_MULTIVIEW) + bool splitting_flag; + int m_scalabilityMaskMAX_VPS_NUM_SCALABILITY_TYPES; + int scalabilityTypes; + uint8_t m_dimensionIdLenMAX_VPS_NUM_SCALABILITY_TYPES; + uint8_t m_dimensionIdMAX_VPS_LAYER_ID_PLUS1MAX_VPS_NUM_SCALABILITY_TYPES; + bool m_nuhLayerIdPresentFlag; + uint8_t m_layerIdInNuhMAX_VPS_LAYER_ID_PLUS1; + uint8_t m_layerIdInVpsMAX_VPS_LAYER_ID_PLUS1; + int m_viewIdLen; + int m_vpsNumLayerSetsMinus1; + int m_numLayersInIdList1023; +#endif + +#if ENABLE_MULTIVIEW + int m_layerIdIncludedFlag; +#endif }; struct Window @@ -252,6 +277,13 @@ Window conformanceWindow; VUI vuiParameters; + bool sps_extension_flag; + +#if ENABLE_MULTIVIEW + int setSpsExtOrMaxSubLayersMinus1; + int maxViews; + bool vui_parameters_present_flag; +#endif SPS() { @@ -290,6 +322,11 @@ int numRefIdxDefault2; bool pps_slice_chroma_qp_offsets_present_flag; + + bool pps_extension_flag; + int maxViews; + + int profileIdc; }; struct WeightParam @@ -339,6 +376,7 @@ NalUnitType m_nalUnitType; SliceType m_sliceType; + SliceType m_origSliceType; int m_sliceQp; int m_chromaQpOffset2; int m_poc; @@ -365,6 +403,13 @@ int m_fieldNum; Frame* m_mcstfRefFrameList2MAX_MCSTF_TEMPORAL_WINDOW_LENGTH; +#if ENABLE_SCC_EXT + Frame* m_lastEncPic; + bool m_bLMvdL1Zero; + bool m_useIntegerMv; +#endif + bool m_bTemporalMvp; + Slice() { m_lastIDR = 0; @@ -380,11 +425,23 @@ m_rpsIdx = -1; m_chromaQpOffset0 = m_chromaQpOffset1 = 0; m_fieldNum = 0; +#if ENABLE_SCC_EXT + m_lastEncPic = NULL; + m_useIntegerMv = false; +#endif + m_bTemporalMvp = false; } void disableWeights(); - void setRefPicList(PicList& picList); + void setRefPicList(PicList& picList, PicList& refPicSetInterLayer0, PicList& refPicSetInterLayer1, int viewId); +#if ENABLE_MULTIVIEW + void createInterLayerReferencePictureSet(PicList& picList, PicList& refPicSetInterLayer0, PicList& refPicSetInterLayer1); +#endif + +#if ENABLE_SCC_EXT + bool isOnlyCurrentPictureAsReference() const; +#endif bool getRapPicFlag() const {
View file
x265_3.6.tar.gz/source/common/threadpool.cpp -> x265_4.0.tar.gz/source/common/threadpool.cpp
Changed
@@ -669,7 +669,11 @@ else if (cpuCount >= 16) p->frameNumThreads = 4; else if (cpuCount >= 8) +#if _WIN32 && X265_ARCH_ARM64 + p->frameNumThreads = cpuCount; +#else p->frameNumThreads = 3; +#endif else if (cpuCount >= 4) p->frameNumThreads = 2; else
View file
x265_3.6.tar.gz/source/common/vec/vec-primitives.cpp -> x265_4.0.tar.gz/source/common/vec/vec-primitives.cpp
Changed
@@ -59,7 +59,7 @@ void setupIntrinsicDCT_sse41(EncoderPrimitives&); /* Use primitives for the best available vector architecture */ -void setupInstrinsicPrimitives(EncoderPrimitives &p, int cpuMask) +void setupIntrinsicPrimitives(EncoderPrimitives &p, int cpuMask) { #ifdef HAVE_SSE3 if (cpuMask & X265_CPU_SSE3)
View file
x265_3.6.tar.gz/source/common/wavefront.cpp -> x265_4.0.tar.gz/source/common/wavefront.cpp
Changed
@@ -58,6 +58,11 @@ x265_free((void*)m_externalDependencyBitmap); } +void WaveFront::setLayerId(int layer) +{ + m_sLayerId = layer; +} + void WaveFront::clearEnabledRowMask() { memset((void*)m_externalDependencyBitmap, 0, sizeof(uint32_t) * m_numWords); @@ -103,7 +108,7 @@ if (ATOMIC_AND(&m_internalDependencyBitmapw, ~bit) & bit) { /* we cleared the bit, we get to process the row */ - processRow(w * 32 + id, threadId); + processRow(w * 32 + id, threadId, m_sLayerId); m_helpWanted = true; return; /* check for a higher priority task */ }
View file
x265_3.6.tar.gz/source/common/wavefront.h -> x265_4.0.tar.gz/source/common/wavefront.h
Changed
@@ -52,6 +52,8 @@ int m_numRows; + int m_sLayerId; + protected: uint32_t *m_row_to_idx; uint32_t *m_idx_to_row; @@ -95,7 +97,9 @@ // Start or resume encode processing of this row, must be implemented by // derived classes. - virtual void processRow(int row, int threadId) = 0; + virtual void processRow(int row, int threadId, int layer) = 0; + + void setLayerId(int layer); }; } // end namespace X265_NS
View file
x265_3.6.tar.gz/source/encoder/analysis.cpp -> x265_4.0.tar.gz/source/encoder/analysis.cpp
Changed
@@ -223,7 +223,12 @@ } ProfileCUScope(ctu, totalCTUTime, totalCTUs); - if (m_slice->m_sliceType == I_SLICE) +#if ENABLE_SCC_EXT + memset(m_ibc.m_BVs, 0, sizeof(m_ibc.m_BVs)); + memset(m_ibc.m_lastIntraBCMv, 0, sizeof(m_ibc.m_lastIntraBCMv)); + m_ibc.m_numBV16s = 0; m_ibc.m_numBVs = 0; +#endif + if (m_slice->m_sliceType == I_SLICE || (m_param->bEnableSCC && (m_slice->m_numRefIdx0 == 1) && m_slice->m_refPOCList00 == m_slice->m_poc)) { x265_analysis_intra_data* intraDataCTU = m_frame->m_analysisData.intraData; if (m_param->analysisLoadReuseLevel > 1) @@ -233,7 +238,11 @@ memcpy(ctu.m_partSize, &intraDataCTU->partSizesctu.m_cuAddr * numPartition, sizeof(char) * numPartition); memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModesctu.m_cuAddr * numPartition, sizeof(uint8_t) * numPartition); } +#if ENABLE_SCC_EXT + compressIntraCU(ctu, cuGeom, qp, &m_ibc); +#else compressIntraCU(ctu, cuGeom, qp); +#endif } else { @@ -271,7 +280,7 @@ { /* In RD Level 0/1, copy source pixels into the reconstructed block so * they are available for intra predictions */ - m_modeDepth0.fencYuv.copyToPicYuv(*m_frame->m_reconPic, ctu.m_cuAddr, 0); + m_modeDepth0.fencYuv.copyToPicYuv(*m_frame->m_reconPic0, ctu.m_cuAddr, 0); compressInterCU_rd0_4(ctu, cuGeom, qp); @@ -304,7 +313,11 @@ else if (m_param->rdLevel <= 4) compressInterCU_rd0_4(ctu, cuGeom, qp); else +#if ENABLE_SCC_EXT + compressInterCU_rd5_6(ctu, cuGeom, qp, &m_ibc); +#else compressInterCU_rd5_6(ctu, cuGeom, qp); +#endif } if (m_param->bEnableRdRefine || m_param->bOptCUDeltaQP) @@ -508,15 +521,22 @@ /* Copy best data to encData CTU and recon */ md.bestMode->cu.copyToPic(depth); - md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx); + md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic0, parentCTU.m_cuAddr, cuGeom.absPartIdx); } +#if ENABLE_SCC_EXT +uint64_t Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, IBC* ibc) +#else uint64_t Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp) +#endif { uint32_t depth = cuGeom.depth; ModeDepth& md = m_modeDepthdepth; md.bestMode = NULL; + MV iMVCandList410; + memset(iMVCandList, 0, sizeof(MV) * 4 * 10); + bool mightSplit = !(cuGeom.flags & CUGeom::LEAF); bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY); @@ -567,6 +587,43 @@ checkBestMode(md.predPRED_INTRA_NxN, depth); } +#if ENABLE_SCC_EXT + bool intraBlockCopyFastSearch = (m_param->bEnableSCC == 1) ? true : false, bUse1DSearchFor8x8 = false; + if (m_param->bEnableSCC) + { + md.predPRED_MERGE_IBC.cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv); + checkRDCostIntraBCMerge2Nx2N(md.predPRED_MERGE_IBC, cuGeom); + + md.predPRED_IBC_2Nx2N.cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv); + checkIntraBC_rd5_6(md.predPRED_IBC_2Nx2N, cuGeom, SIZE_2Nx2N, false, bUse1DSearchFor8x8, *ibc); + checkBestMode(md.predPRED_IBC_2Nx2N, depth); + + if (intraBlockCopyFastSearch) + { + if ((int)depth == m_slice->m_sps->log2DiffMaxMinCodingBlockSize) + { + md.predPRED_IBC_Nx2N.cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv); + checkIntraBC_rd5_6(md.predPRED_IBC_Nx2N, cuGeom, SIZE_Nx2N, false, bUse1DSearchFor8x8, *ibc, (iMVCandListSIZE_Nx2N + 8)); + checkBestMode(md.predPRED_IBC_Nx2N, depth); + + md.predPRED_IBC_2NxN.cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv); + checkIntraBC_rd5_6(md.predPRED_IBC_2NxN, cuGeom, SIZE_2NxN, false, bUse1DSearchFor8x8, *ibc, (iMVCandListSIZE_2NxN + 8)); + checkBestMode(md.predPRED_IBC_2NxN, depth); + } + } + else + { + md.predPRED_IBC_2NxN.cu.initSubCU(parentCTU, cuGeom, qp); + checkIntraBC_rd5_6(md.predPRED_IBC_2NxN, cuGeom, SIZE_2NxN, false, bUse1DSearchFor8x8, *ibc, (iMVCandListSIZE_2NxN + 8)); + checkBestMode(md.predPRED_IBC_2NxN, depth); + + md.predPRED_IBC_Nx2N.cu.initSubCU(parentCTU, cuGeom, qp); + checkIntraBC_rd5_6(md.predPRED_IBC_Nx2N, cuGeom, SIZE_Nx2N, false, bUse1DSearchFor8x8, *ibc, (iMVCandListSIZE_Nx2N + 8)); + checkBestMode(md.predPRED_IBC_Nx2N, depth); + } + } +#endif + if (m_bTryLossless) tryLossless(cuGeom); @@ -574,6 +631,91 @@ addSplitFlagCost(*md.bestMode, cuGeom.depth); } +#if ENABLE_SCC_EXT + // If Intra BC keep last coded Mv + if (md.bestMode && md.bestMode->cu.isInter(0)) + { + MVField mvField; + const CUData* cu = &md.bestMode->cu; + md.bestMode->cu.getMvField(cu, 0, 0, mvField); + int iRefIdxFirst = mvField.refIdx; + md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField); + int iRefIdxLast = mvField.refIdx; + bool isIntraBCFirst = (iRefIdxFirst >= 0) ? cu->m_slice->m_refFrameList0iRefIdxFirst->m_poc == cu->m_slice->m_poc : false; + bool isIntraBCLast = (iRefIdxLast >= 0) ? cu->m_slice->m_refFrameList0iRefIdxLast->m_poc == cu->m_slice->m_poc : false; + + if (isIntraBCFirst || isIntraBCLast) + { + if (cu->m_partSize0 == SIZE_2Nx2N) + { + md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField); + if (mvField.mv != cu->m_lastIntraBCMv0) + { + md.bestMode->cu.m_lastIntraBCMv1 = cu->m_lastIntraBCMv0; + md.bestMode->cu.m_lastIntraBCMv0 = mvField.mv; + } + } + else if (cu->m_partSize0 == SIZE_2NxN || cu->m_partSize0 == SIZE_Nx2N) + { + // mixed PU, only one partition is IntraBC coded + if (isIntraBCFirst != isIntraBCLast) + { + if (isIntraBCFirst) + { + // Part 0 + md.bestMode->cu.getMvField(cu, 0, 0, mvField); + if (mvField.mv != cu->m_lastIntraBCMv0) + { + md.bestMode->cu.m_lastIntraBCMv1 = cu->m_lastIntraBCMv0; + md.bestMode->cu.m_lastIntraBCMv0 = mvField.mv; + } + } + else if (isIntraBCLast) + { + // Part 1 + md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField); + if (mvField.mv != cu->m_lastIntraBCMv0) + { + md.bestMode->cu.m_lastIntraBCMv1 = cu->m_lastIntraBCMv0; + md.bestMode->cu.m_lastIntraBCMv0 = mvField.mv; + } + } + } + else // normal IntraBC CU + { + // Part 0 + md.bestMode->cu.getMvField(cu, 0, 0, mvField); + if (mvField.mv != cu->m_lastIntraBCMv0) + { + md.bestMode->cu.m_lastIntraBCMv1 = cu->m_lastIntraBCMv0; + md.bestMode->cu.m_lastIntraBCMv0 = mvField.mv; + } + // Part 1 + md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField); + if (mvField.mv != cu->m_lastIntraBCMv0) + { + md.bestMode->cu.m_lastIntraBCMv1 = cu->m_lastIntraBCMv0; + md.bestMode->cu.m_lastIntraBCMv0 = mvField.mv; + } + } + } + else + { + // NxN + for (int part = 0; part < 4; part++) + { + md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 4 + part, 0, mvField); + if (mvField.mv != cu->m_lastIntraBCMv0) + { + md.bestMode->cu.m_lastIntraBCMv1 = cu->m_lastIntraBCMv0; + md.bestMode->cu.m_lastIntraBCMv0 = mvField.mv; + } + } + }
View file
x265_3.6.tar.gz/source/encoder/analysis.h -> x265_4.0.tar.gz/source/encoder/analysis.h
Changed
@@ -75,6 +75,14 @@ PRED_nRx2N, PRED_INTRA_NxN, /* 4x4 intra PU blocks for 8x8 CU */ PRED_LOSSLESS, /* lossless encode of best mode */ +#if ENABLE_SCC_EXT + PRED_IBC_2Nx2N, + PRED_IBC_Nx2N, + PRED_IBC_2NxN, + PRED_MIXED_IBC_NX2N, + PRED_MIXED_IBC_2NXN, + PRED_MERGE_IBC, +#endif MAX_PRED_TYPES }; @@ -113,6 +121,7 @@ bool m_modeFlag2; bool m_checkMergeAndSkipOnly2; + IBC m_ibc; Analysis(); bool create(ThreadLocalData* tld); @@ -120,6 +129,7 @@ Mode& compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext); int32_t loadTUDepth(CUGeom cuGeom, CUData parentCTU); + protected: /* Analysis data for save/load mode, writes/reads data based on absPartIdx */ x265_analysis_inter_data* m_reuseInterDataCTU; @@ -162,12 +172,20 @@ void qprdRefine(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t lqp); /* full analysis for an I-slice CU */ +#if ENABLE_SCC_EXT + uint64_t compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, IBC* ibc = NULL); +#else uint64_t compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp); +#endif /* full analysis for a P or B slice CU */ uint32_t compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp); SplitData compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp); +#if ENABLE_SCC_EXT + SplitData compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, IBC* ibc = NULL); +#else SplitData compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp); +#endif void recodeCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t origqp = -1); @@ -177,10 +195,15 @@ /* measure inter options */ void checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refmask2); - void checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refmask2); + void checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refmask2, MV* iMVCandList = NULL); void checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom& cuGeom); +#if ENABLE_SCC_EXT + void checkRDCostIntraBCMerge2Nx2N(Mode& merge, const CUGeom& cuGeom); + void checkIntraBC_rd5_6(Mode& intraBCMode, const CUGeom& cuGeom, PartSize ePartSize, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc, MV* iMVCandList = NULL); +#endif + /* encode current bestMode losslessly, pick best RD cost */ void tryLossless(const CUGeom& cuGeom);
View file
x265_3.6.tar.gz/source/encoder/api.cpp -> x265_4.0.tar.gz/source/encoder/api.cpp
Changed
@@ -20,7 +20,6 @@ * This program is also available under a commercial proprietary license. * For more information, contact us at license @ x265.com. *****************************************************************************/ - #include "common.h" #include "bitstream.h" #include "param.h" @@ -185,7 +184,7 @@ // will detect and set profile/tier/level in VPS determineLevel(*param, encoder->m_vps); - if (!param->bAllowNonConformance && encoder->m_vps.ptl.profileIdc == Profile::NONE) + if (!param->bAllowNonConformance && encoder->m_vps.ptl.profileIdc0 == Profile::NONE) { x265_log(param, X265_LOG_INFO, "non-conformant bitstreams not allowed (--allow-non-conformance)\n"); goto fail; @@ -357,11 +356,11 @@ VPS saveVPS; memcpy(&saveVPS.ptl, &encoder->m_vps.ptl, sizeof(saveVPS.ptl)); determineLevel(*encoder->m_latestParam, encoder->m_vps); - if (saveVPS.ptl.profileIdc != encoder->m_vps.ptl.profileIdc || saveVPS.ptl.levelIdc != encoder->m_vps.ptl.levelIdc + if (saveVPS.ptl.profileIdc0 != encoder->m_vps.ptl.profileIdc0 || saveVPS.ptl.levelIdc != encoder->m_vps.ptl.levelIdc || saveVPS.ptl.tierFlag != encoder->m_vps.ptl.tierFlag) { x265_log(encoder->m_param, X265_LOG_WARNING, "Profile/Level/Tier has changed from %d/%d/%s to %d/%d/%s.Cannot reconfigure rate-control.\n", - saveVPS.ptl.profileIdc, saveVPS.ptl.levelIdc, saveVPS.ptl.tierFlag ? "High" : "Main", encoder->m_vps.ptl.profileIdc, + saveVPS.ptl.profileIdc0, saveVPS.ptl.levelIdc, saveVPS.ptl.tierFlag ? "High" : "Main", encoder->m_vps.ptl.profileIdc0, encoder->m_vps.ptl.levelIdc, encoder->m_vps.ptl.tierFlag ? "High" : "Main"); x265_copy_params(encoder->m_latestParam, &save); memcpy(&encoder->m_vps.ptl, &saveVPS.ptl, sizeof(saveVPS.ptl)); @@ -406,7 +405,7 @@ return 0; } -int x265_encoder_encode(x265_encoder *enc, x265_nal **pp_nal, uint32_t *pi_nal, x265_picture *pic_in, x265_picture *pic_out) +int x265_encoder_encode(x265_encoder *enc, x265_nal **pp_nal, uint32_t *pi_nal, x265_picture *pic_in, x265_picture **pic_out) { if (!enc) return -1; @@ -602,7 +601,10 @@ *pi_nal = 0; if (numEncoded && encoder->m_param->csvLogLevel && encoder->m_outputCount >= encoder->m_latestParam->chunkStart) - x265_csvlog_frame(encoder->m_param, pic_out); + { + for (int layer = 0; layer < encoder->m_param->numLayers; layer++) + x265_csvlog_frame(encoder->m_param, pic_outlayer); + } if (numEncoded < 0) encoder->m_aborted = true; @@ -653,11 +655,14 @@ if (enc) { Encoder *encoder = static_cast<Encoder*>(enc); - x265_stats stats; - encoder->fetchStats(&stats, sizeof(stats)); + x265_stats statsMAX_LAYERS; int padx = encoder->m_sps.conformanceWindow.rightOffset; int pady = encoder->m_sps.conformanceWindow.bottomOffset; - x265_csvlog_encode(encoder->m_param, &stats, padx, pady, argc, argv); + for (int layer = 0; layer < encoder->m_param->numLayers; layer++) + { + encoder->fetchStats(stats, sizeof(statslayer), layer); + x265_csvlog_encode(encoder->m_param, &stats0, padx, pady, argc, argv); + } } } @@ -744,7 +749,7 @@ if (!enc) return -1; Encoder *encoder = static_cast<Encoder*>(enc); - if (!encoder->copySlicetypePocAndSceneCut(slicetype, poc, sceneCut)) + if (!encoder->copySlicetypePocAndSceneCut(slicetype, poc, sceneCut, 0)) return 0; return -1; } @@ -1295,7 +1300,7 @@ { if (param->csvLogLevel) { - fprintf(csvfp, "Encode Order, Type, POC, QP, Bits, Scenecut, "); + fprintf(csvfp, "Layer , Encode Order, Type, POC, QP, Bits, Scenecut, "); if (!!param->bEnableTemporalSubLayers) fprintf(csvfp, "Temporal Sub Layer ID, "); if (param->csvLogLevel >= 2) @@ -1409,7 +1414,7 @@ return; const x265_frame_stats* frameStats = &pic->frameData; - fprintf(param->csvfpt, "%d, %c-SLICE, %4d, %2.2lf, %10d, %d,", frameStats->encoderOrder, frameStats->sliceType, frameStats->poc, + fprintf(param->csvfpt, "%d, %d, %c-SLICE, %4d, %2.2lf, %10d, %d,", pic->layerID, frameStats->encoderOrder, frameStats->sliceType, frameStats->poc, frameStats->qp, (int)frameStats->bits, frameStats->bScenecut); if (!!param->bEnableTemporalSubLayers) fprintf(param->csvfpt, "%d,", frameStats->tLayer); @@ -1806,6 +1811,219 @@ return ret; } +static enum VmafOutputFormat log_fmt_map(const char *log_fmt) +{ + if (log_fmt) { + if (!strcmp(log_fmt, "xml")) + return VMAF_OUTPUT_FORMAT_XML; + if (!strcmp(log_fmt, "json")) + return VMAF_OUTPUT_FORMAT_JSON; + if (!strcmp(log_fmt, "csv")) + return VMAF_OUTPUT_FORMAT_CSV; + if (!strcmp(log_fmt, "sub")) + return VMAF_OUTPUT_FORMAT_SUB; + } + + return VMAF_OUTPUT_FORMAT_NONE; +} + +static enum VmafPoolingMethod pool_method_map(const char *pool_method) +{ + if (pool_method) { + if (!strcmp(pool_method, "min")) + return VMAF_POOL_METHOD_MIN; + if (!strcmp(pool_method, "mean")) + return VMAF_POOL_METHOD_MEAN; + if (!strcmp(pool_method, "harmonic_mean")) + return VMAF_POOL_METHOD_HARMONIC_MEAN; + } + return VMAF_POOL_METHOD_MEAN; +} + +static enum VmafPixelFormat pix_fmt_map(const char *fmt) +{ + if (fmt) { + if (!strcmp(fmt, "yuv420p") || !strcmp(fmt, "yuv420p10le") || !strcmp(fmt, "yuv420p12le") || !strcmp(fmt, "yuv420p16le")) + return VMAF_PIX_FMT_YUV420P; + if (!strcmp(fmt, "yuv422p") || !strcmp(fmt, "yuv422p10le")) + return VMAF_PIX_FMT_YUV422P; + if (!strcmp(fmt, "yuv444p") || !strcmp(fmt, "yuv444p10le")) + return VMAF_PIX_FMT_YUV444P; + } + return VMAF_PIX_FMT_UNKNOWN; +} + +static void copy_picture(float *src, VmafPicture *dst, unsigned width, unsigned height, int src_stride, unsigned bpc) +{ + const int bytes_per_value = bpc > 8 ? 2 : 1; + const int dst_stride = dst->stride0 / bytes_per_value; + const unsigned b_shift = (bpc > 8) ? (bpc - 8) : 0; + + uint8_t *dst_data = static_cast<uint8_t*>(dst->data0); + + for (unsigned i = 0; i < height; i++) { + if (bpc > 8) { + uint16_t *dst_row = reinterpret_cast<uint16_t*>(dst_data); + for (unsigned j = 0; j < width; j++) { + dst_rowj = static_cast<uint16_t>(srcj * (1 << b_shift)); + } + } else { + for (unsigned j = 0; j < width; j++) { + dst_dataj = static_cast<uint8_t>(srcj); + } + } + src += src_stride / sizeof(float); + dst_data += dst_stride * bytes_per_value; + } +} + +int load_feature(VmafContext *vmaf, const char *feature_name, VmafFeatureDictionary *d) { + int err = vmaf_use_feature(vmaf, feature_name, d); + if (err) { + printf("problem loading feature extractor: %s\n", feature_name); + } + return err; +} + +int compute_vmaf(double* vmaf_score, char* fmt, int width, int height, int bitdepth, int(*read_frame)(float *ref_data, float *main_data, float *temp_data, int stride_byte, void *user_data), + void *user_data, char *model_path, char *log_path, char *log_fmt, int disable_clip, int disable_avx, int enable_transform, int phone_model, int do_psnr, int do_ssim, int do_ms_ssim, + char *pool_method, int n_thread, int n_subsample) +{ + int err = 0; + + VmafConfiguration cfg = { + .log_level = VMAF_LOG_LEVEL_INFO, + .n_threads = n_thread, + .n_subsample = n_subsample, + .cpumask = disable_avx ? -1 : 0, + .gpumask = 0, + }; + + VmafContext *vmaf; + err = vmaf_init(&vmaf, cfg); + if (err) { + printf("problem initializing VMAF context\n"); + return -1; + } + + uint64_t flags = VMAF_MODEL_FLAGS_DEFAULT; + if (disable_clip) + flags |= VMAF_MODEL_FLAG_DISABLE_CLIP; + if (enable_transform || phone_model)
View file
x265_3.6.tar.gz/source/encoder/dpb.cpp -> x265_4.0.tar.gz/source/encoder/dpb.cpp
Changed
@@ -53,8 +53,8 @@ FrameData* next = m_frameDataFreeList->m_freeListNext; m_frameDataFreeList->destroy(); - m_frameDataFreeList->m_reconPic->destroy(); - delete m_frameDataFreeList->m_reconPic; + m_frameDataFreeList->m_reconPic0->destroy(); + delete m_frameDataFreeList->m_reconPic0; delete m_frameDataFreeList; m_frameDataFreeList = next; @@ -75,7 +75,7 @@ if (curFrame->m_param->bEnableTemporalFilter) isMCSTFReferenced =!!(curFrame->m_refPicCnt1); - if (!curFrame->m_encData->m_bHasReferences && !curFrame->m_countRefEncoders && !isMCSTFReferenced) + if (curFrame->m_valid && !curFrame->m_encData->m_bHasReferences && !curFrame->m_countRefEncoders && !isMCSTFReferenced) { curFrame->m_bChromaExtended = false; @@ -95,6 +95,12 @@ // iterator is invalidated by remove, restart scan m_picList.remove(*curFrame); +#if ENABLE_MULTIVIEW + if (curFrame->m_param->numViews > 1 && !curFrame->m_viewId && m_picList.getPOC(curFrame->m_poc, 1) && curFrame == m_picList.getPOC(curFrame->m_poc, 1)->refPicSetInterLayer0.getPOC(curFrame->m_poc, curFrame->m_viewId)) + { + m_picList.getPOC(curFrame->m_poc, 1)->refPicSetInterLayer0.removeSubDPB(*curFrame); + } +#endif iterFrame = m_picList.first(); m_freeList.pushBack(*curFrame); @@ -126,7 +132,8 @@ curFrame->m_prevCtuInfoChange = NULL; } curFrame->m_encData = NULL; - curFrame->m_reconPic = NULL; + for (int i = 0; i < !!curFrame->m_param->bEnableSCC + 1; i++) + curFrame->m_reconPici = NULL; } } } @@ -145,6 +152,11 @@ m_lastIDR = pocCurr; slice->m_lastIDR = m_lastIDR; slice->m_sliceType = IS_X265_TYPE_B(type) ? B_SLICE : (type == X265_TYPE_P) ? P_SLICE : I_SLICE; +#if ENABLE_SCC_EXT + if (slice->m_param->bEnableSCC) slice->m_origSliceType = slice->m_sliceType; + if (slice->m_param->bEnableSCC && IS_X265_TYPE_I(type)) + slice->m_sliceType = P_SLICE; +#endif if (type == X265_TYPE_B) { @@ -177,7 +189,8 @@ m_picList.pushFront(*newFrame); - if (m_bTemporalSublayer && getTemporalLayerNonReferenceFlag()) + int layer = slice->m_param->numViews > 1 ? newFrame->m_viewId : (slice->m_param->numScalableLayers > 1) ? newFrame->m_sLayerId : 0; + if (m_bTemporalSublayer && getTemporalLayerNonReferenceFlag(layer)) { switch (slice->m_nalUnitType) { @@ -195,12 +208,13 @@ } } // Do decoding refresh marking if any - decodingRefreshMarking(pocCurr, slice->m_nalUnitType); + decodingRefreshMarking(pocCurr, slice->m_nalUnitType, layer); - computeRPS(pocCurr, newFrame->m_tempLayer, slice->isIRAP(), &slice->m_rps, slice->m_sps->maxDecPicBufferingnewFrame->m_tempLayer); + uint32_t maxDecBuffer = (slice->m_sps->maxDecPicBufferingnewFrame->m_tempLayer >= 8 && slice->m_param->bEnableSCC) ? 7 : slice->m_sps->maxDecPicBufferingnewFrame->m_tempLayer; + computeRPS(pocCurr, newFrame->m_tempLayer, slice->isIRAP(), &slice->m_rps, maxDecBuffer, layer); bool isTSAPic = ((slice->m_nalUnitType == 2) || (slice->m_nalUnitType == 3)) ? true : false; // Mark pictures in m_piclist as unreferenced if they are not included in RPS - applyReferencePictureSet(&slice->m_rps, pocCurr, newFrame->m_tempLayer, isTSAPic); + applyReferencePictureSet(&slice->m_rps, pocCurr, newFrame->m_tempLayer, isTSAPic, layer); if (m_bTemporalSublayer && newFrame->m_tempLayer > 0 @@ -210,9 +224,9 @@ || slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_RASL_R) ) { - if (isTemporalLayerSwitchingPoint(pocCurr, newFrame->m_tempLayer) || (slice->m_sps->maxTempSubLayers == 1)) + if (isTemporalLayerSwitchingPoint(pocCurr, newFrame->m_tempLayer, layer) || (slice->m_sps->maxTempSubLayers == 1)) { - if (getTemporalLayerNonReferenceFlag()) + if (getTemporalLayerNonReferenceFlag(layer)) { slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_TSA_N; } @@ -221,7 +235,7 @@ slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_TSA_R; } } - else if (isStepwiseTemporalLayerSwitchingPoint(&slice->m_rps, pocCurr, newFrame->m_tempLayer)) + else if (isStepwiseTemporalLayerSwitchingPoint(&slice->m_rps, pocCurr, newFrame->m_tempLayer, layer)) { bool isSTSA = true; int id = newFrame->m_gopOffset % x265_gop_ra_lengthnewFrame->m_gopId; @@ -254,7 +268,7 @@ } if (isSTSA == true) { - if (getTemporalLayerNonReferenceFlag()) + if (getTemporalLayerNonReferenceFlag(layer)) { slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_STSA_N; } @@ -266,12 +280,22 @@ } } +#if ENABLE_MULTIVIEW + if (newFrame->m_viewId) + slice->createInterLayerReferencePictureSet(m_picList, newFrame->refPicSetInterLayer0, newFrame->refPicSetInterLayer1); +#endif + int numRef = slice->m_param->bEnableSCC ? slice->m_rps.numberOfNegativePictures + 1 : slice->m_rps.numberOfNegativePictures; if (slice->m_sliceType != I_SLICE) - slice->m_numRefIdx0 = x265_clip3(1, newFrame->m_param->maxNumReferences, slice->m_rps.numberOfNegativePictures); + slice->m_numRefIdx0 = x265_clip3(1, newFrame->m_param->maxNumReferences, numRef + newFrame->refPicSetInterLayer0.size() + newFrame->refPicSetInterLayer1.size()); + else + slice->m_numRefIdx0 = X265_MIN(newFrame->m_param->maxNumReferences, numRef); // Ensuring L0 contains just the -ve POC +#if ENABLE_MULTIVIEW || ENABLE_SCC_EXT + if(slice->m_param->numViews > 1 || !!slice->m_param->bEnableSCC) + slice->m_numRefIdx1 = X265_MIN(newFrame->m_param->bBPyramid ? 3 : 2, slice->m_rps.numberOfPositivePictures + newFrame->refPicSetInterLayer0.size() + newFrame->refPicSetInterLayer1.size()); else - slice->m_numRefIdx0 = X265_MIN(newFrame->m_param->maxNumReferences, slice->m_rps.numberOfNegativePictures); // Ensuring L0 contains just the -ve POC - slice->m_numRefIdx1 = X265_MIN(newFrame->m_param->bBPyramid ? 2 : 1, slice->m_rps.numberOfPositivePictures); - slice->setRefPicList(m_picList); +#endif + slice->m_numRefIdx1 = X265_MIN(newFrame->m_param->bBPyramid ? 2 : 1, slice->m_rps.numberOfPositivePictures); + slice->setRefPicList(m_picList, newFrame->refPicSetInterLayer0, newFrame->refPicSetInterLayer1, layer); X265_CHECK(slice->m_sliceType != B_SLICE || slice->m_numRefIdx1, "B slice without L1 references (non-fatal)\n"); @@ -280,9 +304,29 @@ /* TODO: the lookahead should be able to tell which reference picture * had the least motion residual. We should be able to use that here to * select a colocation reference list and index */ - slice->m_colFromL0Flag = false; + + bool bLowDelay = true; + int iCurrPOC = slice->m_poc; + int iRefIdx = 0; + + for (iRefIdx = 0; iRefIdx < slice->m_numRefIdx0 && bLowDelay; iRefIdx++) + { + if (slice->m_refPOCList0iRefIdx > iCurrPOC) + { + bLowDelay = false; + } + } + for (iRefIdx = 0; iRefIdx < slice->m_numRefIdx1 && bLowDelay; iRefIdx++) + { + if (slice->m_refPOCList1iRefIdx > iCurrPOC) + { + bLowDelay = false; + } + } + + slice->m_bCheckLDC = bLowDelay; + slice->m_colFromL0Flag = bLowDelay; slice->m_colRefIdx = 0; - slice->m_bCheckLDC = false; } else { @@ -291,6 +335,59 @@ slice->m_colRefIdx = 0; } + slice->m_bTemporalMvp = slice->m_sps->bTemporalMVPEnabled; +#if ENABLE_SCC_EXT + bool bGPBcheck = false; + if (slice->m_sliceType == B_SLICE) + { + if (slice->m_param->bEnableSCC) + { + if (slice->m_numRefIdx0 - 1 == slice->m_numRefIdx1) + { + bGPBcheck = true; + for (int i = 0; i < slice->m_numRefIdx1; i++) + { + if (slice->m_refPOCList1i != slice->m_refPOCList0i) + { + bGPBcheck = false; + break; + } + } + } + } + else if (slice->m_numRefIdx0 == slice->m_numRefIdx1) + { + bGPBcheck = true; + int i; + for (i = 0; i < slice->m_numRefIdx1; i++)
View file
x265_3.6.tar.gz/source/encoder/dpb.h -> x265_4.0.tar.gz/source/encoder/dpb.h
Changed
@@ -79,13 +79,13 @@ protected: - void computeRPS(int curPoc,int tempId, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer); + void computeRPS(int curPoc,int tempId, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer, int sLayerId); - void applyReferencePictureSet(RPS *rps, int curPoc, int tempId, bool isTSAPicture); - bool getTemporalLayerNonReferenceFlag(); - void decodingRefreshMarking(int pocCurr, NalUnitType nalUnitType); - bool isTemporalLayerSwitchingPoint(int curPoc, int tempId); - bool isStepwiseTemporalLayerSwitchingPoint(RPS *rps, int curPoc, int tempId); + void applyReferencePictureSet(RPS *rps, int curPoc, int tempId, bool isTSAPicture, int sLayerId); + bool getTemporalLayerNonReferenceFlag(int sLayerId); + void decodingRefreshMarking(int pocCurr, NalUnitType nalUnitType, int sLayerId); + bool isTemporalLayerSwitchingPoint(int curPoc, int tempId, int sLayerId); + bool isStepwiseTemporalLayerSwitchingPoint(RPS *rps, int curPoc, int tempId, int sLayerId); NalUnitType getNalUnitType(int curPoc, bool bIsKeyFrame); };
View file
x265_3.6.tar.gz/source/encoder/encoder.cpp -> x265_4.0.tar.gz/source/encoder/encoder.cpp
Changed
@@ -134,7 +134,6 @@ m_lookahead = NULL; m_rateControl = NULL; m_dpb = NULL; - m_exportedPic = NULL; m_numDelayedPic = 0; m_outputCount = 0; m_param = NULL; @@ -150,6 +149,8 @@ m_rpsInSpsCount = 0; m_cB = 1.0; m_cR = 1.0; + for (int i = 0; i < MAX_LAYERS; i++) + m_exportedPici = NULL; for (int i = 0; i < X265_MAX_FRAME_THREADS; i++) m_frameEncoderi = NULL; for (uint32_t i = 0; i < DUP_BUFFER; i++) @@ -597,9 +598,9 @@ } } -int Encoder::copySlicetypePocAndSceneCut(int *slicetype, int *poc, int *sceneCut) +int Encoder::copySlicetypePocAndSceneCut(int *slicetype, int *poc, int *sceneCut, int sLayer) { - Frame *FramePtr = m_dpb->m_picList.getCurFrame(); + Frame *FramePtr = m_dpb->m_picList.getCurFrame(sLayer); if (FramePtr != NULL) { *slicetype = FramePtr->m_lowres.sliceType; @@ -618,31 +619,36 @@ { if (!(IS_X265_TYPE_I(sliceType))) { - Frame *framePtr = m_dpb->m_picList.getPOC(poc); + Frame *framePtr = m_dpb->m_picList.getPOC(poc, 0); if (framePtr != NULL) { for (int j = 0; j < framePtr->m_encData->m_slice->m_numRefIdx0; j++) // check only for --ref=n number of frames. { - if (framePtr->m_encData->m_slice->m_refFrameList0j && framePtr->m_encData->m_slice->m_refFrameList0j->m_reconPic != NULL) + if (framePtr->m_encData->m_slice->m_refFrameList0j && framePtr->m_encData->m_slice->m_refFrameList0j->m_reconPic0 != NULL) { int l0POC = framePtr->m_encData->m_slice->m_refFrameList0j->m_poc; pocL0j = l0POC; - Frame* l0Fp = m_dpb->m_picList.getPOC(l0POC); - while (l0Fp->m_reconRowFlagl0Fp->m_numRows - 1.get() == 0) - l0Fp->m_reconRowFlagl0Fp->m_numRows - 1.waitForChange(0); /* If recon is not ready, current frame encoder has to wait. */ - l0j = l0Fp->m_reconPic; + Frame* l0Fp = m_dpb->m_picList.getPOC(l0POC, 0); +#if ENABLE_SCC_EXT + if (l0POC != poc) +#endif + { + while (l0Fp->m_reconRowFlagl0Fp->m_numRows - 1.get() == 0) + l0Fp->m_reconRowFlagl0Fp->m_numRows - 1.waitForChange(0); /* If recon is not ready, current frame encoder has to wait. */ + } + l0j = l0Fp->m_reconPic0; } } for (int j = 0; j < framePtr->m_encData->m_slice->m_numRefIdx1; j++) // check only for --ref=n number of frames. { - if (framePtr->m_encData->m_slice->m_refFrameList1j && framePtr->m_encData->m_slice->m_refFrameList1j->m_reconPic != NULL) + if (framePtr->m_encData->m_slice->m_refFrameList1j && framePtr->m_encData->m_slice->m_refFrameList1j->m_reconPic0 != NULL) { int l1POC = framePtr->m_encData->m_slice->m_refFrameList1j->m_poc; pocL1j = l1POC; - Frame* l1Fp = m_dpb->m_picList.getPOC(l1POC); + Frame* l1Fp = m_dpb->m_picList.getPOC(l1POC, 0); while (l1Fp->m_reconRowFlagl1Fp->m_numRows - 1.get() == 0) l1Fp->m_reconRowFlagl1Fp->m_numRows - 1.waitForChange(0); /* If recon is not ready, current frame encoder has to wait. */ - l1j = l1Fp->m_reconPic; + l1j = l1Fp->m_reconPic0; } } } @@ -762,7 +768,7 @@ uint32_t widthInCU = (m_param->sourceWidth + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize; uint32_t heightInCU = (m_param->sourceHeight + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize; - Frame* curFrame = m_dpb->m_picList.getPOC(poc); + Frame* curFrame = m_dpb->m_picList.getPOC(poc, 0); if (curFrame != NULL) { curFrame->m_analysisData = (*analysis_data); @@ -861,10 +867,13 @@ X265_FREE(m_rdCost); X265_FREE(m_trainingCount); } - if (m_exportedPic) + for (int layer = 0; layer < m_param->numLayers; layer++) { - ATOMIC_DEC(&m_exportedPic->m_countRefEncoders); - m_exportedPic = NULL; + if (m_exportedPiclayer) + { + ATOMIC_DEC(&m_exportedPiclayer->m_countRefEncoders); + m_exportedPiclayer = NULL; + } } if (m_param->bEnableFrameDuplication) @@ -1359,6 +1368,10 @@ memcpy(dest->planes0, src->planes0, src->framesize * sizeof(char)); dest->planes1 = (char*)dest->planes0 + src->stride0 * src->height; dest->planes2 = (char*)dest->planes1 + src->stride1 * (src->height >> x265_cli_cspssrc->colorSpace.height1); +#if ENABLE_ALPHA + if(m_param->bEnableAlpha) + dest->planes3 = (char*)dest->planes2 + src->stride2 * (src->height >> x265_cli_cspssrc->colorSpace.height2); +#endif } bool Encoder::isFilterThisframe(uint8_t sliceTypeConfig, int curSliceType) @@ -1458,7 +1471,7 @@ * returns 0 if no frames are currently available for output * 1 if frame was output, m_nalList contains access unit * negative on malloc error or abort */ -int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out) +int Encoder::encode(const x265_picture* pic_in, x265_picture** pic_out) { #if CHECKED_BUILD || _DEBUG if (g_checkFailures) @@ -1470,19 +1483,21 @@ if (m_aborted) return -1; - const x265_picture* inputPic = NULL; + const x265_picture* inputPicMAX_VIEWS = { NULL }; static int written = 0, read = 0; bool dontRead = false; bool dropflag = false; - if (m_exportedPic) + if (*m_exportedPic) { if (!m_param->bUseAnalysisFile && m_param->analysisSave) - x265_free_analysis_data(m_param, &m_exportedPic->m_analysisData); - - ATOMIC_DEC(&m_exportedPic->m_countRefEncoders); + x265_free_analysis_data(m_param, &m_exportedPic0->m_analysisData); - m_exportedPic = NULL; + for (int i = 0; i < m_param->numLayers; i++) + { + ATOMIC_DEC(&m_exportedPici->m_countRefEncoders); + m_exportedPici = NULL; + } m_dpb->recycleUnreferenced(); if (m_param->bEnableTemporalFilter) @@ -1566,143 +1581,194 @@ if (read < written) { - inputPic = m_dupBuffer0->dupPic; + inputPic0 = m_dupBuffer0->dupPic; read++; } } else - inputPic = pic_in; + { + for (int view = 0; view < m_param->numViews; view++) + inputPicview = pic_in + view; + } - Frame *inFrame; - x265_param *p = (m_reconfigure || m_reconfigureRc) ? m_latestParam : m_param; - if (m_dpb->m_freeList.empty()) - { - inFrame = new Frame; - inFrame->m_encodeStartTime = x265_mdate(); - if (inFrame->create(p, inputPic->quantOffsets)) - { - /* the first PicYuv created is asked to generate the CU and block unit offset - * arrays which are then shared with all subsequent PicYuv (orig and recon) - * allocated by this top level encoder */ - if (m_sps.cuOffsetY) - { - inFrame->m_fencPic->m_cuOffsetY = m_sps.cuOffsetY; - inFrame->m_fencPic->m_buOffsetY = m_sps.buOffsetY; - if (m_param->internalCsp != X265_CSP_I400) - { - inFrame->m_fencPic->m_cuOffsetC = m_sps.cuOffsetC; - inFrame->m_fencPic->m_buOffsetC = m_sps.buOffsetC; - } - } - else + x265_param* p = (m_reconfigure || m_reconfigureRc) ? m_latestParam : m_param; + Frame* inFrameMAX_LAYERS; + for (int layer = 0; layer < m_param->numLayers; layer++) + { + if (m_dpb->m_freeList.empty()) + { + inFramelayer = new Frame; + inFramelayer->m_encodeStartTime = x265_mdate(); +#if ENABLE_MULTIVIEW + inFramelayer->m_viewId = m_param->numViews > 1 ? layer : 0; +#endif +#if ENABLE_ALPHA + inFramelayer->m_sLayerId = m_param->numScalableLayers > 1 ? layer : 0;
View file
x265_3.6.tar.gz/source/encoder/encoder.h -> x265_4.0.tar.gz/source/encoder/encoder.h
Changed
@@ -202,7 +202,7 @@ ThreadPool* m_threadPool; FrameEncoder* m_frameEncoderX265_MAX_FRAME_THREADS; DPB* m_dpb; - Frame* m_exportedPic; + Frame* m_exportedPicMAX_LAYERS; FILE* m_analysisFileIn; FILE* m_analysisFileOut; FILE* m_naluFile; @@ -217,10 +217,10 @@ bool m_externalFlush; /* Collect statistics globally */ - EncStats m_analyzeAll; - EncStats m_analyzeI; - EncStats m_analyzeP; - EncStats m_analyzeB; + EncStats m_analyzeAllMAX_LAYERS; + EncStats m_analyzeIMAX_LAYERS; + EncStats m_analyzePMAX_LAYERS; + EncStats m_analyzeBMAX_LAYERS; VPS m_vps; SPS m_sps; PPS m_pps; @@ -300,7 +300,7 @@ void stopJobs(); void destroy(); - int encode(const x265_picture* pic, x265_picture *pic_out); + int encode(const x265_picture* pic, x265_picture **pic_out); int reconfigureParam(x265_param* encParam, x265_param* param); @@ -308,7 +308,7 @@ void copyCtuInfo(x265_ctu_info_t** frameCtuInfo, int poc); - int copySlicetypePocAndSceneCut(int *slicetype, int *poc, int *sceneCut); + int copySlicetypePocAndSceneCut(int *slicetype, int *poc, int *sceneCut, int sLayer); int getRefFrameList(PicYuv** l0, PicYuv** l1, int sliceType, int poc, int* pocL0, int* pocL1); @@ -320,7 +320,7 @@ void getEndNalUnits(NALList& list, Bitstream& bs); - void fetchStats(x265_stats* stats, size_t statsSizeBytes); + void fetchStats(x265_stats* stats, size_t statsSizeBytes, int layer = 0); void printSummary(); @@ -352,7 +352,7 @@ void copyDistortionData(x265_analysis_data* analysis, FrameData &curEncData); - void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, x265_frame_stats* frameStats, int inPoc); + void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, x265_frame_stats* frameStats, int inPoc, int layer); int validateAnalysisData(x265_analysis_validate* param, int readWriteFlag);
View file
x265_3.6.tar.gz/source/encoder/entropy.cpp -> x265_4.0.tar.gz/source/encoder/entropy.cpp
Changed
@@ -230,11 +230,12 @@ X265_CHECK(sizeof(m_contextState) >= sizeof(m_contextState0) * MAX_OFF_CTX_MOD, "context state table is too small\n"); } -void Entropy::codeVPS(const VPS& vps) +void Entropy::codeVPS(const VPS& vps, const SPS& sps) { + int maxLayers = (vps.m_numLayers > 1 || vps.m_numViews > 1) + 1; WRITE_CODE(0, 4, "vps_video_parameter_set_id"); WRITE_CODE(3, 2, "vps_reserved_three_2bits"); - WRITE_CODE(0, 6, "vps_reserved_zero_6bits"); + WRITE_CODE(maxLayers - 1, 6, "vps_reserved_zero_6bits"); WRITE_CODE(vps.maxTempSubLayers - 1, 3, "vps_max_sub_layers_minus1"); WRITE_FLAG(vps.maxTempSubLayers == 1, "vps_temporal_id_nesting_flag"); WRITE_CODE(0xffff, 16, "vps_reserved_ffff_16bits"); @@ -250,50 +251,320 @@ WRITE_UVLC(vps.maxLatencyIncreasei + 1, "vps_max_latency_increase_plus1i"); } +#if ENABLE_ALPHA || ENABLE_MULTIVIEW + if (vps.m_numLayers > 1 || vps.m_numViews > 1) + { + WRITE_CODE(maxLayers - 1, 6, "vps_max_nuh_reserved_zero_layer_id"); + WRITE_UVLC(vps.m_vpsNumLayerSetsMinus1, "vps_num_layer_sets_minus1"); + for (int i = 1; i <= vps.m_vpsNumLayerSetsMinus1; i++) + { +#if ENABLE_MULTIVIEW + if (vps.m_numViews > 1) + { + for (int j = 0; j < vps.m_numViews; j++) + { + WRITE_FLAG(1, "layer_id_included_flagopsIdxi"); + } + } +#endif +#if ENABLE_ALPHA + if (vps.m_numLayers > 1) + { + for (int j = 0; j < vps.m_numLayers; j++) + { + WRITE_FLAG(1, "layer_id_included_flagopsIdxi"); + } + } +#endif + } + } + else + { + WRITE_CODE(0, 6, "vps_max_nuh_reserved_zero_layer_id"); + WRITE_UVLC(0, "vps_max_op_sets_minus1"); + } +#else WRITE_CODE(0, 6, "vps_max_nuh_reserved_zero_layer_id"); - WRITE_UVLC(0, "vps_max_op_sets_minus1"); + WRITE_UVLC(0, "vps_max_op_sets_minus1"); +#endif + WRITE_FLAG(0, "vps_timing_info_present_flag"); /* we signal timing info in SPS-VUI */ - WRITE_FLAG(0, "vps_extension_flag"); + +#if ENABLE_ALPHA || ENABLE_MULTIVIEW + if (vps.m_numLayers > 1 || vps.m_numViews > 1) + { + WRITE_FLAG(vps.vps_extension_flag, "vps_extension_flag"); + + if (vps.vps_extension_flag) + { + while (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0) + { + WRITE_FLAG(1, "vps_extension_alignment_bit_equal_to_one"); + } + + WRITE_CODE(vps.ptl.levelIdc, 8, "general_level_idc"); + if (vps.maxTempSubLayers > 1) + { + for (int i = 0; i < vps.maxTempSubLayers - 1; i++) + { + WRITE_FLAG(0, "sub_layer_profile_present_flagi"); + WRITE_FLAG(0, "sub_layer_level_present_flagi"); + } + for (int i = vps.maxTempSubLayers - 1; i < 8; i++) + WRITE_CODE(0, 2, "reserved_zero_2bits"); + } + + WRITE_FLAG(vps.splitting_flag, "splitting flag"); + for (int i = 0; i < MAX_VPS_NUM_SCALABILITY_TYPES; i++) + { + WRITE_FLAG(vps.m_scalabilityMaski, "scalability_maski"); + } + for (int i = 0; i < vps.scalabilityTypes - vps.splitting_flag; i++) + { + WRITE_CODE(vps.m_dimensionIdLeni - 1, 3, "dimension_id_len_minus1i"); + } + WRITE_FLAG(vps.m_nuhLayerIdPresentFlag, "vps_nuh_layer_id_present_flag"); + for (int i = 1; i < maxLayers; i++) + { + if (vps.m_nuhLayerIdPresentFlag) + WRITE_CODE(vps.m_layerIdInNuhi, 6, "layer_id_in_nuhi"); + + if (!vps.splitting_flag) + { + for (int j = 0; j < vps.scalabilityTypes; j++) + { + uint8_t bits = vps.m_dimensionIdLenj; + WRITE_CODE(vps.m_dimensionIdij, bits, "dimension_idij"); + } + } + } + WRITE_CODE(vps.m_viewIdLen, 4, "view_id_len"); + +#if ENABLE_ALPHA + if (vps.m_numLayers > 1) + { + WRITE_FLAG(0, "direct_dependency_flag10"); + WRITE_UVLC(0, "num_add_layer_sets"); + WRITE_FLAG(0, "vps_sub_layers_max_minus1_present_flag"); + WRITE_FLAG(0, "max_tid_ref_present_flag"); + WRITE_FLAG(0, "default_ref_layers_active_flag"); + WRITE_UVLC(2, "vps_num_profile_tier_level_minus1"); + WRITE_FLAG(1, "vps_profile_present_flag"); + codeProfileTier(vps.ptl, vps.maxTempSubLayers, 1); + + WRITE_UVLC(0, "num_add_olss"); + WRITE_CODE(0, 2, "default_output_layer_idc"); + WRITE_CODE(1, 2, "profile_tier_level_idx i j "); + WRITE_CODE(2, 2, "profile_tier_level_idx i j "); + + WRITE_UVLC(0, "vps_num_rep_formats_minus1"); + + WRITE_CODE(sps.picWidthInLumaSamples, 16, "pic_width_vps_in_luma_samples"); + WRITE_CODE(sps.picHeightInLumaSamples, 16, "pic_height_vps_in_luma_samples"); + WRITE_FLAG(1, "chroma_and_bit_depth_vps_present_flag"); + + WRITE_CODE(sps.chromaFormatIdc, 2, "chroma_format_vps_idc"); + + if (sps.chromaFormatIdc == X265_CSP_I444) + WRITE_FLAG(0, "separate_colour_plane_vps_flag"); + + WRITE_CODE(X265_DEPTH - 8, 4, "bit_depth_vps_luma_minus8"); + WRITE_CODE(X265_DEPTH - 8, 4, "bit_depth_vps_chroma_minus8"); + + const Window& conf = sps.conformanceWindow; + WRITE_FLAG(conf.bEnabled, "conformance_window_vps_flag"); + if (conf.bEnabled) + { + int hShift = CHROMA_H_SHIFT(sps.chromaFormatIdc), vShift = CHROMA_V_SHIFT(sps.chromaFormatIdc); + WRITE_UVLC(conf.leftOffset >> hShift, "conf_win_vps_left_offset"); + WRITE_UVLC(conf.rightOffset >> hShift, "conf_win_vps_right_offset"); + WRITE_UVLC(conf.topOffset >> vShift, "conf_win_vps_top_offset"); + WRITE_UVLC(conf.bottomOffset >> vShift, "conf_win_vps_bottom_offset"); + } + + WRITE_FLAG(1, "max_one_active_ref_layer_flag"); + WRITE_FLAG(0, "vps_poc_lsb_aligned_flag"); + WRITE_FLAG(1, "poc_lsb_not_present_flag"); + + for (int i = 1; i < vps.m_vpsNumLayerSetsMinus1 + 1; i++) + { + WRITE_FLAG(vps.maxTempSubLayers > 1, "sub_layer_flag_info_present_flag"); + for (int j = 0; j < vps.maxTempSubLayers ; j++) + { + if(j > 0) + WRITE_FLAG(vps.maxTempSubLayers > 1, "sub_layer_dpb_info_present_flag"); + + for(int k = 0; k < vps.m_numLayersInIdListi; k++) + WRITE_UVLC(vps.maxDecPicBufferingj - 1, "vps_max_dec_pic_buffering_minus1i"); + + WRITE_UVLC(vps.numReorderPics0, "vps_num_reorder_picsi"); + WRITE_UVLC(vps.maxLatencyIncrease0 + 1, "vps_max_latency_increase_plus1i"); + } + } + + WRITE_UVLC(0, "direct_dep_type_len_minus2"); + + WRITE_FLAG(0, "default_direct_dependency_flag"); + WRITE_UVLC(0, "vps_non_vui_extension_length"); + WRITE_FLAG(0, "vps_vui_present_flag"); + WRITE_FLAG(0, "vps_extension2_flag"); + } +#endif + +#if ENABLE_MULTIVIEW + if (vps.m_numViews > 1) + { + for (uint8_t i = 0; i < vps.m_numViews; i++) + WRITE_CODE(i, vps.m_viewIdLen, "view_id_vali"); + + for (int i = 1; i < vps.m_numViews; i++) + { + for (int j = 0; j < i; j++) + { + if (j == 0) + WRITE_FLAG(1, "direct_dependency_flag10"); + else + WRITE_FLAG(0, "direct_dependency_flag10"); + } + } + WRITE_FLAG(0, "vps_sub_layers_max_minus1_present_flag"); + WRITE_FLAG(0, "max_tid_ref_present_flag"); + WRITE_FLAG(1, "default_ref_layers_active_flag");
View file
x265_3.6.tar.gz/source/encoder/entropy.h -> x265_4.0.tar.gz/source/encoder/entropy.h
Changed
@@ -141,14 +141,14 @@ void loadIntraDirModeLuma(const Entropy& src); void copyState(const Entropy& other); - void codeVPS(const VPS& vps); - void codeSPS(const SPS& sps, const ScalingList& scalingList, const ProfileTierLevel& ptl); - void codePPS( const PPS& pps, bool filerAcross, int iPPSInitQpMinus26 ); - void codeVUI(const VUI& vui, int maxSubTLayers, bool bEmitVUITimingInfo, bool bEmitVUIHRDInfo); + void codeVPS(const VPS& vps, const SPS& sps); + void codeSPS(const SPS& sps, const ScalingList& scalingList, const ProfileTierLevel& ptl, int layer = 0); + void codePPS( const PPS& pps, bool filerAcross, int iPPSInitQpMinus26, int layer = 0); + void codeVUI(const VUI& vui, int maxSubTLayers, bool bEmitVUITimingInfo, bool bEmitVUIHRDInfo, int layer = 0); void codeAUD(const Slice& slice); void codeHrdParameters(const HRDInfo& hrd, int maxSubTLayers); - void codeSliceHeader(const Slice& slice, FrameData& encData, uint32_t slice_addr, uint32_t slice_addr_bits, int sliceQp); + void codeSliceHeader(const Slice& slice, FrameData& encData, uint32_t slice_addr, uint32_t slice_addr_bits, int sliceQp, int layer = 0); void codeSliceHeaderWPPEntryPoints(const uint32_t *substreamSizes, uint32_t numSubStreams, uint32_t maxOffset); void codeShortTermRefPicSet(const RPS& rps, int idx); void finishSlice() { encodeBinTrm(1); finish(); dynamic_cast<Bitstream*>(m_bitIf)->writeByteAlignment(); } @@ -234,7 +234,7 @@ void writeEpExGolomb(uint32_t symbol, uint32_t count); void writeCoefRemainExGolomb(uint32_t symbol, const uint32_t absGoRice); - void codeProfileTier(const ProfileTierLevel& ptl, int maxTempSubLayers); + void codeProfileTier(const ProfileTierLevel& ptl, int maxTempSubLayers, int layer = 0); void codeScalingList(const ScalingList&); void codeScalingList(const ScalingList& scalingList, uint32_t sizeId, uint32_t listId);
View file
x265_3.6.tar.gz/source/encoder/frameencoder.cpp -> x265_4.0.tar.gz/source/encoder/frameencoder.cpp
Changed
@@ -41,11 +41,9 @@ FrameEncoder::FrameEncoder() { - m_prevOutputTime = x265_mdate(); m_reconfigure = false; m_isFrameEncoder = true; m_threadActive = true; - m_slicetypeWaitTime = 0; m_activeWorkerCount = 0; m_completionCount = 0; m_outStreams = NULL; @@ -56,11 +54,16 @@ m_rows = NULL; m_top = NULL; m_param = NULL; - m_frame = NULL; m_cuGeoms = NULL; m_ctuGeomMap = NULL; m_localTldIdx = 0; memset(&m_rce, 0, sizeof(RateControlEntry)); + for (int layer = 0; layer < MAX_LAYERS; layer++) + { + m_prevOutputTimelayer = x265_mdate(); + m_slicetypeWaitTimelayer = 0; + m_framelayer = NULL; + } } void FrameEncoder::destroy() @@ -94,6 +97,7 @@ X265_FREE(m_ctuGeomMap); X265_FREE(m_substreamSizes); X265_FREE(m_nr); + X265_FREE(m_retFrameBuffer); m_frameFilter.destroy(); @@ -216,6 +220,9 @@ ok &= !!m_frameEncTF->createRefPicInfo(&m_mcstfRefListi, m_param); } + m_retFrameBuffer = X265_MALLOC(Frame*, m_param->numLayers); + for (int layer = 0; layer < m_param->numLayers; layer++) + m_retFrameBufferlayer = NULL; return ok; } @@ -282,14 +289,17 @@ return true; } -bool FrameEncoder::startCompressFrame(Frame* curFrame) +bool FrameEncoder::startCompressFrame(Frame* curFrameMAX_LAYERS) { - m_slicetypeWaitTime = x265_mdate() - m_prevOutputTime; - m_frame = curFrame; - m_sliceType = curFrame->m_lowres.sliceType; - curFrame->m_encData->m_frameEncoderID = m_jpId; - curFrame->m_encData->m_jobProvider = this; - curFrame->m_encData->m_slice->m_mref = m_mref; + for (int layer = 0; layer < m_param->numLayers; layer++) + { + m_slicetypeWaitTimelayer = x265_mdate() - m_prevOutputTimelayer; + m_framelayer = curFramelayer; + curFramelayer->m_encData->m_frameEncoderID = m_jpId; + curFramelayer->m_encData->m_jobProvider = this; + curFramelayer->m_encData->m_slice->m_mref = m_mref; + } + m_sliceType = curFrame0->m_lowres.sliceType; if (!m_cuGeoms) { @@ -355,15 +365,17 @@ { if (m_param->bCTUInfo) { - while (!m_frame->m_ctuInfo) - m_frame->m_copied.wait(); + while (!m_frame0->m_ctuInfo) + m_frame0->m_copied.wait(); } - if ((m_param->bAnalysisType == AVC_INFO) && !m_param->analysisSave && !m_param->analysisLoad && !(IS_X265_TYPE_I(m_frame->m_lowres.sliceType))) + if ((m_param->bAnalysisType == AVC_INFO) && !m_param->analysisSave && !m_param->analysisLoad && !(IS_X265_TYPE_I(m_frame0->m_lowres.sliceType))) { - while (((m_frame->m_analysisData.interData == NULL && m_frame->m_analysisData.intraData == NULL) || (uint32_t)m_frame->m_poc != m_frame->m_analysisData.poc)) - m_frame->m_copyMVType.wait(); + while (((m_frame0->m_analysisData.interData == NULL && m_frame0->m_analysisData.intraData == NULL) || (uint32_t)m_frame0->m_poc != m_frame0->m_analysisData.poc)) + m_frame0->m_copyMVType.wait(); } - compressFrame(); + + for (int layer = 0; layer < m_param->numLayers; layer++) + compressFrame(layer); m_done.trigger(); /* FrameEncoder::getEncodedPicture() blocks for this event */ m_enable.wait(); } @@ -371,7 +383,7 @@ void FrameEncoder::WeightAnalysis::processTasks(int /* workerThreadId */) { - Frame* frame = master.m_frame; + Frame* frame = master.m_framemaster.m_sLayerId; weightAnalyse(*frame->m_encData->m_slice, *frame, *master.m_param); } @@ -411,13 +423,13 @@ memcpy(m_top->m_prevTonemapPayload.payload, payload->payload, payload->payloadSize); } - bool isIDR = m_frame->m_lowres.sliceType == X265_TYPE_IDR; + bool isIDR = m_frame0->m_lowres.sliceType == X265_TYPE_IDR; return (payloadChange || isIDR); } -void FrameEncoder::writeTrailingSEIMessages() +void FrameEncoder::writeTrailingSEIMessages(int layer) { - Slice* slice = m_frame->m_encData->m_slice; + Slice* slice = m_framelayer->m_encData->m_slice; int planes = (m_param->internalCsp != X265_CSP_I400) ? 3 : 1; int32_t payloadSize = 0; @@ -444,21 +456,21 @@ } m_seiReconPictureDigest.setSize(payloadSize); - m_seiReconPictureDigest.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_SUFFIX_SEI, m_nalList, false); + m_seiReconPictureDigest.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_SUFFIX_SEI, m_nalList, false, layer); } -void FrameEncoder::compressFrame() +void FrameEncoder::compressFrame(int layer) { ProfileScopeEvent(frameThread); - m_startCompressTime = x265_mdate(); + m_startCompressTimelayer = x265_mdate(); m_totalActiveWorkerCount = 0; m_activeWorkerCountSamples = 0; - m_totalWorkerElapsedTime = 0; - m_totalNoWorkerTime = 0; + m_totalWorkerElapsedTimelayer = 0; + m_totalNoWorkerTimelayer = 0; m_countRowBlocks = 0; - m_allRowsAvailableTime = 0; - m_stallStartTime = 0; + m_allRowsAvailableTimelayer = 0; + m_stallStartTimelayer = 0; m_completionCount = 0; memset((void*)m_bAllRowsStop, 0, sizeof(bool) * m_param->maxSlices); @@ -466,18 +478,19 @@ m_rowSliceTotalBits0 = 0; m_rowSliceTotalBits1 = 0; - m_SSDY = m_SSDU = m_SSDV = 0; - m_ssim = 0; - m_ssimCnt = 0; - memset(&(m_frame->m_encData->m_frameStats), 0, sizeof(m_frame->m_encData->m_frameStats)); + m_SSDYlayer = m_SSDUlayer = m_SSDVlayer = 0; + m_ssimlayer = 0; + m_ssimCntlayer = 0; + memset(&(m_framelayer->m_encData->m_frameStats), 0, sizeof(m_framelayer->m_encData->m_frameStats)); + m_sLayerId = layer; if (m_param->rc.aqMode != X265_AQ_EDGE && m_param->recursionSkipMode == EDGE_BASED_RSKIP) { - int height = m_frame->m_fencPic->m_picHeight; - int width = m_frame->m_fencPic->m_picWidth; - intptr_t stride = m_frame->m_fencPic->m_stride; + int height = m_framelayer->m_fencPic->m_picHeight; + int width = m_framelayer->m_fencPic->m_picWidth; + intptr_t stride = m_framelayer->m_fencPic->m_stride; - if (!computeEdge(m_frame->m_edgeBitPic, m_frame->m_fencPic->m_picOrg0, NULL, stride, height, width, false, 1)) + if (!computeEdge(m_framelayer->m_edgeBitPic, m_framelayer->m_fencPic->m_picOrg0, NULL, stride, height, width, false, 1)) { x265_log(m_param, X265_LOG_ERROR, " Failed to compute edge !"); } @@ -486,15 +499,15 @@ /* Emit access unit delimiter unless this is the first frame and the user is * not repeating headers (since AUD is supposed to be the first NAL in the access * unit) */ - Slice* slice = m_frame->m_encData->m_slice; + Slice* slice = m_framelayer->m_encData->m_slice; - if (m_param->bEnableEndOfSequence && m_frame->m_lowres.sliceType == X265_TYPE_IDR && m_frame->m_poc) + if (m_param->bEnableEndOfSequence && m_framelayer->m_lowres.sliceType == X265_TYPE_IDR && m_framelayer->m_poc) { m_bs.resetBits(); m_nalList.serialize(NAL_UNIT_EOS, m_bs); } - if (m_param->bEnableAccessUnitDelimiters && (m_frame->m_poc || m_param->bRepeatHeaders)) + if (m_param->bEnableAccessUnitDelimiters && (m_framelayer->m_poc || m_param->bRepeatHeaders)) { m_bs.resetBits(); m_entropyCoder.setBitstream(&m_bs); @@ -504,7 +517,7 @@
View file
x265_3.6.tar.gz/source/encoder/frameencoder.h -> x265_4.0.tar.gz/source/encoder/frameencoder.h
Changed
@@ -156,12 +156,12 @@ void destroy(); /* triggers encode of a new frame by the worker thread */ - bool startCompressFrame(Frame* curFrame); + bool startCompressFrame(Frame* curFrameMAX_LAYERS); /* blocks until worker thread is done, returns access unit */ - Frame *getEncodedPicture(NALList& list); + Frame **getEncodedPicture(NALList& list); - void initDecodedPictureHashSEI(int row, int cuAddr, int height); + void initDecodedPictureHashSEI(int row, int cuAddr, int height, int layer); Event m_enable; Event m_done; @@ -190,34 +190,35 @@ RateControlEntry m_rce; SEIDecodedPictureHash m_seiReconPictureDigest; - uint64_t m_SSDY; - uint64_t m_SSDU; - uint64_t m_SSDV; - double m_ssim; - uint64_t m_accessUnitBits; - uint32_t m_ssimCnt; + uint64_t m_SSDYMAX_LAYERS; + uint64_t m_SSDUMAX_LAYERS; + uint64_t m_SSDVMAX_LAYERS; + double m_ssimMAX_LAYERS; + uint64_t m_accessUnitBitsMAX_LAYERS; + uint32_t m_ssimCntMAX_LAYERS; volatile int m_activeWorkerCount; // count of workers currently encoding or filtering CTUs volatile int m_totalActiveWorkerCount; // sum of m_activeWorkerCount sampled at end of each CTU volatile int m_activeWorkerCountSamples; // count of times m_activeWorkerCount was sampled (think vbv restarts) volatile int m_countRowBlocks; // count of workers forced to abandon a row because of top dependency - int64_t m_startCompressTime; // timestamp when frame encoder is given a frame - int64_t m_row0WaitTime; // timestamp when row 0 is allowed to start - int64_t m_allRowsAvailableTime; // timestamp when all reference dependencies are resolved - int64_t m_endCompressTime; // timestamp after all CTUs are compressed - int64_t m_endFrameTime; // timestamp after RCEnd, NR updates, etc - int64_t m_stallStartTime; // timestamp when worker count becomes 0 - int64_t m_prevOutputTime; // timestamp when prev frame was retrieved by API thread - int64_t m_slicetypeWaitTime; // total elapsed time waiting for decided frame - int64_t m_totalWorkerElapsedTime; // total elapsed time spent by worker threads processing CTUs - int64_t m_totalNoWorkerTime; // total elapsed time without any active worker threads + int64_t m_startCompressTimeMAX_LAYERS; // timestamp when frame encoder is given a frame + int64_t m_row0WaitTimeMAX_LAYERS; // timestamp when row 0 is allowed to start + int64_t m_allRowsAvailableTimeMAX_LAYERS; // timestamp when all reference dependencies are resolved + int64_t m_endCompressTimeMAX_LAYERS; // timestamp after all CTUs are compressed + int64_t m_endFrameTimeMAX_LAYERS; // timestamp after RCEnd, NR updates, etc + int64_t m_stallStartTimeMAX_LAYERS; // timestamp when worker count becomes 0 + int64_t m_prevOutputTimeMAX_LAYERS; // timestamp when prev frame was retrieved by API thread + int64_t m_slicetypeWaitTimeMAX_LAYERS; // total elapsed time waiting for decided frame + int64_t m_totalWorkerElapsedTimeMAX_LAYERS; // total elapsed time spent by worker threads processing CTUs + int64_t m_totalNoWorkerTimeMAX_LAYERS; // total elapsed time without any active worker threads #if DETAILED_CU_STATS CUStats m_cuStats; #endif Encoder* m_top; x265_param* m_param; - Frame* m_frame; + Frame* m_frameMAX_LAYERS; + Frame** m_retFrameBuffer; NoiseReduction* m_nr; ThreadLocalData* m_tld; /* for --no-wpp */ Bitstream* m_outStreams; @@ -238,6 +239,8 @@ TemporalFilter* m_frameEncTF; TemporalFilterRefPicInfo m_mcstfRefListMAX_MCSTF_TEMPORAL_WINDOW_LENGTH; + int m_sLayerId; + class WeightAnalysis : public BondedTaskGroup { public: @@ -258,20 +261,20 @@ bool initializeGeoms(); /* analyze / compress frame, can be run in parallel within reference constraints */ - void compressFrame(); + void compressFrame(int layer); /* called by compressFrame to generate final per-row bitstreams */ - void encodeSlice(uint32_t sliceAddr); + void encodeSlice(uint32_t sliceAddr, int layer); void threadMain(); int collectCTUStatistics(const CUData& ctu, FrameStats* frameLog); void noiseReductionUpdate(); - void writeTrailingSEIMessages(); + void writeTrailingSEIMessages(int layer); bool writeToneMapInfo(x265_sei_payload *payload); /* Called by WaveFront::findJob() */ - virtual void processRow(int row, int threadId); - virtual void processRowEncoder(int row, ThreadLocalData& tld); + virtual void processRow(int row, int threadId, int layer); + virtual void processRowEncoder(int row, ThreadLocalData& tld, int layer); void enqueueRowEncoder(int row) { WaveFront::enqueueRow(row * 2 + 0); } void enqueueRowFilter(int row) { WaveFront::enqueueRow(row * 2 + 1); } @@ -280,8 +283,8 @@ #if ENABLE_LIBVMAF void vmafFrameLevelScore(); #endif - void collectDynDataFrame(); - void computeAvgTrainingData(); + void collectDynDataFrame(int layer); + void computeAvgTrainingData(int layer); void collectDynDataRow(CUData& ctu, FrameStats* rowStats); void readModel(FilmGrainCharacteristics* m_filmGrain, FILE* filmgrain); };
View file
x265_3.6.tar.gz/source/encoder/framefilter.cpp -> x265_4.0.tar.gz/source/encoder/framefilter.cpp
Changed
@@ -256,7 +256,7 @@ const int size = cu->m_log2CUSizeabsPartIdx - 2; const uint32_t cuAddr = cu->m_cuAddr; - PicYuv* reconPic = frame.m_reconPic; + PicYuv* reconPic = frame.m_reconPic0; PicYuv* fencPic = frame.m_fencPic; pixel* dst = reconPic->getLumaAddr(cuAddr, absPartIdx); @@ -337,7 +337,7 @@ uint32_t cuAddr = m_rowAddr + col; const CUData* ctu = m_encData->getPicCTU(cuAddr); - assert(m_frameFilter->m_frame->m_reconPic == m_encData->m_reconPic); + assert(m_frameFilter->m_frame->m_reconPic0 == m_encData->m_reconPic0); origCUSampleRestoration(ctu, cuGeomsctuGeomMapcuAddr, *m_frameFilter->m_frame); } } @@ -352,7 +352,7 @@ if ((col != 0) & (col != m_frameFilter->m_numCols - 1) & (m_row != 0) & (m_row != m_frameFilter->m_numRows - 1)) return; - PicYuv *reconPic = m_frameFilter->m_frame->m_reconPic; + PicYuv *reconPic = m_frameFilter->m_frame->m_reconPic0; const uint32_t lineStartCUAddr = m_rowAddr + col; const int realH = getCUHeight(); const int realW = m_frameFilter->getCUWidth(col); @@ -441,7 +441,7 @@ SAOParam* saoParam = m_encData->m_saoParam; const CUGeom* cuGeoms = m_frameFilter->m_frameEncoder->m_cuGeoms; const uint32_t* ctuGeomMap = m_frameFilter->m_frameEncoder->m_ctuGeomMap; - PicYuv* reconPic = m_encData->m_reconPic; + PicYuv* reconPic = m_encData->m_reconPic0; const int colStart = m_lastCol.get(); const int numCols = m_frameFilter->m_numCols; // TODO: Waiting previous row finish or simple clip on it? @@ -561,7 +561,7 @@ } } -void FrameFilter::processRow(int row) +void FrameFilter::processRow(int row, int layer) { ProfileScopeEvent(filterCTURow); @@ -572,7 +572,7 @@ if (!m_param->bEnableLoopFilter && !m_useSao) { - processPostRow(row); + processPostRow(row, layer); return; } FrameData& encData = *m_frame->m_encData; @@ -616,7 +616,7 @@ // this row of CTUs has been encoded if (!ctu->m_bFirstRowInSlice) - processPostRow(row - 1); + processPostRow(row - 1, layer); // NOTE: slices parallelism will be execute out-of-order int numRowFinished = 0; @@ -648,12 +648,12 @@ } if (ctu->m_bLastRowInSlice) - processPostRow(row); + processPostRow(row, layer); } -void FrameFilter::processPostRow(int row) +void FrameFilter::processPostRow(int row, int layer) { - PicYuv *reconPic = m_frame->m_reconPic; + PicYuv *reconPic = m_frame->m_reconPic0; const uint32_t numCols = m_frame->m_encData->m_slice->m_sps->numCuInWidth; const uint32_t lineStartCUAddr = row * numCols; @@ -673,7 +673,7 @@ uint32_t height = m_parallelFilterrow.getCUHeight(); uint64_t ssdY = m_frameEncoder->m_top->computeSSD(fencPic->getLumaAddr(cuAddr), reconPic->getLumaAddr(cuAddr), stride, width, height, m_param); - m_frameEncoder->m_SSDY += ssdY; + m_frameEncoder->m_SSDYlayer += ssdY; if (m_param->internalCsp != X265_CSP_I400) { @@ -684,8 +684,8 @@ uint64_t ssdU = m_frameEncoder->m_top->computeSSD(fencPic->getCbAddr(cuAddr), reconPic->getCbAddr(cuAddr), stride, width, height, m_param); uint64_t ssdV = m_frameEncoder->m_top->computeSSD(fencPic->getCrAddr(cuAddr), reconPic->getCrAddr(cuAddr), stride, width, height, m_param); - m_frameEncoder->m_SSDU += ssdU; - m_frameEncoder->m_SSDV += ssdV; + m_frameEncoder->m_SSDUlayer += ssdU; + m_frameEncoder->m_SSDVlayer += ssdV; } } @@ -705,15 +705,15 @@ /* SSIM is done for each row in blocks of 4x4 . The First blocks are offset by 2 pixels to the right * to avoid alignment of ssim blocks with DCT blocks. */ minPixY += bStart ? 2 : -6; - m_frameEncoder->m_ssim += calculateSSIM(rec + 2 + minPixY * stride1, stride1, fenc + 2 + minPixY * stride2, stride2, + m_frameEncoder->m_ssimlayer += calculateSSIM(rec + 2 + minPixY * stride1, stride1, fenc + 2 + minPixY * stride2, stride2, m_param->sourceWidth - 2, maxPixY - minPixY, m_ssimBuf, ssim_cnt); - m_frameEncoder->m_ssimCnt += ssim_cnt; + m_frameEncoder->m_ssimCntlayer += ssim_cnt; } if (m_param->maxSlices == 1) { uint32_t height = m_parallelFilterrow.getCUHeight(); - m_frameEncoder->initDecodedPictureHashSEI(row, cuAddr, height); + m_frameEncoder->initDecodedPictureHashSEI(row, cuAddr, height, layer); } // end of (m_param->maxSlices == 1) if (ATOMIC_INC(&m_frameEncoder->m_completionCount) == 2 * (int)m_frameEncoder->m_numRows) @@ -737,7 +737,7 @@ } } - int stride = (int)m_frame->m_reconPic->m_stride; + int stride = (int)m_frame->m_reconPic0->m_stride; int padX = m_param->maxCUSize + 32; int padY = m_param->maxCUSize + 16; int numCuInHeight = m_frame->m_encData->m_slice->m_sps->numCuInHeight; @@ -763,7 +763,7 @@ for (int y = startRow; y < height; y++) { - pixel *pix = m_frame->m_reconPic->m_picOrg0 + y * stride - padX; + pixel *pix = m_frame->m_reconPic0->m_picOrg0 + y * stride - padX; uint32_t *sum32x32 = m_frame->m_encData->m_meIntegral0 + (y + 1) * stride - padX; uint32_t *sum32x24 = m_frame->m_encData->m_meIntegral1 + (y + 1) * stride - padX; uint32_t *sum32x8 = m_frame->m_encData->m_meIntegral2 + (y + 1) * stride - padX;
View file
x265_3.6.tar.gz/source/encoder/framefilter.h -> x265_4.0.tar.gz/source/encoder/framefilter.h
Changed
@@ -128,8 +128,8 @@ void start(Frame *pic, Entropy& initState); - void processRow(int row); - void processPostRow(int row); + void processRow(int row, int layer); + void processPostRow(int row, int layer); void computeMEIntegral(int row); }; }
View file
x265_3.6.tar.gz/source/encoder/level.cpp -> x265_4.0.tar.gz/source/encoder/level.cpp
Changed
@@ -60,6 +60,42 @@ { MAX_UINT, MAX_UINT, MAX_UINT, MAX_UINT, MAX_UINT, MAX_UINT, 1, Level::LEVEL8_5, "8.5", 85 }, }; +#if ENABLE_SCC_EXT +enum SCCProfileName +{ + NONE = 0, + // The following are SCC profiles, which would map to the MAINSCC profile idc. + // The enumeration indicates the bit-depth constraint in the bottom 2 digits + // the chroma format in the next digit + // the intra constraint in the next digit + // If it is a SCC profile there is a '2' for the next digit. + // If it is a highthroughput , there is a '2' for the top digit else '1' for the top digit + SCC_MAIN = 121108, + SCC_MAIN_10 = 121110, + SCC_MAIN_444 = 121308, + SCC_MAIN_444_10 = 121310, +}; + +static const SCCProfileName validSCCProfileNames14/* bit depth constraint 8=0, 10=1, 12=2, 14=3*/4/*chroma format*/ = +{ + { + { NONE, SCC_MAIN, NONE, SCC_MAIN_444 }, // 8-bit intra for 400, 420, 422 and 444 + { NONE, SCC_MAIN_10, NONE, SCC_MAIN_444_10 }, // 10-bit intra for 400, 420, 422 and 444 + { NONE, NONE, NONE, NONE }, // 12-bit intra for 400, 420, 422 and 444 + { NONE, NONE, NONE, NONE } // 16-bit intra for 400, 420, 422 and 444 + }, +}; +#endif + +static inline int _confirm(x265_param* param, bool bflag, const char* message) +{ + if (!bflag) + return 0; + + x265_log(param, X265_LOG_ERROR, "%s\n", message); + return 1; +} + /* determine minimum decoder level required to decode the described video */ void determineLevel(const x265_param ¶m, VPS& vps) { @@ -80,45 +116,74 @@ if (param.internalBitDepth <= 8) { if (vps.ptl.onePictureOnlyConstraintFlag) - vps.ptl.profileIdc = Profile::MAINSTILLPICTURE; + vps.ptl.profileIdc0 = Profile::MAINSTILLPICTURE; else if (vps.ptl.intraConstraintFlag) - vps.ptl.profileIdc = Profile::MAINREXT; /* Main Intra */ + vps.ptl.profileIdc0 = Profile::MAINREXT; /* Main Intra */ else - vps.ptl.profileIdc = Profile::MAIN; + vps.ptl.profileIdc0 = Profile::MAIN; + +#if ENABLE_ALPHA + if (param.numScalableLayers == 2) + vps.ptl.profileIdc1 = Profile::SCALABLEMAIN; +#endif } else if (param.internalBitDepth <= 10) { /* note there is no 10bit still picture profile */ if (vps.ptl.intraConstraintFlag) - vps.ptl.profileIdc = Profile::MAINREXT; /* Main10 Intra */ + vps.ptl.profileIdc0 = Profile::MAINREXT; /* Main10 Intra */ else - vps.ptl.profileIdc = Profile::MAIN10; + vps.ptl.profileIdc0 = Profile::MAIN10; + +#if ENABLE_ALPHA + if (param.numScalableLayers == 2) + vps.ptl.profileIdc1 = Profile::SCALABLEMAIN10; +#endif } } else - vps.ptl.profileIdc = Profile::MAINREXT; + vps.ptl.profileIdc0 = Profile::MAINREXT; + +#if ENABLE_MULTIVIEW + if (param.numViews == 2) + vps.ptl.profileIdc1 = Profile::MULTIVIEWMAIN; +#endif + +#if ENABLE_SCC_EXT + if (param.bEnableSCC) + vps.ptl.profileIdc0 = Profile::MAINSCC; /* determine which profiles are compatible with this stream */ + if (vps.ptl.profileIdc0 == Profile::MAINSCC) + { + vps.ptl.onePictureOnlyConstraintFlag = false; + vps.ptl.intraConstraintFlag = param.keyframeMax <= 1 || vps.ptl.onePictureOnlyConstraintFlag; + } +#endif memset(vps.ptl.profileCompatibilityFlag, 0, sizeof(vps.ptl.profileCompatibilityFlag)); - vps.ptl.profileCompatibilityFlagvps.ptl.profileIdc = true; - if (vps.ptl.profileIdc == Profile::MAIN10 && param.internalBitDepth == 8) + vps.ptl.profileCompatibilityFlagvps.ptl.profileIdc0 = true; + if (vps.ptl.profileIdc0 == Profile::MAIN10 && param.internalBitDepth == 8) vps.ptl.profileCompatibilityFlagProfile::MAIN = true; - else if (vps.ptl.profileIdc == Profile::MAIN) + else if (vps.ptl.profileIdc0 == Profile::MAIN) vps.ptl.profileCompatibilityFlagProfile::MAIN10 = true; - else if (vps.ptl.profileIdc == Profile::MAINSTILLPICTURE) + else if (vps.ptl.profileIdc0 == Profile::MAINSTILLPICTURE) { vps.ptl.profileCompatibilityFlagProfile::MAIN = true; vps.ptl.profileCompatibilityFlagProfile::MAIN10 = true; } - else if (vps.ptl.profileIdc == Profile::MAINREXT) + else if (vps.ptl.profileIdc0 == Profile::MAINREXT) vps.ptl.profileCompatibilityFlagProfile::MAINREXT = true; +#if ENABLE_SCC_EXT + else if (vps.ptl.profileIdc0 == Profile::MAINSCC) + vps.ptl.profileCompatibilityFlagProfile::MAINSCC = true; +#endif uint32_t lumaSamples = param.sourceWidth * param.sourceHeight; uint32_t samplesPerSec = (uint32_t)(lumaSamples * ((double)param.fpsNum / param.fpsDenom)); uint32_t bitrate = param.rc.vbvMaxBitrate ? param.rc.vbvMaxBitrate : param.rc.bitrate; - const uint32_t MaxDpbPicBuf = 6; + const uint32_t MaxDpbPicBuf = param.bEnableSCC ? 7 : 6; vps.ptl.levelIdc = Level::NONE; vps.ptl.tierFlag = Level::MAIN; @@ -174,7 +239,7 @@ if (levelsi.levelEnum >= Level::LEVEL5 && param.maxCUSize < 32) { x265_log(¶m, X265_LOG_WARNING, "level %s detected, but CTU size 16 is non-compliant\n", levelsi.name); - vps.ptl.profileIdc = Profile::NONE; + vps.ptl.profileIdc0 = Profile::NONE; vps.ptl.levelIdc = Level::NONE; vps.ptl.tierFlag = Level::MAIN; x265_log(¶m, X265_LOG_INFO, "NONE profile, Level-NONE (Main tier)\n"); @@ -186,7 +251,7 @@ if (numPocTotalCurr > 10) { x265_log(¶m, X265_LOG_WARNING, "level %s detected, but NumPocTotalCurr (total references) is non-compliant\n", levelsi.name); - vps.ptl.profileIdc = Profile::NONE; + vps.ptl.profileIdc0 = Profile::NONE; vps.ptl.levelIdc = Level::NONE; vps.ptl.tierFlag = Level::MAIN; x265_log(¶m, X265_LOG_INFO, "NONE profile, Level-NONE (Main tier)\n"); @@ -217,14 +282,32 @@ break; } - static const char *profiles = { "None", "Main", "Main 10", "Main Still Picture", "RExt" }; +#if ENABLE_SCC_EXT + x265_param m_param = param; +#define CHECK(expr, msg) check_failed |= _confirm(&m_param, expr, msg) + int check_failed = 0; /* abort if there is a fatal configuration problem */ + + if (vps.ptl.profileIdc0 == Profile::MAINSCC) + { + CHECK(vps.ptl.lowerBitRateConstraintFlag == false && vps.ptl.intraConstraintFlag == false, "The lowerBitRateConstraint flag cannot be false when intraConstraintFlag is false"); + CHECK(param.bEnableSCC && !(vps.ptl.profileIdc0 == Profile::MAINSCC), "UseIntraBlockCopy must not be enabled unless the SCC profile is being used."); + CHECK(vps.ptl.intraConstraintFlag, "intra constraint flag must be 0 for SCC profiles"); + CHECK(vps.ptl.onePictureOnlyConstraintFlag, "one-picture-only constraint flag shall be 0 for SCC profiles"); + const uint32_t bitDepthIdx = (vps.ptl.bitDepthConstraint == 8 ? 0 : (vps.ptl.bitDepthConstraint == 10 ? 1 : (vps.ptl.bitDepthConstraint == 12 ? 2 : (vps.ptl.bitDepthConstraint == 16 ? 3 : 4)))); + const uint32_t chromaFormatIdx = uint32_t(vps.ptl.chromaFormatConstraint); + const bool bValidProfile = (bitDepthIdx > 2 || chromaFormatIdx > 3) ? false : (validSCCProfileNames0bitDepthIdxchromaFormatIdx != NONE); + CHECK(!bValidProfile, "Invalid intra constraint flag, bit depth constraint flag and chroma format constraint flag combination for a RExt profile"); + } +#endif + + static const char* profiles = { "None", "Main", "Main 10", "Main Still Picture", "RExt", "", "", "", "", "Main Scc" }; static const char *tiers = { "Main", "High" }; char profbuf64; - strcpy(profbuf, profilesvps.ptl.profileIdc); + strcpy(profbuf, profilesvps.ptl.profileIdc0); bool bStillPicture = false; - if (vps.ptl.profileIdc == Profile::MAINREXT) + if (vps.ptl.profileIdc0 == Profile::MAINREXT) { if (vps.ptl.bitDepthConstraint > 12 && vps.ptl.intraConstraintFlag) { @@ -277,6 +360,27 @@ if (vps.ptl.intraConstraintFlag && !bStillPicture) strcat(profbuf, " Intra"); } + +#if ENABLE_SCC_EXT + if (vps.ptl.profileIdc0 == Profile::MAINSCC) + { + if (param.internalCsp == X265_CSP_I420) + { + if (vps.ptl.bitDepthConstraint <= 8) + strcpy(profbuf, "Main Scc"); + else if (vps.ptl.bitDepthConstraint <= 10) + strcpy(profbuf, "Main 10 Scc"); + } + else if (param.internalCsp == X265_CSP_I444)
View file
x265_3.6.tar.gz/source/encoder/motion.cpp -> x265_4.0.tar.gz/source/encoder/motion.cpp
Changed
@@ -770,6 +770,7 @@ int merange, MV & outQMv, uint32_t maxSlices, + bool m_vertRestriction, pixel * srcReferencePlane) { ALIGN_VAR_16(int, costs16); @@ -794,6 +795,13 @@ // measure SAD cost at clipped QPEL MVP MV pmv = qmvp.clipped(qmvmin, qmvmax); + if (m_vertRestriction) + { + if (pmv.y > mvmax.y << 2) + { + pmv.y = (mvmax.y << 2); + } + } MV bestpre = pmv; int bprecost;
View file
x265_3.6.tar.gz/source/encoder/motion.h -> x265_4.0.tar.gz/source/encoder/motion.h
Changed
@@ -95,7 +95,7 @@ } void refineMV(ReferencePlanes* ref, const MV& mvmin, const MV& mvmax, const MV& qmvp, MV& outQMv); - int motionEstimate(ReferencePlanes* ref, const MV & mvmin, const MV & mvmax, const MV & qmvp, int numCandidates, const MV * mvc, int merange, MV & outQMv, uint32_t maxSlices, pixel *srcReferencePlane = 0); + int motionEstimate(ReferencePlanes* ref, const MV & mvmin, const MV & mvmax, const MV & qmvp, int numCandidates, const MV * mvc, int merange, MV & outQMv, uint32_t maxSlices, bool m_vertRestriction, pixel *srcReferencePlane = 0); int subpelCompare(ReferencePlanes* ref, const MV &qmv, pixelcmp_t);
View file
x265_3.6.tar.gz/source/encoder/nal.cpp -> x265_4.0.tar.gz/source/encoder/nal.cpp
Changed
@@ -57,7 +57,7 @@ other.m_buffer = X265_MALLOC(uint8_t, m_allocSize); } -void NALList::serialize(NalUnitType nalUnitType, const Bitstream& bs, uint8_t temporalID) +void NALList::serialize(NalUnitType nalUnitType, const Bitstream& bs, int layerId, uint8_t temporalID) { static const char startCodePrefix = { 0, 0, 0, 1 }; @@ -114,7 +114,7 @@ * nuh_reserved_zero_6bits 6-bits * nuh_temporal_id_plus1 3-bits */ outbytes++ = (uint8_t)nalUnitType << 1; - outbytes++ = temporalID; + outbytes++ = (layerId << 3) | (temporalID); /* 7.4.1 ... * Within the NAL unit, the following three-byte sequences shall not occur at
View file
x265_3.6.tar.gz/source/encoder/nal.h -> x265_4.0.tar.gz/source/encoder/nal.h
Changed
@@ -35,7 +35,11 @@ class NALList { public: +#if ENABLE_MULTIVIEW || ENABLE_ALPHA + static const int MAX_NAL_UNITS = 32; +#else static const int MAX_NAL_UNITS = 16; +#endif public: @@ -56,7 +60,7 @@ void takeContents(NALList& other); - void serialize(NalUnitType nalUnitType, const Bitstream& bs, uint8_t temporalID = 1); + void serialize(NalUnitType nalUnitType, const Bitstream& bs, int layerId = 0, uint8_t temporalID = 1); uint32_t serializeSubstreams(uint32_t* streamSizeBytes, uint32_t streamCount, const Bitstream* streams); };
View file
x265_3.6.tar.gz/source/encoder/ratecontrol.cpp -> x265_4.0.tar.gz/source/encoder/ratecontrol.cpp
Changed
@@ -1349,6 +1349,10 @@ FrameData& curEncData = *curFrame->m_encData; m_curSlice = curEncData.m_slice; m_sliceType = m_curSlice->m_sliceType; +#if ENABLE_SCC_EXT + if(m_param->bEnableSCC) + m_sliceType = m_curSlice->m_origSliceType; +#endif rce->sliceType = m_sliceType; if (!m_2pass) rce->keptAsRef = IS_REFERENCED(curFrame); @@ -1466,7 +1470,7 @@ int mincr = enc->m_vps.ptl.minCrForLevel; /* Profiles above Main10 don't require maxAU size check, so just set the maximum to a large value. */ - if (enc->m_vps.ptl.profileIdc > Profile::MAIN10 || enc->m_vps.ptl.levelIdc == Level::NONE) + if (enc->m_vps.ptl.profileIdc0 > Profile::MAIN10 || enc->m_vps.ptl.levelIdc == Level::NONE) rce->frameSizeMaximum = 1e9; else {
View file
x265_3.6.tar.gz/source/encoder/sao.cpp -> x265_4.0.tar.gz/source/encoder/sao.cpp
Changed
@@ -36,12 +36,6 @@ return num >= 0 ? ((num * 2 + den) / (den * 2)) : -((-num * 2 + den) / (den * 2)); } -/* get the sign of input variable (TODO: this is a dup, make common) */ -inline int8_t signOf(int x) -{ - return (x >> 31) | ((int)((((uint32_t)-x)) >> 31)); -} - inline int signOf2(const int a, const int b) { // NOTE: don't reorder below compare, both ICL, VC, GCC optimize strong depends on order! @@ -273,7 +267,7 @@ // CTU-based SAO process without slice granularity void SAO::applyPixelOffsets(int addr, int typeIdx, int plane) { - PicYuv* reconPic = m_frame->m_reconPic; + PicYuv* reconPic = m_frame->m_reconPic0; pixel* rec = reconPic->getPlaneAddr(plane, addr); intptr_t stride = plane ? reconPic->m_strideC : reconPic->m_stride; uint32_t picWidth = m_param->sourceWidth; @@ -328,10 +322,10 @@ { for (int y = 0; y < ctuHeight; y++, rec += stride) { - int signLeft = signOf(recstartX - tmpLy); + int signLeft = x265_signOf(recstartX - tmpLy); for (int x = startX; x < endX; x++) { - int signRight = signOf(recx - recx + 1); + int signRight = x265_signOf(recx - recx + 1); int edgeType = signRight + signLeft + 2; signLeft = -signRight; @@ -343,8 +337,8 @@ { for (int y = 0; y < ctuHeight; y += 2, rec += 2 * stride) { - signLeft10 = signOf(recstartX - tmpLy); - signLeft11 = signOf(recstride + startX - tmpLy + 1); + signLeft10 = x265_signOf(recstartX - tmpLy); + signLeft11 = x265_signOf(recstride + startX - tmpLy + 1); if (!lpelx) { @@ -385,13 +379,13 @@ if (ctuWidth & 15) { for (int x = 0; x < ctuWidth; x++) - upBuff1x = signOf(recx - tmpUx); + upBuff1x = x265_signOf(recx - tmpUx); for (int y = startY; y < endY; y++, rec += stride) { for (int x = 0; x < ctuWidth; x++) { - int8_t signDown = signOf(recx - recx + stride); + int8_t signDown = x265_signOf(recx - recx + stride); int edgeType = signDown + upBuff1x + 2; upBuff1x = -signDown; @@ -445,17 +439,17 @@ else { for (int x = startX; x < endX; x++) - upBuff1x = signOf(recx - tmpUx - 1); + upBuff1x = x265_signOf(recx - tmpUx - 1); } if (ctuWidth & 15) { for (int y = startY; y < endY; y++, rec += stride) { - upBufftstartX = signOf(recstride + startX - tmpLy); + upBufftstartX = x265_signOf(recstride + startX - tmpLy); for (int x = startX; x < endX; x++) { - int8_t signDown = signOf(recx - recx + stride + 1); + int8_t signDown = x265_signOf(recx - recx + stride + 1); int edgeType = signDown + upBuff1x + 2; upBufftx + 1 = -signDown; recx = m_clipTablerecx + offsetEoedgeType; @@ -468,7 +462,7 @@ { for (int y = startY; y < endY; y++, rec += stride) { - int8_t iSignDown2 = signOf(recstride + startX - tmpLy); + int8_t iSignDown2 = x265_signOf(recstride + startX - tmpLy); primitives.saoCuOrgE2endX > 16(rec + startX, upBufft + startX, upBuff1 + startX, offsetEo, endX - startX, stride); @@ -493,25 +487,25 @@ if (ctuWidth & 15) { for (int x = startX - 1; x < endX; x++) - upBuff1x = signOf(recx - tmpUx + 1); + upBuff1x = x265_signOf(recx - tmpUx + 1); for (int y = startY; y < endY; y++, rec += stride) { int x = startX; - int8_t signDown = signOf(recx - tmpLy + 1); + int8_t signDown = x265_signOf(recx - tmpLy + 1); int edgeType = signDown + upBuff1x + 2; upBuff1x - 1 = -signDown; recx = m_clipTablerecx + offsetEoedgeType; for (x = startX + 1; x < endX; x++) { - signDown = signOf(recx - recx + stride - 1); + signDown = x265_signOf(recx - recx + stride - 1); edgeType = signDown + upBuff1x + 2; upBuff1x - 1 = -signDown; recx = m_clipTablerecx + offsetEoedgeType; } - upBuff1endX - 1 = signOf(recendX - 1 + stride - recendX); + upBuff1endX - 1 = x265_signOf(recendX - 1 + stride - recendX); } } else @@ -519,7 +513,7 @@ int8_t firstSign, lastSign; if (lpelx) - firstSign = signOf(rec-1 - tmpU0); + firstSign = x265_signOf(rec-1 - tmpU0); if (rpelx == picWidth) lastSign = upBuff1ctuWidth - 1; @@ -533,14 +527,14 @@ for (int y = startY; y < endY; y++, rec += stride) { int x = startX; - int8_t signDown = signOf(recx - tmpLy + 1); + int8_t signDown = x265_signOf(recx - tmpLy + 1); int edgeType = signDown + upBuff1x + 2; upBuff1x - 1 = -signDown; recx = m_clipTablerecx + offsetEoedgeType; primitives.saoCuOrgE3endX > 16(rec, upBuff1, offsetEo, stride - 1, startX, endX); - upBuff1endX - 1 = signOf(recendX - 1 + stride - recendX); + upBuff1endX - 1 = x265_signOf(recendX - 1 + stride - recendX); } } @@ -571,7 +565,7 @@ /* Process SAO unit */ void SAO::generateLumaOffsets(SaoCtuParam* ctuParam, int idxY, int idxX) { - PicYuv* reconPic = m_frame->m_reconPic; + PicYuv* reconPic = m_frame->m_reconPic0; intptr_t stride = reconPic->m_stride; int ctuWidth = m_param->maxCUSize; int ctuHeight = m_param->maxCUSize; @@ -631,7 +625,7 @@ /* Process SAO unit (Chroma only) */ void SAO::generateChromaOffsets(SaoCtuParam* ctuParam3, int idxY, int idxX) { - PicYuv* reconPic = m_frame->m_reconPic; + PicYuv* reconPic = m_frame->m_reconPic0; intptr_t stride = reconPic->m_strideC; int ctuWidth = m_param->maxCUSize; int ctuHeight = m_param->maxCUSize; @@ -735,7 +729,7 @@ void SAO::calcSaoStatsCTU(int addr, int plane) { Slice* slice = m_frame->m_encData->m_slice; - const PicYuv* reconPic = m_frame->m_reconPic; + const PicYuv* reconPic = m_frame->m_reconPic0; const CUData* cu = m_frame->m_encData->getPicCTU(addr); const pixel* fenc0 = m_frame->m_fencPic->getPlaneAddr(plane, addr); const pixel* rec0 = reconPic->getPlaneAddr(plane, addr); @@ -922,7 +916,7 @@ int x, y; const CUData* cu = frame->m_encData->getPicCTU(addr); - const PicYuv* reconPic = m_frame->m_reconPic; + const PicYuv* reconPic = m_frame->m_reconPic0; const pixel* fenc; const pixel* rec; intptr_t stride = reconPic->m_stride; @@ -1030,10 +1024,10 @@ for (y = 0; y < ctuHeight; y++) { x = (y < startY ? startX : firstX); - int signLeft = signOf(recx - recx - 1); + int signLeft = x265_signOf(recx - recx - 1); for (; x < endX; x++) { - int signRight = signOf(recx - recx + 1); + int signRight = x265_signOf(recx - recx + 1); int edgeType = signRight + signLeft + 2; signLeft = -signRight; @@ -1069,13 +1063,13 @@ }
View file
x265_3.6.tar.gz/source/encoder/search.cpp -> x265_4.0.tar.gz/source/encoder/search.cpp
Changed
@@ -76,6 +76,9 @@ m_param = ¶m; m_bFrameParallel = param.frameNumThreads > 1; m_numLayers = g_log2Sizeparam.maxCUSize - 2; +#if ENABLE_SCC_EXT + m_ibcEnabled = param.bEnableSCC; +#endif m_rdCost.setPsyRdScale(param.psyRd); m_rdCost.setSsimRd(param.bSsimRd); @@ -171,6 +174,11 @@ CHECKED_MALLOC(m_tsResidual, int16_t, MAX_TS_SIZE * MAX_TS_SIZE); CHECKED_MALLOC(m_tsRecon, pixel, MAX_TS_SIZE * MAX_TS_SIZE); +#if ENABLE_SCC_EXT + m_numBVs = 0; + m_numBV16s = 0; +#endif + return ok; fail: @@ -496,7 +504,7 @@ } // set reconstruction for next intra prediction blocks if full TU prediction won - PicYuv* reconPic = m_frame->m_reconPic; + PicYuv* reconPic = m_frame->m_reconPic0; pixel* picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx); intptr_t picStride = reconPic->m_stride; primitives.cusizeIdx.copy_pp(picReconY, picStride, reconQt, reconQtStride); @@ -672,7 +680,7 @@ } // set reconstruction for next intra prediction blocks - PicYuv* reconPic = m_frame->m_reconPic; + PicYuv* reconPic = m_frame->m_reconPic0; pixel* picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx); intptr_t picStride = reconPic->m_stride; primitives.cusizeIdx.copy_pp(picReconY, picStride, reconQt, reconQtStride); @@ -723,7 +731,7 @@ uint32_t sizeIdx = log2TrSize - 2; primitives.cusizeIdx.calcresidualstride % 64 == 0(fenc, pred, residual, stride); - PicYuv* reconPic = m_frame->m_reconPic; + PicYuv* reconPic = m_frame->m_reconPic0; pixel* picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx); intptr_t picStride = reconPic->m_stride; @@ -887,7 +895,7 @@ coeff_t* coeffC = m_rqtqtLayer.coeffRQTchromaId + coeffOffsetC; pixel* reconQt = m_rqtqtLayer.reconQtYuv.getChromaAddr(chromaId, absPartIdxC); uint32_t reconQtStride = m_rqtqtLayer.reconQtYuv.m_csize; - PicYuv* reconPic = m_frame->m_reconPic; + PicYuv* reconPic = m_frame->m_reconPic0; pixel* picReconC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC); intptr_t picStride = reconPic->m_strideC; @@ -1078,7 +1086,7 @@ cu.setCbfPartRange(bCbf << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep); cu.setTransformSkipPartRange(bTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep); - PicYuv* reconPic = m_frame->m_reconPic; + PicYuv* reconPic = m_frame->m_reconPic0; pixel* reconPicC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC); intptr_t picStride = reconPic->m_strideC; primitives.cusizeIdxC.copy_pp(reconPicC, picStride, reconQt, reconQtStride); @@ -1185,7 +1193,7 @@ int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC); uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift)); coeff_t* coeffC = cu.m_trCoeffttype + coeffOffsetC; - PicYuv* reconPic = m_frame->m_reconPic; + PicYuv* reconPic = m_frame->m_reconPic0; pixel* picReconC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC); intptr_t picStride = reconPic->m_strideC; @@ -1284,6 +1292,11 @@ updateModeCost(intraMode); checkDQP(intraMode, cuGeom); + +#if ENABLE_SCC_EXT + if (m_param->bEnableSCC) + intraMode.reconYuv.copyToPicYuv(*m_frame->m_reconPic1, cu.m_cuAddr, cuGeom.absPartIdx); +#endif } /* Note that this function does not save the best intra prediction, it must @@ -1671,7 +1684,7 @@ * output recon picture, so it cannot proceed in parallel with anything else when doing INTRA_NXN. Also * it is not updating m_rdContextsdepth.cur for the later PUs which I suspect is slightly wrong. I think * that the contexts should be tracked through each PU */ - PicYuv* reconPic = m_frame->m_reconPic; + PicYuv* reconPic = m_frame->m_reconPic0; pixel* dst = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx); uint32_t dststride = reconPic->m_stride; const pixel* src = reconYuv->getLumaAddr(absPartIdx); @@ -1844,7 +1857,7 @@ if (!tuIterator.isLastSection()) { uint32_t zorder = cuGeom.absPartIdx + absPartIdxC; - PicYuv* reconPic = m_frame->m_reconPic; + PicYuv* reconPic = m_frame->m_reconPic0; uint32_t dststride = reconPic->m_strideC; const pixel* src; pixel* dst; @@ -1895,7 +1908,9 @@ MVField candMvFieldMRG_MAX_NUM_CANDS2; uint8_t candDirMRG_MAX_NUM_CANDS; uint32_t numMergeCand = cu.getInterMergeCandidates(pu.puAbsPartIdx, puIdx, candMvField, candDir); - +#if ENABLE_SCC_EXT + restrictBipredMergeCand(&cu, 0, candMvField, candDir, numMergeCand); +#else if (cu.isBipredRestriction()) { /* do not allow bidir merge candidates if PU is smaller than 8x8, drop L1 reference */ @@ -1908,6 +1923,7 @@ } } } +#endif Yuv& tempYuv = m_rqtcuGeom.depth.tmpPredYuv; @@ -1936,6 +1952,12 @@ continue; } +#if ENABLE_SCC_EXT + if ((candDirmergeCand == 1 || candDirmergeCand == 3) && (m_slice->m_refPOCList0candMvFieldmergeCand0.refIdx == m_slice->m_poc)) + { + continue; + } +#endif cu.m_mv0pu.puAbsPartIdx = candMvFieldmergeCand0.mv; cu.m_refIdx0pu.puAbsPartIdx = (int8_t)candMvFieldmergeCand0.refIdx; cu.m_mv1pu.puAbsPartIdx = candMvFieldmergeCand1.mv; @@ -2015,7 +2037,12 @@ continue; } cu.clipMv(mvCand); - predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refReconPicListlistref, mvCand); +#if ENABLE_SCC_EXT + if (m_slice->m_param->bEnableSCC && !list && ref == m_slice->m_numRefIdx0 - 1) + predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refFrameListlistref->m_reconPic1, mvCand); + else +#endif + predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refReconPicListlistref, mvCand); costsi = m_me.bufSAD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size); } @@ -2086,13 +2113,18 @@ void Search::singleMotionEstimation(Search& master, Mode& interMode, const PredictionUnit& pu, int part, int list, int ref) { uint32_t bits = master.m_listSelBitslist + MVP_IDX_BITS; - bits += getTUBits(ref, m_slice->m_numRefIdxlist); + int numIdx = m_slice->m_numRefIdxlist; +#if ENABLE_SCC_EXT + if (!list && m_ibcEnabled) + numIdx--; +#endif + bits += getTUBits(ref, numIdx); MotionData* bestME = interMode.bestMEpart; // 12 mv candidates including lowresMV MV mvc(MD_ABOVE_LEFT + 1) * 2 + 2; - int numMvc = interMode.cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCandlistref, mvc); + int numMvc = interMode.cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCandlistref, mvc, 0, pu.puAbsPartIdx); const MV* amvp = interMode.amvpCandlistref; int mvpIdx = selectMVP(interMode.cu, pu, amvp, list, ref); @@ -2102,22 +2134,24 @@ if (!m_param->analysisSave && !m_param->analysisLoad) /* Prevents load/save outputs from diverging if lowresMV is not available */ { MV lmv = getLowresMV(interMode.cu, pu, list, ref); - if (lmv.notZero()) + int layer = m_param->numViews > 1 ? m_frame->m_viewId : (m_param->numScalableLayers > 1) ? m_frame->m_sLayerId : 0; + if (lmv.notZero() && !layer) mvcnumMvc++ = lmv; if (m_param->bEnableHME) mvp_lowres = lmv; } + m_vertRestriction = interMode.cu.m_slice->m_refPOCListlistref == interMode.cu.m_slice->m_poc; setSearchRange(interMode.cu, mvp, m_param->searchRange, mvmin, mvmax); - int satdCost = m_me.motionEstimate(&m_slice->m_mreflistref, mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, + int satdCost = m_me.motionEstimate(&m_slice->m_mreflistref, mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, m_vertRestriction, m_param->bSourceReferenceEstimation ? m_slice->m_refFrameListlistref->m_fencPic->getLumaAddr(0) : 0); if (m_param->bEnableHME && mvp_lowres.notZero() && mvp_lowres != mvp) { MV outmv_lowres; setSearchRange(interMode.cu, mvp_lowres, m_param->searchRange, mvmin, mvmax); - int lowresMvCost = m_me.motionEstimate(&m_slice->m_mreflistref, mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices, + int lowresMvCost = m_me.motionEstimate(&m_slice->m_mreflistref, mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices, m_vertRestriction, m_param->bSourceReferenceEstimation ? m_slice->m_refFrameListlistref->m_fencPic->getLumaAddr(0) : 0); if (lowresMvCost < satdCost)
View file
x265_3.6.tar.gz/source/encoder/search.h -> x265_4.0.tar.gz/source/encoder/search.h
Changed
@@ -286,6 +286,16 @@ int32_t m_sliceMaxY; int32_t m_sliceMinY; + bool m_vertRestriction; + +#if ENABLE_SCC_EXT + int m_ibcEnabled; + int m_numBVs; + int m_numBV16s; + MV m_BVs64; + uint32_t m_lastCandCost; +#endif + #if DETAILED_CU_STATS /* Accumulate CU statistics separately for each frame encoder */ CUStats m_statsX265_MAX_FRAME_THREADS; @@ -309,7 +319,7 @@ void encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom); // estimation inter prediction (non-skip) - void predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t masks2); + void predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t masks2, MV* iMVCandList = NULL); void searchMV(Mode& interMode, int list, int ref, MV& outmv, MV mvp3, int numMvc, MV* mvc); // encode residual and compute rd-cost for inter mode void encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom); @@ -329,6 +339,25 @@ MV getLowresMV(const CUData& cu, const PredictionUnit& pu, int list, int ref); +#if ENABLE_SCC_EXT + bool predIntraBCSearch(Mode& intraBCMode, const CUGeom& cuGeom, bool bChromaMC, PartSize ePartSize, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc); + void intraBlockCopyEstimate(Mode& intraBCMode, const CUGeom& cuGeom, int puIdx, MV* pred, MV& mv, uint32_t& cost, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc); + void setIntraSearchRange(Mode& intraBCMode, MV& pred, int puIdx, int roiWidth, int roiHeight, MV& searchRangeLT, MV& searchRangeRB); + void intraPatternSearch(Mode& intraBCMode, const CUGeom& cuGeom, int puIdx, uint32_t partAddr, pixel* refY, int refStride, MV* searchRangeLT, MV* searchRangeRB, + MV& mv, uint32_t& cost, int roiwidth, int roiheight, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc); + bool isValidIntraBCSearchArea(CUData* cu, int predX, int predY, int roiWidth, int roiHeight, int partOffset); + bool isBlockVectorValid(int xPos, int yPos, int width, int height, CUData* pcCU, + int xStartInCU, int yStartInCU, int xBv, int yBv, int ctuSize); + void intraBCSearchMVCandUpdate(uint32_t sad, int x, int y, uint32_t* sadBestCand, MV* cMVCand); + void updateBVMergeCandLists(int roiWidth, int roiHeight, MV* mvCand, IBC& ibc); + int intraBCSearchMVChromaRefine(Mode& intraBCMode, const CUGeom& cuGeom, int roiWidth, int roiHeight, int cuPelX, int cuPelY, uint32_t* sadBestCand, MV* cMVCand, + uint32_t partOffset, int puIdx); + static uint32_t mergeCandLists(MV* dst, uint32_t dn, MV* src, uint32_t sn, bool isSrcQuarPel); + uint32_t getSAD(pixel* ref, int refStride, const pixel* curr, int currStride, int width, int height); + bool predMixedIntraBCInterSearch(Mode& intraBCMode, const CUGeom& cuGeom, bool bChromaMC, PartSize ePartSize, MV* iMVCandList); + void restrictBipredMergeCand(CUData* cu, uint32_t puIdx, MVField(*mvFieldNeighbours)2, uint8_t* interDirNeighbours, uint32_t numValidMergeCand); +#endif + class PME : public BondedTaskGroup { public:
View file
x265_3.6.tar.gz/source/encoder/sei.cpp -> x265_4.0.tar.gz/source/encoder/sei.cpp
Changed
@@ -36,7 +36,7 @@ /* marshal a single SEI message sei, storing the marshalled representation * in bitstream bs */ -void SEI::writeSEImessages(Bitstream& bs, const SPS& sps, NalUnitType nalUnitType, NALList& list, int isNested) +void SEI::writeSEImessages(Bitstream& bs, const SPS& sps, NalUnitType nalUnitType, NALList& list, int isNested, int layer) { if (!isNested) bs.resetBits(); @@ -68,7 +68,7 @@ { if (nalUnitType != NAL_UNIT_UNSPECIFIED) bs.writeByteAlignment(); - list.serialize(nalUnitType, bs, (1 + (nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N))); + list.serialize(nalUnitType, bs, layer, (1 + (nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N))); } }
View file
x265_3.6.tar.gz/source/encoder/sei.h -> x265_4.0.tar.gz/source/encoder/sei.h
Changed
@@ -38,7 +38,7 @@ public: /* SEI users call writeSEImessages() to marshal an SEI to a bitstream. * The writeSEImessages() method calls writeSEI() which encodes the header */ - void writeSEImessages(Bitstream& bs, const SPS& sps, NalUnitType nalUnitType, NALList& list, int isNested); + void writeSEImessages(Bitstream& bs, const SPS& sps, NalUnitType nalUnitType, NALList& list, int isNested, int layerId = 0); void setSize(uint32_t size); static char* base64Decode(char encodedString, int base64EncodeLength); virtual ~SEI() {} @@ -189,6 +189,228 @@ } }; +#if ENABLE_ALPHA +class SEIAlphaChannelInfo : public SEI +{ +public: + SEIAlphaChannelInfo() + { + m_payloadType = ALPHA_CHANNEL_INFO; + m_payloadSize = 0; + } + + bool alpha_channel_cancel_flag; + void writeSEI(const SPS&) + { + WRITE_CODE(alpha_channel_cancel_flag, 1, "alpha_channel_cancel_flag"); + if (!alpha_channel_cancel_flag) + { + WRITE_CODE(0, 3, "alpha_channel_use_idc"); + WRITE_CODE(0, 3, "alpha_channel_bit_depth_minus8"); + WRITE_CODE(0, 9, "alpha_transparent_value"); + WRITE_CODE(255, 9, "alpha_opaque_value"); + WRITE_CODE(0, 1, "alpha_channel_incr_flag"); + WRITE_CODE(0, 1, "alpha_channel_clip_flag"); + } + if (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0) + { + WRITE_FLAG(1, "payload_bit_equal_to_one"); + while (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0) + { + WRITE_FLAG(0, "payload_bit_equal_to_zero"); + } + } + } +}; +#endif + +#if ENABLE_MULTIVIEW +class SEIThreeDimensionalReferenceDisplaysInfo : public SEI +{ +public: + SEIThreeDimensionalReferenceDisplaysInfo() + { + m_payloadType = THREE_DIMENSIONAL_REFERENCE_DISPLAYS_INFO; + m_payloadSize = 0; + } + + int m_numRefDisplaysMinus1 = 0; + bool m_refViewingDistanceFlag = false; + bool m_additionalShiftPresentFlag = false; + void writeSEI(const SPS&) + { + WRITE_UVLC(31, "prec_ref_display_width"); + WRITE_FLAG(m_refViewingDistanceFlag, "ref_viewing_distance_flag"); + if (m_refViewingDistanceFlag) + { + WRITE_UVLC(0, "prec_ref_viewing_dist"); + } + WRITE_UVLC(0, "num_ref_displays_minus1"); + for (int i = 0; i <= m_numRefDisplaysMinus1; i++) + { + WRITE_UVLC(0, "left_view_id"); + WRITE_UVLC(1, "right_view_id"); + WRITE_CODE(0, 6, "exponent_ref_display_width"); + WRITE_CODE(0, 2, "mantissa_ref_display_width"); + if (m_refViewingDistanceFlag) + { + WRITE_CODE(0, 6, "exponent_ref_viewing_distance"); + WRITE_CODE(0, 1, "mantissa_ref_viewing_distance"); + } + WRITE_FLAG(m_additionalShiftPresentFlag, "additional_shift_present_flag"); + if (m_additionalShiftPresentFlag) + { + WRITE_CODE(0, 10, "num_sample_shift_plus512"); + } + } + WRITE_FLAG(0, "three_dimensional_reference_displays_extension_flag"); + + if (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0) + { + WRITE_FLAG(1, "payload_bit_equal_to_one"); + while (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0) + { + WRITE_FLAG(0, "payload_bit_equal_to_zero"); + } + } + } + +}; + +class SEIMultiviewSceneInfo : public SEI +{ +public: + SEIMultiviewSceneInfo() + { + m_payloadType = MULTIVIEW_SCENE_INFO; + m_payloadSize = 0; + } + void writeSEI(const SPS&) + { + WRITE_SVLC(-333, "min_disparity"); + WRITE_UVLC(2047, "max_disparity_range"); + + if (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0) + { + WRITE_FLAG(1, "payload_bit_equal_to_one"); + while (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0) + { + WRITE_FLAG(0, "payload_bit_equal_to_zero"); + } + } + } +}; + +class SEIMultiviewAcquisitionInfo : public SEI +{ +public: + SEIMultiviewAcquisitionInfo() + { + m_payloadType = MULTIVIEW_ACQUISITION_INFO; + m_payloadSize = 0; + } + + int sign_r33 = { {0,1,0},{1,0,0},{0,1,1} }; + int exponent_r33 = { {10,20,11},{10,5,11},{2,20,11} }; + int mantissa_r33 = { {4,9,1},{0,3,4},{3,3,7} }; + int sign_t13 = { 0,1,0 }; + int exponent_t13 = { 0,10,5 }; + int mantissa_t13 = { 1,8,9 }; + int lenght_mantissa_r33 = { {10,20,11},{10,5,11},{2,20,11} }; + int length_mantissa_t13 = { 1,10,5 }; + bool m_intrinsicParamFlag = true; + bool m_extrinsicParamFlag = true; + bool m_intrinsicParamsEqualFlag = true; + void writeSEI(const SPS& sps) + { + WRITE_FLAG(m_intrinsicParamFlag, "intrinsic_param_flag"); + WRITE_FLAG(m_extrinsicParamFlag, "extrinsic_param_flag"); + if (m_intrinsicParamFlag) + { + WRITE_FLAG(m_intrinsicParamsEqualFlag, "intrinsic_params_equal_flag"); + WRITE_UVLC(31, "prec_focal_length"); + WRITE_UVLC(31, "prec_principal_point"); + WRITE_UVLC(31, "prec_skew_factor"); + + for (int i = 0; i <= (m_intrinsicParamsEqualFlag ? 0 : sps.maxViews - 1); i++) + { + WRITE_FLAG(0, "sign_focal_length_x"); + WRITE_CODE(0, 6, "exponent_focal_length_x"); + WRITE_CODE(0, 1, "mantissa_focal_length_x"); + WRITE_FLAG(0, "sign_focal_length_y"); + WRITE_CODE(0, 6, "exponent_focal_length_y"); + WRITE_CODE(0, 1, "mantissa_focal_length_y"); + WRITE_FLAG(0, "sign_principal_point_x"); + WRITE_CODE(0, 6, "exponent_principal_point_x"); + WRITE_CODE(0, 1, "mantissa_principal_point_x"); + WRITE_FLAG(0, "sign_principal_point_y"); + WRITE_CODE(0, 6, "exponent_principal_point_y"); + WRITE_CODE(0, 1, "mantissa_principal_point_y"); + WRITE_FLAG(0, "sign_skew_factor"); + WRITE_CODE(0, 6, "exponent_skew_factor"); + WRITE_CODE(0, 1, "mantissa_skew_factor"); + } + } + + if (m_extrinsicParamFlag) + { + WRITE_UVLC(31, "prec_rotation_param"); + WRITE_UVLC(31, "prec_translation_param"); + for (int i = 0; i <= 0; i++) + { + for (int j = 0; j <= 2; j++) /* row */ + { + for (int k = 0; k <= 2; k++) /* column */ + { + WRITE_FLAG(sign_rjk, "sign_r"); + WRITE_CODE(exponent_rjk, 6, "exponent_r"); + WRITE_CODE(mantissa_rjk, lenght_mantissa_rjk, "mantissa_r"); + } + WRITE_FLAG(sign_tij, "sign_t"); + WRITE_CODE(exponent_tij, 6, "exponent_t"); + WRITE_CODE(mantissa_tij, length_mantissa_tij, "mantissa_t"); + } + } + } + if (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0) + { + WRITE_FLAG(1, "payload_bit_equal_to_one"); + while (m_bitIf->getNumberOfWrittenBits() % X265_BYTE != 0)
View file
x265_3.6.tar.gz/source/encoder/slicetype.cpp -> x265_4.0.tar.gz/source/encoder/slicetype.cpp
Changed
@@ -1324,7 +1324,7 @@ int l0poc = slice->m_rps.numberOfNegativePictures ? slice->m_refPOCList00 : -1; int l1poc = slice->m_refPOCList10; - switch (slice->m_sliceType) + switch (slice->m_origSliceType) { case I_SLICE: framesp0 = &curFrame->m_lowres; @@ -4160,9 +4160,9 @@ /* ME will never return a cost larger than the cost @MVP, so we do not * have to check that ME cost is more than the estimated merge cost */ if(!hme) - fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, searchRange, *fencMV, m_lookahead.m_param->maxSlices); + fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, searchRange, *fencMV, m_lookahead.m_param->maxSlices, 0); else - fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, searchRange, *fencMV, m_lookahead.m_param->maxSlices, fref->lowerResPlane0); + fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, searchRange, *fencMV, m_lookahead.m_param->maxSlices, 0, fref->lowerResPlane0); if (skipCost < 64 && skipCost < fencCost && bBidir) { fencCost = skipCost;
View file
x265_3.6.tar.gz/source/encoder/weightPrediction.cpp -> x265_4.0.tar.gz/source/encoder/weightPrediction.cpp
Changed
@@ -491,8 +491,14 @@ lumaDenom = weights0.log2WeightDenom; chromaDenom = weights1.log2WeightDenom; + int numIdx = slice.m_numRefIdxlist; +#if ENABLE_SCC_EXT + if (!list && param.bEnableSCC) + numIdx--; +#endif + /* reset weight states */ - for (int ref = 1; ref < slice.m_numRefIdxlist; ref++) + for (int ref = 1; ref < numIdx; ref++) { SET_WEIGHT(wplistref0, false, 1 << lumaDenom, lumaDenom, 0); SET_WEIGHT(wplistref1, false, 1 << chromaDenom, chromaDenom, 0);
View file
x265_3.6.tar.gz/source/input/input.cpp -> x265_4.0.tar.gz/source/input/input.cpp
Changed
@@ -27,12 +27,12 @@ using namespace X265_NS; -InputFile* InputFile::open(InputFileInfo& info, bool bForceY4m) +InputFile* InputFile::open(InputFileInfo& info, bool bForceY4m, bool alpha, int format) { const char * s = strrchr(info.filename, '.'); if (bForceY4m || (s && !strcmp(s, ".y4m"))) - return new Y4MInput(info); + return new Y4MInput(info, alpha, format); else - return new YUVInput(info); + return new YUVInput(info, alpha, format); }
View file
x265_3.6.tar.gz/source/input/input.h -> x265_4.0.tar.gz/source/input/input.h
Changed
@@ -66,7 +66,7 @@ InputFile() {} - static InputFile* open(InputFileInfo& info, bool bForceY4m); + static InputFile* open(InputFileInfo& info, bool bForceY4m, bool alpha, int format); virtual void startReader() = 0;
View file
x265_3.6.tar.gz/source/input/y4m.cpp -> x265_4.0.tar.gz/source/input/y4m.cpp
Changed
@@ -40,13 +40,14 @@ using namespace X265_NS; using namespace std; static const char header = {'F','R','A','M','E'}; -Y4MInput::Y4MInput(InputFileInfo& info) +Y4MInput::Y4MInput(InputFileInfo& info, bool alpha, int format) { for (int i = 0; i < QUEUE_SIZE; i++) bufi = NULL; threadActive = false; colorSpace = info.csp; + alphaAvailable = alpha; sarWidth = info.sarWidth; sarHeight = info.sarHeight; width = info.width; @@ -68,11 +69,13 @@ ifs = x265_fopen(info.filename, "rb"); if (ifs && !ferror(ifs) && parseHeader()) { + if (format == 1) width /= 2; + if (format == 2) height /= 2; int pixelbytes = depth > 8 ? 2 : 1; - for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++) + for (int i = 0; i < x265_cli_cspscolorSpace.planes + alphaAvailable; i++) { - int stride = (width >> x265_cli_cspscolorSpace.widthi) * pixelbytes; - framesize += (stride * (height >> x265_cli_cspscolorSpace.heighti)); + int stride = ((width * (format == 1 ? 2 : 1)) >> x265_cli_cspscolorSpace.widthi) * pixelbytes; + framesize += (stride * ((height * (format == 2 ? 2 : 1)) >> x265_cli_cspscolorSpace.heighti)); } threadActive = true; @@ -390,12 +393,19 @@ pic.height = height; pic.width = width; pic.colorSpace = colorSpace; - pic.stride0 = width * pixelbytes; + pic.stride0 = width * pixelbytes * (pic.format == 1 ? 2 : 1); pic.stride1 = pic.stride0 >> x265_cli_cspscolorSpace.width1; pic.stride2 = pic.stride0 >> x265_cli_cspscolorSpace.width2; pic.planes0 = bufread % QUEUE_SIZE; - pic.planes1 = (char*)pic.planes0 + pic.stride0 * height; - pic.planes2 = (char*)pic.planes1 + pic.stride1 * (height >> x265_cli_cspscolorSpace.height1); + pic.planes1 = (char*)pic.planes0 + pic.stride0 * (height * (pic.format == 2 ? 2 : 1)); + pic.planes2 = (char*)pic.planes1 + pic.stride1 * ((height * (pic.format == 2 ? 2 : 1)) >> x265_cli_cspscolorSpace.height1); +#if ENABLE_ALPHA + if (alphaAvailable) + { + pic.stride3 = pic.stride0 >> x265_cli_cspscolorSpace.width3; + pic.planes3 = (char*)pic.planes2 + pic.stride2 * (height >> x265_cli_cspscolorSpace.height2); + } +#endif readCount.incr(); return true; }
View file
x265_3.6.tar.gz/source/input/y4m.h -> x265_4.0.tar.gz/source/input/y4m.h
Changed
@@ -55,6 +55,8 @@ int colorSpace; + bool alphaAvailable; + bool threadActive; ThreadSafeInteger readCount; @@ -69,7 +71,7 @@ public: - Y4MInput(InputFileInfo& info); + Y4MInput(InputFileInfo& info, bool alpha, int format); virtual ~Y4MInput(); void release();
View file
x265_3.6.tar.gz/source/input/yuv.cpp -> x265_4.0.tar.gz/source/input/yuv.cpp
Changed
@@ -40,7 +40,7 @@ using namespace X265_NS; using namespace std; -YUVInput::YUVInput(InputFileInfo& info) +YUVInput::YUVInput(InputFileInfo& info, bool alpha, int format) { for (int i = 0; i < QUEUE_SIZE; i++) bufi = NULL; @@ -49,15 +49,16 @@ width = info.width; height = info.height; colorSpace = info.csp; + alphaAvailable = alpha; threadActive = false; ifs = NULL; uint32_t pixelbytes = depth > 8 ? 2 : 1; framesize = 0; - for (int i = 0; i < x265_cli_cspscolorSpace.planes; i++) + for (int i = 0; i < x265_cli_cspscolorSpace.planes + alphaAvailable; i++) { - uint32_t w = width >> x265_cli_cspscolorSpace.widthi; - uint32_t h = height >> x265_cli_cspscolorSpace.heighti; + int32_t w = (width * (format == 1 ? 2 : 1)) >> x265_cli_cspscolorSpace.widthi; + uint32_t h = (height * (format == 2 ? 2 : 1)) >> x265_cli_cspscolorSpace.heighti; framesize += w * h * pixelbytes; } @@ -205,12 +206,19 @@ pic.framesize = framesize; pic.height = height; pic.width = width; - pic.stride0 = width * pixelbytes; + pic.stride0 = width * pixelbytes * (pic.format == 1 ? 2 : 1); pic.stride1 = pic.stride0 >> x265_cli_cspscolorSpace.width1; pic.stride2 = pic.stride0 >> x265_cli_cspscolorSpace.width2; pic.planes0 = bufread % QUEUE_SIZE; - pic.planes1 = (char*)pic.planes0 + pic.stride0 * height; - pic.planes2 = (char*)pic.planes1 + pic.stride1 * (height >> x265_cli_cspscolorSpace.height1); + pic.planes1 = (char*)pic.planes0 + pic.stride0 * (height * (pic.format == 2 ? 2 : 1)); + pic.planes2 = (char*)pic.planes1 + pic.stride1 * ((height * (pic.format == 2 ? 2 : 1)) >> x265_cli_cspscolorSpace.height1); +#if ENABLE_ALPHA + if (alphaAvailable) + { + pic.stride3 = pic.stride0 >> x265_cli_cspscolorSpace.width3; + pic.planes3 = (char*)pic.planes2 + pic.stride2 * (height >> x265_cli_cspscolorSpace.height2); + } +#endif readCount.incr(); return true; }
View file
x265_3.6.tar.gz/source/input/yuv.h -> x265_4.0.tar.gz/source/input/yuv.h
Changed
@@ -47,6 +47,8 @@ uint32_t framesize; + bool alphaAvailable; + bool threadActive; ThreadSafeInteger readCount; @@ -61,7 +63,7 @@ public: - YUVInput(InputFileInfo& info); + YUVInput(InputFileInfo& info, bool alpha, int format); virtual ~YUVInput(); void release();
View file
x265_3.6.tar.gz/source/test/ipfilterharness.cpp -> x265_4.0.tar.gz/source/test/ipfilterharness.cpp
Changed
@@ -67,7 +67,7 @@ { int index = i % TEST_CASES; - for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++) + for (int coeffIdx = 1; coeffIdx < 8; coeffIdx++) { rand_srcStride = rand() % 100 + 2; rand_dstStride = rand() % 100 + 64; @@ -102,7 +102,7 @@ { int index = i % TEST_CASES; - for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++) + for (int coeffIdx = 1; coeffIdx < 8; coeffIdx++) { rand_srcStride = rand() % 100; rand_dstStride = rand() % 100 + 64; @@ -144,7 +144,7 @@ { int index = i % TEST_CASES; - for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++) + for (int coeffIdx = 1; coeffIdx < 8; coeffIdx++) { // 0 : Interpolate W x H, 1 : Interpolate W x (H + 7) for (int isRowExt = 0; isRowExt < 2; isRowExt++) @@ -185,7 +185,7 @@ { int index = i % TEST_CASES; - for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++) + for (int coeffIdx = 1; coeffIdx < 8; coeffIdx++) { rand_srcStride = rand() % 100; rand_dstStride = rand() % 100 + 64; @@ -220,7 +220,7 @@ { int index = i % TEST_CASES; - for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++) + for (int coeffIdx = 1; coeffIdx < 8; coeffIdx++) { rand_srcStride = rand() % 100; rand_dstStride = rand() % 100 + 64; @@ -255,7 +255,7 @@ { int index = i % TEST_CASES; - for (int coeffIdx = 0; coeffIdx < 4; coeffIdx++) + for (int coeffIdx = 1; coeffIdx < 4; coeffIdx++) { rand_srcStride = rand() % 100; rand_dstStride = rand() % 100 + 64; @@ -290,7 +290,7 @@ { int index = i % TEST_CASES; - for (int coeffIdx = 0; coeffIdx < 4; coeffIdx++) + for (int coeffIdx = 1; coeffIdx < 4; coeffIdx++) { rand_srcStride = rand() % 100; rand_dstStride = rand() % 100 + 64; @@ -325,7 +325,7 @@ { int index = i % TEST_CASES; - for (int coeffIdx = 0; coeffIdx < 4; coeffIdx++) + for (int coeffIdx = 1; coeffIdx < 4; coeffIdx++) { // 0 : Interpolate W x H, 1 : Interpolate W x (H + 7) for (int isRowExt = 0; isRowExt < 2; isRowExt++) @@ -366,7 +366,7 @@ { int index = i % TEST_CASES; - for (int coeffIdx = 0; coeffIdx < 4; coeffIdx++) + for (int coeffIdx = 1; coeffIdx < 4; coeffIdx++) { rand_srcStride = rand() % 100; rand_dstStride = rand() % 100 + 64; @@ -401,7 +401,7 @@ { int index = i % TEST_CASES; - for (int coeffIdx = 0; coeffIdx < 4; coeffIdx++) + for (int coeffIdx = 1; coeffIdx < 4; coeffIdx++) { rand_srcStride = rand() % 100; rand_dstStride = rand() % 100 + 64; @@ -436,9 +436,9 @@ { int index = i % TEST_CASES; - for (int coeffIdxX = 0; coeffIdxX < 4; coeffIdxX++) + for (int coeffIdxX = 1; coeffIdxX < 4; coeffIdxX++) { - for (int coeffIdxY = 0; coeffIdxY < 4; coeffIdxY++) + for (int coeffIdxY = 1; coeffIdxY < 4; coeffIdxY++) { rand_srcStride = rand() % 100; rand_dstStride = rand() % 100 + 64;
View file
x265_3.6.tar.gz/source/test/mbdstharness.cpp -> x265_4.0.tar.gz/source/test/mbdstharness.cpp
Changed
@@ -260,8 +260,14 @@ uint32_t optReturnValue = 0; uint32_t refReturnValue = 0; - int bits = rand() % 32; - int valueToAdd = rand() % (1 << bits); + int log2TrSize = rand() % 4 + 2; + const int qp = rand() % (QP_MAX_SPEC + QP_BD_OFFSET + 1); + const int per = qp / 6; + const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; + + /* Right shift of non-RDOQ quantizer level = (coeff*Q + offset)>>q_bits */ + int bits = QUANT_SHIFT + per + transformShift; + int valueToAdd = (1 << (bits - 1)); int cmp_size = sizeof(short) * height * width; int numCoeff = height * width;
View file
x265_3.6.tar.gz/source/test/pixelharness.cpp -> x265_4.0.tar.gz/source/test/pixelharness.cpp
Changed
@@ -1373,8 +1373,7 @@ ref(sbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, endX, endY, stats_ref, count_ref); checked(opt, sbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, endX, endY, stats_vec, count_vec); - if ( memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref)) - || memcmp(stats_ref, stats_vec, sizeof(stats_ref)) + if ( memcmp(stats_ref, stats_vec, sizeof(stats_ref)) || memcmp(count_ref, count_vec, sizeof(count_ref))) return false; @@ -1425,10 +1424,7 @@ ref(sbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, upBufft_ref, endX, endY, stats_ref, count_ref); checked(opt, sbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, upBufft_vec, endX, endY, stats_vec, count_vec); - // TODO: don't check upBuff*, the latest output pixels different, and can move into stack temporary buffer in future - if ( memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref)) - || memcmp(_upBufft_ref, _upBufft_vec, sizeof(_upBufft_ref)) - || memcmp(stats_ref, stats_vec, sizeof(stats_ref)) + if ( memcmp(stats_ref, stats_vec, sizeof(stats_ref)) || memcmp(count_ref, count_vec, sizeof(count_ref))) return false; @@ -1476,8 +1472,7 @@ ref(sbuf2, pbuf3, stride, upBuff1_ref, endX, endY, stats_ref, count_ref); checked(opt, sbuf2, pbuf3, stride, upBuff1_vec, endX, endY, stats_vec, count_vec); - if ( memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref)) - || memcmp(stats_ref, stats_vec, sizeof(stats_ref)) + if ( memcmp(stats_ref, stats_vec, sizeof(stats_ref)) || memcmp(count_ref, count_vec, sizeof(count_ref))) return false;
View file
x265_3.6.tar.gz/source/test/testbench.cpp -> x265_4.0.tar.gz/source/test/testbench.cpp
Changed
@@ -159,10 +159,11 @@ struct test_arch_t { - char name12; + char name13; int flag; } test_arch = { +#if X265_ARCH_X86 { "SSE2", X265_CPU_SSE2 }, { "SSE3", X265_CPU_SSE3 }, { "SSSE3", X265_CPU_SSSE3 }, @@ -172,11 +173,15 @@ { "AVX2", X265_CPU_AVX2 }, { "BMI2", X265_CPU_AVX2 | X265_CPU_BMI1 | X265_CPU_BMI2 }, { "AVX512", X265_CPU_AVX512 }, +#else { "ARMv6", X265_CPU_ARMV6 }, { "NEON", X265_CPU_NEON }, { "SVE2", X265_CPU_SVE2 }, { "SVE", X265_CPU_SVE }, + { "Neon_DotProd", X265_CPU_NEON_DOTPROD }, + { "Neon_I8MM", X265_CPU_NEON_I8MM }, { "FastNeonMRC", X265_CPU_FAST_NEON_MRC }, +#endif { "", 0 }, }; @@ -190,10 +195,10 @@ else continue; -#if X265_ARCH_X86 +#if defined(X265_ARCH_X86) || defined(X265_ARCH_ARM64) EncoderPrimitives vecprim; memset(&vecprim, 0, sizeof(vecprim)); - setupInstrinsicPrimitives(vecprim, test_archi.flag); + setupIntrinsicPrimitives(vecprim, test_archi.flag); setupAliasPrimitives(vecprim); for (size_t h = 0; h < sizeof(harness) / sizeof(TestHarness*); h++) { @@ -231,8 +236,8 @@ EncoderPrimitives optprim; memset(&optprim, 0, sizeof(optprim)); -#if X265_ARCH_X86 - setupInstrinsicPrimitives(optprim, cpuid); +#if defined(X265_ARCH_X86) || defined(X265_ARCH_ARM64) + setupIntrinsicPrimitives(optprim, cpuid); #endif setupAssemblyPrimitives(optprim, cpuid);
View file
x265_3.6.tar.gz/source/test/testharness.h -> x265_4.0.tar.gz/source/test/testharness.h
Changed
@@ -88,6 +88,7 @@ // TO-DO: replace clock() function with appropriate ARM cpu instructions a = clock(); #elif X265_ARCH_ARM64 + asm volatile("isb" : : : "memory"); asm volatile("mrs %0, cntvct_el0" : "=r"(a)); #endif return a;
View file
x265_3.6.tar.gz/source/x265.h -> x265_4.0.tar.gz/source/x265.h
Changed
@@ -371,6 +371,11 @@ MASTERING_DISPLAY_INFO = 137, CONTENT_LIGHT_LEVEL_INFO = 144, ALTERNATIVE_TRANSFER_CHARACTERISTICS = 147, + ALPHA_CHANNEL_INFO = 165, + THREE_DIMENSIONAL_REFERENCE_DISPLAYS_INFO = 176, + MULTIVIEW_SCENE_INFO = 178, + MULTIVIEW_ACQUISITION_INFO = 179, + MULTIVIEW_VIEW_POSITION = 180 } SEIPayloadType; typedef struct x265_sei_payload @@ -410,10 +415,10 @@ /* Must be specified on input pictures, the number of planes is determined * by the colorSpace value */ - void* planes3; + void* planes4; /* Stride is the number of bytes between row starts */ - int stride3; + int stride4; /* Must be specified on input pictures. x265_picture_init() will set it to * the encoder's internal bit depth, but this field must describe the depth @@ -487,6 +492,9 @@ uint32_t picStruct; int width; + + int layerID; + int format; } x265_picture; typedef enum @@ -536,11 +544,13 @@ #define X265_CPU_SLOW_PALIGNR (1 << 25) /* such as on the AMD Bobcat */ /* ARM */ -#define X265_CPU_ARMV6 0x0000001 -#define X265_CPU_NEON 0x0000002 /* ARM NEON */ -#define X265_CPU_SVE2 0x0000008 /* ARM SVE2 */ -#define X265_CPU_SVE 0x0000010 /* ARM SVE2 */ -#define X265_CPU_FAST_NEON_MRC 0x0000004 /* Transfer from NEON to ARM register is fast (Cortex-A9) */ +#define X265_CPU_ARMV6 (1 << 0) +#define X265_CPU_NEON (1 << 1) /* ARM NEON */ +#define X265_CPU_FAST_NEON_MRC (1 << 2) /* Transfer from NEON to ARM register is fast (Cortex-A9) */ +#define X265_CPU_SVE2 (1 << 3) /* AArch64 SVE2 */ +#define X265_CPU_SVE (1 << 4) /* AArch64 SVE2 */ +#define X265_CPU_NEON_DOTPROD (1 << 5) /* AArch64 Neon DotProd */ +#define X265_CPU_NEON_I8MM (1 << 6) /* AArch64 Neon I8MM */ /* IBM Power8 */ #define X265_CPU_ALTIVEC 0x0000001 @@ -623,13 +633,49 @@ #define X265_MAX_GOP_LENGTH 16 #define MAX_T_LAYERS 7 +#if ENABLE_MULTIVIEW +#define MAX_VIEWS 2 +#define MAX_VPS_NUM_SCALABILITY_TYPES 16 +#define MAX_VPS_LAYER_ID_PLUS1 MAX_VIEWS +#define MULTIVIEW_SCALABILITY_IDX 1 +#else +#define MAX_VIEWS 1 +#endif + +#if ENABLE_ALPHA +#define MAX_SCALABLE_LAYERS 2 +#define MAX_VPS_NUM_SCALABILITY_TYPES 16 +#define MAX_VPS_LAYER_ID_PLUS1 MAX_SCALABLE_LAYERS +#else +#define MAX_SCALABLE_LAYERS 1 +#endif + +#if ENABLE_ALPHA || ENABLE_MULTIVIEW +#define MAX_LAYERS 2 +#else +#define MAX_LAYERS 1 +#endif + +#if ENABLE_SCC_EXT +/* SCC Extension Options */ +#define SCC_EXT_IDX 3 +#define NUM_EXTENSION_FLAGS 8 +#define SCM_S0067_NUM_CANDIDATES 64 +#define CHROMA_REFINEMENT_CANDIDATES 8 +#define SCM_S0067_IBC_FULL_1D_SEARCH_FOR_PU 2 ///< Do full horizontal/vertical search for Nx2N +#define SCM_S0067_MAX_CAND_SIZE 32 ///< 32 or 64, 16 by default +#define NUM_RECON_VERSION 2 +#else +#define NUM_RECON_VERSION 1 +#endif + #define X265_IPRATIO_STRENGTH 1.43 typedef struct x265_cli_csp { int planes; - int width3; - int height3; + int width4; + int height4; } x265_cli_csp; static const x265_cli_csp x265_cli_csps = @@ -754,10 +800,9 @@ char *pool; int thread; int subsample; - int enable_conf_interval; }x265_vmaf_commondata; -static const x265_vmaf_commondata vcd = { { NULL, (char *)"/usr/local/share/model/vmaf_v0.6.1.pkl", NULL, NULL, 0, 0, 0, 0, 0, 0, 0, NULL, 0, 1, 0 } }; +static x265_vmaf_commondata vcd = { { NULL, (char *)"/usr/local/share/model/vmaf_v0.6.1.json", NULL, NULL, 0, 0, 0, 0, 0, 0, 0, NULL, 0, 1} }; typedef struct x265_temporal_layer { int poc_offset; /* POC offset */ @@ -2268,6 +2313,20 @@ /*SBRC*/ int bEnableSBRC; + int mcstfFrameRange; + + /*Alpha channel encoding*/ + int bEnableAlpha; + int numScalableLayers; + + /*Multi View Encoding*/ + int numViews; + int format; + + int numLayers; + + /*Screen Content Coding*/ + int bEnableSCC; } x265_param; /* x265_param_alloc: @@ -2320,6 +2379,10 @@ "main444-12", "main444-12-intra", "main444-16-intra", "main444-16-stillpicture", /* Not Supported! */ + +#if ENABLE_SCC_EXT + "main-scc", "main10-scc", "main444-scc", "main444-10-scc", /* Screen content coding */ +#endif 0 }; @@ -2430,7 +2493,7 @@ * the payloads of all output NALs are guaranteed to be sequential in memory. * To flush the encoder and retrieve delayed output pictures, pass pic_in as NULL. * Once flushing has begun, all subsequent calls must pass pic_in as NULL. */ -int x265_encoder_encode(x265_encoder *encoder, x265_nal **pp_nal, uint32_t *pi_nal, x265_picture *pic_in, x265_picture *pic_out); +int x265_encoder_encode(x265_encoder *encoder, x265_nal **pp_nal, uint32_t *pi_nal, x265_picture *pic_in, x265_picture **pic_out); /* x265_encoder_reconfig: * various parameters from x265_param are copied. @@ -2537,7 +2600,7 @@ /* x265_calculate_vmaf_framelevelscore: * returns VMAF score for each frame in a given input video. */ -double x265_calculate_vmaf_framelevelscore(x265_vmaf_framedata*); +double x265_calculate_vmaf_framelevelscore(x265_param*, x265_vmaf_framedata*); /* x265_vmaf_encoder_log: * write a line to the configured CSV file. If a CSV filename was not * configured, or file open failed, this function will perform no write. @@ -2584,7 +2647,7 @@ int (*encoder_reconfig)(x265_encoder*, x265_param*); int (*encoder_reconfig_zone)(x265_encoder*, x265_zone*); int (*encoder_headers)(x265_encoder*, x265_nal**, uint32_t*); - int (*encoder_encode)(x265_encoder*, x265_nal**, uint32_t*, x265_picture*, x265_picture*); + int (*encoder_encode)(x265_encoder*, x265_nal**, uint32_t*, x265_picture*, x265_picture**); void (*encoder_get_stats)(x265_encoder*, x265_stats*, uint32_t); void (*encoder_log)(x265_encoder*, int, char**); void (*encoder_close)(x265_encoder*); @@ -2602,7 +2665,7 @@ int (*set_analysis_data)(x265_encoder *encoder, x265_analysis_data *analysis_data, int poc, uint32_t cuBytes); #if ENABLE_LIBVMAF double (*calculate_vmafscore)(x265_param *, x265_vmaf_data *); - double (*calculate_vmaf_framelevelscore)(x265_vmaf_framedata *); + double (*calculate_vmaf_framelevelscore)(x265_param *, x265_vmaf_framedata *); void (*vmaf_encoder_log)(x265_encoder*, int, char**, x265_param *, x265_vmaf_data *); #endif int (*zone_param_parse)(x265_param*, const char*, const char*);
View file
x265_3.6.tar.gz/source/x265cli.cpp -> x265_4.0.tar.gz/source/x265cli.cpp
Changed
@@ -374,6 +374,17 @@ H0(" --no-frame-dup Enable Frame duplication. Default %s\n", OPT(param->bEnableFrameDuplication)); H0(" --dup-threshold <integer> PSNR threshold for Frame duplication. Default %d\n", param->dupThreshold); H0(" --no-mcstf Enable GOP based temporal filter. Default %d\n", param->bEnableTemporalFilter); +#if ENABLE_ALPHA + H0(" --alpha Enable alpha channel support. Default %d\n", param->bEnableAlpha); +#endif +#if ENABLE_MULTIVIEW + H0(" --num-views Number of Views for Multiview Encoding. Default %d\n", param->numViews); + H0(" --format Format of the input video 0 : normal, 1 : side-by-side, 2 : over-under Default %d\n", param->format); + H0(" --multiview-config Configuration file for Multiview Encoding\n"); +#endif +#if ENABLE_SCC_EXT + H0(" --scc <integer> Enable screen content coding. 0: Diabled, 1:Intrablockcopy fast search with 1x2 CTUs search range, 2: Intrablockcopy Full search. Default %d\n", param->bEnableSCC); +#endif #ifdef SVT_HEVC H0(" --nosvt Enable SVT HEVC encoder %s\n", OPT(param->bEnableSvtHevc)); H0(" --no-svt-hme Enable Hierarchial motion estimation(HME) in SVT HEVC encoder \n"); @@ -416,12 +427,18 @@ free(argString); } - if (input) - input->release(); - input = NULL; - if (recon) - recon->release(); - recon = NULL; + for (int i = 0; i < MAX_VIEWS; i++) + { + if (inputi) + inputi->release(); + inputi = NULL; + } + for (int i = 0; i < MAX_LAYERS; i++) + { + if (reconi) + reconi->release(); + reconi = NULL; + } if (qpfile) fclose(qpfile); qpfile = NULL; @@ -577,8 +594,12 @@ int inputBitDepth = 8; int outputBitDepth = 0; int reconFileBitDepth = 0; - const char *inputfn = NULL; - const char *reconfn = NULL; + char* inputfnMAX_VIEWS = { NULL }; + for (int view = 0; view < MAX_VIEWS; view++) + { + inputfnview = X265_MALLOC(char, sizeof(char) * 1024); + } + const char* reconfnMAX_LAYERS = { NULL }; const char *outputfn = NULL; const char *preset = NULL; const char *tune = NULL; @@ -717,8 +738,8 @@ OPT("frames") this->framesToBeEncoded = (uint32_t)x265_atoi(optarg, bError); OPT("no-progress") this->bProgress = false; OPT("output") outputfn = optarg; - OPT("input") inputfn = optarg; - OPT("recon") reconfn = optarg; + OPT("input") strcpy(inputfn0 , optarg); + OPT("recon") reconfn0 = optarg; OPT("input-depth") inputBitDepth = (uint32_t)x265_atoi(optarg, bError); OPT("dither") this->bDither = true; OPT("recon-depth") reconFileBitDepth = (uint32_t)x265_atoi(optarg, bError); @@ -750,6 +771,14 @@ if (!this->scenecutAwareQpConfig) x265_log_file(param, X265_LOG_ERROR, "%s scenecut aware qp config file not found or error in opening config file\n", optarg); } +#if ENABLE_MULTIVIEW + OPT("multiview-config") + { + this->multiViewConfig = x265_fopen(optarg, "rb"); + if (!this->multiViewConfig) + x265_log_file(param, X265_LOG_ERROR, "%s Multiview config file not found or error in opening config file\n", optarg); + } +#endif OPT("zonefile") { this->zoneFile = x265_fopen(optarg, "rb"); @@ -776,8 +805,10 @@ } } - if (optind < argc && !inputfn) - inputfn = argvoptind++; +#if !ENABLE_MULTIVIEW + if (optind < argc && !inputfn0) + inputfn0 = argvoptind++; +#endif if (optind < argc && !outputfn) outputfn = argvoptind++; if (optind < argc) @@ -793,9 +824,29 @@ showHelp(param); } - if (!inputfn || !outputfn) +#if ENABLE_MULTIVIEW + if (this->multiViewConfig) + { + if (!this->parseMultiViewConfig(inputfn)) + { + x265_log(NULL, X265_LOG_ERROR, "Unable to parse multiview config file \n"); + fclose(this->multiViewConfig); + this->multiViewConfig = NULL; + } + } +#endif + param->numLayers = param->numViews > 1 ? param->numViews : (param->numScalableLayers > 1) ? param->numScalableLayers : 1; + if (!outputfn) { x265_log(param, X265_LOG_ERROR, "input or output file not specified, try --help for help\n"); + for (int view = 0; view < param->numViews; view++) + { + if (!inputfnview) + { + x265_log(param, X265_LOG_ERROR, "input or output file not specified, try --help for help\n"); + return true; + } + } return true; } @@ -816,51 +867,53 @@ svtParam->encoderBitDepth = inputBitDepth; } #endif - - InputFileInfo info; - info.filename = inputfn; - info.depth = inputBitDepth; - info.csp = param->internalCsp; - info.width = param->sourceWidth; - info.height = param->sourceHeight; - info.fpsNum = param->fpsNum; - info.fpsDenom = param->fpsDenom; - info.sarWidth = param->vui.sarWidth; - info.sarHeight = param->vui.sarHeight; - info.skipFrames = seek; - info.frameCount = 0; - getParamAspectRatio(param, info.sarWidth, info.sarHeight); - - - this->input = InputFile::open(info, this->bForceY4m); - if (!this->input || this->input->isFail()) + InputFileInfo infoMAX_VIEWS; + for (int i = 0; i < param->numViews - !!param->format; i++) { - x265_log_file(param, X265_LOG_ERROR, "unable to open input file <%s>\n", inputfn); - return true; - } + infoi.filename = inputfni; + infoi.depth = inputBitDepth; + infoi.csp = param->internalCsp; + infoi.width = param->sourceWidth; + infoi.height = param->sourceHeight; + infoi.fpsNum = param->fpsNum; + infoi.fpsDenom = param->fpsDenom; + infoi.sarWidth = param->vui.sarWidth; + infoi.sarHeight = param->vui.sarHeight; + infoi.skipFrames = seek; + infoi.frameCount = 0; + getParamAspectRatio(param, infoi.sarWidth, infoi.sarHeight); + + this->inputi = InputFile::open(infoi, this->bForceY4m, param->numScalableLayers > 1, param->format); + if (!this->inputi || this->inputi->isFail()) + { + x265_log_file(param, X265_LOG_ERROR, "unable to open input file <%s>\n", inputfni); + return true; + } - if (info.depth < 8 || info.depth > 16) - { - x265_log(param, X265_LOG_ERROR, "Input bit depth (%d) must be between 8 and 16\n", inputBitDepth); - return true; + if (infoi.depth < 8 || infoi.depth > 16) + { + x265_log(param, X265_LOG_ERROR, "Input bit depth (%d) must be between 8 and 16\n", inputBitDepth); + return true; + } } + //TODO:Validate info params of both the views to equal values /* Unconditionally accept height/width/csp/bitDepth from file info */ - param->sourceWidth = info.width; - param->sourceHeight = info.height; - param->internalCsp = info.csp; - param->sourceBitDepth = info.depth; + param->sourceWidth = info0.width; + param->sourceHeight = info0.height; + param->internalCsp = info0.csp; + param->sourceBitDepth = info0.depth; /* Accept fps and sar from file info if not specified by user */ if (param->fpsDenom == 0 || param->fpsNum == 0)
View file
x265_3.6.tar.gz/source/x265cli.h -> x265_4.0.tar.gz/source/x265cli.h
Changed
@@ -358,6 +358,17 @@ { "dup-threshold", required_argument, NULL, 0 }, { "mcstf", no_argument, NULL, 0 }, { "no-mcstf", no_argument, NULL, 0 }, +#if ENABLE_ALPHA + { "alpha", no_argument, NULL, 0 }, +#endif +#if ENABLE_MULTIVIEW + { "num-views", required_argument, NULL, 0 }, + { "multiview-config", required_argument, NULL, 0 }, + { "format", required_argument, NULL, 0 }, +#endif +#if ENABLE_SCC_EXT + { "scc", required_argument, NULL, 0 }, +#endif #ifdef SVT_HEVC { "svt", no_argument, NULL, 0 }, { "no-svt", no_argument, NULL, 0 }, @@ -393,13 +404,16 @@ struct CLIOptions { - InputFile* input; - ReconFile* recon; + InputFile* inputMAX_VIEWS; + ReconFile* reconMAX_LAYERS; OutputFile* output; FILE* qpfile; FILE* zoneFile; FILE* dolbyVisionRpu; /* File containing Dolby Vision BL RPU metadata */ FILE* scenecutAwareQpConfig; /* File containing scenecut aware frame quantization related CLI options */ +#if ENABLE_MULTIVIEW + FILE* multiViewConfig; /* File containing multi-view related CLI options */ +#endif const char* reconPlayCmd; const x265_api* api; x265_param* param; @@ -431,13 +445,18 @@ static const int UPDATE_INTERVAL = 250000; CLIOptions() { - input = NULL; - recon = NULL; + for (int i = 0; i < MAX_VIEWS; i++) + inputi = NULL; + for (int i = 0; i < MAX_LAYERS; i++) + reconi = NULL; output = NULL; qpfile = NULL; zoneFile = NULL; dolbyVisionRpu = NULL; scenecutAwareQpConfig = NULL; +#if ENABLE_MULTIVIEW + multiViewConfig = NULL; +#endif reconPlayCmd = NULL; api = NULL; param = NULL; @@ -470,6 +489,9 @@ int rpuParser(x265_picture * pic); bool parseScenecutAwareQpConfig(); bool parseScenecutAwareQpParam(int argc, char **argv, x265_param* globalParam); +#if ENABLE_MULTIVIEW + bool parseMultiViewConfig(char** fn); +#endif }; #ifdef __cplusplus }
View file
x265_3.6.tar.gz/x265Version.txt -> x265_4.0.tar.gz/x265Version.txt
Changed
@@ -1,4 +1,4 @@ #Attribute: Values -repositorychangeset: aa7f602f7 +repositorychangeset: 6318f22 releasetagdistance: 1 -releasetag: 3.6 +releasetag: 4.0
Locations
Projects
Search
Status Monitor
Help
Open Build Service
OBS Manuals
API Documentation
OBS Portal
Reporting a Bug
Contact
Mailing List
Forums
Chat (IRC)
Twitter
Open Build Service (OBS)
is an
openSUSE project
.