Packman Build Service PMBS

We truncated the diff of some files because they were too big. If you want to see the full diff for every file, click here.

Changes of Revision 11

x265.changes Changed

@@ -1,4 +1,47 @@
 -------------------------------------------------------------------
+Fri Nov 27 18:21:04 UTC 2015 - aloisio@gmx.com
+
+- Update to version 1.8:
+  API Changes:
+  * Experimental support for Main12 is now enabled. Partial
+    assembly support exists.
+  * Main12 and Intra/Still picture profiles are now supported.
+    Still picture profile is detected based on
+    x265_param::totalFrames.
+  * Three classes of encoding statistics are now available
+    through the API.
+    + x265_stats - contains encoding statistics, available
+      through x265_encoder_get_stats()
+    + x265_frame_stats and x265_cu_stats - contains frame
+      encoding statistics, available through recon x265_picture
+  * --csv
+  * x265_encoder_log() is now deprecated
+  * x265_param::csvfn is also deprecated
+  * --log-level now controls only console logging, frame
+    level console logging has been removed.
+  * Support added for new color transfer characteristic ARIB
+    STD-B67
+  New Features:
+  * limit-refs
+    + This feature limits the references analysed for
+      individual CUS.
+    + Provides a nice tradeoff between efficiency and
+      performance.
+    + aq-mode 3
+  * A new aq-mode that provides additional biasing for
+    low-light conditions.
+  * An improved scene cut detection logic that allows
+    ratecontrol to manage visual quality at fade-ins and
+    fade-outs better.
+  Preset and Tune Options:
+  * tune grain
+    + Increases psyRdoq strength to 10.0, and rdoq-level to 2.
+    + qg-size
+  * Default value changed to 32.
+- soname bump to 68
+- Reworked arm.patch for 1.8
+
+-------------------------------------------------------------------
 Fri May 29 09:11:02 UTC 2015 - aloisio@gmx.com
 
 - soname bump to 59

x265.spec Changed

arm.patch Changed

@@ -1,9 +1,11 @@
---- source/CMakeLists.txt.orig	2015-04-28 21:43:18.585528552 +0200
-+++ source/CMakeLists.txt	2015-04-28 21:47:14.995334232 +0200
-@@ -50,10 +50,18 @@
-         set(X64 1)
-         add_definitions(-DX86_64=1)
-     endif()
+Index: x265_11047/source/CMakeLists.txt
+===================================================================
+--- x265_11047.orig/source/CMakeLists.txt
++++ x265_11047/source/CMakeLists.txt
+@@ -56,10 +56,22 @@ elseif(POWERMATCH GREATER "-1")
+     message(STATUS "Detected POWER target processor")
+     set(POWER 1)
+     add_definitions(-DX265_ARCH_POWER=1)
 +elseif(${SYSPROC} MATCHES "armv5.*")
 +    message(STATUS "Detected ARMV5 system processor")
 +    set(ARMV5 1)
@@ -19,10 +21,14 @@
 +    message(STATUS "Detected ARMV7 system processor")
 +    set(ARMV7 1)
 +    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1 -DHAVE_NEON=0)
++elseif(${SYSPROC} STREQUAL "aarch64")
++    message(STATUS "Detected AArch64 system processor")
++    set(ARMV7 1)
++    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1 -DHAVE_NEON=0)
  else()
      message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
      message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
-@@ -155,8 +163,8 @@
+@@ -169,8 +181,8 @@ if(GCC)
      elseif(X86 AND NOT X64)
          add_definitions(-march=i686)
      endif()
@@ -33,8 +39,10 @@
      endif()
      if(FPROFILE_GENERATE)
          if(INTEL_CXX)
---- source/common/cpu.cpp.orig	2015-04-28 21:47:44.634923269 +0200
-+++ source/common/cpu.cpp	2015-04-28 21:49:50.305468867 +0200
+Index: x265_11047/source/common/cpu.cpp
+===================================================================
+--- x265_11047.orig/source/common/cpu.cpp
++++ x265_11047/source/common/cpu.cpp
 @@ -37,7 +37,7 @@
  #include <machine/cpu.h>
  #endif
@@ -44,20 +52,3 @@
  #include <signal.h>
  #include <setjmp.h>
  static sigjmp_buf jmpbuf;
-@@ -340,7 +340,6 @@
-     }
- 
-     canjump = 1;
--    x265_cpu_neon_test();
-     canjump = 0;
-     signal(SIGILL, oldsig);
- #endif // if !HAVE_NEON
-@@ -356,7 +355,7 @@
-     // which may result in incorrect detection and the counters stuck enabled.
-     // right now Apple does not seem to support performance counters for this test
- #ifndef __MACH__
--    flags |= x265_cpu_fast_neon_mrc_test() ? X265_CPU_FAST_NEON_MRC : 0;
-+    //flags |= x265_cpu_fast_neon_mrc_test() ? X265_CPU_FAST_NEON_MRC : 0;
- #endif
-     // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
- #endif // if HAVE_ARMV6

baselibs.conf Changed

x265_1.7.tar.gz/source/filters/filters.cpp Deleted

@@ -1,79 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2013 x265 project
- *
- * Authors: Selvakumar Nithiyaruban <selvakumar@multicorewareinc.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-#include "filters.h"
-#include "common.h"
-
-/* The dithering algorithm is based on Sierra-2-4A error diffusion. */
-void ditherPlane(pixel *dst, int dstStride, uint16_t *src, int srcStride,
-                 int width, int height, int16_t *errors, int bitDepth)
-{
-    const int lShift = 16 - bitDepth;
-    const int rShift = 16 - bitDepth + 2;
-    const int half = (1 << (16 - bitDepth + 1));
-    const int pixelMax = (1 << bitDepth) - 1;
-
-    memset(errors, 0, (width + 1) * sizeof(int16_t));
-    int pitch = 1;
-    for (int y = 0; y < height; y++, src += srcStride, dst += dstStride)
-    {
-        int16_t err = 0;
-        for (int x = 0; x < width; x++)
-        {
-            err = err * 2 + errors[x] + errors[x + 1];
-            dst[x * pitch] = (pixel)x265_clip3(0, pixelMax, ((src[x * 1] << 2) + err + half) >> rShift);
-            errors[x] = err = src[x * pitch] - (dst[x * pitch] << lShift);
-        }
-    }
-}
-
-void ditherImage(x265_picture& picIn, int picWidth, int picHeight, int16_t *errorBuf, int bitDepth)
-{
-    /* This portion of code is from readFrame in x264. */
-    for (int i = 0; i < x265_cli_csps[picIn.colorSpace].planes; i++)
-    {
-        if ((picIn.bitDepth & 7) && (picIn.bitDepth != 16))
-        {
-            /* upconvert non 16bit high depth planes to 16bit */
-            uint16_t *plane = (uint16_t*)picIn.planes[i];
-            uint32_t pixelCount = x265_picturePlaneSize(picIn.colorSpace, picWidth, picHeight, i);
-            int lShift = 16 - picIn.bitDepth;
-
-            /* This loop assumes width is equal to stride which
-               happens to be true for file reader outputs */
-            for (uint32_t j = 0; j < pixelCount; j++)
-            {
-                plane[j] = plane[j] << lShift;
-            }
-        }
-    }
-
-    for (int i = 0; i < x265_cli_csps[picIn.colorSpace].planes; i++)
-    {
-        int height = (int)(picHeight >> x265_cli_csps[picIn.colorSpace].height[i]);
-        int width = (int)(picWidth >> x265_cli_csps[picIn.colorSpace].width[i]);
-
-        ditherPlane(((pixel*)picIn.planes[i]), picIn.stride[i] / sizeof(pixel), ((uint16_t*)picIn.planes[i]),
-                    picIn.stride[i] / 2, width, height, errorBuf, bitDepth);
-    }
-}

x265_1.7.tar.gz/source/filters/filters.h Deleted

@@ -1,31 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2013 x265 project
- *
- * Authors: Selvakumar Nithiyaruban <selvakumar@multicorewareinc.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-#ifndef X265_FILTERS_H
-#define X265_FILTERS_H
-
-#include "x265.h"
-
-void ditherImage(x265_picture&, int picWidth, int picHeight, int16_t *errorBuf, int bitDepth);
-
-#endif //X265_FILTERS_H

x265_1.7.tar.gz/.hg_archival.txt -> x265_1.8.tar.gz/.hg_archival.txt Changed

x265_1.7.tar.gz/.hgtags -> x265_1.8.tar.gz/.hgtags Changed

x265_1.8.tar.gz/build/linux/multilib.sh Added

@@ -0,0 +1,41 @@
+#!/bin/sh
+
+mkdir -p 8bit 10bit 12bit
+
+cd 12bit
+cmake ../../../source -DHIGH_BIT_DEPTH=ON -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=OFF -DMAIN12=ON
+make ${MAKEFLAGS}
+
+cd ../10bit
+cmake ../../../source -DHIGH_BIT_DEPTH=ON -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=OFF
+make ${MAKEFLAGS}
+
+cd ../8bit
+ln -sf ../10bit/libx265.a libx265_main10.a
+ln -sf ../12bit/libx265.a libx265_main12.a
+cmake ../../../source -DEXTRA_LIB="x265_main10.a;x265_main12.a" -DEXTRA_LINK_FLAGS=-L. -DLINKED_10BIT=ON -DLINKED_12BIT=ON
+make ${MAKEFLAGS}
+
+# rename the 8bit library, then combine all three into libx265.a
+mv libx265.a libx265_main.a
+
+uname=`uname`
+if [ "$uname" = "Linux" ]
+then
+
+# On Linux, we use GNU ar to combine the static libraries together
+ar -M <<EOF
+CREATE libx265.a
+ADDLIB libx265_main.a
+ADDLIB libx265_main10.a
+ADDLIB libx265_main12.a
+SAVE
+END
+EOF
+
+else
+
+# Mac/BSD libtool
+libtool -static -o libx265.a libx265_main.a libx265_main10.a libx265_main12.a 2>/dev/null
+
+fi

x265_1.8.tar.gz/build/msys/multilib.sh Added

x265_1.8.tar.gz/build/vc10-x86_64/multilib.bat Added

@@ -0,0 +1,44 @@
+@echo off
+if "%VS100COMNTOOLS%" == "" (
+  msg "%username%" "Visual Studio 10 not detected"
+  exit 1
+)
+
+call "%VS100COMNTOOLS%\..\..\VC\vcvarsall.bat"
+
+@mkdir 12bit
+@mkdir 10bit
+@mkdir 8bit
+
+@cd 12bit
+cmake -G "Visual Studio 10 Win64" ../../../source -DHIGH_BIT_DEPTH=ON -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=OFF -DMAIN12=ON
+if exist x265.sln (
+  MSBuild /property:Configuration="Release" x265.sln
+  copy/y Release\x265-static.lib ..\8bit\x265-static-main12.lib
+)
+
+@cd ..\10bit
+cmake -G "Visual Studio 10 Win64" ../../../source -DHIGH_BIT_DEPTH=ON -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=OFF
+if exist x265.sln (
+  MSBuild /property:Configuration="Release" x265.sln
+  copy/y Release\x265-static.lib ..\8bit\x265-static-main10.lib
+)
+
+@cd ..\8bit
+if not exist x265-static-main10.lib (
+  msg "%username%" "10bit build failed"
+  exit 1
+)
+if not exist x265-static-main12.lib (
+  msg "%username%" "12bit build failed"
+  exit 1
+)
+cmake -G "Visual Studio 10 Win64" ../../../source -DEXTRA_LIB="x265-static-main10.lib;x265-static-main12.lib" -DLINKED_10BIT=ON -DLINKED_12BIT=ON
+if exist x265.sln (
+  MSBuild /property:Configuration="Release" x265.sln
+  :: combine static libraries (ignore warnings caused by winxp.cpp hacks)
+  move Release\x265-static.lib x265-static-main.lib
+  LIB.EXE /ignore:4006 /ignore:4221 /OUT:Release\x265-static.lib x265-static-main.lib x265-static-main10.lib x265-static-main12.lib
+)
+
+pause

x265_1.8.tar.gz/build/vc11-x86_64/multilib.bat Added

@@ -0,0 +1,44 @@
+@echo off
+if "%VS110COMNTOOLS%" == "" (
+  msg "%username%" "Visual Studio 11 not detected"
+  exit 1
+)
+
+call "%VS110COMNTOOLS%\..\..\VC\vcvarsall.bat"
+
+@mkdir 12bit
+@mkdir 10bit
+@mkdir 8bit
+
+@cd 12bit
+cmake -G "Visual Studio 11 Win64" ../../../source -DHIGH_BIT_DEPTH=ON -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=OFF -DMAIN12=ON
+if exist x265.sln (
+  MSBuild /property:Configuration="Release" x265.sln
+  copy/y Release\x265-static.lib ..\8bit\x265-static-main12.lib
+)
+
+@cd ..\10bit
+cmake -G "Visual Studio 11 Win64" ../../../source -DHIGH_BIT_DEPTH=ON -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=OFF
+if exist x265.sln (
+  MSBuild /property:Configuration="Release" x265.sln
+  copy/y Release\x265-static.lib ..\8bit\x265-static-main10.lib
+)
+
+@cd ..\8bit
+if not exist x265-static-main10.lib (
+  msg "%username%" "10bit build failed"
+  exit 1
+)
+if not exist x265-static-main12.lib (
+  msg "%username%" "12bit build failed"
+  exit 1
+)
+cmake -G "Visual Studio 11 Win64" ../../../source -DEXTRA_LIB="x265-static-main10.lib;x265-static-main12.lib" -DLINKED_10BIT=ON -DLINKED_12BIT=ON
+if exist x265.sln (
+  MSBuild /property:Configuration="Release" x265.sln
+  :: combine static libraries (ignore warnings caused by winxp.cpp hacks)
+  move Release\x265-static.lib x265-static-main.lib
+  LIB.EXE /ignore:4006 /ignore:4221 /OUT:Release\x265-static.lib x265-static-main.lib x265-static-main10.lib x265-static-main12.lib
+)
+
+pause

x265_1.8.tar.gz/build/vc12-x86_64/multilib.bat Added

@@ -0,0 +1,44 @@
+@echo off
+if "%VS120COMNTOOLS%" == "" (
+  msg "%username%" "Visual Studio 12 not detected"
+  exit 1
+)
+
+call "%VS120COMNTOOLS%\..\..\VC\vcvarsall.bat"
+
+@mkdir 12bit
+@mkdir 10bit
+@mkdir 8bit
+
+@cd 12bit
+cmake -G "Visual Studio 12 Win64" ../../../source -DHIGH_BIT_DEPTH=ON -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=OFF -DMAIN12=ON
+if exist x265.sln (
+  MSBuild /property:Configuration="Release" x265.sln
+  copy/y Release\x265-static.lib ..\8bit\x265-static-main12.lib
+)
+
+@cd ..\10bit
+cmake -G "Visual Studio 12 Win64" ../../../source -DHIGH_BIT_DEPTH=ON -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=OFF
+if exist x265.sln (
+  MSBuild /property:Configuration="Release" x265.sln
+  copy/y Release\x265-static.lib ..\8bit\x265-static-main10.lib
+)
+
+@cd ..\8bit
+if not exist x265-static-main10.lib (
+  msg "%username%" "10bit build failed"
+  exit 1
+)
+if not exist x265-static-main12.lib (
+  msg "%username%" "12bit build failed"
+  exit 1
+)
+cmake -G "Visual Studio 12 Win64" ../../../source -DEXTRA_LIB="x265-static-main10.lib;x265-static-main12.lib" -DLINKED_10BIT=ON -DLINKED_12BIT=ON
+if exist x265.sln (
+  MSBuild /property:Configuration="Release" x265.sln
+  :: combine static libraries (ignore warnings caused by winxp.cpp hacks)
+  move Release\x265-static.lib x265-static-main.lib
+  LIB.EXE /ignore:4006 /ignore:4221 /OUT:Release\x265-static.lib x265-static-main.lib x265-static-main10.lib x265-static-main12.lib
+)
+
+pause

x265_1.8.tar.gz/build/vc9-x86_64/multilib.bat Added

@@ -0,0 +1,44 @@
+@echo off
+if "%VS90COMNTOOLS%" == "" (
+  msg "%username%" "Visual Studio 9 not detected"
+  exit 1
+)
+
+call "%VS90COMNTOOLS%\..\..\VC\vcvarsall.bat"
+
+@mkdir 12bit
+@mkdir 10bit
+@mkdir 8bit
+
+@cd 12bit
+cmake -G "Visual Studio 9 2008 Win64" ../../../source -DHIGH_BIT_DEPTH=ON -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=OFF -DMAIN12=ON
+if exist x265.sln (
+  MSBuild /property:Configuration="Release" x265.sln
+  copy/y Release\x265-static.lib ..\8bit\x265-static-main12.lib
+)
+
+@cd ..\10bit
+cmake -G "Visual Studio 9 2008 Win64" ../../../source -DHIGH_BIT_DEPTH=ON -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=OFF
+if exist x265.sln (
+  MSBuild /property:Configuration="Release" x265.sln
+  copy/y Release\x265-static.lib ..\8bit\x265-static-main10.lib
+)
+
+@cd ..\8bit
+if not exist x265-static-main10.lib (
+  msg "%username%" "10bit build failed"
+  exit 1
+)
+if not exist x265-static-main12.lib (
+  msg "%username%" "12bit build failed"
+  exit 1
+)
+cmake -G "Visual Studio 9 2008 Win64" ../../../source -DEXTRA_LIB="x265-static-main10.lib;x265-static-main12.lib" -DLINKED_10BIT=ON -DLINKED_12BIT=ON
+if exist x265.sln (
+  MSBuild /property:Configuration="Release" x265.sln
+  :: combine static libraries (ignore warnings caused by winxp.cpp hacks)
+  move Release\x265-static.lib x265-static-main.lib
+  LIB.EXE /ignore:4006 /ignore:4221 /OUT:Release\x265-static.lib x265-static-main.lib x265-static-main10.lib x265-static-main12.lib
+)
+
+pause

x265_1.7.tar.gz/doc/reST/api.rst -> x265_1.8.tar.gz/doc/reST/api.rst Changed

@@ -41,9 +41,9 @@
 x265 will accept input pixels of any depth between 8 and 16 bits
 regardless of the depth of its internal pixels (8 or 10).  It will shift
 and mask input pixels as required to reach the internal depth. If
-downshifting is being performed using our CLI application, the
-:option:`--dither` option may be enabled to reduce banding. This feature
-is not available through the C interface.
+downshifting is being performed using our CLI application (to 8 bits),
+the :option:`--dither` option may be enabled to reduce banding. This
+feature is not available through the C interface.
 
 Encoder
 =======
@@ -159,7 +159,8 @@
 	helps future-proof your code in many ways, but the x265 API is
 	versioned in such a way that we prevent linkage against a build of
 	x265 that does not match the version of the header you are compiling
-	against. This is function of the X265_BUILD macro.
+	against (unless you use x265_api_query() to acquire the library's
+	interfaces). This is function of the X265_BUILD macro.
 
 **x265_encoder_parameters()** may be used to get a copy of the param
 structure from the encoder after it has been opened, in order to see the
@@ -190,7 +191,7 @@
 	 *      presets is not recommended without a more fine-grained breakdown of
 	 *      parameters to take this into account. */
 	int x265_encoder_reconfig(x265_encoder *, x265_param *);
-	
+
 Pictures
 ========
 
@@ -320,7 +321,8 @@
 	provided, the encoder will fill it with data pertaining to the
 	output picture corresponding to the output NALs, including the
 	recontructed image, POC and decode timestamp. These pictures will be
-	in encode (or decode) order.
+	in encode (or decode) order. The encoder will also write corresponding 
+	frame encode statistics into **x265_frame_stats**.
 
 When the last of the raw input pictures has been sent to the encoder,
 **x265_encoder_encode()** must still be called repeatedly with a
@@ -338,15 +340,6 @@
 Cleanup
 =======
 
-At the end of the encode, the application will want to trigger logging
-of the final encode statistics, if :option:`--csv` had been specified::
-
-	/* x265_encoder_log:
-	 *       write a line to the configured CSV file.  If a CSV filename was not
-	 *       configured, or file open failed, or the log level indicated frame level
-	 *       logging, this function will perform no write. */
-	void x265_encoder_log(x265_encoder *encoder, int argc, char **argv);
-
 Finally, the encoder must be closed in order to free all of its
 resources. An encoder that has been flushed cannot be restarted and
 reused. Once **x265_encoder_close()** has been called, the encoder
@@ -370,52 +363,150 @@
 Multi-library Interface
 =======================
 
-If your application might want to make a runtime selection between
-a number of libx265 libraries (perhaps 8bpp and 16bpp), then you will
-want to use the multi-library interface.
-
-Instead of directly using all of the **x265_** methods documented
-above, you query an x265_api structure from your libx265 and then use
-the function pointers within that structure of the same name, but
-without the **x265_** prefix. So **x265_param_default()** becomes
-**api->param_default()**. The key method is x265_api_get()::
-
-    /* x265_api_get:
-     *   Retrieve the programming interface for a linked x265 library.
-     *   May return NULL if no library is available that supports the
-     *   requested bit depth. If bitDepth is 0, the function is guarunteed
-     *   to return a non-NULL x265_api pointer from the system default
-     *   libx265 */
-    const x265_api* x265_api_get(int bitDepth);
-
-Note that using this multi-library API in your application is only the
-first step.
-
-Your application must link to one build of libx265 (statically or 
-dynamically) and this linked version of libx265 will support one 
-bit-depth (8 or 10 bits). 
-
-Your application must now request the API for the bitDepth you would 
-prefer the encoder to use (8 or 10). If the requested bitdepth is zero, 
-or if it matches the bitdepth of the system default libx265 (the 
-currently linked library), then this library will be used for encode.
-If you request a different bit-depth, the linked libx265 will attempt 
-to dynamically bind a shared library with a name appropriate for the 
-requested bit-depth:
-
-    8-bit:  libx265_main.dll
-    10-bit: libx265_main10.dll
-
-    (the shared library extension is obviously platform specific. On
-    Linux it is .so while on Mac it is .dylib)
-
-For example on Windows, one could package together an x265.exe
-statically linked against the 8bpp libx265 together with a
-libx265_main10.dll in the same folder, and this executable would be able
-to encode main and main10 bitstreams.
-
-On Linux, x265 packagers could install 8bpp static and shared libraries
-under the name libx265 (so all applications link against 8bpp libx265)
-and then also install libx265_main10.so (symlinked to its numbered solib).
-Thus applications which use x265_api_get() will be able to generate main
-or main10 bitstreams.
+If your application might want to make a runtime bit-depth selection, it
+will need to use one of these bit-depth introspection interfaces which
+returns an API structure containing the public function entry points and
+constants.
+
+Instead of directly using all of the **x265_** methods documented above,
+you query an x265_api structure from your libx265 and then use the
+function pointers of the same name (minus the **x265_** prefix) within
+that structure.  For instance **x265_param_default()** becomes
+**api->param_default()**.
+
+x265_api_get
+------------
+
+The first bit-depth instrospecton method is x265_api_get(). It designed
+for applications that might statically link with libx265, or will at
+least be tied to a particular SONAME or API version::
+
+	/* x265_api_get:
+	 *   Retrieve the programming interface for a linked x265 library.
+	 *   May return NULL if no library is available that supports the
+	 *   requested bit depth. If bitDepth is 0, the function is guarunteed
+	 *   to return a non-NULL x265_api pointer from the system default
+	 *   libx265 */
+	const x265_api* x265_api_get(int bitDepth);
+
+Like **x265_encoder_encode()**, this function has the build number
+automatically appended to the function name via macros. This ties your
+application to a particular binary API version of libx265 (the one you
+compile against). If you attempt to link with a libx265 with a different
+API version number, the link will fail.
+
+Obviously this has no meaningful effect on applications which statically
+link to libx265.
+
+x265_api_query
+--------------
+
+The second bit-depth introspection method is designed for applications
+which need more flexibility in API versioning.  If you use
+**x265_api_query()** and dynamically link to libx265 at runtime (using
+dlopen() on POSIX or LoadLibrary() on Windows) your application is no
+longer directly tied to the API version that it was compiled against::
+
+	/* x265_api_query:
+	 *   Retrieve the programming interface for a linked x265 library, like
+	 *   x265_api_get(), except this function accepts X265_BUILD as the second
+	 *   argument rather than using the build number as part of the function name.
+	 *   Applications which dynamically link to libx265 can use this interface to
+	 *   query the library API and achieve a relative amount of version skew
+	 *   flexibility. The function may return NULL if the library determines that
+	 *   the apiVersion that your application was compiled against is not compatible
+	 *   with the library you have linked with.
+	 *
+	 *   api_major_version will be incremented any time non-backward compatible
+	 *   changes are made to any public structures or functions. If
+	 *   api_major_version does not match X265_MAJOR_VERSION from the x265.h your
+	 *   application compiled against, your application must not use the returned
+	 *   x265_api pointer.
+	 *
+	 *   Users of this API *must* also validate the sizes of any structures which
+	 *   are not treated as opaque in application code. For instance, if your
+	 *   application dereferences a x265_param pointer, then it must check that
+	 *   api->sizeof_param matches the sizeof(x265_param) that your application
+	 *   compiled with. */
+	const x265_api* x265_api_query(int bitDepth, int apiVersion, int* err);
+
+A number of validations must be performed on the returned API structure
+in order to determine if it is safe for use by your application. If you
+do not perform these checks, your application is liable to crash::
+
+	if (api->api_major_version != X265_MAJOR_VERSION) /* do not use */
+	if (api->sizeof_param != sizeof(x265_param))      /* do not use */
+	if (api->sizeof_picture != sizeof(x265_picture))  /* do not use */
+	if (api->sizeof_stats != sizeof(x265_stats))      /* do not use */
+	if (api->sizeof_zone != sizeof(x265_zone))        /* do not use */
+	etc.
+
+Note that if your application does not directly allocate or dereference
+one of these structures, if it treats the structure as opaque or does
+not use it at all, then it can skip the size check for that structure.
+
+In particular, if your application uses api->param_alloc(),
+api->param_free(), api->param_parse(), etc and never directly accesses
+any x265_param fields, then it can skip the check on the
+sizeof(x265_parm) and thereby ignore changes to that structure (which
+account for a large percentage of X265_BUILD bumps).
+
+Build Implications

x265_1.7.tar.gz/doc/reST/cli.rst -> x265_1.8.tar.gz/doc/reST/cli.rst Changed

@@ -28,7 +28,7 @@
 
 Generally, when an option expects a string value from a list of strings
 the user may specify the integer ordinal of the value they desire. ie:
-:option:`--log-level` 4 is equivalent to :option:`--log-level` debug.
+:option:`--log-level` 3 is equivalent to :option:`--log-level` debug.
 
 Executable Options
 ==================
@@ -52,6 +52,7 @@
 	2. unable to open encoder
 	3. unable to generate stream headers
 	4. encoder abort
+	5. unable to open csv file
 
 Logging/Statistic Options
 =========================
@@ -67,9 +68,8 @@
 	0. error
 	1. warning
 	2. info **(default)**
-	3. frame
-	4. debug
-	5. full
+	3. debug
+	4. full
 
 .. option:: --no-progress
 
@@ -80,9 +80,9 @@
 .. option:: --csv <filename>
 
 	Writes encoding results to a comma separated value log file. Creates
-	the file if it doesnt already exist, else adds one line per run.  if
-	:option:`--log-level` is frame or above, it writes one line per
-	frame. Default none
+	the file if it doesnt already exist. If :option:`--csv-log-level` is 0, 
+	it adds one line per run. If :option:`--csv-log-level` is greater than
+	0, it writes one line per frame. Default none
 
 	When frame level logging is enabled, several frame performance
 	statistics are listed:
@@ -123,13 +123,17 @@
 	enough ahead for the necessary reference data to be available. This
 	is more of a problem for P frames where some blocks are much more
 	expensive than others.
+	
+	**CLI ONLY**
 
+.. option:: --csv-log-level <integer>
 
-.. option:: --cu-stats, --no-cu-stats
+        CSV logging level. Default 0
+        0. summary
+        1. frame level logging
+        2. frame level logging with performance statistics
 
-	Records statistics on how each CU was coded (split depths and other
-	mode decisions) and reports those statistics at the end of the
-	encode. Default disabled
+        **CLI ONLY**
 
 .. option:: --ssim, --no-ssim
 
@@ -349,6 +353,13 @@
 
 	**CLI ONLY**
 
+.. option:: --total-frames <integer>
+
+	The number of frames intended to be encoded.  It may be left
+	unspecified, but when it is specified rate control can make use of
+	this information. It is also used to determine if an encode is
+	actually a stillpicture profile encode (single frame)
+
 .. option:: --dither
 
 	Enable high quality downscaling. Dithering is based on the diffusion
@@ -384,7 +395,7 @@
 
 	**Range of values:** positive int or float, or num/denom
 
-.. option:: --interlaceMode <false|tff|bff>, --no-interlaceMode
+.. option:: --interlace <false|tff|bff>, --no-interlace
 
 	0. progressive pictures **(default)**
 	1. top field first 
@@ -419,14 +430,18 @@
 
 	**CLI ONLY**
 
-.. option:: --output-depth, -D 8|10
+.. option:: --output-depth, -D 8|10|12
 
 	Bitdepth of output HEVC bitstream, which is also the internal bit
 	depth of the encoder. If the requested bit depth is not the bit
 	depth of the linked libx265, it will attempt to bind libx265_main
-	for an 8bit encoder, or libx265_main10 for a 10bit encoder, with the
+	for an 8bit encoder, libx265_main10 for a 10bit encoder, or
+	libx265_main12 for a 12bit encoder (EXPERIMENTAL), with the
 	same API version as the linked libx265.
 
+	If the output depth is not specified but :option:`--profile` is
+	specified, the output depth will be derived from the profile name.
+
 	**CLI ONLY**
 
 Profile, Level, Tier
@@ -439,15 +454,44 @@
 	profile.  May abort the encode if the specified profile is
 	impossible to be supported by the compile options chosen for the
 	encoder (a high bit depth encoder will be unable to output
-	bitstreams compliant with Main or Mainstillpicture).
+	bitstreams compliant with Main or MainStillPicture).
+
+	The following profiles are supported in x265.
+
+	8bit profiles::
+
+	main, main-intra, mainstillpicture (or msp for short)
+	main444-8 main444-intra main444-stillpicture
+	See note below on signaling intra and stillpicture profiles.
+	
+	10bit profiles::
+
+	main10, main10-intra
+	main422-10, main422-10-intra
+	main444-10, main444-10-intra
+
+	12bit profiles::
+
+	main12, main12-intra
+	main422-12, main422-12-intra
+	main444-12, main444-12-intra
+
+
+	**CLI ONLY**
 
-	API users must use x265_param_apply_profile() after configuring
+	API users must call x265_param_apply_profile() after configuring
 	their param structure. Any changes made to the param structure after
 	this call might make the encode non-compliant.
 
-	**Values:** main, main10, mainstillpicture, main422-8, main422-10, main444-8, main444-10
+	The CLI application will derive the output bit depth from the
+	profile name if :option:`--output-depth` is not specified.
 
-	**CLI ONLY**
+.. note::
+
+	All 12bit presets are extremely unstable, do not use them yet.
+	16bit is not supported at all, but those profiles are included
+	because it is possible for libx265 to make bitstreams compatible
+	with them.
 
 .. option:: --level-idc <integer|float>
 
@@ -479,6 +523,9 @@
 	specified level, main tier first, turning on high tier only if 
 	necessary and available at that level.
 
+	If :option:`--level-idc` has not been specified, this argument is
+	ignored.
+
 .. option:: --ref <1..16>
 
 	Max number of L0 references to be allowed. This number has a linear
@@ -511,6 +558,7 @@
 	Default: disabled
 
 .. note::
+
 	:option:`--profile`, :option:`--level-idc`, and
 	:option:`--high-tier` are only intended for use when you are
 	targeting a particular decoder (or decoders) with fixed resource
@@ -519,6 +567,29 @@
 	parameters to meet those requirements but it will never raise
 	them. It may enable VBV constraints on a CRF encode.
 
+	Also note that x265 determines the decoder requirement profile and
+	level in three steps.  First, the user configures an x265_param
+	structure with their suggested encoder options and then optionally
+	calls x265_param_apply_profile() to enforce a specific profile
+	(main, main10, etc). Second, an encoder is created from this
+	x265_param instance and the :option:`--level-idc` and
+	:option:`--high-tier` parameters are used to reduce bitrate or other
+	features in order to enforce the target level. Finally, the encoder
+	re-examines the final set of parameters and detects the actual
+	minimum decoder requirement level and this is what is signaled in
+	the bitstream headers. The detected decoder level will only use High
+	tier if the user specified a High tier level.
+
+	The signaled profile will be determined by the encoder's internal
+	bitdepth and input color space. If :option:`--keyint` is 0 or 1,
+	then an intra variant of the profile will be signaled.
+
+	If :option:`--total-frames` is 1, then a stillpicture variant will
+	be signaled, but this parameter is not always set by applications,
+	particularly not when the CLI uses stdin streaming or when libx265
+	is used by third-party applications.

x265_1.7.tar.gz/doc/reST/presets.rst -> x265_1.8.tar.gz/doc/reST/presets.rst Changed

x265_1.7.tar.gz/doc/reST/threading.rst -> x265_1.8.tar.gz/doc/reST/threading.rst Changed

@@ -28,7 +28,7 @@
 providers are recommended to call this method when they make new jobs
 available.
 
-Worker jobs are not allowed to block except when abosultely necessary
+Worker jobs are not allowed to block except when absolutely necessary
 for data locking. If a job becomes blocked, the work function is
 expected to drop that job so the worker thread may go back to the pool
 and find more work.
@@ -94,10 +94,10 @@
 
 If a worker thread job has work which can be performed in parallel by
 many threads, it may allocate a bonded task group and enlist the help of
-other idle worker threads in the same pool. Those threads will cooperate
-to complete the work of the bonded task group and then return to their
-idle states. The larger and more uniform those tasks are, the better the
-bonded task group will perform.
+other idle worker threads from the same thread pool. Those threads will
+cooperate to complete the work of the bonded task group and then return
+to their idle states. The larger and more uniform those tasks are, the
+better the bonded task group will perform.
 
 Parallel Mode Analysis
 ~~~~~~~~~~~~~~~~~~~~~~
@@ -105,19 +105,20 @@
 When :option:`--pmode` is enabled, each CU (at all depths from 64x64 to
 8x8) will distribute its analysis work to the thread pool via a bonded
 task group. Each analysis job will measure the cost of one prediction
-for the CU: merge, skip, intra, inter (2Nx2N, Nx2N, 2NxN, and AMP). At
-slower presets, the amount of increased parallelism is often enough to
-be able to reduce frame parallelism while achieving the same overall CPU
-utilization. Reducing frame threads is often beneficial to ABR and VBV
-rate control.
+for the CU: merge, skip, intra, inter (2Nx2N, Nx2N, 2NxN, and AMP).
+
+At slower presets, the amount of increased parallelism from pmode is
+often enough to be able to reduce or disable frame parallelism while
+achieving the same overall CPU utilization. Reducing frame threads is
+often beneficial to ABR and VBV rate control.
 
 Parallel Motion Estimation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 When :option:`--pme` is enabled all of the analysis functions which
 perform motion searches to reference frames will distribute those motion
-searches as jobs for worker threads via a bonded task group (if more
-than two motion searches are required).
+searches to other worker threads via a bonded task group (if more than
+two motion searches are required).
 
 Frame Threading
 ===============
@@ -241,7 +242,7 @@
 bonded task groups to measure single frame cost estimates using slices.
 (see :option:`--lookahead-slices`)
 
-The function slicetypeDecide() itself is also be performed by a worker
+The main slicetypeDecide() function itself is also performed by a worker
 thread if your encoder has a thread pool, else it runs within the
 context of the thread which calls the x265_encoder_encode().

x265_1.7.tar.gz/source/CMakeLists.txt -> x265_1.8.tar.gz/source/CMakeLists.txt Changed

@@ -30,7 +30,7 @@
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
 
 # X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 59)
+set(X265_BUILD 68)
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
                "${PROJECT_BINARY_DIR}/x265.def")
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
@@ -42,6 +42,8 @@
 string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSPROC)
 set(X86_ALIASES x86 i386 i686 x86_64 amd64)
 list(FIND X86_ALIASES "${SYSPROC}" X86MATCH)
+set(POWER_ALIASES ppc64 ppc64le)
+list(FIND POWER_ALIASES "${SYSPROC}" POWERMATCH)
 if("${SYSPROC}" STREQUAL "" OR X86MATCH GREATER "-1")
     message(STATUS "Detected x86 target processor")
     set(X86 1)
@@ -50,6 +52,10 @@
         set(X64 1)
         add_definitions(-DX86_64=1)
     endif()
+elseif(POWERMATCH GREATER "-1")
+    message(STATUS "Detected POWER target processor")
+    set(POWER 1)
+    add_definitions(-DX265_ARCH_POWER=1)
 elseif(${SYSPROC} STREQUAL "armv6l")
     message(STATUS "Detected ARM target processor")
     set(ARM 1)
@@ -82,6 +88,10 @@
         endif()
     endif()
     mark_as_advanced(LIBRT NUMA_FOUND)
+    option(NO_ATOMICS "Use a slow mutex to replace atomics" OFF)
+    if(NO_ATOMICS)
+        add_definitions(-DNO_ATOMICS=1)
+    endif(NO_ATOMICS)
 endif(UNIX)
 
 if(X64 AND NOT WIN32)
@@ -260,6 +270,8 @@
         message(STATUS "Found Yasm ${YASM_VERSION_STRING} to build assembly primitives")
         option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives" ON)
     endif()
+else()
+    option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives" OFF)
 endif()
 
 option(CHECKED_BUILD "Enable run-time sanity checks (debugging)" OFF)
@@ -270,23 +282,59 @@
 # Build options
 set(LIB_INSTALL_DIR lib CACHE STRING "Install location of libraries")
 set(BIN_INSTALL_DIR bin CACHE STRING "Install location of executables")
+set(EXTRA_LIB "" CACHE STRING "Extra libraries to link against")
+set(EXTRA_LINK_FLAGS "" CACHE STRING "Extra link flags")
+if(EXTRA_LINK_FLAGS)
+    list(APPEND LINKER_OPTIONS ${EXTRA_LINK_FLAGS})
+endif()
+if(EXTRA_LIB)
+    option(LINKED_8BIT  "8bit libx265 is being linked with this library" OFF)
+    option(LINKED_10BIT "10bit libx265 is being linked with this library" OFF)
+    option(LINKED_12BIT "12bit libx265 is being linked with this library" OFF)
+endif(EXTRA_LIB)
+mark_as_advanced(EXTRA_LIB EXTRA_LINK_FLAGS)
 
 if(X64)
-    # NOTE: We only officially support 16bit-per-pixel compiles of x265
-    # on 64bit architectures. 16bpp plus large resolution plus slow
+    # NOTE: We only officially support high-bit-depth compiles of x265
+    # on 64bit architectures. Main10 plus large resolution plus slow
     # preset plus 32bit address space usually means malloc failure.  You
     # can disable this if(X64) check if you desparately need a 32bit
     # build with 10bit/12bit support, but this violates the "shrink wrap
     # license" so to speak.  If it breaks you get to keep both halves.
-    # You will likely need to compile without assembly
-    option(HIGH_BIT_DEPTH "Store pixels as 16bit values" OFF)
+    # You will need to disable assembly manually.
+    option(HIGH_BIT_DEPTH "Store pixel samples as 16bit values (Main10/Main12)" OFF)
 endif(X64)
 if(HIGH_BIT_DEPTH)
-    add_definitions(-DHIGH_BIT_DEPTH=1)
+    option(MAIN12 "Support Main12 instead of Main10" OFF)
+    if(MAIN12)
+        add_definitions(-DHIGH_BIT_DEPTH=1 -DX265_DEPTH=12)
+    else()
+        add_definitions(-DHIGH_BIT_DEPTH=1 -DX265_DEPTH=10)
+    endif()
 else(HIGH_BIT_DEPTH)
-    add_definitions(-DHIGH_BIT_DEPTH=0)
+    add_definitions(-DHIGH_BIT_DEPTH=0 -DX265_DEPTH=8)
 endif(HIGH_BIT_DEPTH)
 
+# this option can only be used when linking multiple libx265 libraries
+# together, and some alternate API access method is implemented.
+option(EXPORT_C_API "Implement public C programming interface" ON)
+mark_as_advanced(EXPORT_C_API)
+if(EXPORT_C_API)
+    set(X265_NS x265)
+    add_definitions(-DEXPORT_C_API=1)
+elseif(HIGH_BIT_DEPTH)
+    if(MAIN12)
+        set(X265_NS x265_12bit)
+    else()
+        set(X265_NS x265_10bit)
+    endif()
+    add_definitions(-DEXPORT_C_API=0)
+else()
+    set(X265_NS x265_8bit)
+    add_definitions(-DEXPORT_C_API=0)
+endif()
+add_definitions(-DX265_NS=${X265_NS})
+
 option(WARNINGS_AS_ERRORS "Stop compiles on first warning" OFF)
 if(WARNINGS_AS_ERRORS)
     if(GCC)
@@ -375,6 +423,9 @@
 if(NOT MSVC)
     set_target_properties(x265-static PROPERTIES OUTPUT_NAME x265)
 endif()
+if(EXTRA_LIB)
+    target_link_libraries(x265-static ${EXTRA_LIB})
+endif()
 install(TARGETS x265-static
     LIBRARY DESTINATION ${LIB_INSTALL_DIR}
     ARCHIVE DESTINATION ${LIB_INSTALL_DIR})
@@ -415,7 +466,7 @@
         if(APPLE)
             set_target_properties(x265-shared PROPERTIES MACOSX_RPATH 1)
         else()
-            set_target_properties(x265-shared PROPERTIES LINK_FLAGS "-Wl,-Bsymbolic,-znoexecstack")
+            list(APPEND LINKER_OPTIONS "-Wl,-Bsymbolic,-znoexecstack")
         endif()
     endif()
     set_target_properties(x265-shared PROPERTIES SOVERSION ${X265_BUILD})
@@ -429,6 +480,9 @@
                 ARCHIVE DESTINATION ${LIB_INSTALL_DIR}
                 RUNTIME DESTINATION ${BIN_INSTALL_DIR})
     endif()
+    if(EXTRA_LIB)
+        target_link_libraries(x265-shared ${EXTRA_LIB})
+    endif()
     if(LINKER_OPTIONS)
         # set_target_properties can't do list expansion
         string(REPLACE ";" " " LINKER_OPTION_STR "${LINKER_OPTIONS}")
@@ -468,16 +522,14 @@
 endif()
 
 # Main CLI application
-option(ENABLE_CLI "Build standalone CLI application" ON)
+set(ENABLE_CLI ON CACHE BOOL "Build standalone CLI application")
 if(ENABLE_CLI)
     file(GLOB InputFiles input/input.cpp input/yuv.cpp input/y4m.cpp input/*.h)
     file(GLOB OutputFiles output/output.cpp output/reconplay.cpp output/*.h
                           output/yuv.cpp output/y4m.cpp # recon
                           output/raw.cpp)               # muxers
-    file(GLOB FilterFiles filters/*.cpp filters/*.h)
     source_group(input FILES ${InputFiles})
     source_group(output FILES ${OutputFiles})
-    source_group(filters FILES ${FilterFiles})
 
     check_include_files(getopt.h HAVE_GETOPT_H)
     if(NOT HAVE_GETOPT_H)
@@ -487,13 +539,18 @@
         include_directories(compat/getopt)
         set(GETOPT compat/getopt/getopt.c compat/getopt/getopt.h)
     endif(NOT HAVE_GETOPT_H)
+    if(WIN32)
+        set(ExportDefs "${PROJECT_BINARY_DIR}/x265.def")
+    endif(WIN32)
 
     if(XCODE)
         # Xcode seems unable to link the CLI with libs, so link as one targget
-        add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${FilterFiles} ${GETOPT} x265.cpp x265.h x265cli.h
-                           $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common> ${YASM_OBJS} ${YASM_SRCS})
+        add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${GETOPT}
+                       x265.cpp x265.h x265cli.h x265-extras.h x265-extras.cpp
+                       $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common> ${YASM_OBJS} ${YASM_SRCS})
     else()
-        add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${FilterFiles} ${GETOPT} ${X265_RC_FILE} x265.cpp x265.h x265cli.h)
+        add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${GETOPT} ${X265_RC_FILE}
+                       ${ExportDefs} x265.cpp x265.h x265cli.h x265-extras.h x265-extras.cpp)
         if(WIN32 OR NOT ENABLE_SHARED OR INTEL_CXX)
             # The CLI cannot link to the shared library on Windows, it
             # requires internal APIs not exported from the DLL

x265_1.7.tar.gz/source/cmake/CMakeASM_YASMInformation.cmake -> x265_1.8.tar.gz/source/cmake/CMakeASM_YASMInformation.cmake Changed

x265_1.7.tar.gz/source/cmake/FindYasm.cmake -> x265_1.8.tar.gz/source/cmake/FindYasm.cmake Changed

x265_1.7.tar.gz/source/common/CMakeLists.txt -> x265_1.8.tar.gz/source/common/CMakeLists.txt Changed

@@ -1,7 +1,21 @@
 # vim: syntax=cmake
 
+list(APPEND VFLAGS "-DX265_VERSION=${X265_VERSION}")
+if(EXTRA_LIB)
+    if(LINKED_8BIT)
+        list(APPEND VFLAGS "-DLINKED_8BIT=1")
+    endif(LINKED_8BIT)
+    if(LINKED_10BIT)
+        list(APPEND VFLAGS "-DLINKED_10BIT=1")
+    endif(LINKED_10BIT)
+    if(LINKED_12BIT)
+        list(APPEND VFLAGS "-DLINKED_12BIT=1")
+    endif(LINKED_12BIT)
+endif(EXTRA_LIB)
+
 if(ENABLE_ASSEMBLY)
     set_source_files_properties(threading.cpp primitives.cpp PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1)
+    list(APPEND VFLAGS "-DENABLE_ASSEMBLY=1")
 
     set(SSE3  vec/dct-sse3.cpp)
     set(SSSE3 vec/dct-ssse3.cpp)
@@ -46,7 +60,7 @@
                mc-a2.asm pixel-util8.asm blockcopy8.asm
                pixeladd8.asm dct8.asm)
     if(HIGH_BIT_DEPTH)
-        set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm ipfilter16.asm)
+        set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm ipfilter16.asm loopfilter.asm)
     else()
         set(A_SRCS ${A_SRCS} sad-a.asm intrapred8.asm intrapred8_allangs.asm ipfilter8.asm loopfilter.asm)
     endif()
@@ -69,6 +83,10 @@
     source_group(Assembly FILES ${ASM_PRIMITIVES})
 endif(ENABLE_ASSEMBLY)
 
+# set_target_properties can't do list expansion
+string(REPLACE ";" " " VERSION_FLAGS "${VFLAGS}")
+set_source_files_properties(version.cpp PROPERTIES COMPILE_FLAGS ${VERSION_FLAGS})
+
 check_symbol_exists(strtok_r "string.h" HAVE_STRTOK_R)
 if(HAVE_STRTOK_R)
     set_source_files_properties(param.cpp PROPERTIES COMPILE_FLAGS -DHAVE_STRTOK_R=1)
@@ -81,11 +99,8 @@
     set(WINXP winxp.h winxp.cpp)
 endif(WIN32)
 
-set_source_files_properties(version.cpp PROPERTIES COMPILE_FLAGS -DX265_VERSION=${X265_VERSION})
-
 add_library(common OBJECT
-    ${ASM_PRIMITIVES} ${VEC_PRIMITIVES}
-    ${LIBCOMMON_SRC} ${LIBCOMMON_HDR} ${WINXP}
+    ${ASM_PRIMITIVES} ${VEC_PRIMITIVES} ${WINXP}
     primitives.cpp primitives.h
     pixel.cpp dct.cpp ipfilter.cpp intrapred.cpp loopfilter.cpp
     constants.cpp constants.h

x265_1.7.tar.gz/source/common/bitstream.cpp -> x265_1.8.tar.gz/source/common/bitstream.cpp Changed

x265_1.7.tar.gz/source/common/bitstream.h -> x265_1.8.tar.gz/source/common/bitstream.h Changed

x265_1.7.tar.gz/source/common/common.cpp -> x265_1.8.tar.gz/source/common/common.cpp Changed

x265_1.7.tar.gz/source/common/common.h -> x265_1.8.tar.gz/source/common/common.h Changed

@@ -106,7 +106,7 @@
 /* If compiled with CHECKED_BUILD perform run-time checks and log any that
  * fail, both to stderr and to a file */
 #if CHECKED_BUILD || _DEBUG
-extern int g_checkFailures;
+namespace X265_NS { extern int g_checkFailures; }
 #define X265_CHECK(expr, ...) if (!(expr)) { \
     x265_log(NULL, X265_LOG_ERROR, __VA_ARGS__); \
     FILE *fp = fopen("x265_check_failures.txt", "a"); \
@@ -126,16 +126,20 @@
 typedef uint64_t sum2_t;
 typedef uint64_t pixel4;
 typedef int64_t  ssum2_t;
-#define X265_DEPTH 10          // compile time configurable bit depth
 #else
 typedef uint8_t  pixel;
 typedef uint16_t sum_t;
 typedef uint32_t sum2_t;
 typedef uint32_t pixel4;
-typedef int32_t  ssum2_t;      //Signed sum
-#define X265_DEPTH 8           // compile time configurable bit depth
+typedef int32_t  ssum2_t; // Signed sum
 #endif // if HIGH_BIT_DEPTH
 
+#if X265_DEPTH <= 10
+typedef uint32_t sse_ret_t;
+#else
+typedef uint64_t sse_ret_t;
+#endif
+
 #ifndef NULL
 #define NULL 0
 #endif
@@ -313,7 +317,7 @@
 #define CHROMA_V_SHIFT(x) (x == X265_CSP_I420)
 #define X265_MAX_PRED_MODE_PER_CTU 85 * 2 * 8
 
-namespace x265 {
+namespace X265_NS {
 
 enum { SAO_NUM_OFFSET = 4 };
 
@@ -409,9 +413,7 @@
 /* located in pixel.cpp */
 void extendPicBorder(pixel* recon, intptr_t stride, int width, int height, int marginX, int marginY);
 
-}
-
-/* outside x265 namespace, but prefixed. defined in common.cpp */
+/* located in common.cpp */
 int64_t  x265_mdate(void);
 #define  x265_log(param, ...) general_log(param, "x265", __VA_ARGS__)
 void     general_log(const x265_param* param, const char* caller, int level, const char* fmt, ...);
@@ -426,7 +428,10 @@
 void     x265_free(void *ptr);
 char*    x265_slurp_file(const char *filename);
 
-void     x265_setup_primitives(x265_param* param, int cpu); /* primitives.cpp */
+/* located in primitives.cpp */
+void     x265_setup_primitives(x265_param* param);
+void     x265_report_simd(x265_param* param);
+}
 
 #include "constants.h"

x265_1.7.tar.gz/source/common/constants.cpp -> x265_1.8.tar.gz/source/common/constants.cpp Changed

@@ -25,9 +25,50 @@
 #include "constants.h"
 #include "threading.h"
 
-namespace x265 {
+namespace X265_NS {
+
+#if X265_DEPTH == 12
+
+// lambda = pow(2, (double)q / 6 - 2) * (1 << (12 - 8));
+double x265_lambda_tab[QP_MAX_MAX + 1] =
+{
+    4.0000,    4.4898,    5.0397,    5.6569,     6.3496,
+    7.1272,    8.0000,    8.9797,    10.0794,    11.3137,
+    12.6992,   14.2544,   16.0000,   17.9594,    20.1587,
+    22.6274,   25.3984,   28.5088,   32.0000,    35.9188,
+    40.3175,   45.2548,   50.7968,   57.0175,    64.0000,
+    71.8376,   80.6349,   90.5097,   101.5937,   114.0350,
+    128.0000,  143.6751,  161.2699,  181.0193,   203.1873,
+    228.0701,  256.0000,  287.3503,  322.5398,   362.0387,
+    406.3747,  456.1401,  512.0000,  574.7006,   645.0796,
+    724.0773,  812.7493,  912.2803,  1024.0000,  1149.4011,
+    1290.1592, 1448.1547, 1625.4987, 1824.5606,  2048.0000,
+    2298.8023, 2580.3183, 2896.3094, 3250.9974,  3649.1211,
+    4096.0000, 4597.6045, 5160.6366, 5792.6188,  6501.9947,
+    7298.2423, 8192.0000, 9195.2091, 10321.2732, 11585.2375
+};
+
+// lambda2 = pow(lambda, 2) * scale (0.85);
+double x265_lambda2_tab[QP_MAX_MAX + 1] =
+{
+    13.6000,       17.1349,       21.5887,       27.2000,       34.2699,
+    43.1773,       54.4000,       68.5397,       86.3546,       108.8000,
+    137.0794,      172.7092,      217.6000,      274.1588,      345.4185,
+    435.2000,      548.3176,      690.8369,      870.4000,      1096.6353,
+    1381.6739,     1740.8000,     2193.2706,     2763.3478,     3481.6000,
+    4386.5411,     5526.6955,     6963.2000,     8773.0822,     11053.3910,
+    13926.4000,    17546.1645,    22106.7819,    27852.8000,    35092.3291,
+    44213.5641,    55705.6000,    70184.6579,    88427.1282,    111411.2000,
+    140369.3159,   176854.2563,   222822.4000,   280738.6324,   353708.5127,
+    445644.8001,   561477.2648,   707417.0237,   891289.6000,   1122954.5277,
+    1414834.0484,  1782579.2003,  2245909.0566,  2829668.0981,  3565158.4000,
+    4491818.1146,  5659336.1938,  7130316.8013,  8983636.2264,  11318672.3923,
+    14260633.6000, 17967272.4585, 22637344.7751, 28521267.1953, 35934544.9165,
+    45274689.5567, 57042534.4000, 71869089.8338, 90549379.1181, 114085068.8008
+};
+
+#elif X265_DEPTH == 10
 
-#if HIGH_BIT_DEPTH
 // lambda = pow(2, (double)q / 6 - 2) * (1 << (X265_DEPTH - 8));
 double x265_lambda_tab[QP_MAX_MAX + 1] =
 {
@@ -324,11 +365,12 @@
       4,  12, 20, 28,  5, 13, 21, 29,  6, 14, 22, 30,  7, 15, 23, 31, 36, 44, 52, 60, 37, 45, 53, 61, 38, 46, 54, 62, 39, 47, 55, 63 }
 };
 
-ALIGN_VAR_16(const uint16_t, g_scan4x4[NUM_SCAN_TYPE][4 * 4]) =
+ALIGN_VAR_16(const uint16_t, g_scan4x4[NUM_SCAN_TYPE + 1][4 * 4]) =
 {
     { 0,  4,  1,  8,  5,  2, 12,  9,  6,  3, 13, 10,  7, 14, 11, 15 },
     { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
-    { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 }
+    { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 },
+    { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0 }
 };
 
 const uint16_t g_scan16x16[16 * 16] =

x265_1.7.tar.gz/source/common/constants.h -> x265_1.8.tar.gz/source/common/constants.h Changed

x265_1.7.tar.gz/source/common/contexts.h -> x265_1.8.tar.gz/source/common/contexts.h Changed

x265_1.7.tar.gz/source/common/cpu.cpp -> x265_1.8.tar.gz/source/common/cpu.cpp Changed

@@ -57,7 +57,7 @@
 
 #endif // if X265_ARCH_ARM
 
-namespace x265 {
+namespace X265_NS {
 const cpu_name_t cpu_names[] =
 {
 #if X265_ARCH_X86
@@ -107,9 +107,9 @@
 
 extern "C" {
 /* cpu-a.asm */
-int x265_cpu_cpuid_test(void);
-void x265_cpu_cpuid(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx);
-void x265_cpu_xgetbv(uint32_t op, uint32_t *eax, uint32_t *edx);
+int PFX(cpu_cpuid_test)(void);
+void PFX(cpu_cpuid)(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx);
+void PFX(cpu_xgetbv)(uint32_t op, uint32_t *eax, uint32_t *edx);
 }
 
 #if defined(_MSC_VER)
@@ -125,16 +125,16 @@
     uint32_t max_extended_cap, max_basic_cap;
 
 #if !X86_64
-    if (!x265_cpu_cpuid_test())
+    if (!PFX(cpu_cpuid_test)())
         return 0;
 #endif
 
-    x265_cpu_cpuid(0, &eax, vendor + 0, vendor + 2, vendor + 1);
+    PFX(cpu_cpuid)(0, &eax, vendor + 0, vendor + 2, vendor + 1);
     max_basic_cap = eax;
     if (max_basic_cap == 0)
         return 0;
 
-    x265_cpu_cpuid(1, &eax, &ebx, &ecx, &edx);
+    PFX(cpu_cpuid)(1, &eax, &ebx, &ecx, &edx);
     if (edx & 0x00800000)
         cpu |= X265_CPU_MMX;
     else
@@ -159,7 +159,7 @@
     if ((ecx & 0x18000000) == 0x18000000)
     {
         /* Check for OS support */
-        x265_cpu_xgetbv(0, &eax, &edx);
+        PFX(cpu_xgetbv)(0, &eax, &edx);
         if ((eax & 0x6) == 0x6)
         {
             cpu |= X265_CPU_AVX;
@@ -170,7 +170,7 @@
 
     if (max_basic_cap >= 7)
     {
-        x265_cpu_cpuid(7, &eax, &ebx, &ecx, &edx);
+        PFX(cpu_cpuid)(7, &eax, &ebx, &ecx, &edx);
         /* AVX2 requires OS support, but BMI1/2 don't. */
         if ((cpu & X265_CPU_AVX) && (ebx & 0x00000020))
             cpu |= X265_CPU_AVX2;
@@ -185,12 +185,12 @@
     if (cpu & X265_CPU_SSSE3)
         cpu |= X265_CPU_SSE2_IS_FAST;
 
-    x265_cpu_cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
+    PFX(cpu_cpuid)(0x80000000, &eax, &ebx, &ecx, &edx);
     max_extended_cap = eax;
 
     if (max_extended_cap >= 0x80000001)
     {
-        x265_cpu_cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
+        PFX(cpu_cpuid)(0x80000001, &eax, &ebx, &ecx, &edx);
 
         if (ecx & 0x00000020)
             cpu |= X265_CPU_LZCNT; /* Supported by Intel chips starting with Haswell */
@@ -233,7 +233,7 @@
 
     if (!strcmp((char*)vendor, "GenuineIntel"))
     {
-        x265_cpu_cpuid(1, &eax, &ebx, &ecx, &edx);
+        PFX(cpu_cpuid)(1, &eax, &ebx, &ecx, &edx);
         int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
         int model  = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
         if (family == 6)
@@ -264,11 +264,11 @@
     if ((!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu & X265_CPU_SSE42))
     {
         /* cacheline size is specified in 3 places, any of which may be missing */
-        x265_cpu_cpuid(1, &eax, &ebx, &ecx, &edx);
+        PFX(cpu_cpuid)(1, &eax, &ebx, &ecx, &edx);
         int cache = (ebx & 0xff00) >> 5; // cflush size
         if (!cache && max_extended_cap >= 0x80000006)
         {
-            x265_cpu_cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
+            PFX(cpu_cpuid)(0x80000006, &eax, &ebx, &ecx, &edx);
             cache = ecx & 0xff; // cacheline size
         }
         if (!cache && max_basic_cap >= 2)
@@ -281,7 +281,7 @@
             int max, i = 0;
             do
             {
-                x265_cpu_cpuid(2, buf + 0, buf + 1, buf + 2, buf + 3);
+                PFX(cpu_cpuid)(2, buf + 0, buf + 1, buf + 2, buf + 3);
                 max = buf[0] & 0xff;
                 buf[0] &= ~0xff;
                 for (int j = 0; j < 4; j++)
@@ -318,8 +318,8 @@
 #elif X265_ARCH_ARM
 
 extern "C" {
-void x265_cpu_neon_test(void);
-int x265_cpu_fast_neon_mrc_test(void);
+void PFX(cpu_neon_test)(void);
+int PFX(cpu_fast_neon_mrc_test)(void);
 }
 
 uint32_t cpu_detect(void)
@@ -340,7 +340,7 @@
     }
 
     canjump = 1;
-    x265_cpu_neon_test();
+    PFX(cpu_neon_test)();
     canjump = 0;
     signal(SIGILL, oldsig);
 #endif // if !HAVE_NEON
@@ -356,7 +356,7 @@
     // which may result in incorrect detection and the counters stuck enabled.
     // right now Apple does not seem to support performance counters for this test
 #ifndef __MACH__
-    flags |= x265_cpu_fast_neon_mrc_test() ? X265_CPU_FAST_NEON_MRC : 0;
+    flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
 #endif
     // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
 #endif // if HAVE_ARMV6

x265_1.7.tar.gz/source/common/cpu.h -> x265_1.8.tar.gz/source/common/cpu.h Changed

@@ -27,24 +27,29 @@
 
 #include "common.h"
 
+/* All assembly functions are prefixed with X265_NS (macro expanded) */
+#define PFX3(prefix, name) prefix ## _ ## name
+#define PFX2(prefix, name) PFX3(prefix, name)
+#define PFX(name)          PFX2(X265_NS, name)
+
 // from cpu-a.asm, if ASM primitives are compiled, else primitives.cpp
-extern "C" void x265_cpu_emms(void);
-extern "C" void x265_safe_intel_cpu_indicator_init(void);
+extern "C" void PFX(cpu_emms)(void);
+extern "C" void PFX(safe_intel_cpu_indicator_init)(void);
 
 #if _MSC_VER && _WIN64
-#define x265_emms() x265_cpu_emms()
+#define x265_emms() PFX(cpu_emms)()
 #elif _MSC_VER
 #include <mmintrin.h>
 #define x265_emms() _mm_empty()
 #elif __GNUC__
 // Cannot use _mm_empty() directly without compiling all the source with
 // a fixed CPU arch, which we would like to avoid at the moment
-#define x265_emms() x265_cpu_emms()
+#define x265_emms() PFX(cpu_emms)()
 #else
-#define x265_emms() x265_cpu_emms()
+#define x265_emms() PFX(cpu_emms)()
 #endif
 
-namespace x265 {
+namespace X265_NS {
 uint32_t cpu_detect(void);
 
 struct cpu_name_t

x265_1.7.tar.gz/source/common/cudata.cpp -> x265_1.8.tar.gz/source/common/cudata.cpp Changed

@@ -28,33 +28,33 @@
 #include "mv.h"
 #include "cudata.h"
 
-using namespace x265;
-
-namespace {
-// file private namespace
+using namespace X265_NS;
 
 /* for all bcast* and copy* functions, dst and src are aligned to MIN(size, 32) */
 
-void bcast1(uint8_t* dst, uint8_t val)  { dst[0] = val; }
+static void bcast1(uint8_t* dst, uint8_t val)  { dst[0] = val; }
 
-void copy4(uint8_t* dst, uint8_t* src)  { ((uint32_t*)dst)[0] = ((uint32_t*)src)[0]; }
-void bcast4(uint8_t* dst, uint8_t val)  { ((uint32_t*)dst)[0] = 0x01010101u * val; }
+static void copy4(uint8_t* dst, uint8_t* src)  { ((uint32_t*)dst)[0] = ((uint32_t*)src)[0]; }
+static void bcast4(uint8_t* dst, uint8_t val)  { ((uint32_t*)dst)[0] = 0x01010101u * val; }
 
-void copy16(uint8_t* dst, uint8_t* src) { ((uint64_t*)dst)[0] = ((uint64_t*)src)[0]; ((uint64_t*)dst)[1] = ((uint64_t*)src)[1]; }
-void bcast16(uint8_t* dst, uint8_t val) { uint64_t bval = 0x0101010101010101ULL * val; ((uint64_t*)dst)[0] = bval; ((uint64_t*)dst)[1] = bval; }
+static void copy16(uint8_t* dst, uint8_t* src) { ((uint64_t*)dst)[0] = ((uint64_t*)src)[0]; ((uint64_t*)dst)[1] = ((uint64_t*)src)[1]; }
+static void bcast16(uint8_t* dst, uint8_t val) { uint64_t bval = 0x0101010101010101ULL * val; ((uint64_t*)dst)[0] = bval; ((uint64_t*)dst)[1] = bval; }
 
-void copy64(uint8_t* dst, uint8_t* src) { ((uint64_t*)dst)[0] = ((uint64_t*)src)[0]; ((uint64_t*)dst)[1] = ((uint64_t*)src)[1]; 
-                                          ((uint64_t*)dst)[2] = ((uint64_t*)src)[2]; ((uint64_t*)dst)[3] = ((uint64_t*)src)[3];
-                                          ((uint64_t*)dst)[4] = ((uint64_t*)src)[4]; ((uint64_t*)dst)[5] = ((uint64_t*)src)[5];
-                                          ((uint64_t*)dst)[6] = ((uint64_t*)src)[6]; ((uint64_t*)dst)[7] = ((uint64_t*)src)[7]; }
-void bcast64(uint8_t* dst, uint8_t val) { uint64_t bval = 0x0101010101010101ULL * val;
-                                          ((uint64_t*)dst)[0] = bval; ((uint64_t*)dst)[1] = bval; ((uint64_t*)dst)[2] = bval; ((uint64_t*)dst)[3] = bval;
-                                          ((uint64_t*)dst)[4] = bval; ((uint64_t*)dst)[5] = bval; ((uint64_t*)dst)[6] = bval; ((uint64_t*)dst)[7] = bval; }
+static void copy64(uint8_t* dst, uint8_t* src) { ((uint64_t*)dst)[0] = ((uint64_t*)src)[0]; ((uint64_t*)dst)[1] = ((uint64_t*)src)[1]; 
+                                                 ((uint64_t*)dst)[2] = ((uint64_t*)src)[2]; ((uint64_t*)dst)[3] = ((uint64_t*)src)[3];
+                                                 ((uint64_t*)dst)[4] = ((uint64_t*)src)[4]; ((uint64_t*)dst)[5] = ((uint64_t*)src)[5];
+                                                 ((uint64_t*)dst)[6] = ((uint64_t*)src)[6]; ((uint64_t*)dst)[7] = ((uint64_t*)src)[7]; }
+static void bcast64(uint8_t* dst, uint8_t val) { uint64_t bval = 0x0101010101010101ULL * val;
+                                                 ((uint64_t*)dst)[0] = bval; ((uint64_t*)dst)[1] = bval; ((uint64_t*)dst)[2] = bval; ((uint64_t*)dst)[3] = bval;
+                                                 ((uint64_t*)dst)[4] = bval; ((uint64_t*)dst)[5] = bval; ((uint64_t*)dst)[6] = bval; ((uint64_t*)dst)[7] = bval; }
 
 /* at 256 bytes, memset/memcpy will probably use SIMD more effectively than our uint64_t hack,
  * but hand-written assembly would beat it. */
-void copy256(uint8_t* dst, uint8_t* src) { memcpy(dst, src, 256); }
-void bcast256(uint8_t* dst, uint8_t val) { memset(dst, val, 256); }
+static void copy256(uint8_t* dst, uint8_t* src) { memcpy(dst, src, 256); }
+static void bcast256(uint8_t* dst, uint8_t val) { memset(dst, val, 256); }
+
+namespace {
+// file private namespace
 
 /* Check whether 2 addresses point to the same column */
 inline bool isEqualCol(int addrA, int addrB, int numUnits)
@@ -112,38 +112,6 @@
     return MV((int16_t)mvx, (int16_t)mvy);
 }
 
-// Partition table.
-// First index is partitioning mode. Second index is partition index.
-// Third index is 0 for partition sizes, 1 for partition offsets. The 
-// sizes and offsets are encoded as two packed 4-bit values (X,Y). 
-// X and Y represent 1/4 fractions of the block size.
-const uint32_t partTable[8][4][2] =
-{
-    //        XY
-    { { 0x44, 0x00 }, { 0x00, 0x00 }, { 0x00, 0x00 }, { 0x00, 0x00 } }, // SIZE_2Nx2N.
-    { { 0x42, 0x00 }, { 0x42, 0x02 }, { 0x00, 0x00 }, { 0x00, 0x00 } }, // SIZE_2NxN.
-    { { 0x24, 0x00 }, { 0x24, 0x20 }, { 0x00, 0x00 }, { 0x00, 0x00 } }, // SIZE_Nx2N.
-    { { 0x22, 0x00 }, { 0x22, 0x20 }, { 0x22, 0x02 }, { 0x22, 0x22 } }, // SIZE_NxN.
-    { { 0x41, 0x00 }, { 0x43, 0x01 }, { 0x00, 0x00 }, { 0x00, 0x00 } }, // SIZE_2NxnU.
-    { { 0x43, 0x00 }, { 0x41, 0x03 }, { 0x00, 0x00 }, { 0x00, 0x00 } }, // SIZE_2NxnD.
-    { { 0x14, 0x00 }, { 0x34, 0x10 }, { 0x00, 0x00 }, { 0x00, 0x00 } }, // SIZE_nLx2N.
-    { { 0x34, 0x00 }, { 0x14, 0x30 }, { 0x00, 0x00 }, { 0x00, 0x00 } }  // SIZE_nRx2N.
-};
-
-// Partition Address table.
-// First index is partitioning mode. Second index is partition address.
-const uint32_t partAddrTable[8][4] =
-{
-    { 0x00, 0x00, 0x00, 0x00 }, // SIZE_2Nx2N.
-    { 0x00, 0x08, 0x08, 0x08 }, // SIZE_2NxN.
-    { 0x00, 0x04, 0x04, 0x04 }, // SIZE_Nx2N.
-    { 0x00, 0x04, 0x08, 0x0C }, // SIZE_NxN.
-    { 0x00, 0x02, 0x02, 0x02 }, // SIZE_2NxnU.
-    { 0x00, 0x0A, 0x0A, 0x0A }, // SIZE_2NxnD.
-    { 0x00, 0x01, 0x01, 0x01 }, // SIZE_nLx2N.
-    { 0x00, 0x05, 0x05, 0x05 }  // SIZE_nRx2N.
-};
-
 }
 
 cubcast_t CUData::s_partSet[NUM_FULL_DEPTH] = { NULL, NULL, NULL, NULL, NULL };

x265_1.7.tar.gz/source/common/cudata.h -> x265_1.8.tar.gz/source/common/cudata.h Changed

@@ -28,7 +28,7 @@
 #include "slice.h"
 #include "mv.h"
 
-namespace x265 {
+namespace X265_NS {
 // private namespace
 
 class FrameData;
@@ -121,6 +121,38 @@
 // Partition count table, index represents partitioning mode.
 const uint32_t nbPartsTable[8] = { 1, 2, 2, 4, 2, 2, 2, 2 };
 
+// Partition table.
+// First index is partitioning mode. Second index is partition index.
+// Third index is 0 for partition sizes, 1 for partition offsets. The 
+// sizes and offsets are encoded as two packed 4-bit values (X,Y). 
+// X and Y represent 1/4 fractions of the block size.
+const uint32_t partTable[8][4][2] =
+{
+    //        XY
+    { { 0x44, 0x00 }, { 0x00, 0x00 }, { 0x00, 0x00 }, { 0x00, 0x00 } }, // SIZE_2Nx2N.
+    { { 0x42, 0x00 }, { 0x42, 0x02 }, { 0x00, 0x00 }, { 0x00, 0x00 } }, // SIZE_2NxN.
+    { { 0x24, 0x00 }, { 0x24, 0x20 }, { 0x00, 0x00 }, { 0x00, 0x00 } }, // SIZE_Nx2N.
+    { { 0x22, 0x00 }, { 0x22, 0x20 }, { 0x22, 0x02 }, { 0x22, 0x22 } }, // SIZE_NxN.
+    { { 0x41, 0x00 }, { 0x43, 0x01 }, { 0x00, 0x00 }, { 0x00, 0x00 } }, // SIZE_2NxnU.
+    { { 0x43, 0x00 }, { 0x41, 0x03 }, { 0x00, 0x00 }, { 0x00, 0x00 } }, // SIZE_2NxnD.
+    { { 0x14, 0x00 }, { 0x34, 0x10 }, { 0x00, 0x00 }, { 0x00, 0x00 } }, // SIZE_nLx2N.
+    { { 0x34, 0x00 }, { 0x14, 0x30 }, { 0x00, 0x00 }, { 0x00, 0x00 } }  // SIZE_nRx2N.
+};
+
+// Partition Address table.
+// First index is partitioning mode. Second index is partition address.
+const uint32_t partAddrTable[8][4] =
+{
+    { 0x00, 0x00, 0x00, 0x00 }, // SIZE_2Nx2N.
+    { 0x00, 0x08, 0x08, 0x08 }, // SIZE_2NxN.
+    { 0x00, 0x04, 0x04, 0x04 }, // SIZE_Nx2N.
+    { 0x00, 0x04, 0x08, 0x0C }, // SIZE_NxN.
+    { 0x00, 0x02, 0x02, 0x02 }, // SIZE_2NxnU.
+    { 0x00, 0x0A, 0x0A, 0x0A }, // SIZE_2NxnD.
+    { 0x00, 0x01, 0x01, 0x01 }, // SIZE_nLx2N.
+    { 0x00, 0x05, 0x05, 0x05 }  // SIZE_nRx2N.
+};
+
 // Holds part data for a CU of a given size, from an 8x8 CU to a CTU
 class CUData
 {
@@ -222,8 +254,11 @@
     void     getNeighbourMV(uint32_t puIdx, uint32_t absPartIdx, InterNeighbourMV* neighbours) const;
     void     getIntraTUQtDepthRange(uint32_t tuDepthRange[2], uint32_t absPartIdx) const;
     void     getInterTUQtDepthRange(uint32_t tuDepthRange[2], uint32_t absPartIdx) const;
+    uint32_t getBestRefIdx(uint32_t subPartIdx) const { return ((m_interDir[subPartIdx] & 1) << m_refIdx[0][subPartIdx]) | 
+                                                              (((m_interDir[subPartIdx] >> 1) & 1) << (m_refIdx[1][subPartIdx] + 16)); }
+    uint32_t getPUOffset(uint32_t puIdx, uint32_t absPartIdx) const { return (partAddrTable[(int)m_partSize[absPartIdx]][puIdx] << (g_unitSizeDepth - m_cuDepth[absPartIdx]) * 2) >> 4; }
 
-    uint32_t getNumPartInter() const              { return nbPartsTable[(int)m_partSize[0]]; }
+    uint32_t getNumPartInter(uint32_t absPartIdx) const              { return nbPartsTable[(int)m_partSize[absPartIdx]]; }
     bool     isIntra(uint32_t absPartIdx) const   { return m_predMode[absPartIdx] == MODE_INTRA; }
     bool     isInter(uint32_t absPartIdx) const   { return !!(m_predMode[absPartIdx] & MODE_INTER); }
     bool     isSkipped(uint32_t absPartIdx) const { return m_predMode[absPartIdx] == MODE_SKIP; }

x265_1.7.tar.gz/source/common/dct.cpp -> x265_1.8.tar.gz/source/common/dct.cpp Changed

@@ -29,19 +29,18 @@
 
 #include "common.h"
 #include "primitives.h"
+#include "contexts.h"   // costCoeffNxN_c
+#include "threading.h"  // CLZ
 
-using namespace x265;
+using namespace X265_NS;
 
 #if _MSC_VER
 #pragma warning(disable: 4127) // conditional expression is constant, typical for templated functions
 #endif
 
-namespace {
-// anonymous file-static namespace
-
 // Fast DST Algorithm. Full matrix multiplication for DST and Fast DST algorithm
 // give identical results
-void fastForwardDst(const int16_t* block, int16_t* coeff, int shift)  // input block, output coeff
+static void fastForwardDst(const int16_t* block, int16_t* coeff, int shift)  // input block, output coeff
 {
     int c[4];
     int rnd_factor = 1 << (shift - 1);
@@ -61,7 +60,7 @@
     }
 }
 
-void inversedst(const int16_t* tmp, int16_t* block, int shift)  // input tmp, output block
+static void inversedst(const int16_t* tmp, int16_t* block, int shift)  // input tmp, output block
 {
     int i, c[4];
     int rnd_factor = 1 << (shift - 1);
@@ -81,7 +80,7 @@
     }
 }
 
-void partialButterfly16(const int16_t* src, int16_t* dst, int shift, int line)
+static void partialButterfly16(const int16_t* src, int16_t* dst, int shift, int line)
 {
     int j, k;
     int E[8], O[8];
@@ -134,7 +133,7 @@
     }
 }
 
-void partialButterfly32(const int16_t* src, int16_t* dst, int shift, int line)
+static void partialButterfly32(const int16_t* src, int16_t* dst, int shift, int line)
 {
     int j, k;
     int E[16], O[16];
@@ -203,7 +202,7 @@
     }
 }
 
-void partialButterfly8(const int16_t* src, int16_t* dst, int shift, int line)
+static void partialButterfly8(const int16_t* src, int16_t* dst, int shift, int line)
 {
     int j, k;
     int E[4], O[4];
@@ -240,7 +239,7 @@
     }
 }
 
-void partialButterflyInverse4(const int16_t* src, int16_t* dst, int shift, int line)
+static void partialButterflyInverse4(const int16_t* src, int16_t* dst, int shift, int line)
 {
     int j;
     int E[2], O[2];
@@ -265,7 +264,7 @@
     }
 }
 
-void partialButterflyInverse8(const int16_t* src, int16_t* dst, int shift, int line)
+static void partialButterflyInverse8(const int16_t* src, int16_t* dst, int shift, int line)
 {
     int j, k;
     int E[4], O[4];
@@ -301,7 +300,7 @@
     }
 }
 
-void partialButterflyInverse16(const int16_t* src, int16_t* dst, int shift, int line)
+static void partialButterflyInverse16(const int16_t* src, int16_t* dst, int shift, int line)
 {
     int j, k;
     int E[8], O[8];
@@ -352,7 +351,7 @@
     }
 }
 
-void partialButterflyInverse32(const int16_t* src, int16_t* dst, int shift, int line)
+static void partialButterflyInverse32(const int16_t* src, int16_t* dst, int shift, int line)
 {
     int j, k;
     int E[16], O[16];
@@ -416,7 +415,7 @@
     }
 }
 
-void partialButterfly4(const int16_t* src, int16_t* dst, int shift, int line)
+static void partialButterfly4(const int16_t* src, int16_t* dst, int shift, int line)
 {
     int j;
     int E[2], O[2];
@@ -440,7 +439,7 @@
     }
 }
 
-void dst4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
+static void dst4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
 {
     const int shift_1st = 1 + X265_DEPTH - 8;
     const int shift_2nd = 8;
@@ -457,7 +456,7 @@
     fastForwardDst(coef, dst, shift_2nd);
 }
 
-void dct4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
+static void dct4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
 {
     const int shift_1st = 1 + X265_DEPTH - 8;
     const int shift_2nd = 8;
@@ -474,7 +473,7 @@
     partialButterfly4(coef, dst, shift_2nd, 4);
 }
 
-void dct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
+static void dct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
 {
     const int shift_1st = 2 + X265_DEPTH - 8;
     const int shift_2nd = 9;
@@ -491,7 +490,7 @@
     partialButterfly8(coef, dst, shift_2nd, 8);
 }
 
-void dct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
+static void dct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
 {
     const int shift_1st = 3 + X265_DEPTH - 8;
     const int shift_2nd = 10;
@@ -508,7 +507,7 @@
     partialButterfly16(coef, dst, shift_2nd, 16);
 }
 
-void dct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
+static void dct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
 {
     const int shift_1st = 4 + X265_DEPTH - 8;
     const int shift_2nd = 11;
@@ -525,7 +524,7 @@
     partialButterfly32(coef, dst, shift_2nd, 32);
 }
 
-void idst4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
+static void idst4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
 {
     const int shift_1st = 7;
     const int shift_2nd = 12 - (X265_DEPTH - 8);
@@ -542,7 +541,7 @@
     }
 }
 
-void idct4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
+static void idct4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
 {
     const int shift_1st = 7;
     const int shift_2nd = 12 - (X265_DEPTH - 8);
@@ -559,7 +558,7 @@
     }
 }
 
-void idct8_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
+static void idct8_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
 {
     const int shift_1st = 7;
     const int shift_2nd = 12 - (X265_DEPTH - 8);
@@ -576,7 +575,7 @@
     }
 }
 
-void idct16_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
+static void idct16_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
 {
     const int shift_1st = 7;
     const int shift_2nd = 12 - (X265_DEPTH - 8);
@@ -593,7 +592,7 @@
     }
 }
 
-void idct32_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
+static void idct32_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
 {
     const int shift_1st = 7;
     const int shift_2nd = 12 - (X265_DEPTH - 8);
@@ -610,10 +609,10 @@
     }
 }
 
-void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)

x265_1.7.tar.gz/source/common/deblock.cpp -> x265_1.8.tar.gz/source/common/deblock.cpp Changed

x265_1.7.tar.gz/source/common/deblock.h -> x265_1.8.tar.gz/source/common/deblock.h Changed

x265_1.7.tar.gz/source/common/frame.cpp -> x265_1.8.tar.gz/source/common/frame.cpp Changed

x265_1.7.tar.gz/source/common/frame.h -> x265_1.8.tar.gz/source/common/frame.h Changed

x265_1.7.tar.gz/source/common/framedata.cpp -> x265_1.8.tar.gz/source/common/framedata.cpp Changed

x265_1.7.tar.gz/source/common/framedata.h -> x265_1.8.tar.gz/source/common/framedata.h Changed

@@ -28,12 +28,61 @@
 #include "slice.h"
 #include "cudata.h"
 
-namespace x265 {
+namespace X265_NS {
 // private namespace
 
 class PicYuv;
 class JobProvider;
 
+#define INTER_MODES 4 // 2Nx2N, 2NxN, Nx2N, AMP modes
+#define INTRA_MODES 3 // DC, Planar, Angular modes
+
+/* Current frame stats for 2 pass */
+struct FrameStats
+{
+    int         mvBits;    /* MV bits (MV+Ref+Block Type) */
+    int         coeffBits; /* Texture bits (DCT coefs) */
+    int         miscBits;
+
+    int         intra8x8Cnt;
+    int         inter8x8Cnt;
+    int         skip8x8Cnt;
+
+    /* CU type counts stored as percentage */
+    double      percent8x8Intra;
+    double      percent8x8Inter;
+    double      percent8x8Skip;
+    double      avgLumaDistortion;
+    double      avgChromaDistortion;
+    double      avgPsyEnergy;
+    double      avgLumaLevel;
+    double      lumaLevel;
+    double      percentIntraNxN;
+    double      percentSkipCu[NUM_CU_DEPTH];
+    double      percentMergeCu[NUM_CU_DEPTH];
+    double      percentIntraDistribution[NUM_CU_DEPTH][INTRA_MODES];
+    double      percentInterDistribution[NUM_CU_DEPTH][3];           // 2Nx2N, RECT, AMP modes percentage
+
+    uint64_t    cntIntraNxN;
+    uint64_t    totalCu;
+    uint64_t    totalCtu;
+    uint64_t    lumaDistortion;
+    uint64_t    chromaDistortion;
+    uint64_t    psyEnergy;
+    uint64_t    cntSkipCu[NUM_CU_DEPTH];
+    uint64_t    cntMergeCu[NUM_CU_DEPTH];
+    uint64_t    cntInter[NUM_CU_DEPTH];
+    uint64_t    cntIntra[NUM_CU_DEPTH];
+    uint64_t    cuInterDistribution[NUM_CU_DEPTH][INTER_MODES];
+    uint64_t    cuIntraDistribution[NUM_CU_DEPTH][INTRA_MODES];
+    uint16_t    maxLumaLevel;
+
+    FrameStats()
+    {
+        memset(this, 0, sizeof(FrameStats));
+    }
+};
+
 /* Per-frame data that is used during encodes and referenced while the picture
  * is available for reference. A FrameData instance is attached to a Frame as it
  * comes out of the lookahead. Frames which are not being encoded do not have a
@@ -85,6 +134,7 @@
 
     RCStatCU*      m_cuStat;
     RCStatRow*     m_rowStat;
+    FrameStats     m_frameStats; // stats of current frame for multi-pass encodes
 
     double         m_avgQpRc;    /* avg QP as decided by rate-control */
     double         m_avgQpAq;    /* avg QP as decided by AQ in addition to rate-control */

x265_1.7.tar.gz/source/common/intrapred.cpp -> x265_1.8.tar.gz/source/common/intrapred.cpp Changed

x265_1.7.tar.gz/source/common/ipfilter.cpp -> x265_1.8.tar.gz/source/common/ipfilter.cpp Changed

x265_1.7.tar.gz/source/common/loopfilter.cpp -> x265_1.8.tar.gz/source/common/loopfilter.cpp Changed

@@ -36,13 +36,13 @@
     return (x >> 31) | ((int)((((uint32_t)-x)) >> 31));
 }
 
-void calSign(int8_t *dst, const pixel *src1, const pixel *src2, const int endX)
+static void calSign(int8_t *dst, const pixel *src1, const pixel *src2, const int endX)
 {
     for (int x = 0; x < endX; x++)
         dst[x] = signOf(src1[x] - src2[x]);
 }
 
-void processSaoCUE0(pixel * rec, int8_t * offsetEo, int width, int8_t* signLeft, intptr_t stride)
+static void processSaoCUE0(pixel * rec, int8_t * offsetEo, int width, int8_t* signLeft, intptr_t stride)
 {
     int x, y;
     int8_t signRight, signLeft0;
@@ -62,7 +62,7 @@
     }
 }
 
-void processSaoCUE1(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width)
+static void processSaoCUE1(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width)
 {
     int x;
     int8_t signDown;
@@ -77,7 +77,7 @@
     }
 }
 
-void processSaoCUE1_2Rows(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width)
+static void processSaoCUE1_2Rows(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width)
 {
     int x, y;
     int8_t signDown;
@@ -96,7 +96,7 @@
     }
 }
 
-void processSaoCUE2(pixel * rec, int8_t * bufft, int8_t * buff1, int8_t * offsetEo, int width, intptr_t stride)
+static void processSaoCUE2(pixel * rec, int8_t * bufft, int8_t * buff1, int8_t * offsetEo, int width, intptr_t stride)
 {
     int x;
     for (x = 0; x < width; x++)
@@ -108,7 +108,7 @@
     }
 }
 
-void processSaoCUE3(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int startX, int endX)
+static void processSaoCUE3(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int startX, int endX)
 {
     int8_t signDown;
     int8_t edgeType;
@@ -122,7 +122,7 @@
     }
 }
 
-void processSaoCUB0(pixel* rec, const int8_t* offset, int ctuWidth, int ctuHeight, intptr_t stride)
+static void processSaoCUB0(pixel* rec, const int8_t* offset, int ctuWidth, int ctuHeight, intptr_t stride)
 {
     #define SAO_BO_BITS 5
     const int boShift = X265_DEPTH - SAO_BO_BITS;
@@ -138,7 +138,7 @@
 }
 }
 
-namespace x265 {
+namespace X265_NS {
 void setupLoopFilterPrimitives_c(EncoderPrimitives &p)
 {
     p.saoCuOrgE0 = processSaoCUE0;

x265_1.7.tar.gz/source/common/lowres.cpp -> x265_1.8.tar.gz/source/common/lowres.cpp Changed

@@ -25,7 +25,7 @@
 #include "lowres.h"
 #include "mv.h"
 
-using namespace x265;
+using namespace X265_NS;
 
 bool Lowres::create(PicYuv *origPic, int _bframes, bool bAQEnabled)
 {
@@ -36,13 +36,13 @@
     lumaStride = width + 2 * origPic->m_lumaMarginX;
     if (lumaStride & 31)
         lumaStride += 32 - (lumaStride & 31);
-    int cuWidth = (width + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
-    int cuHeight = (lines + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
-    int cuCount = cuWidth * cuHeight;
+    maxBlocksInRow = (width + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+    maxBlocksInCol = (lines + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+    int cuCount = maxBlocksInRow * maxBlocksInCol;
 
     /* rounding the width to multiple of lowres CU size */
-    width = cuWidth * X265_LOWRES_CU_SIZE;
-    lines = cuHeight * X265_LOWRES_CU_SIZE;
+    width = maxBlocksInRow * X265_LOWRES_CU_SIZE;
+    lines = maxBlocksInCol * X265_LOWRES_CU_SIZE;
 
     size_t planesize = lumaStride * (lines + 2 * origPic->m_lumaMarginY);
     size_t padoffset = lumaStride * origPic->m_lumaMarginY + origPic->m_lumaMarginX;
@@ -74,7 +74,7 @@
     {
         for (int j = 0; j < bframes + 2; j++)
         {
-            CHECKED_MALLOC(rowSatds[i][j], int32_t, cuHeight);
+            CHECKED_MALLOC(rowSatds[i][j], int32_t, maxBlocksInCol);
             CHECKED_MALLOC(lowresCosts[i][j], uint16_t, cuCount);
         }
     }
@@ -126,7 +126,7 @@
 void Lowres::init(PicYuv *origPic, int poc)
 {
     bLastMiniGopBFrame = false;
-    bScenecut = true;  // could be a scene-cut, until ruled out by flash detection
+    bScenecut = false;  // could be a scene-cut, until ruled out by flash detection
     bKeyframe = false; // Not a keyframe unless identified by lookahead
     frameNum = poc;
     leadingBframes = 0;

x265_1.7.tar.gz/source/common/lowres.h -> x265_1.8.tar.gz/source/common/lowres.h Changed

x265_1.7.tar.gz/source/common/md5.cpp -> x265_1.8.tar.gz/source/common/md5.cpp Changed

x265_1.7.tar.gz/source/common/md5.h -> x265_1.8.tar.gz/source/common/md5.h Changed

x265_1.7.tar.gz/source/common/mv.h -> x265_1.8.tar.gz/source/common/mv.h Changed

x265_1.7.tar.gz/source/common/param.cpp -> x265_1.8.tar.gz/source/common/param.cpp Changed

@@ -52,7 +52,7 @@
  */
 
 #undef strtok_r
-char* strtok_r(char* str, const char* delim, char** nextp)
+static char* strtok_r(char* str, const char* delim, char** nextp)
 {
     if (!str)
         str = *nextp;
@@ -76,27 +76,35 @@
 
 #endif // if !defined(HAVE_STRTOK_R)
 
-using namespace x265;
+#if EXPORT_C_API
+
+/* these functions are exported as C functions (default) */
+using namespace X265_NS;
+extern "C" {
+
+#else
+
+/* these functions exist within private namespace (multilib) */
+namespace X265_NS {
+
+#endif
 
-extern "C"
 x265_param *x265_param_alloc()
 {
     return (x265_param*)x265_malloc(sizeof(x265_param));
 }
 
-extern "C"
 void x265_param_free(x265_param* p)
 {
     x265_free(p);
 }
 
-extern "C"
 void x265_param_default(x265_param* param)
 {
     memset(param, 0, sizeof(x265_param));
 
     /* Applying default values to all elements in the param structure */
-    param->cpuid = x265::cpu_detect();
+    param->cpuid = X265_NS::cpu_detect();
     param->bEnableWavefront = 1;
     param->frameNumThreads = 0;
 
@@ -111,7 +119,7 @@
     param->bEnableSsim = 0;
 
     /* Source specifications */
-    param->internalBitDepth = x265_max_bit_depth;
+    param->internalBitDepth = X265_DEPTH;
     param->internalCsp = X265_CSP_I420;
 
     param->levelIdc = 0;
@@ -151,6 +159,7 @@
     param->subpelRefine = 2;
     param->searchRange = 57;
     param->maxNumMergeCand = 2;
+    param->limitReferences = 0;
     param->bEnableWeightedPred = 1;
     param->bEnableWeightedBiPred = 0;
     param->bEnableEarlySkip = 0;
@@ -197,6 +206,7 @@
     param->rc.rateControlMode = X265_RC_CRF;
     param->rc.qp = 32;
     param->rc.aqMode = X265_AQ_VARIANCE;
+    param->rc.qgSize = 32;
     param->rc.aqStrength = 1.0;
     param->rc.cuTree = 1;
     param->rc.rfConstantMax = 0;
@@ -210,7 +220,6 @@
     param->rc.zones = NULL;
     param->rc.bEnableSlowFirstPass = 0;
     param->rc.bStrictCbr = 0;
-    param->rc.qgSize = 64; /* Same as maxCUSize */
 
     /* Video Usability Information (VUI) */
     param->vui.aspectRatioIdc = 0;
@@ -234,10 +243,13 @@
     param->vui.defDispWinBottomOffset = 0;
 }
 
-extern "C"
 int x265_param_default_preset(x265_param* param, const char* preset, const char* tune)
 {
-    x265_param_default(param);
+#if EXPORT_C_API
+    ::x265_param_default(param);
+#else
+    X265_NS::x265_param_default(param);
+#endif
 
     if (preset)
     {
@@ -430,8 +442,8 @@
             param->deblockingFilterBetaOffset = -2;
             param->deblockingFilterTCOffset = -2;
             param->bIntraInBFrames = 0;
-            param->rdoqLevel = 1;
-            param->psyRdoq = 30;
+            param->rdoqLevel = 2;
+            param->psyRdoq = 10.0;
             param->psyRd = 0.5;
             param->rc.ipFactor = 1.1;
             param->rc.pbFactor = 1.1;
@@ -459,16 +471,6 @@
     return 0;
 }
 
-static double x265_atof(const char* str, bool& bError)
-{
-    char *end;
-    double v = strtod(str, &end);
-
-    if (end == str || *end != '\0')
-        bError = true;
-    return v;
-}
-
 static int parseName(const char* arg, const char* const* names, bool& bError)
 {
     for (int i = 0; names[i]; i++)
@@ -485,7 +487,6 @@
 #define atof(str) x265_atof(str, bError)
 #define atobool(str) (bNameWasBool = true, x265_atobool(str, bError))
 
-extern "C"
 int x265_param_parse(x265_param* p, const char* name, const char* value)
 {
     bool bError = false;
@@ -581,6 +582,7 @@
         }
     }
     OPT("cu-stats") p->bLogCuStats = atobool(value);
+    OPT("total-frames") p->totalFrames = atoi(value);
     OPT("annexb") p->bAnnexB = atobool(value);
     OPT("repeat-headers") p->bRepeatHeaders = atobool(value);
     OPT("wpp") p->bEnableWavefront = atobool(value);
@@ -641,6 +643,7 @@
         }
     }
     OPT("ref") p->maxNumReferences = atoi(value);
+    OPT("limit-refs") p->limitReferences = atoi(value);
     OPT("weightp") p->bEnableWeightedPred = atobool(value);
     OPT("weightb") p->bEnableWeightedBiPred = atobool(value);
     OPT("cbqpoffs") p->cbQpOffset = atoi(value);
@@ -827,7 +830,7 @@
         p->vui.chromaSampleLocTypeTopField = atoi(value);
         p->vui.chromaSampleLocTypeBottomField = p->vui.chromaSampleLocTypeTopField;
     }
-    OPT("crop-rect")
+    OPT2("display-window", "crop-rect")
     {
         p->vui.bEnableDefaultDisplayWindowFlag = 1;
         bError |= sscanf(value, "%d,%d,%d,%d",
@@ -845,7 +848,6 @@
         p->rc.bStatRead = pass & 2;
     }
     OPT("stats") p->rc.statFileName = strdup(value);
-    OPT("csv") p->csvfn = strdup(value);
     OPT("scaling-list") p->scalingLists = strdup(value);
     OPT2("pools", "numa-pools") p->numaPools = strdup(value);
     OPT("lambda-file") p->rc.lambdaFileName = strdup(value);
@@ -864,7 +866,9 @@
     return bError ? X265_PARAM_BAD_VALUE : 0;
 }
 
-namespace x265 {
+} /* end extern "C" or namespace */
+
+namespace X265_NS {
 // internal encoder functions
 
 int x265_atoi(const char* str, bool& bError)
@@ -877,6 +881,16 @@
     return v;
 }
 
+double x265_atof(const char* str, bool& bError)
+{
+    char *end;
+    double v = strtod(str, &end);
+
+    if (end == str || *end != '\0')
+        bError = true;
+    return v;
+}
+
 /* cpu name can be:
  *   auto || true - x265::cpu_detect()
  *   false || no  - disabled
@@ -893,7 +907,7 @@
     if (isdigit(value[0]))
         cpu = x265_atoi(value, bError);
     else

x265_1.7.tar.gz/source/common/param.h -> x265_1.8.tar.gz/source/common/param.h Changed

@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
+ *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -24,7 +25,8 @@
 #ifndef X265_PARAM_H
 #define X265_PARAM_H
 
-namespace x265 {
+namespace X265_NS {
+
 int   x265_check_params(x265_param *param);
 int   x265_set_globals(x265_param *param);
 void  x265_print_params(x265_param *param);
@@ -32,13 +34,27 @@
 void  x265_param_apply_fastfirstpass(x265_param *p);
 char* x265_param2string(x265_param *param);
 int   x265_atoi(const char *str, bool& bError);
+double x265_atof(const char *str, bool& bError);
 int   parseCpuName(const char *value, bool& bError);
 void  setParamAspectRatio(x265_param *p, int width, int height);
 void  getParamAspectRatio(x265_param *p, int& width, int& height);
 bool  parseLambdaFile(x265_param *param);
 
 /* this table is kept internal to avoid confusion, since log level indices start at -1 */
-static const char * const logLevelNames[] = { "none", "error", "warning", "info", "frame", "debug", "full", 0 };
+static const char * const logLevelNames[] = { "none", "error", "warning", "info", "debug", "full", 0 };
+
+#if EXPORT_C_API
+#define PARAM_NS
+#else
+/* declare param functions within private namespace */
+void x265_param_free(x265_param *);
+x265_param* x265_param_alloc();
+void x265_param_default(x265_param *param);
+int x265_param_default_preset(x265_param *, const char *preset, const char *tune);
+int x265_param_apply_profile(x265_param *, const char *profile);
+int x265_param_parse(x265_param *p, const char *name, const char *value);
+#define PARAM_NS X265_NS
+#endif
 
 #define MAXPARAMSIZE 2000
 }

x265_1.7.tar.gz/source/common/piclist.cpp -> x265_1.8.tar.gz/source/common/piclist.cpp Changed

x265_1.7.tar.gz/source/common/piclist.h -> x265_1.8.tar.gz/source/common/piclist.h Changed

x265_1.7.tar.gz/source/common/picyuv.cpp -> x265_1.8.tar.gz/source/common/picyuv.cpp Changed

@@ -26,7 +26,7 @@
 #include "slice.h"
 #include "primitives.h"
 
-using namespace x265;
+using namespace X265_NS;
 
 PicYuv::PicYuv()
 {
@@ -148,52 +148,62 @@
     padx++;
     pady++;
 
-    if (pic.bitDepth < X265_DEPTH)
-    {
-        pixel *yPixel = m_picOrg[0];
-        pixel *uPixel = m_picOrg[1];
-        pixel *vPixel = m_picOrg[2];
+    X265_CHECK(pic.bitDepth >= 8, "pic.bitDepth check failure");
 
-        uint8_t *yChar = (uint8_t*)pic.planes[0];
-        uint8_t *uChar = (uint8_t*)pic.planes[1];
-        uint8_t *vChar = (uint8_t*)pic.planes[2];
-        int shift = X265_MAX(0, X265_DEPTH - pic.bitDepth);
-
-        primitives.planecopy_cp(yChar, pic.stride[0] / sizeof(*yChar), yPixel, m_stride, width, height, shift);
-        primitives.planecopy_cp(uChar, pic.stride[1] / sizeof(*uChar), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
-        primitives.planecopy_cp(vChar, pic.stride[2] / sizeof(*vChar), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
-    }
-    else if (pic.bitDepth == 8)
+    if (pic.bitDepth == 8)
     {
-        pixel *yPixel = m_picOrg[0];
-        pixel *uPixel = m_picOrg[1];
-        pixel *vPixel = m_picOrg[2];
+#if (X265_DEPTH > 8)
+        {
+            pixel *yPixel = m_picOrg[0];
+            pixel *uPixel = m_picOrg[1];
+            pixel *vPixel = m_picOrg[2];
+
+            uint8_t *yChar = (uint8_t*)pic.planes[0];
+            uint8_t *uChar = (uint8_t*)pic.planes[1];
+            uint8_t *vChar = (uint8_t*)pic.planes[2];
+            int shift = (X265_DEPTH - 8);
+
+            primitives.planecopy_cp(yChar, pic.stride[0] / sizeof(*yChar), yPixel, m_stride, width, height, shift);
+            primitives.planecopy_cp(uChar, pic.stride[1] / sizeof(*uChar), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
+            primitives.planecopy_cp(vChar, pic.stride[2] / sizeof(*vChar), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
+        }
+#else /* Case for (X265_DEPTH == 8) */
+        // TODO: Does we need this path? may merge into above in future
+        {
+            pixel *yPixel = m_picOrg[0];
+            pixel *uPixel = m_picOrg[1];
+            pixel *vPixel = m_picOrg[2];
 
-        uint8_t *yChar = (uint8_t*)pic.planes[0];
-        uint8_t *uChar = (uint8_t*)pic.planes[1];
-        uint8_t *vChar = (uint8_t*)pic.planes[2];
+            uint8_t *yChar = (uint8_t*)pic.planes[0];
+            uint8_t *uChar = (uint8_t*)pic.planes[1];
+            uint8_t *vChar = (uint8_t*)pic.planes[2];
 
-        for (int r = 0; r < height; r++)
-        {
-            memcpy(yPixel, yChar, width * sizeof(pixel));
+            for (int r = 0; r < height; r++)
+            {
+                memcpy(yPixel, yChar, width * sizeof(pixel));
 
-            yPixel += m_stride;
-            yChar += pic.stride[0] / sizeof(*yChar);
-        }
+                yPixel += m_stride;
+                yChar += pic.stride[0] / sizeof(*yChar);
+            }
 
-        for (int r = 0; r < height >> m_vChromaShift; r++)
-        {
-            memcpy(uPixel, uChar, (width >> m_hChromaShift) * sizeof(pixel));
-            memcpy(vPixel, vChar, (width >> m_hChromaShift) * sizeof(pixel));
+            for (int r = 0; r < height >> m_vChromaShift; r++)
+            {
+                memcpy(uPixel, uChar, (width >> m_hChromaShift) * sizeof(pixel));
+                memcpy(vPixel, vChar, (width >> m_hChromaShift) * sizeof(pixel));
 
-            uPixel += m_strideC;
-            vPixel += m_strideC;
-            uChar += pic.stride[1] / sizeof(*uChar);
-            vChar += pic.stride[2] / sizeof(*vChar);
+                uPixel += m_strideC;
+                vPixel += m_strideC;
+                uChar += pic.stride[1] / sizeof(*uChar);
+                vChar += pic.stride[2] / sizeof(*vChar);
+            }
         }
+#endif /* (X265_DEPTH > 8) */
     }
     else /* pic.bitDepth > 8 */
     {
+        /* defensive programming, mask off bits that are supposed to be zero */
+        uint16_t mask = (1 << X265_DEPTH) - 1;
+        int shift = abs(pic.bitDepth - X265_DEPTH);
         pixel *yPixel = m_picOrg[0];
         pixel *uPixel = m_picOrg[1];
         pixel *vPixel = m_picOrg[2];
@@ -202,15 +212,20 @@
         uint16_t *uShort = (uint16_t*)pic.planes[1];
         uint16_t *vShort = (uint16_t*)pic.planes[2];
 
-        /* defensive programming, mask off bits that are supposed to be zero */
-        uint16_t mask = (1 << X265_DEPTH) - 1;
-        int shift = X265_MAX(0, pic.bitDepth - X265_DEPTH);
-
-        /* shift and mask pixels to final size */
-
-        primitives.planecopy_sp(yShort, pic.stride[0] / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
-        primitives.planecopy_sp(uShort, pic.stride[1] / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
-        primitives.planecopy_sp(vShort, pic.stride[2] / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
+        if (pic.bitDepth > X265_DEPTH)
+        {
+            /* shift right and mask pixels to final size */
+            primitives.planecopy_sp(yShort, pic.stride[0] / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
+            primitives.planecopy_sp(uShort, pic.stride[1] / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
+            primitives.planecopy_sp(vShort, pic.stride[2] / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
+        }
+        else /* Case for (pic.bitDepth <= X265_DEPTH) */
+        {
+            /* shift left and mask pixels to final size */
+            primitives.planecopy_sp_shl(yShort, pic.stride[0] / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
+            primitives.planecopy_sp_shl(uShort, pic.stride[1] / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
+            primitives.planecopy_sp_shl(vShort, pic.stride[2] / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
+        }
     }
 
     /* extend the right edge if width was not multiple of the minimum CU size */
@@ -259,7 +274,7 @@
     }
 }
 
-namespace x265 {
+namespace X265_NS {
 
 template<uint32_t OUTPUT_BITDEPTH_DIV8>
 static void md5_block(MD5Context& md5, const pixel* plane, uint32_t n)

x265_1.7.tar.gz/source/common/picyuv.h -> x265_1.8.tar.gz/source/common/picyuv.h Changed

x265_1.7.tar.gz/source/common/pixel.cpp -> x265_1.8.tar.gz/source/common/pixel.cpp Changed

@@ -30,7 +30,7 @@
 
 #include <cstdlib> // abs()
 
-using namespace x265;
+using namespace X265_NS;
 
 namespace {
 // place functions in anonymous namespace (file static)
@@ -117,9 +117,9 @@
 }
 
 template<int lx, int ly, class T1, class T2>
-int sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2)
+sse_ret_t sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2)
 {
-    int sum = 0;
+    sse_ret_t sum = 0;
     int tmp;
 
     for (int y = 0; y < ly; y++)
@@ -159,7 +159,7 @@
     return (a + s) ^ s;
 }
 
-int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+static int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
 {
     sum2_t tmp[4][2];
     sum2_t a0, a1, a2, a3, b0, b1;
@@ -219,7 +219,7 @@
 }
 
 // x264's SWAR version of satd 8x4, performs two 4x4 SATDs at once
-int satd_8x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+static int satd_8x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
 {
     sum2_t tmp[4][4];
     sum2_t a0, a1, a2, a3;
@@ -308,7 +308,7 @@
     return (int)sum;
 }
 
-int sa8d_8x8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
+inline int sa8d_8x8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
 {
     return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2);
 }
@@ -359,12 +359,12 @@
     return (int)sum;
 }
 
-int sa8d_8x8(const int16_t* pix1, intptr_t i_pix1)
+static int sa8d_8x8(const int16_t* pix1, intptr_t i_pix1)
 {
     return (int)((_sa8d_8x8(pix1, i_pix1) + 2) >> 2);
 }
 
-int sa8d_16x16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
+static int sa8d_16x16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
 {
     int sum = _sa8d_8x8(pix1, i_pix1, pix2, i_pix2)
         + _sa8d_8x8(pix1 + 8, i_pix1, pix2 + 8, i_pix2)
@@ -516,7 +516,7 @@
             dst[k * blockSize + l] = src[l * stride + k];
 }
 
-void weight_sp_c(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
+static void weight_sp_c(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
 {
     int x, y;
 
@@ -541,7 +541,7 @@
     }
 }
 
-void weight_pp_c(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
+static void weight_pp_c(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
 {
     int x, y;
 
@@ -582,7 +582,7 @@
     }
 }
 
-void scale1D_128to64(pixel *dst, const pixel *src)
+static void scale1D_128to64(pixel *dst, const pixel *src)
 {
     int x;
     const pixel* src1 = src;
@@ -608,7 +608,7 @@
     }
 }
 
-void scale2D_64to32(pixel* dst, const pixel* src, intptr_t stride)
+static void scale2D_64to32(pixel* dst, const pixel* src, intptr_t stride)
 {
     uint32_t x, y;
 
@@ -627,6 +627,7 @@
     }
 }
 
+static
 void frame_init_lowres_core(const pixel* src0, pixel* dst0, pixel* dsth, pixel* dstv, pixel* dstc,
                             intptr_t src_stride, intptr_t dst_stride, int width, int height)
 {
@@ -653,7 +654,7 @@
 }
 
 /* structural similarity metric */
-void ssim_4x4x2_core(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4])
+static void ssim_4x4x2_core(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4])
 {
     for (int z = 0; z < 2; z++)
     {
@@ -681,7 +682,7 @@
     }
 }
 
-float ssim_end_1(int s1, int s2, int ss, int s12)
+static float ssim_end_1(int s1, int s2, int ss, int s12)
 {
 /* Maximum value for 10-bit is: ss*64 = (2^10-1)^2*16*4*64 = 4286582784, which will overflow in some cases.
  * s1*s1, s2*s2, and s1*s2 also obtain this value for edge cases: ((2^10-1)*16*4)^2 = 4286582784.
@@ -689,7 +690,7 @@
 
 #define PIXEL_MAX ((1 << X265_DEPTH) - 1)
 #if HIGH_BIT_DEPTH
-    X265_CHECK(X265_DEPTH == 10, "ssim invalid depth\n");
+    X265_CHECK((X265_DEPTH == 10) || (X265_DEPTH == 12), "ssim invalid depth\n");
 #define type float
     static const float ssim_c1 = (float)(.01 * .01 * PIXEL_MAX * PIXEL_MAX * 64);
     static const float ssim_c2 = (float)(.03 * .03 * PIXEL_MAX * PIXEL_MAX * 64 * 63);
@@ -711,7 +712,7 @@
 #undef PIXEL_MAX
 }
 
-float ssim_end_4(int sum0[5][4], int sum1[5][4], int width)
+static float ssim_end_4(int sum0[5][4], int sum1[5][4], int width)
 {
     float ssim = 0.0;
 
@@ -920,7 +921,7 @@
     }
 }
 
-void planecopy_cp_c(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift)
+static void planecopy_cp_c(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift)
 {
     for (int r = 0; r < height; r++)
     {
@@ -932,7 +933,7 @@
     }
 }
 
-void planecopy_sp_c(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
+static void planecopy_sp_c(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
 {
     for (int r = 0; r < height; r++)
     {
@@ -944,9 +945,21 @@
     }
 }
 
+static void planecopy_sp_shl_c(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
+{
+    for (int r = 0; r < height; r++)
+    {
+        for (int c = 0; c < width; c++)
+            dst[c] = (pixel)((src[c] << shift) & mask);
+
+        dst += dstStride;
+        src += srcStride;
+    }
+}
+
 /* Estimate the total amount of influence on future quality that could be had if we
  * were to improve the reference samples used to inter predict any given CU. */
-void estimateCUPropagateCost(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts,
+static void estimateCUPropagateCost(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts,
                              const int32_t* invQscales, const double* fpsFactor, int len)
 {
     double fps = *fpsFactor / 256;
@@ -962,7 +975,7 @@
 }
 }  // end anonymous namespace
 
-namespace x265 {
+namespace X265_NS {
 // x265 private namespace
 
 /* Extend the edges of a picture so that it may safely be used for motion
@@ -1244,6 +1257,7 @@
 
     p.planecopy_cp = planecopy_cp_c;
     p.planecopy_sp = planecopy_sp_c;
+    p.planecopy_sp_shl = planecopy_sp_shl_c;
     p.propagateCost = estimateCUPropagateCost;
 }

x265_1.7.tar.gz/source/common/predict.cpp -> x265_1.8.tar.gz/source/common/predict.cpp Changed

@@ -28,7 +28,7 @@
 #include "predict.h"
 #include "primitives.h"
 
-using namespace x265;
+using namespace X265_NS;
 
 #if _MSC_VER
 #pragma warning(disable: 4127) // conditional expression is constant
@@ -776,30 +776,17 @@
         // Fill left & below-left samples
         adiTemp += picStride;
         adi--;
-        pNeighborFlags--;
-        for (int j = 0; j < leftUnits; j++)
+        // NOTE: over copy here, but reduce condition operators
+        for (int j = 0; j < leftUnits * unitHeight; j++)
         {
-            if (*pNeighborFlags)
-                for (int i = 0; i < unitHeight; i++)
-                    adi[-i] = adiTemp[i * picStride];
-
-            adiTemp += unitHeight * picStride;
-            adi -= unitHeight;
-            pNeighborFlags--;
+            adi[-j] = adiTemp[j * picStride];
         }
 
         // Fill above & above-right samples
         adiTemp = adiOrigin - picStride;
         adi = adiLineBuffer + (leftUnits * unitHeight) + unitWidth;
-        pNeighborFlags = bNeighborFlags + leftUnits + 1;
-        for (int j = 0; j < aboveUnits; j++)
-        {
-            if (*pNeighborFlags)
-                memcpy(adi, adiTemp, unitWidth * sizeof(*adiTemp));
-            adiTemp += unitWidth;
-            adi += unitWidth;
-            pNeighborFlags++;
-        }
+        // NOTE: over copy here, but reduce condition operators
+        memcpy(adi, adiTemp, aboveUnits * unitWidth * sizeof(*adiTemp));
 
         // Pad reference samples when necessary
         int curr = 0;

x265_1.7.tar.gz/source/common/predict.h -> x265_1.8.tar.gz/source/common/predict.h Changed

x265_1.7.tar.gz/source/common/primitives.cpp -> x265_1.8.tar.gz/source/common/primitives.cpp Changed

@@ -24,7 +24,7 @@
 #include "common.h"
 #include "primitives.h"
 
-namespace x265 {
+namespace X265_NS {
 // x265 private namespace
 
 extern const uint8_t lumaPartitionMapTable[] =
@@ -56,6 +56,7 @@
 void setupFilterPrimitives_c(EncoderPrimitives &p);
 void setupIntraPrimitives_c(EncoderPrimitives &p);
 void setupLoopFilterPrimitives_c(EncoderPrimitives &p);
+void setupSaoPrimitives_c(EncoderPrimitives &p);
 
 void setupCPrimitives(EncoderPrimitives &p)
 {
@@ -64,6 +65,7 @@
     setupFilterPrimitives_c(p);     // ipfilter.cpp
     setupIntraPrimitives_c(p);      // intrapred.cpp
     setupLoopFilterPrimitives_c(p); // loopfilter.cpp
+    setupSaoPrimitives_c(p);        // sao.cpp
 }
 
 void setupAliasPrimitives(EncoderPrimitives &p)
@@ -72,7 +74,7 @@
     /* at HIGH_BIT_DEPTH, pixel == short so we can alias many primitives */
     for (int i = 0; i < NUM_CU_SIZES; i++)
     {
-        p.cu[i].sse_pp = (pixelcmp_t)p.cu[i].sse_ss;
+        p.cu[i].sse_pp = (pixel_sse_t)p.cu[i].sse_ss;
 
         p.cu[i].copy_ps = (copy_ps_t)p.pu[i].copy_pp;
         p.cu[i].copy_sp = (copy_sp_t)p.pu[i].copy_pp;
@@ -185,62 +187,36 @@
 
     p.chroma[X265_CSP_I422].cu[BLOCK_422_2x4].sse_pp = NULL;
 }
-}
-using namespace x265;
 
-/* cpuid >= 0 - force CPU type
- * cpuid < 0  - auto-detect if uninitialized */
-void x265_setup_primitives(x265_param *param, int cpuid)
+void x265_report_simd(x265_param* param)
 {
-    if (cpuid < 0)
-        cpuid = x265::cpu_detect();
-
-    // initialize global variables
-    if (!primitives.pu[0].sad)
-    {
-        setupCPrimitives(primitives);
-
-        /* We do not want the encoder to use the un-optimized intra all-angles
-         * C references. It is better to call the individual angle functions
-         * instead. We must check for NULL before using this primitive */
-        for (int i = 0; i < NUM_TR_SIZE; i++)
-            primitives.cu[i].intra_pred_allangs = NULL;
-
-#if ENABLE_ASSEMBLY
-        setupInstrinsicPrimitives(primitives, cpuid);
-        setupAssemblyPrimitives(primitives, cpuid);
-#else
-        x265_log(param, X265_LOG_WARNING, "Assembly not supported in this binary\n");
-#endif
-
-        setupAliasPrimitives(primitives);
-    }
-
     if (param->logLevel >= X265_LOG_INFO)
     {
+        int cpuid = param->cpuid;
+
         char buf[1000];
         char *p = buf + sprintf(buf, "using cpu capabilities:");
         char *none = p;
-        for (int i = 0; x265::cpu_names[i].flags; i++)
+        for (int i = 0; X265_NS::cpu_names[i].flags; i++)
         {
-            if (!strcmp(x265::cpu_names[i].name, "SSE")
+            if (!strcmp(X265_NS::cpu_names[i].name, "SSE")
                 && (cpuid & X265_CPU_SSE2))
                 continue;
-            if (!strcmp(x265::cpu_names[i].name, "SSE2")
+            if (!strcmp(X265_NS::cpu_names[i].name, "SSE2")
                 && (cpuid & (X265_CPU_SSE2_IS_FAST | X265_CPU_SSE2_IS_SLOW)))
                 continue;
-            if (!strcmp(x265::cpu_names[i].name, "SSE3")
+            if (!strcmp(X265_NS::cpu_names[i].name, "SSE3")
                 && (cpuid & X265_CPU_SSSE3 || !(cpuid & X265_CPU_CACHELINE_64)))
                 continue;
-            if (!strcmp(x265::cpu_names[i].name, "SSE4.1")
+            if (!strcmp(X265_NS::cpu_names[i].name, "SSE4.1")
                 && (cpuid & X265_CPU_SSE42))
                 continue;
-            if (!strcmp(x265::cpu_names[i].name, "BMI1")
+            if (!strcmp(X265_NS::cpu_names[i].name, "BMI1")
                 && (cpuid & X265_CPU_BMI2))
                 continue;
-            if ((cpuid & x265::cpu_names[i].flags) == x265::cpu_names[i].flags
-                && (!i || x265::cpu_names[i].flags != x265::cpu_names[i - 1].flags))
-                p += sprintf(p, " %s", x265::cpu_names[i].name);
+            if ((cpuid & X265_NS::cpu_names[i].flags) == X265_NS::cpu_names[i].flags
+                && (!i || X265_NS::cpu_names[i].flags != X265_NS::cpu_names[i - 1].flags))
+                p += sprintf(p, " %s", X265_NS::cpu_names[i].name);
         }
 
         if (p == none)
@@ -249,14 +225,40 @@
     }
 }
 
+void x265_setup_primitives(x265_param *param)
+{
+    if (!primitives.pu[0].sad)
+    {
+        setupCPrimitives(primitives);
+
+        /* We do not want the encoder to use the un-optimized intra all-angles
+         * C references. It is better to call the individual angle functions
+         * instead. We must check for NULL before using this primitive */
+        for (int i = 0; i < NUM_TR_SIZE; i++)
+            primitives.cu[i].intra_pred_allangs = NULL;
+
+#if ENABLE_ASSEMBLY
+        setupInstrinsicPrimitives(primitives, param->cpuid);
+        setupAssemblyPrimitives(primitives, param->cpuid);
+#endif
+
+        setupAliasPrimitives(primitives);
+    }
+
+    x265_report_simd(param);
+}
+}
+
 #if ENABLE_ASSEMBLY
 /* these functions are implemented in assembly. When assembly is not being
  * compiled, they are unnecessary and can be NOPs */
 #else
 extern "C" {
-int x265_cpu_cpuid_test(void) { return 0; }
-void x265_cpu_emms(void) {}
-void x265_cpu_cpuid(uint32_t, uint32_t *eax, uint32_t *, uint32_t *, uint32_t *) { *eax = 0; }
-void x265_cpu_xgetbv(uint32_t, uint32_t *, uint32_t *) {}
+int PFX(cpu_cpuid_test)(void) { return 0; }
+void PFX(cpu_emms)(void) {}
+void PFX(cpu_cpuid)(uint32_t, uint32_t *eax, uint32_t *, uint32_t *, uint32_t *) { *eax = 0; }
+void PFX(cpu_xgetbv)(uint32_t, uint32_t *, uint32_t *) {}
+void PFX(cpu_neon_test)(void) {}
+int PFX(cpu_fast_neon_mrc_test)(void) { return 0; }
 }
 #endif

x265_1.7.tar.gz/source/common/primitives.h -> x265_1.8.tar.gz/source/common/primitives.h Changed

@@ -33,7 +33,7 @@
 #include "common.h"
 #include "cpu.h"
 
-namespace x265 {
+namespace X265_NS {
 // x265 private namespace
 
 enum LumaPU
@@ -112,6 +112,8 @@
 
 typedef int  (*pixelcmp_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned
 typedef int  (*pixelcmp_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride);
+typedef sse_ret_t (*pixel_sse_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned
+typedef sse_ret_t (*pixel_sse_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride);
 typedef int  (*pixel_ssd_s_t)(const int16_t* fenc, intptr_t fencstride);
 typedef void (*pixelcmp_x4_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
 typedef void (*pixelcmp_x3_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
@@ -173,6 +175,13 @@
 typedef void (*saoCuOrgE2_t)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
 typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX);
 typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
+
+typedef void (*saoCuStatsBO_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
+typedef void (*saoCuStatsE0_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
+typedef void (*saoCuStatsE1_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
+typedef void (*saoCuStatsE2_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBuff, int endX, int endY, int32_t *stats, int32_t *count);
+typedef void (*saoCuStatsE3_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
+
 typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
 typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
 typedef void (*planecopy_sp_t) (const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
@@ -182,6 +191,10 @@
 typedef int (*scanPosLast_t)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
 typedef uint32_t (*findPosFirstLast_t)(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]);
 
+typedef uint32_t (*costCoeffNxN_t)(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase);
+typedef uint32_t (*costCoeffRemain_t)(uint16_t *absCoeff, int numNonZero, int idx);
+typedef uint32_t (*costC1C2Flag_t)(uint16_t *absCoeff, intptr_t numC1Flag, uint8_t *baseCtxMod, intptr_t ctxOffset);
+
 /* Function pointers to optimized encoder primitives. Each pointer can reference
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */
 struct EncoderPrimitives
@@ -242,8 +255,9 @@
         copy_pp_t       copy_pp;       // alias to pu[].copy_pp
 
         var_t           var;           // block internal variance
-        pixelcmp_t      sse_pp;        // Sum of Square Error (pixel, pixel) fenc alignment not assumed
-        pixelcmp_ss_t   sse_ss;        // Sum of Square Error (short, short) fenc alignment not assumed
+
+        pixel_sse_t     sse_pp;        // Sum of Square Error (pixel, pixel) fenc alignment not assumed
+        pixel_sse_ss_t  sse_ss;        // Sum of Square Error (short, short) fenc alignment not assumed
         pixelcmp_t      psy_cost_pp;   // difference in AC energy between two pixel blocks
         pixelcmp_ss_t   psy_cost_ss;   // difference in AC energy between two signed residual blocks
         pixel_ssd_s_t   ssd_s;         // Sum of Square Error (residual coeff to self)
@@ -289,12 +303,19 @@
     saoCuOrgE3_t          saoCuOrgE3[2];
     saoCuOrgB0_t          saoCuOrgB0;
 
+    saoCuStatsBO_t        saoCuStatsBO;
+    saoCuStatsE0_t        saoCuStatsE0;
+    saoCuStatsE1_t        saoCuStatsE1;
+    saoCuStatsE2_t        saoCuStatsE2;
+    saoCuStatsE3_t        saoCuStatsE3;
+
     downscale_t           frameInitLowres;
     cutree_propagate_cost propagateCost;
 
     extendCURowBorder_t   extendRowBorder;
     planecopy_cp_t        planecopy_cp;
     planecopy_sp_t        planecopy_sp;
+    planecopy_sp_t        planecopy_sp_shl;
 
     weightp_sp_t          weight_sp;
     weightp_pp_t          weight_pp;
@@ -303,6 +324,11 @@
     scanPosLast_t         scanPosLast;
     findPosFirstLast_t    findPosFirstLast;
 
+    costCoeffNxN_t        costCoeffNxN;
+    costCoeffRemain_t     costCoeffRemain;
+    costC1C2Flag_t        costC1C2Flag;
+
+
     /* There is one set of chroma primitives per color space. An encoder will
      * have just a single color space and thus it will only ever use one entry
      * in this array. However we always fill all entries in the array in case
@@ -335,7 +361,7 @@
         struct CUChroma
         {
             pixelcmp_t     sa8d;    // if chroma CU is not multiple of 8x8, will use satd
-            pixelcmp_t     sse_pp;
+            pixel_sse_t    sse_pp;
             pixel_sub_ps_t sub_ps;
             pixel_add_ps_t add_ps;
 
@@ -377,4 +403,10 @@
 void setupAliasPrimitives(EncoderPrimitives &p);
 }
 
+#if !EXPORT_C_API
+extern const int   PFX(max_bit_depth);
+extern const char* PFX(version_str);
+extern const char* PFX(build_info_str);
+#endif
+
 #endif // ifndef X265_PRIMITIVES_H

x265_1.7.tar.gz/source/common/quant.cpp -> x265_1.8.tar.gz/source/common/quant.cpp Changed

@@ -30,7 +30,7 @@
 #include "cudata.h"
 #include "contexts.h"
 
-using namespace x265;
+using namespace X265_NS;
 
 #define SIGN(x,y) ((x^(y >> 31))-(y >> 31))
 
@@ -204,7 +204,6 @@
     m_resiDctCoeff = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE * 2);
     m_fencDctCoeff = m_resiDctCoeff + (MAX_TR_SIZE * MAX_TR_SIZE);
     m_fencShortBuf = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE);
-    m_tqBypass = false;
 
     return m_resiDctCoeff && m_fencShortBuf;
 }
@@ -228,9 +227,6 @@
 
 void Quant::setQPforQuant(const CUData& ctu, int qp)
 {
-    m_tqBypass = !!ctu.m_tqBypass[0];
-    if (m_tqBypass)
-        return;
     m_nr = m_frameNr ? &m_frameNr[ctu.m_encData->m_frameEncoderID] : NULL;
     m_qpParam[TEXT_LUMA].setQpParam(qp + QP_BD_OFFSET);
     setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[0], TEXT_CHROMA_U, ctu.m_chromaFormat);
@@ -251,30 +247,63 @@
 }
 
 /* To minimize the distortion only. No rate is considered */
-uint32_t Quant::signBitHidingHDQ(int16_t* coeff, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codeParams)
+uint32_t Quant::signBitHidingHDQ(int16_t* coeff, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codeParams, uint32_t log2TrSize)
 {
-    const uint32_t log2TrSizeCG = codeParams.log2TrSizeCG;
+    uint32_t trSize = 1 << log2TrSize;
     const uint16_t* scan = codeParams.scan;
-    bool lastCG = true;
 
-    for (int cg = (1 << (log2TrSizeCG * 2)) - 1; cg >= 0; cg--)
+    uint8_t coeffNum[MLS_GRP_NUM];      // value range[0, 16]
+    uint16_t coeffSign[MLS_GRP_NUM];    // bit mask map for non-zero coeff sign
+    uint16_t coeffFlag[MLS_GRP_NUM];    // bit mask map for non-zero coeff
+
+#if CHECKED_BUILD || _DEBUG
+    // clean output buffer, the asm version of scanPosLast Never output anything after latest non-zero coeff group
+    memset(coeffNum, 0, sizeof(coeffNum));
+    memset(coeffSign, 0, sizeof(coeffNum));
+    memset(coeffFlag, 0, sizeof(coeffNum));
+#endif
+    const int lastScanPos = primitives.scanPosLast(codeParams.scan, coeff, coeffSign, coeffFlag, coeffNum, numSig, g_scan4x4[codeParams.scanType], trSize);
+    const int cgLastScanPos = (lastScanPos >> LOG2_SCAN_SET_SIZE);
+    unsigned long tmp;
+
+    // first CG need specially processing
+    const uint32_t correctOffset = 0x0F & (lastScanPos ^ 0xF);
+    coeffFlag[cgLastScanPos] <<= correctOffset;
+
+    for (int cg = cgLastScanPos; cg >= 0; cg--)
     {
         int cgStartPos = cg << LOG2_SCAN_SET_SIZE;
         int n;
 
+#if CHECKED_BUILD || _DEBUG
         for (n = SCAN_SET_SIZE - 1; n >= 0; --n)
             if (coeff[scan[n + cgStartPos]])
                 break;
-        if (n < 0)
-            continue;
+        int lastNZPosInCG0 = n;
+#endif
 
-        int lastNZPosInCG = n;
+        if (coeffNum[cg] == 0)
+        {
+            X265_CHECK(lastNZPosInCG0 < 0, "all zero block check failure\n");
+            continue;
+        }
 
+#if CHECKED_BUILD || _DEBUG
         for (n = 0;; n++)
             if (coeff[scan[n + cgStartPos]])
                 break;
 
-        int firstNZPosInCG = n;
+        int firstNZPosInCG0 = n;
+#endif
+
+        CLZ(tmp, coeffFlag[cg]);
+        const int firstNZPosInCG = (15 ^ tmp);
+
+        CTZ(tmp, coeffFlag[cg]);
+        const int lastNZPosInCG = (15 ^ tmp);
+
+        X265_CHECK(firstNZPosInCG0 == firstNZPosInCG, "firstNZPosInCG0 check failure\n");
+        X265_CHECK(lastNZPosInCG0 == lastNZPosInCG, "lastNZPosInCG0 check failure\n");
 
         if (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD)
         {
@@ -287,12 +316,17 @@
             if (signbit != (absSum & 0x1)) // compare signbit with sum_parity
             {
                 int minCostInc = MAX_INT,  minPos = -1, curCost = MAX_INT;
-                int16_t finalChange = 0, curChange = 0;
+                int32_t finalChange = 0, curChange = 0;
+                uint32_t cgFlags = coeffFlag[cg];
+                if (cg == cgLastScanPos)
+                    cgFlags >>= correctOffset;
 
-                for (n = (lastCG ? lastNZPosInCG : SCAN_SET_SIZE - 1); n >= 0; --n)
+                for (n = (cg == cgLastScanPos ? lastNZPosInCG : SCAN_SET_SIZE - 1); n >= 0; --n)
                 {
                     uint32_t blkPos = scan[n + cgStartPos];
-                    if (coeff[blkPos])
+                    X265_CHECK(!!coeff[blkPos] == !!(cgFlags & 1), "non zero coeff check failure\n");
+
+                    if (cgFlags & 1)
                     {
                         if (deltaU[blkPos] > 0)
                         {
@@ -301,8 +335,11 @@
                         }
                         else
                         {
-                            if (n == firstNZPosInCG && abs(coeff[blkPos]) == 1)
+                            if ((cgFlags == 1) && (abs(coeff[blkPos]) == 1))
+                            {
+                                X265_CHECK(n == firstNZPosInCG, "firstNZPosInCG position check failure\n");
                                 curCost = MAX_INT;
+                            }
                             else
                             {
                                 curCost = deltaU[blkPos];
@@ -312,8 +349,9 @@
                     }
                     else
                     {
-                        if (n < firstNZPosInCG)
+                        if (cgFlags == 0)
                         {
+                            X265_CHECK(n < firstNZPosInCG, "firstNZPosInCG position check failure\n");
                             uint32_t thisSignBit = m_resiDctCoeff[blkPos] >= 0 ? 0 : 1;
                             if (thisSignBit != signbit)
                                 curCost = MAX_INT;
@@ -336,6 +374,7 @@
                         finalChange = curChange;
                         minPos = blkPos;
                     }
+                    cgFlags>>=1;
                 }
 
                 /* do not allow change to violate coeff clamp */
@@ -347,14 +386,12 @@
                 else if (finalChange == -1 && abs(coeff[minPos]) == 1)
                     numSig--;
 
-                if (m_resiDctCoeff[minPos] >= 0)
-                    coeff[minPos] += finalChange;
-                else
-                    coeff[minPos] -= finalChange;
+                {
+                    const int16_t sigMask = ((int16_t)m_resiDctCoeff[minPos]) >> 15;
+                    coeff[minPos] += ((int16_t)finalChange ^ sigMask) - sigMask;
+                }
             }
         }
-
-        lastCG = false;
     }
 
     return numSig;
@@ -364,7 +401,8 @@
                              coeff_t* coeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip)
 {
     const uint32_t sizeIdx = log2TrSize - 2;
-    if (m_tqBypass)
+
+    if (cu.m_tqBypass[0])
     {
         X265_CHECK(log2TrSize >= 2 && log2TrSize <= 5, "Block size mistake!\n");
         return primitives.cu[sizeIdx].copy_cnt(coeff, residual, resiStride);
@@ -437,18 +475,19 @@
         {
             TUEntropyCodingParameters codeParams;
             cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, isLuma);
-            return signBitHidingHDQ(coeff, deltaU, numSig, codeParams);
+            return signBitHidingHDQ(coeff, deltaU, numSig, codeParams, log2TrSize);
         }
         else
             return numSig;
     }
 }
 
-void Quant::invtransformNxN(int16_t* residual, uint32_t resiStride, const coeff_t* coeff,
+void Quant::invtransformNxN(const CUData& cu, int16_t* residual, uint32_t resiStride, const coeff_t* coeff,
                             uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig)
 {
     const uint32_t sizeIdx = log2TrSize - 2;
-    if (m_tqBypass)
+

x265_1.7.tar.gz/source/common/quant.h -> x265_1.8.tar.gz/source/common/quant.h Changed

@@ -28,7 +28,7 @@
 #include "scalinglist.h"
 #include "contexts.h"
 
-namespace x265 {
+namespace X265_NS {
 // private namespace
 
 class CUData;
@@ -41,7 +41,7 @@
     int per;
     int qp;
     int64_t lambda2; /* FIX8 */
-    int32_t lambda;  /* FIX8, dynamic range is 18-bits in 8bpp and 20-bits in 16bpp */
+    int32_t lambda;  /* FIX8, dynamic range is 18-bits in Main and 20-bits in Main10 */
 
     QpParam() : qp(MAX_INT) {}
 
@@ -68,9 +68,9 @@
     /* 0 = luma 4x4,   1 = luma 8x8,   2 = luma 16x16,   3 = luma 32x32
      * 4 = chroma 4x4, 5 = chroma 8x8, 6 = chroma 16x16, 7 = chroma 32x32
      * Intra 0..7 - Inter 8..15 */
-    uint16_t offsetDenoise[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
-    uint32_t residualSum[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
+    ALIGN_VAR_16(uint32_t, residualSum[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS]);
     uint32_t count[MAX_NUM_TR_CATEGORIES];
+    uint16_t offsetDenoise[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
 };
 
 class Quant
@@ -94,7 +94,6 @@
 
     NoiseReduction*    m_nr;
     NoiseReduction*    m_frameNr; // Array of NR structures, one for each frameEncoder
-    bool               m_tqBypass;
 
     Quant();
     ~Quant();
@@ -109,7 +108,7 @@
     uint32_t transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t resiStride, coeff_t* coeff,
                           uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip);
 
-    void invtransformNxN(int16_t* residual, uint32_t resiStride, const coeff_t* coeff,
+    void invtransformNxN(const CUData& cu, int16_t* residual, uint32_t resiStride, const coeff_t* coeff,
                          uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig);
 
     /* Pattern decision for context derivation process of significant_coeff_flag */
@@ -126,9 +125,9 @@
         const uint32_t sigPos = (uint32_t)(sigCoeffGroupFlag64 >> (cgBlkPos + 1)); // just need lowest 7-bits valid
 
         // TODO: instruction BT is faster, but _bittest64 still generate instruction 'BT m, r' in VS2012
-        const uint32_t sigRight = ((int32_t)(cgPosX - (trSizeCG - 1)) >> 31) & (sigPos & 1);
-        const uint32_t sigLower = ((int32_t)(cgPosY - (trSizeCG - 1)) >> 31) & (sigPos >> (trSizeCG - 2)) & 2;
-        return sigRight + sigLower;
+        const uint32_t sigRight = ((uint32_t)(cgPosX - (trSizeCG - 1)) >> 31) & sigPos;
+        const uint32_t sigLower = ((uint32_t)(cgPosY - (trSizeCG - 1)) >> 31) & (sigPos >> (trSizeCG - 1));
+        return sigRight + sigLower * 2;
     }
 
     /* Context derivation process of coeff_abs_significant_flag */
@@ -137,10 +136,10 @@
         X265_CHECK(cgBlkPos < 64, "cgBlkPos is too large\n");
         // NOTE: unsafe shift operator, see NOTE in calcPatternSigCtx
         const uint32_t sigPos = (uint32_t)(cgGroupMask >> (cgBlkPos + 1)); // just need lowest 8-bits valid
-        const uint32_t sigRight = ((int32_t)(cgPosX - (trSizeCG - 1)) >> 31) & sigPos;
-        const uint32_t sigLower = ((int32_t)(cgPosY - (trSizeCG - 1)) >> 31) & (sigPos >> (trSizeCG - 1));
+        const uint32_t sigRight = ((uint32_t)(cgPosX - (trSizeCG - 1)) >> 31) & sigPos;
+        const uint32_t sigLower = ((uint32_t)(cgPosY - (trSizeCG - 1)) >> 31) & (sigPos >> (trSizeCG - 1));
 
-        return (sigRight | sigLower) & 1;
+        return (sigRight | sigLower);
     }
 
     /* static methods shared with entropy.cpp */
@@ -150,7 +149,7 @@
 
     void setChromaQP(int qpin, TextType ttype, int chFmt);
 
-    uint32_t signBitHidingHDQ(int16_t* qcoeff, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codingParameters);
+    uint32_t signBitHidingHDQ(int16_t* qcoeff, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codingParameters, uint32_t log2TrSize);
 
     uint32_t rdoQuant(const CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool usePsy);
 };

x265_1.7.tar.gz/source/common/scalinglist.cpp -> x265_1.8.tar.gz/source/common/scalinglist.cpp Changed

x265_1.7.tar.gz/source/common/scalinglist.h -> x265_1.8.tar.gz/source/common/scalinglist.h Changed

x265_1.7.tar.gz/source/common/shortyuv.cpp -> x265_1.8.tar.gz/source/common/shortyuv.cpp Changed

x265_1.7.tar.gz/source/common/shortyuv.h -> x265_1.8.tar.gz/source/common/shortyuv.h Changed

x265_1.7.tar.gz/source/common/slice.cpp -> x265_1.8.tar.gz/source/common/slice.cpp Changed

x265_1.7.tar.gz/source/common/slice.h -> x265_1.8.tar.gz/source/common/slice.h Changed

x265_1.7.tar.gz/source/common/threading.cpp -> x265_1.8.tar.gz/source/common/threading.cpp Changed

@@ -21,21 +21,73 @@
  * For more information, contact us at license @ x265.com
  *****************************************************************************/
 
+#include "common.h"
 #include "threading.h"
+#include "cpu.h"
 
-namespace x265 {
+namespace X265_NS {
 // x265 private namespace
 
 #if X265_ARCH_X86 && !defined(X86_64) && ENABLE_ASSEMBLY && defined(__GNUC__)
-extern "C" intptr_t x265_stack_align(void (*func)(), ...);
-#define x265_stack_align(func, ...) x265_stack_align((void (*)())func, __VA_ARGS__)
+extern "C" intptr_t PFX(stack_align)(void (*func)(), ...);
+#define STACK_ALIGN(func, ...) PFX(stack_align)((void (*)())func, __VA_ARGS__)
 #else
-#define x265_stack_align(func, ...) func(__VA_ARGS__)
+#define STACK_ALIGN(func, ...) func(__VA_ARGS__)
+#endif
+
+#if NO_ATOMICS
+pthread_mutex_t g_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+int no_atomic_or(int* ptr, int mask)
+{ 
+    pthread_mutex_lock(&g_mutex);
+    int ret = *ptr;
+    *ptr |= mask;
+    pthread_mutex_unlock(&g_mutex);
+    return ret;
+}
+
+int no_atomic_and(int* ptr, int mask)
+{
+    pthread_mutex_lock(&g_mutex);
+    int ret = *ptr;
+    *ptr &= mask;
+    pthread_mutex_unlock(&g_mutex);
+    return ret;
+}
+
+int no_atomic_inc(int* ptr)
+{
+    pthread_mutex_lock(&g_mutex);
+    *ptr += 1;
+    int ret = *ptr;
+    pthread_mutex_unlock(&g_mutex);
+    return ret;
+}
+
+int no_atomic_dec(int* ptr)
+{
+    pthread_mutex_lock(&g_mutex);
+    *ptr -= 1;
+    int ret = *ptr;
+    pthread_mutex_unlock(&g_mutex);
+    return ret;
+}
+
+int no_atomic_add(int* ptr, int val)
+{
+    pthread_mutex_lock(&g_mutex);
+    *ptr += val;
+    int ret = *ptr;
+    pthread_mutex_unlock(&g_mutex);
+    return ret;
+}
 #endif
 
 /* C shim for forced stack alignment */
 static void stackAlignMain(Thread *instance)
 {
+    // defer processing to the virtual function implemented in the derived class
     instance->threadMain();
 }
 
@@ -43,8 +95,7 @@
 
 static DWORD WINAPI ThreadShim(Thread *instance)
 {
-    // defer processing to the virtual function implemented in the derived class
-    x265_stack_align(stackAlignMain, instance);
+    STACK_ALIGN(stackAlignMain, instance);
 
     return 0;
 }
@@ -77,7 +128,7 @@
     // defer processing to the virtual function implemented in the derived class
     Thread *instance = reinterpret_cast<Thread *>(opaque);
 
-    x265_stack_align(stackAlignMain, instance);
+    STACK_ALIGN(stackAlignMain, instance);
 
     return NULL;
 }

x265_1.7.tar.gz/source/common/threading.h -> x265_1.8.tar.gz/source/common/threading.h Changed

@@ -42,7 +42,30 @@
 #include <sys/sysctl.h>
 #endif
 
-#ifdef __GNUC__               /* GCCs builtin atomics */
+#if NO_ATOMICS
+
+#include <sys/time.h>
+#include <unistd.h>
+
+namespace X265_NS {
+// x265 private namespace
+int no_atomic_or(int* ptr, int mask);
+int no_atomic_and(int* ptr, int mask);
+int no_atomic_inc(int* ptr);
+int no_atomic_dec(int* ptr);
+int no_atomic_add(int* ptr, int val);
+}
+
+#define CLZ(id, x)            id = (unsigned long)__builtin_clz(x) ^ 31
+#define CTZ(id, x)            id = (unsigned long)__builtin_ctz(x)
+#define ATOMIC_OR(ptr, mask)  no_atomic_or((int*)ptr, mask)
+#define ATOMIC_AND(ptr, mask) no_atomic_and((int*)ptr, mask)
+#define ATOMIC_INC(ptr)       no_atomic_inc((int*)ptr)
+#define ATOMIC_DEC(ptr)       no_atomic_dec((int*)ptr)
+#define ATOMIC_ADD(ptr, val)  no_atomic_add((int*)ptr, val)
+#define GIVE_UP_TIME()        usleep(0)
+
+#elif __GNUC__               /* GCCs builtin atomics */
 
 #include <sys/time.h>
 #include <unistd.h>
@@ -71,7 +94,7 @@
 
 #endif // ifdef __GNUC__
 
-namespace x265 {
+namespace X265_NS {
 // x265 private namespace
 
 #ifdef _WIN32
@@ -463,6 +486,6 @@
 
     void stop();
 };
-} // end namespace x265
+} // end namespace X265_NS
 
 #endif // ifndef X265_THREADING_H

x265_1.7.tar.gz/source/common/threadpool.cpp -> x265_1.8.tar.gz/source/common/threadpool.cpp Changed

x265_1.7.tar.gz/source/common/threadpool.h -> x265_1.8.tar.gz/source/common/threadpool.h Changed

x265_1.7.tar.gz/source/common/vec/dct-sse3.cpp -> x265_1.8.tar.gz/source/common/vec/dct-sse3.cpp Changed

@@ -33,19 +33,13 @@
 #include <xmmintrin.h> // SSE
 #include <pmmintrin.h> // SSE3
 
-using namespace x265;
+using namespace X265_NS;
 
-namespace {
 #define SHIFT1  7
 #define ADD1    64
 
-#if HIGH_BIT_DEPTH
-#define SHIFT2  10
-#define ADD2    512
-#else
-#define SHIFT2  12
-#define ADD2    2048
-#endif
+#define SHIFT2  (12 - (X265_DEPTH - 8))
+#define ADD2    (1 << ((SHIFT2) - 1))
 
 ALIGN_VAR_32(static const int16_t, tab_idct_8x8[12][8]) =
 {
@@ -62,7 +56,8 @@
     {  83,  36,  83,  36, 83,  36, 83,  36 },
     {  36, -83,  36, -83, 36, -83, 36, -83 }
 };
-void idct8(const int16_t* src, int16_t* dst, intptr_t stride)
+
+static void idct8(const int16_t* src, int16_t* dst, intptr_t stride)
 {
     __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h, E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l, O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h;
     __m128i T00, T01, T02, T03, T04, T05, T06, T07;
@@ -299,7 +294,7 @@
     _mm_storeh_pi((__m64*)&dst[7 * stride +  4], _mm_castsi128_ps(T11));
 }
 
-void idct16(const int16_t *src, int16_t *dst, intptr_t stride)
+static void idct16(const int16_t *src, int16_t *dst, intptr_t stride)
 {
 #define READ_UNPACKHILO(offset)\
     const __m128i T_00_00A = _mm_unpacklo_epi16(*(__m128i*)&src[1 * 16 + offset], *(__m128i*)&src[3 * 16 + offset]);\
@@ -677,7 +672,7 @@
 #undef UNPACKHILO
 #undef READ_UNPACKHILO
 
-void idct32(const int16_t *src, int16_t *dst, intptr_t stride)
+static void idct32(const int16_t *src, int16_t *dst, intptr_t stride)
 {
     //Odd
     const __m128i c16_p90_p90   = _mm_set1_epi32(0x005A005A); //column 0
@@ -1418,9 +1413,7 @@
     }
 }
 
-}
-
-namespace x265 {
+namespace X265_NS {
 void setupIntrinsicDCT_sse3(EncoderPrimitives &p)
 {
     /* Note: We have AVX2 assembly for these functions, but since AVX2 is still

x265_1.7.tar.gz/source/common/vec/dct-sse41.cpp -> x265_1.8.tar.gz/source/common/vec/dct-sse41.cpp Changed

x265_1.7.tar.gz/source/common/vec/dct-ssse3.cpp -> x265_1.8.tar.gz/source/common/vec/dct-ssse3.cpp Changed

@@ -34,9 +34,20 @@
 #include <pmmintrin.h> // SSE3
 #include <tmmintrin.h> // SSSE3
 
-using namespace x265;
+#define DCT16_SHIFT1  (3 + X265_DEPTH - 8)
+#define DCT16_ADD1    (1 << ((DCT16_SHIFT1) - 1))
+
+#define DCT16_SHIFT2  10
+#define DCT16_ADD2    (1 << ((DCT16_SHIFT2) - 1))
+
+#define DCT32_SHIFT1  (DCT16_SHIFT1 + 1)
+#define DCT32_ADD1    (1 << ((DCT32_SHIFT1) - 1))
+
+#define DCT32_SHIFT2  (DCT16_SHIFT2 + 1)
+#define DCT32_ADD2    (1 << ((DCT32_SHIFT2) - 1))
+
+using namespace X265_NS;
 
-namespace {
 ALIGN_VAR_32(static const int16_t, tab_dct_8[][8]) =
 {
     { 0x0100, 0x0F0E, 0x0706, 0x0908, 0x0302, 0x0D0C, 0x0504, 0x0B0A },
@@ -99,22 +110,11 @@
 #undef MAKE_COEF
 };
 
-void dct16(const int16_t *src, int16_t *dst, intptr_t stride)
+static void dct16(const int16_t *src, int16_t *dst, intptr_t stride)
 {
-#if HIGH_BIT_DEPTH
-#define SHIFT1  5
-#define ADD1    16
-#else
-#define SHIFT1  3
-#define ADD1    4
-#endif
-
-#define SHIFT2  10
-#define ADD2    512
-
     // Const
-    __m128i c_4     = _mm_set1_epi32(ADD1);
-    __m128i c_512   = _mm_set1_epi32(ADD2);
+    __m128i c_4     = _mm_set1_epi32(DCT16_ADD1);
+    __m128i c_512   = _mm_set1_epi32(DCT16_ADD2);
 
     int i;
 
@@ -202,29 +202,29 @@
 
         T60  = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_8[1]));
         T61  = _mm_madd_epi16(T51, _mm_load_si128((__m128i*)tab_dct_8[1]));
-        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
-        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
+        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
+        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
         T70  = _mm_packs_epi32(T60, T61);
         _mm_store_si128((__m128i*)&tmp[0 * 16 + i], T70);
 
         T60  = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_8[2]));
         T61  = _mm_madd_epi16(T51, _mm_load_si128((__m128i*)tab_dct_8[2]));
-        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
-        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
+        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
+        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
         T70  = _mm_packs_epi32(T60, T61);
         _mm_store_si128((__m128i*)&tmp[8 * 16 + i], T70);
 
         T60  = _mm_madd_epi16(T52, _mm_load_si128((__m128i*)tab_dct_8[3]));
         T61  = _mm_madd_epi16(T53, _mm_load_si128((__m128i*)tab_dct_8[3]));
-        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
-        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
+        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
+        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
         T70  = _mm_packs_epi32(T60, T61);
         _mm_store_si128((__m128i*)&tmp[4 * 16 + i], T70);
 
         T60  = _mm_madd_epi16(T52, _mm_load_si128((__m128i*)tab_dct_8[4]));
         T61  = _mm_madd_epi16(T53, _mm_load_si128((__m128i*)tab_dct_8[4]));
-        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
-        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
+        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
+        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
         T70  = _mm_packs_epi32(T60, T61);
         _mm_store_si128((__m128i*)&tmp[12 * 16 + i], T70);
 
@@ -234,8 +234,8 @@
         T63  = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[5]));
         T60  = _mm_hadd_epi32(T60, T61);
         T61  = _mm_hadd_epi32(T62, T63);
-        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
-        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
+        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
+        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
         T70  = _mm_packs_epi32(T60, T61);
         _mm_store_si128((__m128i*)&tmp[2 * 16 + i], T70);
 
@@ -245,8 +245,8 @@
         T63  = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[6]));
         T60  = _mm_hadd_epi32(T60, T61);
         T61  = _mm_hadd_epi32(T62, T63);
-        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
-        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
+        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
+        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
         T70  = _mm_packs_epi32(T60, T61);
         _mm_store_si128((__m128i*)&tmp[6 * 16 + i], T70);
 
@@ -256,8 +256,8 @@
         T63  = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[7]));
         T60  = _mm_hadd_epi32(T60, T61);
         T61  = _mm_hadd_epi32(T62, T63);
-        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
-        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
+        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
+        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
         T70  = _mm_packs_epi32(T60, T61);
         _mm_store_si128((__m128i*)&tmp[10 * 16 + i], T70);
 
@@ -267,8 +267,8 @@
         T63  = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[8]));
         T60  = _mm_hadd_epi32(T60, T61);
         T61  = _mm_hadd_epi32(T62, T63);
-        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
-        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
+        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
+        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
         T70  = _mm_packs_epi32(T60, T61);
         _mm_store_si128((__m128i*)&tmp[14 * 16 + i], T70);
 
@@ -287,8 +287,8 @@
     T63  = _mm_hadd_epi32(T66, T67); \
     T60  = _mm_hadd_epi32(T60, T61); \
     T61  = _mm_hadd_epi32(T62, T63); \
-    T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1); \
-    T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1); \
+    T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1); \
+    T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1); \
     T70  = _mm_packs_epi32(T60, T61); \
     _mm_store_si128((__m128i*)&tmp[(dstPos) * 16 + i], T70);
 
@@ -352,8 +352,8 @@
 
         T40  = _mm_hadd_epi32(T30, T31);
         T41  = _mm_hsub_epi32(T30, T31);
-        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
-        T41  = _mm_srai_epi32(_mm_add_epi32(T41, c_512), SHIFT2);
+        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
+        T41  = _mm_srai_epi32(_mm_add_epi32(T41, c_512), DCT16_SHIFT2);
         T40  = _mm_packs_epi32(T40, T40);
         T41  = _mm_packs_epi32(T41, T41);
         _mm_storel_epi64((__m128i*)&dst[0 * 16 + i], T40);
@@ -377,7 +377,7 @@
         T31  = _mm_hadd_epi32(T32, T33);
 
         T40  = _mm_hadd_epi32(T30, T31);
-        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
+        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
         T40  = _mm_packs_epi32(T40, T40);
         _mm_storel_epi64((__m128i*)&dst[4 * 16 + i], T40);
 
@@ -399,7 +399,7 @@
         T31  = _mm_hadd_epi32(T32, T33);
 
         T40  = _mm_hadd_epi32(T30, T31);
-        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
+        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
         T40  = _mm_packs_epi32(T40, T40);
         _mm_storel_epi64((__m128i*)&dst[12 * 16 + i], T40);
 
@@ -421,7 +421,7 @@
         T31  = _mm_hadd_epi32(T32, T33);
 
         T40  = _mm_hadd_epi32(T30, T31);
-        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
+        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
         T40  = _mm_packs_epi32(T40, T40);
         _mm_storel_epi64((__m128i*)&dst[2 * 16 + i], T40);
 
@@ -443,7 +443,7 @@
         T31  = _mm_hadd_epi32(T32, T33);
 
         T40  = _mm_hadd_epi32(T30, T31);
-        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
+        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
         T40  = _mm_packs_epi32(T40, T40);
         _mm_storel_epi64((__m128i*)&dst[6 * 16 + i], T40);
 
@@ -465,7 +465,7 @@
         T31  = _mm_hadd_epi32(T32, T33);
 
         T40  = _mm_hadd_epi32(T30, T31);
-        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
+        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
         T40  = _mm_packs_epi32(T40, T40);
         _mm_storel_epi64((__m128i*)&dst[10 * 16 + i], T40);
 
@@ -487,7 +487,7 @@
         T31  = _mm_hadd_epi32(T32, T33);

x265_1.7.tar.gz/source/common/vec/vec-primitives.cpp -> x265_1.8.tar.gz/source/common/vec/vec-primitives.cpp Changed

x265_1.7.tar.gz/source/common/version.cpp -> x265_1.8.tar.gz/source/common/version.cpp Changed

@@ -23,71 +23,109 @@
 
 #include "x265.h"
 #include "common.h"
+#include "primitives.h"
 
 #define XSTR(x) STR(x)
 #define STR(x) #x
 
 #if defined(__clang__)
-#define NVM_COMPILEDBY  "[clang " XSTR(__clang_major__) "." XSTR(__clang_minor__) "." XSTR(__clang_patchlevel__) "]"
+#define COMPILEDBY  "[clang " XSTR(__clang_major__) "." XSTR(__clang_minor__) "." XSTR(__clang_patchlevel__) "]"
 #ifdef __IA64__
-#define NVM_ONARCH    "[on 64-bit] "
+#define ONARCH    "[on 64-bit] "
 #else
-#define NVM_ONARCH    "[on 32-bit] "
+#define ONARCH    "[on 32-bit] "
 #endif
 #endif
 
 #if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
-#define NVM_COMPILEDBY  "[GCC " XSTR(__GNUC__) "." XSTR(__GNUC_MINOR__) "." XSTR(__GNUC_PATCHLEVEL__) "]"
+#define COMPILEDBY  "[GCC " XSTR(__GNUC__) "." XSTR(__GNUC_MINOR__) "." XSTR(__GNUC_PATCHLEVEL__) "]"
 #ifdef __IA64__
-#define NVM_ONARCH    "[on 64-bit] "
+#define ONARCH    "[on 64-bit] "
 #else
-#define NVM_ONARCH    "[on 32-bit] "
+#define ONARCH    "[on 32-bit] "
 #endif
 #endif
 
 #ifdef __INTEL_COMPILER
-#define NVM_COMPILEDBY  "[ICC " XSTR(__INTEL_COMPILER) "]"
+#define COMPILEDBY "[ICC " XSTR(__INTEL_COMPILER) "]"
 #elif  _MSC_VER
-#define NVM_COMPILEDBY  "[MSVC " XSTR(_MSC_VER) "]"
+#define COMPILEDBY "[MSVC " XSTR(_MSC_VER) "]"
 #endif
 
-#ifndef NVM_COMPILEDBY
-#define NVM_COMPILEDBY "[Unk-CXX]"
+#ifndef COMPILEDBY
+#define COMPILEDBY "[Unk-CXX]"
 #endif
 
 #ifdef _WIN32
-#define NVM_ONOS        "[Windows]"
+#define ONOS    "[Windows]"
 #elif  __linux
-#define NVM_ONOS        "[Linux]"
+#define ONOS    "[Linux]"
 #elif __OpenBSD__
-#define NVM_ONOS        "[OpenBSD]"
+#define ONOS    "[OpenBSD]"
 #elif  __CYGWIN__
-#define NVM_ONOS        "[Cygwin]"
+#define ONOS    "[Cygwin]"
 #elif __APPLE__
-#define NVM_ONOS        "[Mac OS X]"
+#define ONOS    "[Mac OS X]"
 #else
-#define NVM_ONOS "[Unk-OS]"
+#define ONOS    "[Unk-OS]"
 #endif
 
 #if X86_64
-#define NVM_BITS        "[64 bit]"
+#define BITS    "[64 bit]"
 #else
-#define NVM_BITS        "[32 bit]"
+#define BITS    "[32 bit]"
+#endif
+
+#if defined(ENABLE_ASSEMBLY)
+#define ASM     ""
+#else
+#define ASM     "[noasm]"
+#endif
+ 
+#if NO_ATOMICS
+#define ATOMICS "[no-atomics]"
+#else
+#define ATOMICS ""
 #endif
 
 #if CHECKED_BUILD
-#define CHECKED         "[CHECKED] "
+#define CHECKED "[CHECKED] "
 #else
-#define CHECKED         " "
+#define CHECKED " "
 #endif
 
-#if HIGH_BIT_DEPTH
-#define BITDEPTH "16bpp"
-const int x265_max_bit_depth = 10;
+#if X265_DEPTH == 12
+
+#define BITDEPTH "12bit"
+const int PFX(max_bit_depth) = 12;
+
+#elif X265_DEPTH == 10
+
+#define BITDEPTH "10bit"
+const int PFX(max_bit_depth) = 10;
+
+#elif X265_DEPTH == 8
+
+#define BITDEPTH "8bit"
+const int PFX(max_bit_depth) = 8;
+
+#endif
+
+#if LINKED_8BIT
+#define ADD8 "+8bit"
+#else
+#define ADD8 ""
+#endif
+#if LINKED_10BIT
+#define ADD10 "+10bit"
+#else
+#define ADD10 ""
+#endif
+#if LINKED_12BIT
+#define ADD12 "+12bit"
 #else
-#define BITDEPTH "8bpp"
-const int x265_max_bit_depth = 8;
+#define ADD12 ""
 #endif
 
-const char *x265_version_str = XSTR(X265_VERSION);
-const char *x265_build_info_str = NVM_ONOS NVM_COMPILEDBY NVM_BITS CHECKED BITDEPTH;
+const char* PFX(version_str) = XSTR(X265_VERSION);
+const char* PFX(build_info_str) = ONOS COMPILEDBY BITS ASM ATOMICS CHECKED BITDEPTH ADD8 ADD10 ADD12;

x265_1.7.tar.gz/source/common/wavefront.cpp -> x265_1.8.tar.gz/source/common/wavefront.cpp Changed

x265_1.7.tar.gz/source/common/wavefront.h -> x265_1.8.tar.gz/source/common/wavefront.h Changed

x265_1.7.tar.gz/source/common/winxp.cpp -> x265_1.8.tar.gz/source/common/winxp.cpp Changed

x265_1.7.tar.gz/source/common/winxp.h -> x265_1.8.tar.gz/source/common/winxp.h Changed

@@ -30,7 +30,7 @@
 #include <intrin.h> // _InterlockedCompareExchange64
 #endif
 
-namespace x265 {
+namespace X265_NS {
 /* non-native condition variable */
 typedef struct
 {
@@ -49,14 +49,14 @@
 void cond_destroy(ConditionVariable *cond);
 
 /* map missing API symbols to our structure and functions */
-#define CONDITION_VARIABLE          x265::ConditionVariable
-#define InitializeConditionVariable x265::cond_init
-#define SleepConditionVariableCS    x265::cond_wait
-#define WakeConditionVariable       x265::cond_signal
-#define WakeAllConditionVariable    x265::cond_broadcast
-#define XP_CONDITION_VAR_FREE       x265::cond_destroy
+#define CONDITION_VARIABLE          X265_NS::ConditionVariable
+#define InitializeConditionVariable X265_NS::cond_init
+#define SleepConditionVariableCS    X265_NS::cond_wait
+#define WakeConditionVariable       X265_NS::cond_signal
+#define WakeAllConditionVariable    X265_NS::cond_broadcast
+#define XP_CONDITION_VAR_FREE       X265_NS::cond_destroy
 
-} // namespace x265
+} // namespace X265_NS
 
 #else // if defined(_WIN32) && (_WIN32_WINNT < 0x0600)

x265_1.7.tar.gz/source/common/x86/asm-primitives.cpp -> x265_1.8.tar.gz/source/common/x86/asm-primitives.cpp Changed

@@ -28,6 +28,83 @@
 #include "x265.h"
 #include "cpu.h"
 
+#define FUNCDEF_TU(ret, name, cpu, ...) \
+    ret PFX(name ## _4x4_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## _8x8_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## _16x16_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## _32x32_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## _64x64_ ## cpu(__VA_ARGS__))
+
+#define FUNCDEF_TU_S(ret, name, cpu, ...) \
+    ret PFX(name ## _4_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## _8_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## _16_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## _32_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## _64_ ## cpu(__VA_ARGS__))
+
+#define FUNCDEF_TU_S2(ret, name, cpu, ...) \
+    ret PFX(name ## 4_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## 8_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## 16_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## 32_ ## cpu(__VA_ARGS__)); \
+    ret PFX(name ## 64_ ## cpu(__VA_ARGS__))
+
+#define FUNCDEF_PU(ret, name, cpu, ...) \
+    ret PFX(name ## _4x4_   ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _8x8_   ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x16_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _32x32_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _64x64_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _8x4_   ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _4x8_   ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x8_  ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _8x16_  ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x32_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _32x16_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _64x32_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _32x64_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x12_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _12x16_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x4_  ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _4x16_  ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _32x24_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _24x32_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _32x8_  ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _8x32_  ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _64x48_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _48x64_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _64x16_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x64_ ## cpu)(__VA_ARGS__)
+
+#define FUNCDEF_CHROMA_PU(ret, name, cpu, ...) \
+    FUNCDEF_PU(ret, name, cpu, __VA_ARGS__); \
+    ret PFX(name ## _4x2_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _2x4_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _8x2_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _2x8_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _8x6_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _6x8_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _8x12_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _12x8_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _6x16_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x6_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _2x16_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x2_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _4x12_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _12x4_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _32x12_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _12x32_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _32x4_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _4x32_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _32x48_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _48x32_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _16x24_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _24x16_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _8x64_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _64x8_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _64x24_ ## cpu)(__VA_ARGS__); \
+    ret PFX(name ## _24x64_ ## cpu)(__VA_ARGS__);
+
 extern "C" {
 #include "pixel.h"
 #include "pixel-util.h"
@@ -40,31 +117,31 @@
 }
 
 #define ALL_LUMA_CU_TYPED(prim, fncdef, fname, cpu) \
-    p.cu[BLOCK_8x8].prim   = fncdef x265_ ## fname ## _8x8_ ## cpu; \
-    p.cu[BLOCK_16x16].prim = fncdef x265_ ## fname ## _16x16_ ## cpu; \
-    p.cu[BLOCK_32x32].prim = fncdef x265_ ## fname ## _32x32_ ## cpu; \
-    p.cu[BLOCK_64x64].prim = fncdef x265_ ## fname ## _64x64_ ## cpu
+    p.cu[BLOCK_8x8].prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
+    p.cu[BLOCK_16x16].prim = fncdef PFX(fname ## _16x16_ ## cpu); \
+    p.cu[BLOCK_32x32].prim = fncdef PFX(fname ## _32x32_ ## cpu); \
+    p.cu[BLOCK_64x64].prim = fncdef PFX(fname ## _64x64_ ## cpu)
 #define ALL_LUMA_CU_TYPED_S(prim, fncdef, fname, cpu) \
-    p.cu[BLOCK_8x8].prim   = fncdef x265_ ## fname ## 8_ ## cpu; \
-    p.cu[BLOCK_16x16].prim = fncdef x265_ ## fname ## 16_ ## cpu; \
-    p.cu[BLOCK_32x32].prim = fncdef x265_ ## fname ## 32_ ## cpu; \
-    p.cu[BLOCK_64x64].prim = fncdef x265_ ## fname ## 64_ ## cpu
+    p.cu[BLOCK_8x8].prim   = fncdef PFX(fname ## 8_ ## cpu); \
+    p.cu[BLOCK_16x16].prim = fncdef PFX(fname ## 16_ ## cpu); \
+    p.cu[BLOCK_32x32].prim = fncdef PFX(fname ## 32_ ## cpu); \
+    p.cu[BLOCK_64x64].prim = fncdef PFX(fname ## 64_ ## cpu)
 #define ALL_LUMA_TU_TYPED(prim, fncdef, fname, cpu) \
-    p.cu[BLOCK_4x4].prim   = fncdef x265_ ## fname ## _4x4_ ## cpu; \
-    p.cu[BLOCK_8x8].prim   = fncdef x265_ ## fname ## _8x8_ ## cpu; \
-    p.cu[BLOCK_16x16].prim = fncdef x265_ ## fname ## _16x16_ ## cpu; \
-    p.cu[BLOCK_32x32].prim = fncdef x265_ ## fname ## _32x32_ ## cpu
+    p.cu[BLOCK_4x4].prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
+    p.cu[BLOCK_8x8].prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
+    p.cu[BLOCK_16x16].prim = fncdef PFX(fname ## _16x16_ ## cpu); \
+    p.cu[BLOCK_32x32].prim = fncdef PFX(fname ## _32x32_ ## cpu)
 #define ALL_LUMA_TU_TYPED_S(prim, fncdef, fname, cpu) \
-    p.cu[BLOCK_4x4].prim   = fncdef x265_ ## fname ## 4_ ## cpu; \
-    p.cu[BLOCK_8x8].prim   = fncdef x265_ ## fname ## 8_ ## cpu; \
-    p.cu[BLOCK_16x16].prim = fncdef x265_ ## fname ## 16_ ## cpu; \
-    p.cu[BLOCK_32x32].prim = fncdef x265_ ## fname ## 32_ ## cpu
+    p.cu[BLOCK_4x4].prim   = fncdef PFX(fname ## 4_ ## cpu); \
+    p.cu[BLOCK_8x8].prim   = fncdef PFX(fname ## 8_ ## cpu); \
+    p.cu[BLOCK_16x16].prim = fncdef PFX(fname ## 16_ ## cpu); \
+    p.cu[BLOCK_32x32].prim = fncdef PFX(fname ## 32_ ## cpu)
 #define ALL_LUMA_BLOCKS_TYPED(prim, fncdef, fname, cpu) \
-    p.cu[BLOCK_4x4].prim   = fncdef x265_ ## fname ## _4x4_ ## cpu; \
-    p.cu[BLOCK_8x8].prim   = fncdef x265_ ## fname ## _8x8_ ## cpu; \
-    p.cu[BLOCK_16x16].prim = fncdef x265_ ## fname ## _16x16_ ## cpu; \
-    p.cu[BLOCK_32x32].prim = fncdef x265_ ## fname ## _32x32_ ## cpu; \
-    p.cu[BLOCK_64x64].prim = fncdef x265_ ## fname ## _64x64_ ## cpu;
+    p.cu[BLOCK_4x4].prim   = fncdef PFX(fname ## _4x4_ ## cpu); \
+    p.cu[BLOCK_8x8].prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
+    p.cu[BLOCK_16x16].prim = fncdef PFX(fname ## _16x16_ ## cpu); \
+    p.cu[BLOCK_32x32].prim = fncdef PFX(fname ## _32x32_ ## cpu); \
+    p.cu[BLOCK_64x64].prim = fncdef PFX(fname ## _64x64_ ## cpu);
 #define ALL_LUMA_CU(prim, fname, cpu)      ALL_LUMA_CU_TYPED(prim, , fname, cpu)
 #define ALL_LUMA_CU_S(prim, fname, cpu)    ALL_LUMA_CU_TYPED_S(prim, , fname, cpu)
 #define ALL_LUMA_TU(prim, fname, cpu)      ALL_LUMA_TU_TYPED(prim, , fname, cpu)
@@ -72,30 +149,30 @@
 #define ALL_LUMA_TU_S(prim, fname, cpu)    ALL_LUMA_TU_TYPED_S(prim, , fname, cpu)
 
 #define ALL_LUMA_PU_TYPED(prim, fncdef, fname, cpu) \
-    p.pu[LUMA_8x8].prim   = fncdef x265_ ## fname ## _8x8_ ## cpu; \
-    p.pu[LUMA_16x16].prim = fncdef x265_ ## fname ## _16x16_ ## cpu; \
-    p.pu[LUMA_32x32].prim = fncdef x265_ ## fname ## _32x32_ ## cpu; \
-    p.pu[LUMA_64x64].prim = fncdef x265_ ## fname ## _64x64_ ## cpu; \
-    p.pu[LUMA_8x4].prim   = fncdef x265_ ## fname ## _8x4_ ## cpu; \
-    p.pu[LUMA_4x8].prim   = fncdef x265_ ## fname ## _4x8_ ## cpu; \
-    p.pu[LUMA_16x8].prim  = fncdef x265_ ## fname ## _16x8_ ## cpu; \
-    p.pu[LUMA_8x16].prim  = fncdef x265_ ## fname ## _8x16_ ## cpu; \
-    p.pu[LUMA_16x32].prim = fncdef x265_ ## fname ## _16x32_ ## cpu; \
-    p.pu[LUMA_32x16].prim = fncdef x265_ ## fname ## _32x16_ ## cpu; \
-    p.pu[LUMA_64x32].prim = fncdef x265_ ## fname ## _64x32_ ## cpu; \
-    p.pu[LUMA_32x64].prim = fncdef x265_ ## fname ## _32x64_ ## cpu; \
-    p.pu[LUMA_16x12].prim = fncdef x265_ ## fname ## _16x12_ ## cpu; \
-    p.pu[LUMA_12x16].prim = fncdef x265_ ## fname ## _12x16_ ## cpu; \
-    p.pu[LUMA_16x4].prim  = fncdef x265_ ## fname ## _16x4_ ## cpu; \
-    p.pu[LUMA_4x16].prim  = fncdef x265_ ## fname ## _4x16_ ## cpu; \
-    p.pu[LUMA_32x24].prim = fncdef x265_ ## fname ## _32x24_ ## cpu; \
-    p.pu[LUMA_24x32].prim = fncdef x265_ ## fname ## _24x32_ ## cpu; \
-    p.pu[LUMA_32x8].prim  = fncdef x265_ ## fname ## _32x8_ ## cpu; \
-    p.pu[LUMA_8x32].prim  = fncdef x265_ ## fname ## _8x32_ ## cpu; \
-    p.pu[LUMA_64x48].prim = fncdef x265_ ## fname ## _64x48_ ## cpu; \
-    p.pu[LUMA_48x64].prim = fncdef x265_ ## fname ## _48x64_ ## cpu; \
-    p.pu[LUMA_64x16].prim = fncdef x265_ ## fname ## _64x16_ ## cpu; \
-    p.pu[LUMA_16x64].prim = fncdef x265_ ## fname ## _16x64_ ## cpu
+    p.pu[LUMA_8x8].prim   = fncdef PFX(fname ## _8x8_ ## cpu); \
+    p.pu[LUMA_16x16].prim = fncdef PFX(fname ## _16x16_ ## cpu); \
+    p.pu[LUMA_32x32].prim = fncdef PFX(fname ## _32x32_ ## cpu); \
+    p.pu[LUMA_64x64].prim = fncdef PFX(fname ## _64x64_ ## cpu); \
+    p.pu[LUMA_8x4].prim   = fncdef PFX(fname ## _8x4_ ## cpu); \
+    p.pu[LUMA_4x8].prim   = fncdef PFX(fname ## _4x8_ ## cpu); \
+    p.pu[LUMA_16x8].prim  = fncdef PFX(fname ## _16x8_ ## cpu); \
+    p.pu[LUMA_8x16].prim  = fncdef PFX(fname ## _8x16_ ## cpu); \
+    p.pu[LUMA_16x32].prim = fncdef PFX(fname ## _16x32_ ## cpu); \
+    p.pu[LUMA_32x16].prim = fncdef PFX(fname ## _32x16_ ## cpu); \
+    p.pu[LUMA_64x32].prim = fncdef PFX(fname ## _64x32_ ## cpu); \
+    p.pu[LUMA_32x64].prim = fncdef PFX(fname ## _32x64_ ## cpu); \
+    p.pu[LUMA_16x12].prim = fncdef PFX(fname ## _16x12_ ## cpu); \
+    p.pu[LUMA_12x16].prim = fncdef PFX(fname ## _12x16_ ## cpu); \
+    p.pu[LUMA_16x4].prim  = fncdef PFX(fname ## _16x4_ ## cpu); \
+    p.pu[LUMA_4x16].prim  = fncdef PFX(fname ## _4x16_ ## cpu); \
+    p.pu[LUMA_32x24].prim = fncdef PFX(fname ## _32x24_ ## cpu); \
+    p.pu[LUMA_24x32].prim = fncdef PFX(fname ## _24x32_ ## cpu); \
+    p.pu[LUMA_32x8].prim  = fncdef PFX(fname ## _32x8_ ## cpu); \
+    p.pu[LUMA_8x32].prim  = fncdef PFX(fname ## _8x32_ ## cpu); \
+    p.pu[LUMA_64x48].prim = fncdef PFX(fname ## _64x48_ ## cpu); \
+    p.pu[LUMA_48x64].prim = fncdef PFX(fname ## _48x64_ ## cpu); \
+    p.pu[LUMA_64x16].prim = fncdef PFX(fname ## _64x16_ ## cpu); \
+    p.pu[LUMA_16x64].prim = fncdef PFX(fname ## _16x64_ ## cpu)
 #define ALL_LUMA_PU(prim, fname, cpu) ALL_LUMA_PU_TYPED(prim, , fname, cpu)
 
 #define ALL_LUMA_PU_T(prim, fname) \
@@ -125,237 +202,237 @@
     p.pu[LUMA_16x64].prim = fname<LUMA_16x64>
 
 #define ALL_CHROMA_420_CU_TYPED(prim, fncdef, fname, cpu) \
-    p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].prim   = fncdef x265_ ## fname ## _4x4_ ## cpu; \
-    p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].prim   = fncdef x265_ ## fname ## _8x8_ ## cpu; \
-    p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].prim = fncdef x265_ ## fname ## _16x16_ ## cpu; \
-    p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].prim = fncdef x265_ ## fname ## _32x32_ ## cpu

x265_1.7.tar.gz/source/common/x86/blockcopy8.asm -> x265_1.8.tar.gz/source/common/x86/blockcopy8.asm Changed

@@ -3043,43 +3043,31 @@
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PS_W32_H4_avx2 2
 INIT_YMM avx2
-cglobal blockcopy_ps_%1x%2, 4, 7, 3
+cglobal blockcopy_ps_%1x%2, 4, 7, 2
     add     r1, r1
     mov     r4d, %2/4
     lea     r5, [3 * r3]
     lea     r6, [3 * r1]
-    pxor    m0, m0
-
 .loop:
-    movu          m1, [r2]
-    punpcklbw     m2, m1, m0
-    punpckhbw     m1, m1, m0
-    vperm2i128    m3, m2, m1, 00100000b
-    vperm2i128    m2, m2, m1, 00110001b
-    movu          [r0], m3
-    movu          [r0 + 32], m2
-    movu          m1, [r2 + r3]
-    punpcklbw     m2, m1, m0
-    punpckhbw     m1, m1, m0
-    vperm2i128    m3, m2, m1, 00100000b
-    vperm2i128    m2, m2, m1, 00110001b
-    movu          [r0 + r1], m3
-    movu          [r0 + r1 + 32], m2
-    movu          m1, [r2 + 2 * r3]
-    punpcklbw     m2, m1, m0
-    punpckhbw     m1, m1, m0
-    vperm2i128    m3, m2, m1, 00100000b
-    vperm2i128    m2, m2, m1, 00110001b
-    movu          [r0 + 2 * r1], m3
-    movu          [r0 + 2 * r1 + 32], m2
-    movu          m1, [r2 + r5]
-    punpcklbw     m2, m1, m0
-    punpckhbw     m1, m1, m0
-    vperm2i128    m3, m2, m1, 00100000b
-    vperm2i128    m2, m2, m1, 00110001b
-    movu          [r0 + r6], m3
-    movu          [r0 + r6 + 32], m2
-
+    pmovzxbw      m0, [r2 +  0]
+    pmovzxbw      m1, [r2 + 16]
+    movu          [r0 +  0], m0
+    movu          [r0 + 32], m1
+
+    pmovzxbw      m0, [r2 + r3 +  0]
+    pmovzxbw      m1, [r2 + r3 + 16]
+    movu          [r0 + r1 +  0], m0
+    movu          [r0 + r1 + 32], m1
+
+    pmovzxbw      m0, [r2 + r3 * 2 +  0]
+    pmovzxbw      m1, [r2 + r3 * 2 + 16]
+    movu          [r0 + r1 * 2 +  0], m0
+    movu          [r0 + r1 * 2 + 32], m1
+
+    pmovzxbw      m0, [r2 + r5 +  0]
+    pmovzxbw      m1, [r2 + r5 + 16]
+    movu          [r0 + r6 +  0], m0
+    movu          [r0 + r6 + 32], m1
     lea           r0, [r0 + 4 * r1]
     lea           r2, [r2 + 4 * r3]
     dec           r4d
@@ -3228,71 +3216,49 @@
 INIT_YMM avx2
 cglobal blockcopy_ps_64x64, 4, 7, 4
     add     r1, r1
-    mov     r4d, 64/4
+    mov     r4d, 64/8
     lea     r5, [3 * r3]
     lea     r6, [3 * r1]
-    pxor    m0, m0
-
 .loop:
-    movu          m1, [r2]
-    punpcklbw     m2, m1, m0
-    punpckhbw     m1, m1, m0
-    vperm2i128    m3, m2, m1, 00100000b
-    vperm2i128    m2, m2, m1, 00110001b
-    movu          [r0], m3
-    movu          [r0 + 32], m2
-    movu          m1, [r2 + 32]
-    punpcklbw     m2, m1, m0
-    punpckhbw     m1, m1, m0
-    vperm2i128    m3, m2, m1, 00100000b
-    vperm2i128    m2, m2, m1, 00110001b
-    movu          [r0 + 64], m3
-    movu          [r0 + 96], m2
-    movu          m1, [r2 + r3]
-    punpcklbw     m2, m1, m0
-    punpckhbw     m1, m1, m0
-    vperm2i128    m3, m2, m1, 00100000b
-    vperm2i128    m2, m2, m1, 00110001b
-    movu          [r0 + r1], m3
-    movu          [r0 + r1 + 32], m2
-    movu          m1, [r2 + r3 + 32]
-    punpcklbw     m2, m1, m0
-    punpckhbw     m1, m1, m0
-    vperm2i128    m3, m2, m1, 00100000b
-    vperm2i128    m2, m2, m1, 00110001b
-    movu          [r0 + r1 + 64], m3
-    movu          [r0 + r1 + 96], m2
-    movu          m1, [r2 + 2 * r3]
-    punpcklbw     m2, m1, m0
-    punpckhbw     m1, m1, m0
-    vperm2i128    m3, m2, m1, 00100000b
-    vperm2i128    m2, m2, m1, 00110001b
-    movu          [r0 + 2 * r1], m3
-    movu          [r0 + 2 * r1 + 32], m2
-    movu          m1, [r2 + 2 * r3 + 32]
-    punpcklbw     m2, m1, m0
-    punpckhbw     m1, m1, m0
-    vperm2i128    m3, m2, m1, 00100000b
-    vperm2i128    m2, m2, m1, 00110001b
-    movu          [r0 + 2 * r1 + 64], m3
-    movu          [r0 + 2 * r1 + 96], m2
-    movu          m1, [r2 + r5]
-    punpcklbw     m2, m1, m0
-    punpckhbw     m1, m1, m0
-    vperm2i128    m3, m2, m1, 00100000b
-    vperm2i128    m2, m2, m1, 00110001b
-    movu          [r0 + r6], m3
-    movu          [r0 + r6 + 32], m2
-    movu          m1, [r2 + r5 + 32]
-    punpcklbw     m2, m1, m0
-    punpckhbw     m1, m1, m0
-    vperm2i128    m3, m2, m1, 00100000b
-    vperm2i128    m2, m2, m1, 00110001b
-    movu          [r0 + r6 + 64], m3
-    movu          [r0 + r6 + 96], m2
-
+%rep 2
+    pmovzxbw      m0, [r2 +  0]
+    pmovzxbw      m1, [r2 + 16]
+    pmovzxbw      m2, [r2 + 32]
+    pmovzxbw      m3, [r2 + 48]
+    movu          [r0 +  0], m0
+    movu          [r0 + 32], m1
+    movu          [r0 + 64], m2
+    movu          [r0 + 96], m3
+
+    pmovzxbw      m0, [r2 + r3 +  0]
+    pmovzxbw      m1, [r2 + r3 + 16]
+    pmovzxbw      m2, [r2 + r3 + 32]
+    pmovzxbw      m3, [r2 + r3 + 48]
+    movu          [r0 + r1 +  0], m0
+    movu          [r0 + r1 + 32], m1
+    movu          [r0 + r1 + 64], m2
+    movu          [r0 + r1 + 96], m3
+
+    pmovzxbw      m0, [r2 + r3 * 2 +  0]
+    pmovzxbw      m1, [r2 + r3 * 2 + 16]
+    pmovzxbw      m2, [r2 + r3 * 2 + 32]
+    pmovzxbw      m3, [r2 + r3 * 2 + 48]
+    movu          [r0 + r1 * 2 +  0], m0
+    movu          [r0 + r1 * 2 + 32], m1
+    movu          [r0 + r1 * 2 + 64], m2
+    movu          [r0 + r1 * 2 + 96], m3
+
+    pmovzxbw      m0, [r2 + r5 +  0]
+    pmovzxbw      m1, [r2 + r5 + 16]
+    pmovzxbw      m2, [r2 + r5 + 32]
+    pmovzxbw      m3, [r2 + r5 + 48]
+    movu          [r0 + r6 +  0], m0
+    movu          [r0 + r6 + 32], m1
+    movu          [r0 + r6 + 64], m2
+    movu          [r0 + r6 + 96], m3
     lea           r0, [r0 + 4 * r1]
     lea           r2, [r2 + 4 * r3]
+%endrep
     dec           r4d
     jnz           .loop
     RET

x265_1.7.tar.gz/source/common/x86/blockcopy8.h -> x265_1.8.tar.gz/source/common/x86/blockcopy8.h Changed

@@ -24,240 +24,40 @@
 #ifndef X265_BLOCKCOPY8_H
 #define X265_BLOCKCOPY8_H
 
-void x265_cpy2Dto1D_shl_4_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
-void x265_cpy2Dto1D_shl_8_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
-void x265_cpy2Dto1D_shl_16_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
-void x265_cpy2Dto1D_shl_32_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
-void x265_cpy2Dto1D_shr_4_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
-void x265_cpy2Dto1D_shr_8_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
-void x265_cpy2Dto1D_shr_16_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
-void x265_cpy2Dto1D_shr_32_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
-void x265_cpy1Dto2D_shl_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
-void x265_cpy1Dto2D_shl_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
-void x265_cpy1Dto2D_shl_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
-void x265_cpy1Dto2D_shl_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
-void x265_cpy1Dto2D_shl_4_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
-void x265_cpy1Dto2D_shl_8_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
-void x265_cpy1Dto2D_shl_16_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
-void x265_cpy1Dto2D_shl_32_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
-void x265_cpy1Dto2D_shr_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
-void x265_cpy1Dto2D_shr_8_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
-void x265_cpy1Dto2D_shr_16_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
-void x265_cpy1Dto2D_shr_32_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
-void x265_cpy1Dto2D_shr_4_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
-void x265_cpy1Dto2D_shr_8_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
-void x265_cpy1Dto2D_shr_16_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
-void x265_cpy1Dto2D_shr_32_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
-void x265_cpy2Dto1D_shl_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
-void x265_cpy2Dto1D_shl_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
-void x265_cpy2Dto1D_shl_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
-void x265_cpy2Dto1D_shr_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
-void x265_cpy2Dto1D_shr_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
-void x265_cpy2Dto1D_shr_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
-uint32_t x265_copy_cnt_4_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
-uint32_t x265_copy_cnt_8_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
-uint32_t x265_copy_cnt_16_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
-uint32_t x265_copy_cnt_32_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
-uint32_t x265_copy_cnt_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
-uint32_t x265_copy_cnt_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
-uint32_t x265_copy_cnt_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
-uint32_t x265_copy_cnt_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
+FUNCDEF_TU_S(void, cpy2Dto1D_shl, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+FUNCDEF_TU_S(void, cpy2Dto1D_shl, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+FUNCDEF_TU_S(void, cpy2Dto1D_shl, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 
-#define SETUP_BLOCKCOPY_FUNC(W, H, cpu) \
-    void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); \
-    void x265_blockcopy_sp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb); \
-    void x265_blockcopy_ss_ ## W ## x ## H ## cpu(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+FUNCDEF_TU_S(void, cpy2Dto1D_shr, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+FUNCDEF_TU_S(void, cpy2Dto1D_shr, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+FUNCDEF_TU_S(void, cpy2Dto1D_shr, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 
-#define SETUP_BLOCKCOPY_PS(W, H, cpu) \
-    void x265_blockcopy_ps_ ## W ## x ## H ## cpu(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+FUNCDEF_TU_S(void, cpy1Dto2D_shl, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+FUNCDEF_TU_S(void, cpy1Dto2D_shl, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+FUNCDEF_TU_S(void, cpy1Dto2D_shl, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 
-#define SETUP_BLOCKCOPY_SP(W, H, cpu) \
-    void x265_blockcopy_sp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+FUNCDEF_TU_S(void, cpy1Dto2D_shr, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+FUNCDEF_TU_S(void, cpy1Dto2D_shr, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+FUNCDEF_TU_S(void, cpy1Dto2D_shr, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 
-#define SETUP_BLOCKCOPY_SS_PP(W, H, cpu) \
-    void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); \
-    void x265_blockcopy_ss_ ## W ## x ## H ## cpu(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+FUNCDEF_TU_S(uint32_t, copy_cnt, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride);
+FUNCDEF_TU_S(uint32_t, copy_cnt, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride);
+FUNCDEF_TU_S(uint32_t, copy_cnt, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride);
 
-#define BLOCKCOPY_COMMON(cpu) \
-    SETUP_BLOCKCOPY_FUNC(4, 4, cpu); \
-    SETUP_BLOCKCOPY_FUNC(4, 2, cpu); \
-    SETUP_BLOCKCOPY_FUNC(8, 8, cpu); \
-    SETUP_BLOCKCOPY_FUNC(8, 4, cpu); \
-    SETUP_BLOCKCOPY_FUNC(4, 8, cpu); \
-    SETUP_BLOCKCOPY_FUNC(8, 6, cpu); \
-    SETUP_BLOCKCOPY_FUNC(8, 2, cpu); \
-    SETUP_BLOCKCOPY_FUNC(16, 16, cpu); \
-    SETUP_BLOCKCOPY_FUNC(16, 8, cpu); \
-    SETUP_BLOCKCOPY_FUNC(8, 16, cpu); \
-    SETUP_BLOCKCOPY_FUNC(16, 12, cpu); \
-    SETUP_BLOCKCOPY_FUNC(12, 16, cpu); \
-    SETUP_BLOCKCOPY_FUNC(16, 4, cpu); \
-    SETUP_BLOCKCOPY_FUNC(4, 16, cpu); \
-    SETUP_BLOCKCOPY_FUNC(32, 32, cpu); \
-    SETUP_BLOCKCOPY_FUNC(32, 16, cpu); \
-    SETUP_BLOCKCOPY_FUNC(16, 32, cpu); \
-    SETUP_BLOCKCOPY_FUNC(32, 24, cpu); \
-    SETUP_BLOCKCOPY_FUNC(24, 32, cpu); \
-    SETUP_BLOCKCOPY_FUNC(32, 8, cpu); \
-    SETUP_BLOCKCOPY_FUNC(8, 32, cpu); \
-    SETUP_BLOCKCOPY_FUNC(64, 64, cpu); \
-    SETUP_BLOCKCOPY_FUNC(64, 32, cpu); \
-    SETUP_BLOCKCOPY_FUNC(32, 64, cpu); \
-    SETUP_BLOCKCOPY_FUNC(64, 48, cpu); \
-    SETUP_BLOCKCOPY_FUNC(48, 64, cpu); \
-    SETUP_BLOCKCOPY_FUNC(64, 16, cpu); \
-    SETUP_BLOCKCOPY_FUNC(16, 64, cpu);
+FUNCDEF_TU(void, blockfill_s, sse2, int16_t* dst, intptr_t dstride, int16_t val);
+FUNCDEF_TU(void, blockfill_s, avx2, int16_t* dst, intptr_t dstride, int16_t val);
 
-#define BLOCKCOPY_SP(cpu) \
-    SETUP_BLOCKCOPY_SP(2, 4, cpu); \
-    SETUP_BLOCKCOPY_SP(2, 8, cpu); \
-    SETUP_BLOCKCOPY_SP(6, 8, cpu); \
-    \
-    SETUP_BLOCKCOPY_SP(2, 16, cpu); \
-    SETUP_BLOCKCOPY_SP(4, 32, cpu); \
-    SETUP_BLOCKCOPY_SP(6, 16, cpu); \
-    SETUP_BLOCKCOPY_SP(8, 12, cpu); \
-    SETUP_BLOCKCOPY_SP(8, 64, cpu); \
-    SETUP_BLOCKCOPY_SP(12, 32, cpu); \
-    SETUP_BLOCKCOPY_SP(16, 24, cpu); \
-    SETUP_BLOCKCOPY_SP(24, 64, cpu); \
-    SETUP_BLOCKCOPY_SP(32, 48, cpu);
+FUNCDEF_CHROMA_PU(void, blockcopy_ss, sse2, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+FUNCDEF_CHROMA_PU(void, blockcopy_ss, avx, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
 
-#define BLOCKCOPY_SS_PP(cpu) \
-    SETUP_BLOCKCOPY_SS_PP(2, 4, cpu); \
-    SETUP_BLOCKCOPY_SS_PP(2, 8, cpu); \
-    SETUP_BLOCKCOPY_SS_PP(6, 8, cpu); \
-    \
-    SETUP_BLOCKCOPY_SS_PP(2, 16, cpu); \
-    SETUP_BLOCKCOPY_SS_PP(4, 32, cpu); \
-    SETUP_BLOCKCOPY_SS_PP(6, 16, cpu); \
-    SETUP_BLOCKCOPY_SS_PP(8, 12, cpu); \
-    SETUP_BLOCKCOPY_SS_PP(8, 64, cpu); \
-    SETUP_BLOCKCOPY_SS_PP(12, 32, cpu); \
-    SETUP_BLOCKCOPY_SS_PP(16, 24, cpu); \
-    SETUP_BLOCKCOPY_SS_PP(24, 64, cpu); \
-    SETUP_BLOCKCOPY_SS_PP(32, 48, cpu);
-    
+FUNCDEF_CHROMA_PU(void, blockcopy_pp, sse2, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+FUNCDEF_CHROMA_PU(void, blockcopy_pp, avx, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 
-#define BLOCKCOPY_PS(cpu) \
-    SETUP_BLOCKCOPY_PS(2, 4, cpu); \
-    SETUP_BLOCKCOPY_PS(2, 8, cpu); \
-    SETUP_BLOCKCOPY_PS(4, 2, cpu); \
-    SETUP_BLOCKCOPY_PS(4, 4, cpu); \
-    SETUP_BLOCKCOPY_PS(4, 8, cpu); \
-    SETUP_BLOCKCOPY_PS(4, 16, cpu); \
-    SETUP_BLOCKCOPY_PS(6, 8, cpu); \
-    SETUP_BLOCKCOPY_PS(8, 2, cpu); \
-    SETUP_BLOCKCOPY_PS(8, 4, cpu); \
-    SETUP_BLOCKCOPY_PS(8, 6, cpu); \
-    SETUP_BLOCKCOPY_PS(8, 8, cpu); \
-    SETUP_BLOCKCOPY_PS(8, 16, cpu); \
-    SETUP_BLOCKCOPY_PS(8, 32, cpu); \
-    SETUP_BLOCKCOPY_PS(12, 16, cpu); \
-    SETUP_BLOCKCOPY_PS(16, 4, cpu); \
-    SETUP_BLOCKCOPY_PS(16, 8, cpu); \
-    SETUP_BLOCKCOPY_PS(16, 12, cpu); \
-    SETUP_BLOCKCOPY_PS(16, 16, cpu); \
-    SETUP_BLOCKCOPY_PS(16, 32, cpu); \
-    SETUP_BLOCKCOPY_PS(24, 32, cpu); \
-    SETUP_BLOCKCOPY_PS(32,  8, cpu); \
-    SETUP_BLOCKCOPY_PS(32, 16, cpu); \
-    SETUP_BLOCKCOPY_PS(32, 24, cpu); \
-    SETUP_BLOCKCOPY_PS(32, 32, cpu); \
-    SETUP_BLOCKCOPY_PS(16, 64, cpu); \
-    SETUP_BLOCKCOPY_PS(32, 64, cpu); \
-    SETUP_BLOCKCOPY_PS(48, 64, cpu); \
-    SETUP_BLOCKCOPY_PS(64, 16, cpu); \
-    SETUP_BLOCKCOPY_PS(64, 32, cpu); \
-    SETUP_BLOCKCOPY_PS(64, 48, cpu); \
-    SETUP_BLOCKCOPY_PS(64, 64, cpu); \
-    \
-    SETUP_BLOCKCOPY_PS(2, 16, cpu); \
-    SETUP_BLOCKCOPY_PS(4, 32, cpu); \
-    SETUP_BLOCKCOPY_PS(6, 16, cpu); \
-    SETUP_BLOCKCOPY_PS(8, 12, cpu); \
-    SETUP_BLOCKCOPY_PS(8, 64, cpu); \
-    SETUP_BLOCKCOPY_PS(12, 32, cpu); \
-    SETUP_BLOCKCOPY_PS(16, 24, cpu); \
-    SETUP_BLOCKCOPY_PS(24, 64, cpu); \
-    SETUP_BLOCKCOPY_PS(32, 48, cpu);
-
-BLOCKCOPY_COMMON(_sse2);
-BLOCKCOPY_SS_PP(_sse2);
-BLOCKCOPY_SP(_sse4);
-BLOCKCOPY_PS(_sse4);
-
-BLOCKCOPY_SP(_sse2);
-
-void x265_blockfill_s_4x4_sse2(int16_t* dst, intptr_t dstride, int16_t val);
-void x265_blockfill_s_8x8_sse2(int16_t* dst, intptr_t dstride, int16_t val);
-void x265_blockfill_s_16x16_sse2(int16_t* dst, intptr_t dstride, int16_t val);
-void x265_blockfill_s_32x32_sse2(int16_t* dst, intptr_t dstride, int16_t val);
-void x265_blockcopy_ss_16x4_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
-void x265_blockcopy_ss_16x8_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
-void x265_blockcopy_ss_16x12_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
-void x265_blockcopy_ss_16x16_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
-void x265_blockcopy_ss_16x24_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
-void x265_blockcopy_ss_16x32_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);

x265_1.7.tar.gz/source/common/x86/const-a.asm -> x265_1.8.tar.gz/source/common/x86/const-a.asm Changed

@@ -41,7 +41,7 @@
 const pb_16,                times 32 db 16
 const pb_32,                times 32 db 32
 const pb_64,                times 32 db 64
-const pb_128,               times 16 db 128
+const pb_128,               times 32 db 128
 const pb_a1,                times 16 db 0xa1
 
 const pb_01,                times  8 db   0,   1
@@ -62,7 +62,9 @@
 ;; 16-bit constants
 
 const pw_1,                 times 16 dw 1
-const pw_2,                 times  8 dw 2
+const pw_2,                 times 16 dw 2
+const pw_3,                 times 16 dw 3
+const pw_7,                 times 16 dw 7
 const pw_m2,                times  8 dw -2
 const pw_4,                 times  8 dw 4
 const pw_8,                 times  8 dw 8
@@ -75,9 +77,11 @@
 const pw_256,               times 16 dw 256
 const pw_257,               times 16 dw 257
 const pw_512,               times 16 dw 512
-const pw_1023,              times  8 dw 1023
+const pw_1023,              times 16 dw 1023
 const pw_1024,              times 16 dw 1024
+const pw_2048,              times 16 dw 2048
 const pw_4096,              times 16 dw 4096
+const pw_8192,              times  8 dw 8192
 const pw_00ff,              times 16 dw 0x00ff
 const pw_ff00,              times  8 dw 0xff00
 const pw_2000,              times 16 dw 0x2000
@@ -90,7 +94,7 @@
 const pw_0_15,              times  2 dw   0,   1,   2,   3,   4,   5,   6,   7
 const pw_ppppmmmm,          times  1 dw   1,   1,   1,   1,  -1,  -1,  -1,  -1
 const pw_ppmmppmm,          times  1 dw   1,   1,  -1,  -1,   1,   1,  -1,  -1
-const pw_pmpmpmpm,          times  1 dw   1,  -1,   1,  -1,   1,  -1,   1,  -1
+const pw_pmpmpmpm,          times 16 dw   1,  -1,   1,  -1,   1,  -1,   1,  -1
 const pw_pmmpzzzz,          times  1 dw   1,  -1,  -1,   1,   0,   0,   0,   0
 const multi_2Row,           times  1 dw   1,   2,   3,   4,   1,   2,   3,   4
 const multiH,               times  1 dw   9,  10,  11,  12,  13,  14,  15,  16
@@ -100,7 +104,9 @@
 const pw_planar16_mul,      times  1 dw  15,  14,  13,  12,  11,  10,   9,   8,   7,   6,   5,   4,   3,   2,   1,   0
 const pw_planar32_mul,      times  1 dw  31,  30,  29,  28,  27,  26,  25,  24,  23,  22,  21,  20,  19,  18,  17,  16
 const pw_FFFFFFFFFFFFFFF0,           dw 0x00
-                            times 7  dw 0xff
+                            times  7 dw 0xff
+const hmul_16p,             times 16 db   1
+                            times  8 db   1,  -1
 
 
 ;; 32-bit constants
@@ -109,8 +115,9 @@
 const pd_2,                 times  8 dd 2
 const pd_4,                 times  4 dd 4
 const pd_8,                 times  4 dd 8
-const pd_16,                times  4 dd 16
-const pd_32,                times  4 dd 32
+const pd_16,                times  8 dd 16
+const pd_31,                times  4 dd 31
+const pd_32,                times  8 dd 32
 const pd_64,                times  4 dd 64
 const pd_128,               times  4 dd 128
 const pd_256,               times  4 dd 256
@@ -119,10 +126,11 @@
 const pd_2048,              times  4 dd 2048
 const pd_ffff,              times  4 dd 0xffff
 const pd_32767,             times  4 dd 32767
-const pd_n32768,            times  4 dd 0xffff8000
+const pd_524416,            times  4 dd 524416
+const pd_n32768,            times  8 dd 0xffff8000
+const pd_n131072,           times  4 dd 0xfffe0000
 
 const trans8_shuf,          times  1 dd   0,   4,   1,   5,   2,   6,   3,   7
-const deinterleave_shufd,   times  1 dd   0,   4,   1,   5,   2,   6,   3,   7
 
 const popcnt_table
 %assign x 0
@@ -131,5 +139,3 @@
 db ((x>>0)&1)+((x>>1)&1)+((x>>2)&1)+((x>>3)&1)+((x>>4)&1)+((x>>5)&1)+((x>>6)&1)+((x>>7)&1)
 %assign x x+1
 %endrep
-
-const sw_64,       dd 64

x265_1.7.tar.gz/source/common/x86/dct8.asm -> x265_1.8.tar.gz/source/common/x86/dct8.asm Changed

@@ -157,7 +157,7 @@
 
 idct8_shuf1:    dd 0, 2, 4, 6, 1, 3, 5, 7
 
-idct8_shuf2:    times 2 db 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
+const idct8_shuf2,    times 2 db 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
 
 idct8_shuf3:    times 2 db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
 
@@ -332,20 +332,48 @@
 cextern pd_2048
 cextern pw_ppppmmmm
 cextern trans8_shuf
+
+
+%if BIT_DEPTH == 12
+    %define     DCT4_SHIFT          5
+    %define     DCT4_ROUND          16
+    %define    IDCT_SHIFT           8
+    %define    IDCT_ROUND           128
+    %define     DST4_SHIFT          5
+    %define     DST4_ROUND          16
+    %define     DCT8_SHIFT1         6
+    %define     DCT8_ROUND1         32
+%elif BIT_DEPTH == 10
+    %define     DCT4_SHIFT          3
+    %define     DCT4_ROUND          4
+    %define    IDCT_SHIFT           10
+    %define    IDCT_ROUND           512
+    %define     DST4_SHIFT          3
+    %define     DST4_ROUND          4
+    %define     DCT8_SHIFT1         4
+    %define     DCT8_ROUND1         8
+%elif BIT_DEPTH == 8
+    %define     DCT4_SHIFT          1
+    %define     DCT4_ROUND          1
+    %define    IDCT_SHIFT           12
+    %define    IDCT_ROUND           2048
+    %define     DST4_SHIFT          1
+    %define     DST4_ROUND          1
+    %define     DCT8_SHIFT1         2
+    %define     DCT8_ROUND1         2
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+%define         DCT8_ROUND2         256
+%define         DCT8_SHIFT2         9
+
 ;------------------------------------------------------
 ;void dct4(const int16_t* src, int16_t* dst, intptr_t srcStride)
 ;------------------------------------------------------
 INIT_XMM sse2
 cglobal dct4, 3, 4, 8
-%if BIT_DEPTH == 10
-  %define       DCT_SHIFT 3
-  mova          m7, [pd_4]
-%elif BIT_DEPTH == 8
-  %define       DCT_SHIFT 1
-  mova          m7, [pd_1]
-%else
-  %error Unsupported BIT_DEPTH!
-%endif
+    mova        m7, [pd_ %+ DCT4_ROUND]
     add         r2d, r2d
     lea         r3, [tab_dct4]
 
@@ -372,19 +400,19 @@
     psubw       m2, m0
     pmaddwd     m0, m1, m4
     paddd       m0, m7
-    psrad       m0, DCT_SHIFT
+    psrad       m0, DCT4_SHIFT
     pmaddwd     m3, m2, m5
     paddd       m3, m7
-    psrad       m3, DCT_SHIFT
+    psrad       m3, DCT4_SHIFT
     packssdw    m0, m3
     pshufd      m0, m0, 0xD8
     pshufhw     m0, m0, 0xB1
     pmaddwd     m1, m6
     paddd       m1, m7
-    psrad       m1, DCT_SHIFT
+    psrad       m1, DCT4_SHIFT
     pmaddwd     m2, [r3 + 3 * 16]
     paddd       m2, m7
-    psrad       m2, DCT_SHIFT
+    psrad       m2, DCT4_SHIFT
     packssdw    m1, m2
     pshufd      m1, m1, 0xD8
     pshufhw     m1, m1, 0xB1
@@ -431,15 +459,7 @@
 ; - r2:     source stride
 INIT_YMM avx2
 cglobal dct4, 3, 4, 8, src, dst, srcStride
-%if BIT_DEPTH == 10
-    %define DCT_SHIFT 3
-    vbroadcasti128 m7, [pd_4]
-%elif BIT_DEPTH == 8
-    %define DCT_SHIFT 1
-    vbroadcasti128 m7, [pd_1]
-%else
-    %error Unsupported BIT_DEPTH!
-%endif
+    vbroadcasti128  m7, [pd_ %+ DCT4_ROUND]
     add             r2d, r2d
     lea             r3, [avx2_dct4]
 
@@ -461,11 +481,11 @@
 
     pmaddwd         m2, m5
     paddd           m2, m7
-    psrad           m2, DCT_SHIFT
+    psrad           m2, DCT4_SHIFT
 
     pmaddwd         m0, m6
     paddd           m0, m7
-    psrad           m0, DCT_SHIFT
+    psrad           m0, DCT4_SHIFT
 
     packssdw        m2, m0
     pshufb          m2, m4
@@ -493,30 +513,19 @@
 ;void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride)
 ;-------------------------------------------------------
 INIT_XMM sse2
-cglobal idct4, 3, 4, 7
-%if BIT_DEPTH == 8
-  %define IDCT4_OFFSET  [pd_2048]
-  %define IDCT4_SHIFT   12
-%elif BIT_DEPTH == 10
-  %define IDCT4_OFFSET  [pd_512]
-  %define IDCT4_SHIFT   10
-%else
-  %error Unsupported BIT_DEPTH!
-%endif
+cglobal idct4, 3, 4, 6
     add         r2d, r2d
     lea         r3, [tab_dct4]
 
-    mova        m6, [pd_64]
-
     movu        m0, [r0 + 0 * 16]
     movu        m1, [r0 + 1 * 16]
 
     punpcklwd   m2, m0, m1
     pmaddwd     m3, m2, [r3 + 0 * 16]       ; m3 = E1
-    paddd       m3, m6
+    paddd       m3, [pd_64]
 
     pmaddwd     m2, [r3 + 2 * 16]           ; m2 = E2
-    paddd       m2, m6
+    paddd       m2, [pd_64]
 
     punpckhwd   m0, m1
     pmaddwd     m1, m0, [r3 + 1 * 16]       ; m1 = O1
@@ -540,29 +549,27 @@
     punpcklwd   m0, m1, m4                  ; m0 = m128iA
     punpckhwd   m1, m4                      ; m1 = m128iD
 
-    mova        m6, IDCT4_OFFSET
-
     punpcklwd   m2, m0, m1
     pmaddwd     m3, m2, [r3 + 0 * 16]
-    paddd       m3, m6                      ; m3 = E1
+    paddd       m3, [pd_ %+ IDCT_ROUND]     ; m3 = E1
 
     pmaddwd     m2, [r3 + 2 * 16]
-    paddd       m2, m6                      ; m2 = E2
+    paddd       m2, [pd_ %+ IDCT_ROUND]     ; m2 = E2
 
     punpckhwd   m0, m1
     pmaddwd     m1, m0, [r3 + 1 * 16]       ; m1 = O1
     pmaddwd     m0, [r3 + 3 * 16]           ; m0 = O2
 
     paddd       m4, m3, m1
-    psrad       m4, IDCT4_SHIFT             ; m4 = m128iA
+    psrad       m4, IDCT_SHIFT              ; m4 = m128iA
     paddd       m5, m2, m0
-    psrad       m5, IDCT4_SHIFT
+    psrad       m5, IDCT_SHIFT
     packssdw    m4, m5                      ; m4 = m128iA
 
     psubd       m2, m0
-    psrad       m2, IDCT4_SHIFT
+    psrad       m2, IDCT_SHIFT
     psubd       m3, m1
-    psrad       m3, IDCT4_SHIFT
+    psrad       m3, IDCT_SHIFT
     packssdw    m2, m3                      ; m2 = m128iD
 
     punpcklwd   m1, m4, m2
@@ -576,7 +583,139 @@
     movlps      [r1 + 2 * r2], m1
     lea         r1, [r1 + 2 * r2]
     movhps      [r1 + r2], m1
+    RET
+
+;------------------------------------------------------
+;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride)

x265_1.7.tar.gz/source/common/x86/dct8.h -> x265_1.8.tar.gz/source/common/x86/dct8.h Changed

@@ -23,27 +23,23 @@
 
 #ifndef X265_DCT8_H
 #define X265_DCT8_H
-void x265_dct4_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride);
-void x265_dct8_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride);
-void x265_dst4_ssse3(const int16_t* src, int16_t* dst, intptr_t srcStride);
-void x265_dst4_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
-void x265_dct8_sse4(const int16_t* src, int16_t* dst, intptr_t srcStride);
-void x265_dct4_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
-void x265_dct8_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
-void x265_dct16_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
-void x265_dct32_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
 
-void x265_idst4_sse2(const int16_t* src, int16_t* dst, intptr_t dstStride);
-void x265_idst4_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
-void x265_idct4_sse2(const int16_t* src, int16_t* dst, intptr_t dstStride);
-void x265_idct4_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
-void x265_idct8_sse2(const int16_t* src, int16_t* dst, intptr_t dstStride);
-void x265_idct8_ssse3(const int16_t* src, int16_t* dst, intptr_t dstStride);
-void x265_idct8_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
-void x265_idct16_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
-void x265_idct32_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
+FUNCDEF_TU_S2(void, dct, sse2, const int16_t* src, int16_t* dst, intptr_t srcStride);
+FUNCDEF_TU_S2(void, dct, ssse3, const int16_t* src, int16_t* dst, intptr_t srcStride);
+FUNCDEF_TU_S2(void, dct, sse4, const int16_t* src, int16_t* dst, intptr_t srcStride);
+FUNCDEF_TU_S2(void, dct, avx2, const int16_t* src, int16_t* dst, intptr_t srcStride);
 
-void x265_denoise_dct_sse4(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
-void x265_denoise_dct_avx2(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
+FUNCDEF_TU_S2(void, idct, sse2, const int16_t* src, int16_t* dst, intptr_t dstStride);
+FUNCDEF_TU_S2(void, idct, ssse3, const int16_t* src, int16_t* dst, intptr_t dstStride);
+FUNCDEF_TU_S2(void, idct, sse4, const int16_t* src, int16_t* dst, intptr_t dstStride);
+FUNCDEF_TU_S2(void, idct, avx2, const int16_t* src, int16_t* dst, intptr_t dstStride);
+
+void PFX(dst4_ssse3)(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void PFX(dst4_sse2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void PFX(idst4_sse2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void PFX(dst4_avx2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void PFX(idst4_avx2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void PFX(denoise_dct_sse4)(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
+void PFX(denoise_dct_avx2)(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
 
 #endif // ifndef X265_DCT8_H

x265_1.7.tar.gz/source/common/x86/intrapred.h -> x265_1.8.tar.gz/source/common/x86/intrapred.h Changed

@@ -26,262 +26,68 @@
 #ifndef X265_INTRAPRED_H
 #define X265_INTRAPRED_H
 
-void x265_intra_pred_dc4_sse2(pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
-void x265_intra_pred_dc8_sse2(pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
-void x265_intra_pred_dc16_sse2(pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
-void x265_intra_pred_dc32_sse2(pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
-void x265_intra_pred_dc4_sse4(pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
-void x265_intra_pred_dc8_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
-void x265_intra_pred_dc16_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
-void x265_intra_pred_dc32_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
-void x265_intra_pred_dc32_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
-
-void x265_intra_pred_planar4_sse2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
-void x265_intra_pred_planar8_sse2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
-void x265_intra_pred_planar16_sse2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
-void x265_intra_pred_planar32_sse2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
-void x265_intra_pred_planar4_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
-void x265_intra_pred_planar8_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
-void x265_intra_pred_planar16_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
-void x265_intra_pred_planar32_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
-void x265_intra_pred_planar16_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
-void x265_intra_pred_planar32_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
-
 #define DECL_ANG(bsize, mode, cpu) \
-    void x265_intra_pred_ang ## bsize ## _ ## mode ## _ ## cpu(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
+    void PFX(intra_pred_ang ## bsize ## _ ## mode ## _ ## cpu)(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
+
+#define DECL_ANGS(bsize, cpu) \
+    DECL_ANG(bsize, 2, cpu); \
+    DECL_ANG(bsize, 3, cpu); \
+    DECL_ANG(bsize, 4, cpu); \
+    DECL_ANG(bsize, 5, cpu); \
+    DECL_ANG(bsize, 6, cpu); \
+    DECL_ANG(bsize, 7, cpu); \
+    DECL_ANG(bsize, 8, cpu); \
+    DECL_ANG(bsize, 9, cpu); \
+    DECL_ANG(bsize, 10, cpu); \
+    DECL_ANG(bsize, 11, cpu); \
+    DECL_ANG(bsize, 12, cpu); \
+    DECL_ANG(bsize, 13, cpu); \
+    DECL_ANG(bsize, 14, cpu); \
+    DECL_ANG(bsize, 15, cpu); \
+    DECL_ANG(bsize, 16, cpu); \
+    DECL_ANG(bsize, 17, cpu); \
+    DECL_ANG(bsize, 18, cpu); \
+    DECL_ANG(bsize, 19, cpu); \
+    DECL_ANG(bsize, 20, cpu); \
+    DECL_ANG(bsize, 21, cpu); \
+    DECL_ANG(bsize, 22, cpu); \
+    DECL_ANG(bsize, 23, cpu); \
+    DECL_ANG(bsize, 24, cpu); \
+    DECL_ANG(bsize, 25, cpu); \
+    DECL_ANG(bsize, 26, cpu); \
+    DECL_ANG(bsize, 27, cpu); \
+    DECL_ANG(bsize, 28, cpu); \
+    DECL_ANG(bsize, 29, cpu); \
+    DECL_ANG(bsize, 30, cpu); \
+    DECL_ANG(bsize, 31, cpu); \
+    DECL_ANG(bsize, 32, cpu); \
+    DECL_ANG(bsize, 33, cpu); \
+    DECL_ANG(bsize, 34, cpu)
 
-DECL_ANG(4, 2, sse2);
-DECL_ANG(4, 3, sse2);
-DECL_ANG(4, 4, sse2);
-DECL_ANG(4, 5, sse2);
-DECL_ANG(4, 6, sse2);
-DECL_ANG(4, 7, sse2);
-DECL_ANG(4, 8, sse2);
-DECL_ANG(4, 9, sse2);
-DECL_ANG(4, 10, sse2);
-DECL_ANG(4, 11, sse2);
-DECL_ANG(4, 12, sse2);
-DECL_ANG(4, 13, sse2);
-DECL_ANG(4, 14, sse2);
-DECL_ANG(4, 15, sse2);
-DECL_ANG(4, 16, sse2);
-DECL_ANG(4, 17, sse2);
-DECL_ANG(4, 18, sse2);
-DECL_ANG(4, 26, sse2);
+#define DECL_ALL(cpu) \
+    FUNCDEF_TU(void, all_angs_pred, cpu, pixel *dest, pixel *refPix, pixel *filtPix, int bLuma); \
+    FUNCDEF_TU(void, intra_filter, cpu, const pixel *samples, pixel *filtered); \
+    DECL_ANGS(4, cpu); \
+    DECL_ANGS(8, cpu); \
+    DECL_ANGS(16, cpu); \
+    DECL_ANGS(32, cpu)
 
-DECL_ANG(4, 2, ssse3);
-DECL_ANG(4, 3, sse4);
-DECL_ANG(4, 4, sse4);
-DECL_ANG(4, 5, sse4);
-DECL_ANG(4, 6, sse4);
-DECL_ANG(4, 7, sse4);
-DECL_ANG(4, 8, sse4);
-DECL_ANG(4, 9, sse4);
-DECL_ANG(4, 10, sse4);
-DECL_ANG(4, 11, sse4);
-DECL_ANG(4, 12, sse4);
-DECL_ANG(4, 13, sse4);
-DECL_ANG(4, 14, sse4);
-DECL_ANG(4, 15, sse4);
-DECL_ANG(4, 16, sse4);
-DECL_ANG(4, 17, sse4);
-DECL_ANG(4, 18, sse4);
-DECL_ANG(4, 26, sse4);
-DECL_ANG(8, 2, ssse3);
-DECL_ANG(8, 3, sse4);
-DECL_ANG(8, 4, sse4);
-DECL_ANG(8, 5, sse4);
-DECL_ANG(8, 6, sse4);
-DECL_ANG(8, 7, sse4);
-DECL_ANG(8, 8, sse4);
-DECL_ANG(8, 9, sse4);
-DECL_ANG(8, 10, sse4);
-DECL_ANG(8, 11, sse4);
-DECL_ANG(8, 12, sse4);
-DECL_ANG(8, 13, sse4);
-DECL_ANG(8, 14, sse4);
-DECL_ANG(8, 15, sse4);
-DECL_ANG(8, 16, sse4);
-DECL_ANG(8, 17, sse4);
-DECL_ANG(8, 18, sse4);
-DECL_ANG(8, 19, sse4);
-DECL_ANG(8, 20, sse4);
-DECL_ANG(8, 21, sse4);
-DECL_ANG(8, 22, sse4);
-DECL_ANG(8, 23, sse4);
-DECL_ANG(8, 24, sse4);
-DECL_ANG(8, 25, sse4);
-DECL_ANG(8, 26, sse4);
-DECL_ANG(8, 27, sse4);
-DECL_ANG(8, 28, sse4);
-DECL_ANG(8, 29, sse4);
-DECL_ANG(8, 30, sse4);
-DECL_ANG(8, 31, sse4);
-DECL_ANG(8, 32, sse4);
-DECL_ANG(8, 33, sse4);
+FUNCDEF_TU_S2(void, intra_pred_dc, sse2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
+FUNCDEF_TU_S2(void, intra_pred_dc, sse4, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
+FUNCDEF_TU_S2(void, intra_pred_dc, avx2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
 
-DECL_ANG(16, 2, ssse3);
-DECL_ANG(16, 3, sse4);
-DECL_ANG(16, 4, sse4);
-DECL_ANG(16, 5, sse4);
-DECL_ANG(16, 6, sse4);
-DECL_ANG(16, 7, sse4);
-DECL_ANG(16, 8, sse4);
-DECL_ANG(16, 9, sse4);
-DECL_ANG(16, 10, sse4);
-DECL_ANG(16, 11, sse4);
-DECL_ANG(16, 12, sse4);
-DECL_ANG(16, 13, sse4);
-DECL_ANG(16, 14, sse4);
-DECL_ANG(16, 15, sse4);
-DECL_ANG(16, 16, sse4);
-DECL_ANG(16, 17, sse4);
-DECL_ANG(16, 18, sse4);
-DECL_ANG(16, 19, sse4);
-DECL_ANG(16, 20, sse4);
-DECL_ANG(16, 21, sse4);
-DECL_ANG(16, 22, sse4);
-DECL_ANG(16, 23, sse4);
-DECL_ANG(16, 24, sse4);
-DECL_ANG(16, 25, sse4);
-DECL_ANG(16, 26, sse4);
-DECL_ANG(16, 27, sse4);
-DECL_ANG(16, 28, sse4);
-DECL_ANG(16, 29, sse4);
-DECL_ANG(16, 30, sse4);
-DECL_ANG(16, 31, sse4);
-DECL_ANG(16, 32, sse4);
-DECL_ANG(16, 33, sse4);
+FUNCDEF_TU_S2(void, intra_pred_planar, sse2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
+FUNCDEF_TU_S2(void, intra_pred_planar, sse4, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
+FUNCDEF_TU_S2(void, intra_pred_planar, avx2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
 
-DECL_ANG(32, 2, ssse3);
-DECL_ANG(32, 3, sse4);
-DECL_ANG(32, 4, sse4);
-DECL_ANG(32, 5, sse4);
-DECL_ANG(32, 6, sse4);
-DECL_ANG(32, 7, sse4);
-DECL_ANG(32, 8, sse4);
-DECL_ANG(32, 9, sse4);
-DECL_ANG(32, 10, sse4);
-DECL_ANG(32, 11, sse4);
-DECL_ANG(32, 12, sse4);
-DECL_ANG(32, 13, sse4);
-DECL_ANG(32, 14, sse4);
-DECL_ANG(32, 15, sse4);
-DECL_ANG(32, 16, sse4);
-DECL_ANG(32, 17, sse4);
-DECL_ANG(32, 18, sse4);
-DECL_ANG(32, 19, sse4);
-DECL_ANG(32, 20, sse4);
-DECL_ANG(32, 21, sse4);

x265_1.7.tar.gz/source/common/x86/intrapred16.asm -> x265_1.8.tar.gz/source/common/x86/intrapred16.asm Changed

@@ -35,39 +35,52 @@
 %assign x x+1
 %endrep
 
-const shuf_mode_13_23,      db  0,  0, 14, 15,  6,  7,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0
-const shuf_mode_14_22,      db 14, 15, 10, 11,  4,  5,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0
-const shuf_mode_15_21,      db 12, 13,  8,  9,  4,  5,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0
-const shuf_mode_16_20,      db  2,  3,  0,  1, 14, 15, 12, 13,  8,  9,  6,  7,  2,  3,  0,  1
-const shuf_mode_17_19,      db  0,  1, 14, 15, 12, 13, 10, 11,  6,  7,  4,  5,  2,  3,  0,  1
-const shuf_mode32_18,       db 14, 15, 12, 13, 10, 11,  8,  9,  6,  7,  4,  5,  2,  3,  0,  1
-const pw_punpcklwd,         db  0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9
-const c_mode32_10_0,        db  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1
-
-const pw_unpackwdq, times 8 db 0,1
-const pw_ang8_12,   db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 0, 1
-const pw_ang8_13,   db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 8, 9, 0, 1
-const pw_ang8_14,   db 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 10, 11, 4, 5, 0, 1
-const pw_ang8_15,   db 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 8, 9, 4, 5, 0, 1
-const pw_ang8_16,   db 0, 0, 0, 0, 0, 0, 12, 13, 10, 11, 6, 7, 4, 5, 0, 1
-const pw_ang8_17,   db 0, 0, 14, 15, 12, 13, 10, 11, 8, 9, 4, 5, 2, 3, 0, 1
-const pw_swap16,    db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
+const ang_table_avx2
+%assign x 0
+%rep 32
+    times 8 dw (32-x), x
+%assign x x+1
+%endrep
 
-const pw_ang16_13,   db 14, 15, 8, 9, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-const pw_ang16_16,   db 0, 0, 0, 0, 0, 0, 10, 11, 8, 9, 6, 7, 2, 3, 0, 1
+const pw_ang16_12_24,               db  0,  0,  0,  0,  0,  0,  0,  0, 14, 15, 14, 15,  0,  1,  0,  1
+const pw_ang16_13_23,               db  2,  3,  2,  3, 14, 15, 14, 15,  6,  7,  6,  7,  0,  1,  0,  1
+const pw_ang16_14_22,               db  2,  3,  2,  3, 10, 11, 10, 11,  6,  7,  6,  7,  0,  1,  0,  1
+const pw_ang16_15_21,               db 12, 13, 12, 13,  8,  9,  8,  9,  4,  5,  4,  5,  0,  1,  0,  1
+const pw_ang16_16_20,               db  8,  9,  8,  9,  6,  7,  6,  7,  2,  3,  2,  3,  0,  1,  0,  1
+
+const pw_ang32_12_24,               db  0,  1,  0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7
+const pw_ang32_13_23,               db  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 14, 15,  6,  7,  0,  1
+const pw_ang32_14_22,               db  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 10, 11,  6,  7,  0,  1
+const pw_ang32_15_21,               db  0,  0,  0,  0,  0,  0,  0,  0, 12, 13,  8,  9,  4,  5,  0,  1
+const pw_ang32_16_20,               db  0,  0,  0,  0,  0,  0,  0,  0,  8,  9,  6,  7,  2,  3,  0,  1
+const pw_ang32_17_19_0,             db  0,  0,  0,  0, 12, 13, 10, 11,  8,  9,  6,  7,  2,  3,  0,  1
+
+const shuf_mode_13_23,              db  0,  0, 14, 15,  6,  7,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0
+const shuf_mode_14_22,              db 14, 15, 10, 11,  4,  5,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0
+const shuf_mode_15_21,              db 12, 13,  8,  9,  4,  5,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0
+const shuf_mode_16_20,              db  2,  3,  0,  1, 14, 15, 12, 13,  8,  9,  6,  7,  2,  3,  0,  1
+const shuf_mode_17_19,              db  0,  1, 14, 15, 12, 13, 10, 11,  6,  7,  4,  5,  2,  3,  0,  1
+const shuf_mode32_18,               db 14, 15, 12, 13, 10, 11,  8,  9,  6,  7,  4,  5,  2,  3,  0,  1
+const pw_punpcklwd,                 db  0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9
+const c_mode32_10_0,                db  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1
+
+const pw_ang8_12,                   db  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 12, 13,  0,  1
+const pw_ang8_13,                   db  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 14, 15,  8,  9,  0,  1
+const pw_ang8_14,                   db  0,  0,  0,  0,  0,  0,  0,  0, 14, 15, 10, 11,  4,  5,  0,  1
+const pw_ang8_15,                   db  0,  0,  0,  0,  0,  0,  0,  0, 12, 13,  8,  9,  4,  5,  0,  1
+const pw_ang8_16,                   db  0,  0,  0,  0,  0,  0, 12, 13, 10, 11,  6,  7,  4,  5,  0,  1
+const pw_ang8_17,                   db  0,  0, 14, 15, 12, 13, 10, 11,  8,  9,  4,  5,  2,  3,  0,  1
+const pw_swap16,            times 2 db 14, 15, 12, 13, 10, 11,  8,  9,  6,  7,  4,  5,  2,  3,  0,  1
+
+const pw_ang16_13,                  db 14, 15,  8,  9,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+const pw_ang16_16,                  db  0,  0,  0,  0,  0,  0, 10, 11,  8,  9,  6,  7,  2,  3,  0,  1
+
+intra_filter4_shuf0:                db  2,  3,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10 ,11, 12, 13
+intra_filter4_shuf1:                db 14, 15,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10 ,11, 12, 13
+intra_filter4_shuf2:        times 2 db  4,  5,  0,  1,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
 
 ;; (blkSize - 1 - x)
-pw_planar4_0:         dw 3,  2,  1,  0,  3,  2,  1,  0
-pw_planar4_1:         dw 3,  3,  3,  3,  3,  3,  3,  3
-pw_planar8_0:         dw 7,  6,  5,  4,  3,  2,  1,  0
-pw_planar8_1:         dw 7,  7,  7,  7,  7,  7,  7,  7
-pw_planar16_0:        dw 15, 14, 13, 12, 11, 10,  9, 8
-pw_planar16_1:        dw 15, 15, 15, 15, 15, 15, 15, 15
-pd_planar32_1:        dd 31, 31, 31, 31
-
-pw_planar32_1:        dw 31, 31, 31, 31, 31, 31, 31, 31
-pw_planar32_L:        dw 31, 30, 29, 28, 27, 26, 25, 24
-pw_planar32_H:        dw 23, 22, 21, 20, 19, 18, 17, 16
+pw_planar4_0:                       dw  3,  2,  1,  0,  3,  2,  1,  0
 
 const planar32_table
 %assign x 31
@@ -85,16 +98,22 @@
 
 SECTION .text
 
+cextern pb_01
 cextern pw_1
 cextern pw_2
+cextern pw_3
+cextern pw_7
 cextern pw_4
 cextern pw_8
+cextern pw_15
 cextern pw_16
+cextern pw_31
 cextern pw_32
-cextern pw_1023
 cextern pd_16
+cextern pd_31
 cextern pd_32
 cextern pw_4096
+cextern pw_pixel_max
 cextern multiL
 cextern multiH
 cextern multiH2
@@ -103,6 +122,8 @@
 cextern pw_swap
 cextern pb_unpackwq1
 cextern pb_unpackwq2
+cextern pw_planar16_mul
+cextern pw_planar32_mul
 
 ;-----------------------------------------------------------------------------------
 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter)
@@ -121,7 +142,7 @@
     test        r4d,            r4d
 
     paddw       m0,             [pw_4]
-    psraw       m0,             3
+    psrlw       m0,             3
 
     ; store DC 4x4
     movh        [r0],           m0
@@ -140,7 +161,7 @@
     ; filter top
     movh        m1,             [r2 + 2]
     paddw       m1,             m0
-    psraw       m1,             2
+    psrlw       m1,             2
     movh        [r0],           m1             ; overwrite top-left pixel, we will update it later
 
     ; filter top-left
@@ -155,7 +176,7 @@
     ; filter left
     movu        m1,             [r2 + 20]
     paddw       m1,             m0
-    psraw       m1,             2
+    psrlw       m1,             2
     movd        r3d,            m1
     mov         [r0 + r1 * 2],  r3w
     shr         r3d,            16
@@ -181,7 +202,7 @@
     pmaddwd         m0,            [pw_1]
 
     paddw           m0,            [pw_8]
-    psraw           m0,            4              ; sum = sum / 16
+    psrlw           m0,            4              ; sum = sum / 16
     pshuflw         m0,            m0, 0
     pshufd          m0,            m0, 0          ; m0 = word [dc_val ...]
 
@@ -214,7 +235,7 @@
     ; filter top
     movu            m0,            [r2 + 2]
     paddw           m0,            m1
-    psraw           m0,            2
+    psrlw           m0,            2
     movu            [r0],          m0
 
     ; filter top-left
@@ -229,7 +250,7 @@
     ; filter left
     movu            m0,            [r2 + 36]
     paddw           m0,            m1
-    psraw           m0,            2
+    psrlw           m0,            2
     movh            r3,            m0
     mov             [r0 + r1 * 2], r3w
     shr             r3,            16
@@ -263,14 +284,10 @@
     paddw           m0,                  m1
     paddw           m2,                  m3
     paddw           m0,                  m2
-    movhlps         m1,                  m0
-    paddw           m0,                  m1
-    pshuflw         m1,                  m0, 0x6E
-    paddw           m0,                  m1
-    pmaddwd         m0,                  [pw_1]
+    HADDUW          m0,                  m1
+    paddd           m0,                  [pd_16]
+    psrld           m0,                  5
 
-    paddw           m0,                  [pw_16]
-    psraw           m0,                  5
     movd            r5d,                 m0
     pshuflw         m0,                  m0, 0 ; m0 = word [dc_val ...]
     pshufd          m0,                  m0, 0
@@ -326,11 +343,11 @@
     ; filter top
     movu            m2,                  [r2 + 2]
     paddw           m2,                  m1
-    psraw           m2,                  2
+    psrlw           m2,                  2
     movu            [r0],                m2
     movu            m3,                  [r2 + 18]
     paddw           m3,                  m1
-    psraw           m3,                  2
+    psrlw           m3,                  2
     movu            [r0 + 16],           m3

x265_1.7.tar.gz/source/common/x86/intrapred8.asm -> x265_1.8.tar.gz/source/common/x86/intrapred8.asm Changed

@@ -30,6 +30,10 @@
 intra_pred_shuff_0_8:    times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
 intra_pred_shuff_15_0:   times 2 db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
+intra_filter4_shuf0:  times 2 db  2,  3,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13
+intra_filter4_shuf1:  times 2 db 14, 15,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13
+intra_filter4_shuf2:  times 2 db  4,  5,  0,  1,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
+
 pb_0_8        times 8 db  0,  8
 pb_unpackbw1  times 2 db  1,  8,  2,  8,  3,  8,  4,  8
 pb_swap8:     times 2 db  7,  6,  5,  4,  3,  2,  1,  0
@@ -191,16 +195,6 @@
 intra_pred_shuff_0_15: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 15
 
 ALIGN 32
-c_ang16_mode_8:       db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
-                      db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
-                      db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
-                      db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-                      db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1
-                      db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
-                      db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
-                      db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-
-ALIGN 32
 c_ang16_mode_29:     db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9,  14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
                      db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
                      db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
@@ -212,16 +206,6 @@
                      db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
 
 ALIGN 32
-c_ang16_mode_7:      db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
-                     db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
-                     db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3
-                     db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
-                     db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
-                     db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
-                     db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7
-                     db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-
-ALIGN 32
 c_ang16_mode_30:      db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
                       db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
                       db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
@@ -232,18 +216,6 @@
                       db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
                       db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
 
-
-
-ALIGN 32
-c_ang16_mode_6:       db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
-                      db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
-                      db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
-                      db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-                      db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9
-                      db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
-                      db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3
-                      db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-
 ALIGN 32
 c_ang16_mode_31:      db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
                       db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
@@ -255,66 +227,6 @@
                       db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
                       db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
 
-
-ALIGN 32
-c_ang16_mode_5:       db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
-                      db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
-                      db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
-                      db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
-                      db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
-                      db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
-                      db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
-                      db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-
-ALIGN 32
-c_ang16_mode_32:      db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
-                      db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
-                      db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
-                      db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
-                      db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
-                      db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
-                      db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
-                      db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-                      db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
-                      db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
-                      db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-
-ALIGN 32
-c_ang16_mode_4:       db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
-                      db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
-                      db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7
-                      db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-                      db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
-                      db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
-                      db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
-                      db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-
-ALIGN 32
-c_ang16_mode_33:     db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
-                     db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
-                     db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
-                     db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
-                     db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-                     db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
-                     db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-                     db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
-                     db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
-                     db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-                     db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
-                     db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
-                     db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
-                     db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
-ALIGN 32
-c_ang16_mode_3:      db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
-                     db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
-                     db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
-                     db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-                     db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
-                     db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
-                     db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
-                     db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
 ALIGN 32
 c_ang16_mode_24:     db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
                      db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
@@ -476,38 +388,6 @@
                    db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
                    db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
 
-
-ALIGN 32
-c_ang32_mode_33:   db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
-                   db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
-                   db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
-                   db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
-                   db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-                   db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
-                   db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-                   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
-                   db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
-                   db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-                   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
-                   db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
-                   db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
-                   db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
-                   db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
-                   db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
-                   db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
-                   db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-                   db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
-                   db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-                   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
-                   db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
-                   db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-                   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
-                   db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
-                   db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
-                   db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
-
-
 ALIGN 32
 c_ang32_mode_25:   db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
                    db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
@@ -526,8 +406,6 @@
                    db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
                    db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
 
-
-
 ALIGN 32
 c_ang32_mode_24:   db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
                    db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
@@ -664,15 +542,6 @@
 ALIGN 32
 ;; (blkSize - 1 - x)
 pw_planar4_0:         dw 3,  2,  1,  0,  3,  2,  1,  0
-pw_planar4_1:         dw 3,  3,  3,  3,  3,  3,  3,  3
-pw_planar8_0:         dw 7,  6,  5,  4,  3,  2,  1,  0
-pw_planar8_1:         dw 7,  7,  7,  7,  7,  7,  7,  7
-pw_planar16_0:        dw 15, 14, 13, 12, 11, 10, 9,  8
-pw_planar16_1:        dw 15, 15, 15, 15, 15, 15, 15, 15
-pw_planar32_1:        dw 31, 31, 31, 31, 31, 31, 31, 31
-pw_planar32_L:        dw 31, 30, 29, 28, 27, 26, 25, 24
-pw_planar32_H:        dw 23, 22, 21, 20, 19, 18, 17, 16
-
 ALIGN 32
 c_ang8_mode_13:       db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
                       db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
@@ -704,6 +573,13 @@
 %assign x x+1
 %endrep
 
+const ang_table_avx2

x265_1.7.tar.gz/source/common/x86/ipfilter16.asm -> x265_1.8.tar.gz/source/common/x86/ipfilter16.asm Changed

@@ -3,6 +3,7 @@
 ;*
 ;* Authors: Nabajit Deka <nabajit@multicorewareinc.com>
 ;*          Murugan Vairavel <murugan@multicorewareinc.com>
+;*          Min Chen <chenm003@163.com>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
@@ -25,10 +26,28 @@
 %include "x86inc.asm"
 %include "x86util.asm"
 
+
+%define INTERP_OFFSET_PP        pd_32
+%define INTERP_SHIFT_PP         6
+
+%if BIT_DEPTH == 10
+    %define INTERP_SHIFT_PS         2
+    %define INTERP_OFFSET_PS        pd_n32768
+    %define INTERP_SHIFT_SP         10
+    %define INTERP_OFFSET_SP        pd_524800
+%elif BIT_DEPTH == 12
+    %define INTERP_SHIFT_PS         4
+    %define INTERP_OFFSET_PS        pd_n131072
+    %define INTERP_SHIFT_SP         8
+    %define INTERP_OFFSET_SP        pd_524416
+%else
+    %error Unsupport bit depth!
+%endif
+
+
 SECTION_RODATA 32
 
-tab_c_32:         times 4 dd 32
-tab_c_n32768:     times 4 dd -32768
+tab_c_32:         times 8 dd 32
 tab_c_524800:     times 4 dd 524800
 tab_c_n8192:      times 8 dw -8192
 pd_524800:        times 8 dd 524800
@@ -44,29 +63,53 @@
                   dw -2, 16, 54, -4
                   dw -2, 10, 58, -2
 
-tab_ChromaCoeffV: times 4 dw 0, 64
-                  times 4 dw 0, 0
+const tab_ChromaCoeffV,  times 8 dw 0, 64
+                         times 8 dw 0, 0
+
+                         times 8 dw -2, 58
+                         times 8 dw 10, -2
+
+                         times 8 dw -4, 54
+                         times 8 dw 16, -2
+
+                         times 8 dw -6, 46
+                         times 8 dw 28, -4
+
+                         times 8 dw -4, 36
+                         times 8 dw 36, -4
 
-                  times 4 dw -2, 58
-                  times 4 dw 10, -2
+                         times 8 dw -4, 28
+                         times 8 dw 46, -6
 
-                  times 4 dw -4, 54
-                  times 4 dw 16, -2
+                         times 8 dw -2, 16
+                         times 8 dw 54, -4
 
-                  times 4 dw -6, 46 
-                  times 4 dw 28, -4
+                         times 8 dw -2, 10
+                         times 8 dw 58, -2
 
-                  times 4 dw -4, 36
-                  times 4 dw 36, -4
+tab_ChromaCoeffVer: times 8 dw 0, 64
+                    times 8 dw 0, 0
 
-                  times 4 dw -4, 28
-                  times 4 dw 46, -6
+                    times 8 dw -2, 58
+                    times 8 dw 10, -2
 
-                  times 4 dw -2, 16
-                  times 4 dw 54, -4
+                    times 8 dw -4, 54
+                    times 8 dw 16, -2
 
-                  times 4 dw -2, 10
-                  times 4 dw 58, -2
+                    times 8 dw -6, 46
+                    times 8 dw 28, -4
+
+                    times 8 dw -4, 36
+                    times 8 dw 36, -4
+
+                    times 8 dw -4, 28
+                    times 8 dw 46, -6
+
+                    times 8 dw -2, 16
+                    times 8 dw 54, -4
+
+                    times 8 dw -2, 10
+                    times 8 dw 58, -2
 
 tab_LumaCoeff:    dw   0, 0,  0,  64,  0,   0,  0,  0
                   dw  -1, 4, -10, 58,  17, -5,  1,  0
@@ -115,11 +158,1024 @@
 
 const interp8_hps_shuf,     dd 0, 4, 1, 5, 2, 6, 3, 7
 
+const interp8_hpp_shuf,     db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
+                            db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
+
+const pb_shuf,  db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
+                db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
+
+
 SECTION .text
+cextern pd_8
 cextern pd_32
 cextern pw_pixel_max
+cextern pd_524416
 cextern pd_n32768
+cextern pd_n131072
 cextern pw_2000
+cextern idct8_shuf2
+
+%macro FILTER_LUMA_HOR_4_sse2 1
+    movu        m4,     [r0 + %1]       ; m4 = src[0-7]
+    movu        m5,     [r0 + %1 + 2]   ; m5 = src[1-8]
+    pmaddwd     m4,     m0
+    pmaddwd     m5,     m0
+    pshufd      m2,     m4,     q2301
+    paddd       m4,     m2
+    pshufd      m2,     m5,     q2301
+    paddd       m5,     m2
+    pshufd      m4,     m4,     q3120
+    pshufd      m5,     m5,     q3120
+    punpcklqdq  m4,     m5
+
+    movu        m5,     [r0 + %1 + 4]   ; m5 = src[2-9]
+    movu        m3,     [r0 + %1 + 6]   ; m3 = src[3-10]
+    pmaddwd     m5,     m0
+    pmaddwd     m3,     m0
+    pshufd      m2,     m5,     q2301
+    paddd       m5,     m2
+    pshufd      m2,     m3,     q2301
+    paddd       m3,     m2
+    pshufd      m5,     m5,     q3120
+    pshufd      m3,     m3,     q3120
+    punpcklqdq  m5,     m3
+
+    pshufd      m2,     m4,     q2301
+    paddd       m4,     m2
+    pshufd      m2,     m5,     q2301
+    paddd       m5,     m2
+    pshufd      m4,     m4,     q3120
+    pshufd      m5,     m5,     q3120
+    punpcklqdq  m4,     m5
+    paddd       m4,     m1
+%endmacro
+
+%macro FILTER_LUMA_HOR_8_sse2 1
+    movu        m4,     [r0 + %1]       ; m4 = src[0-7]
+    movu        m5,     [r0 + %1 + 2]   ; m5 = src[1-8]
+    pmaddwd     m4,     m0
+    pmaddwd     m5,     m0
+    pshufd      m2,     m4,     q2301
+    paddd       m4,     m2
+    pshufd      m2,     m5,     q2301
+    paddd       m5,     m2
+    pshufd      m4,     m4,     q3120
+    pshufd      m5,     m5,     q3120
+    punpcklqdq  m4,     m5
+
+    movu        m5,     [r0 + %1 + 4]   ; m5 = src[2-9]
+    movu        m3,     [r0 + %1 + 6]   ; m3 = src[3-10]
+    pmaddwd     m5,     m0
+    pmaddwd     m3,     m0
+    pshufd      m2,     m5,     q2301
+    paddd       m5,     m2
+    pshufd      m2,     m3,     q2301
+    paddd       m3,     m2
+    pshufd      m5,     m5,     q3120
+    pshufd      m3,     m3,     q3120
+    punpcklqdq  m5,     m3
+
+    pshufd      m2,     m4,     q2301
+    paddd       m4,     m2
+    pshufd      m2,     m5,     q2301
+    paddd       m5,     m2
+    pshufd      m4,     m4,     q3120
+    pshufd      m5,     m5,     q3120
+    punpcklqdq  m4,     m5
+    paddd       m4,     m1
+
+    movu        m5,     [r0 + %1 + 8]   ; m5 = src[4-11]

x265_1.7.tar.gz/source/common/x86/ipfilter8.asm -> x265_1.8.tar.gz/source/common/x86/ipfilter8.asm Changed

@@ -301,6 +301,7 @@
 cextern pw_32
 cextern pw_512
 cextern pw_2000
+cextern pw_8192
 
 %macro FILTER_H4_w2_2_sse2 0
     pxor        m3, m3
@@ -330,80 +331,38 @@
 %endmacro
 
 ;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-INIT_XMM sse3
-cglobal interp_4tap_horiz_pp_2x4, 4, 6, 6, src, srcstride, dst, dststride
-    mov         r4d,        r4m
-    mova        m5,         [pw_32]
-
-%ifdef PIC
-    lea         r5,          [tabw_ChromaCoeff]
-    movddup     m4,         [r5 + r4 * 8]
-%else
-    movddup     m4,         [tabw_ChromaCoeff + r4 * 8]
-%endif
-
-    FILTER_H4_w2_2_sse2
-    lea         srcq,       [srcq + srcstrideq * 2]
-    lea         dstq,       [dstq + dststrideq * 2]
-    FILTER_H4_w2_2_sse2
-
-    RET
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+; void interp_4tap_horiz_pp_2xN(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-----------------------------------------------------------------------------
+%macro FILTER_H4_W2xN_sse3 1
 INIT_XMM sse3
-cglobal interp_4tap_horiz_pp_2x8, 4, 6, 6, src, srcstride, dst, dststride
-    mov         r4d,        r4m
-    mova        m5,         [pw_32]
+cglobal interp_4tap_horiz_pp_2x%1, 4, 6, 6, src, srcstride, dst, dststride
+    mov         r4d,    r4m
+    mova        m5,     [pw_32]
 
 %ifdef PIC
-    lea         r5,          [tabw_ChromaCoeff]
-    movddup     m4,         [r5 + r4 * 8]
+    lea         r5,     [tabw_ChromaCoeff]
+    movddup     m4,     [r5 + r4 * 8]
 %else
-    movddup     m4,         [tabw_ChromaCoeff + r4 * 8]
+    movddup     m4,     [tabw_ChromaCoeff + r4 * 8]
 %endif
 
 %assign x 1
-%rep 4
+%rep %1/2
     FILTER_H4_w2_2_sse2
-%if x < 4
-    lea         srcq,       [srcq + srcstrideq * 2]
-    lea         dstq,       [dstq + dststrideq * 2]
+%if x < %1/2
+    lea         srcq,   [srcq + srcstrideq * 2]
+    lea         dstq,   [dstq + dststrideq * 2]
 %endif
 %assign x x+1
 %endrep
 
     RET
 
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_2x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-INIT_XMM sse3
-cglobal interp_4tap_horiz_pp_2x16, 4, 6, 6, src, srcstride, dst, dststride
-    mov         r4d,        r4m
-    mova        m5,         [pw_32]
-
-%ifdef PIC
-    lea         r5,         [tabw_ChromaCoeff]
-    movddup     m4,         [r5 + r4 * 8]
-%else
-    movddup     m4,         [tabw_ChromaCoeff + r4 * 8]
-%endif
-
-%assign x 1
-%rep 8
-    FILTER_H4_w2_2_sse2
-%if x < 8
-    lea         srcq,       [srcq + srcstrideq * 2]
-    lea         dstq,       [dstq + dststrideq * 2]
-%endif
-%assign x x+1
-%endrep
+%endmacro
 
-    RET
+    FILTER_H4_W2xN_sse3 4
+    FILTER_H4_W2xN_sse3 8
+    FILTER_H4_W2xN_sse3 16
 
 %macro FILTER_H4_w4_2_sse2 0
     pxor        m5, m5
@@ -447,143 +406,41 @@
 %endmacro
 
 ;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-INIT_XMM sse3
-cglobal interp_4tap_horiz_pp_4x2, 4, 6, 8, src, srcstride, dst, dststride
-    mov         r4d,        r4m
-    mova        m7,         [pw_32]
-
-%ifdef PIC
-    lea         r5,         [tabw_ChromaCoeff]
-    movddup     m4,         [r5 + r4 * 8]
-%else
-    movddup     m4,         [tabw_ChromaCoeff + r4 * 8]
-%endif
-
-    FILTER_H4_w4_2_sse2
-
-    RET
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-INIT_XMM sse3
-cglobal interp_4tap_horiz_pp_4x4, 4, 6, 8, src, srcstride, dst, dststride
-    mov         r4d,        r4m
-    mova        m7,         [pw_32]
-
-%ifdef PIC
-    lea         r5,         [tabw_ChromaCoeff]
-    movddup     m4,         [r5 + r4 * 8]
-%else
-    movddup     m4,         [tabw_ChromaCoeff + r4 * 8]
-%endif
-
-    FILTER_H4_w4_2_sse2
-    lea         srcq,       [srcq + srcstrideq * 2]
-    lea         dstq,       [dstq + dststrideq * 2]
-    FILTER_H4_w4_2_sse2
-
-    RET
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-INIT_XMM sse3
-cglobal interp_4tap_horiz_pp_4x8, 4, 6, 8, src, srcstride, dst, dststride
-    mov         r4d,        r4m
-    mova        m7,         [pw_32]
-
-%ifdef PIC
-    lea         r5,         [tabw_ChromaCoeff]
-    movddup     m4,         [r5 + r4 * 8]
-%else
-    movddup     m4,         [tabw_ChromaCoeff + r4 * 8]
-%endif
-
-%assign x 1
-%rep 4
-    FILTER_H4_w4_2_sse2
-%if x < 4
-    lea         srcq,       [srcq + srcstrideq * 2]
-    lea         dstq,       [dstq + dststrideq * 2]
-%endif
-%assign x x+1
-%endrep
-
-    RET
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-INIT_XMM sse3
-cglobal interp_4tap_horiz_pp_4x16, 4, 6, 8, src, srcstride, dst, dststride
-    mov         r4d,        r4m
-    mova        m7,         [pw_32]
-
-%ifdef PIC
-    lea         r5,         [tabw_ChromaCoeff]
-    movddup     m4,         [r5 + r4 * 8]
-%else
-    movddup     m4,         [tabw_ChromaCoeff + r4 * 8]
-%endif
-
-%assign x 1
-%rep 8
-    FILTER_H4_w4_2_sse2
-%if x < 8
-    lea         srcq,       [srcq + srcstrideq * 2]
-    lea         dstq,       [dstq + dststrideq * 2]
-%endif
-%assign x x+1
-%endrep

x265_1.7.tar.gz/source/common/x86/ipfilter8.h -> x265_1.8.tar.gz/source/common/x86/ipfilter8.h Changed

@@ -24,912 +24,26 @@
 #ifndef X265_IPFILTER8_H
 #define X265_IPFILTER8_H
 
-#define SETUP_LUMA_FUNC_DEF(W, H, cpu) \
-    void x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
-    void x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
-    void x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
-    void x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
-
-#define LUMA_FILTERS(cpu) \
-    SETUP_LUMA_FUNC_DEF(4,   4, cpu); \
-    SETUP_LUMA_FUNC_DEF(8,   8, cpu); \
-    SETUP_LUMA_FUNC_DEF(8,   4, cpu); \
-    SETUP_LUMA_FUNC_DEF(4,   8, cpu); \
-    SETUP_LUMA_FUNC_DEF(16, 16, cpu); \
-    SETUP_LUMA_FUNC_DEF(16,  8, cpu); \
-    SETUP_LUMA_FUNC_DEF(8,  16, cpu); \
-    SETUP_LUMA_FUNC_DEF(16, 12, cpu); \
-    SETUP_LUMA_FUNC_DEF(12, 16, cpu); \
-    SETUP_LUMA_FUNC_DEF(16,  4, cpu); \
-    SETUP_LUMA_FUNC_DEF(4,  16, cpu); \
-    SETUP_LUMA_FUNC_DEF(32, 32, cpu); \
-    SETUP_LUMA_FUNC_DEF(32, 16, cpu); \
-    SETUP_LUMA_FUNC_DEF(16, 32, cpu); \
-    SETUP_LUMA_FUNC_DEF(32, 24, cpu); \
-    SETUP_LUMA_FUNC_DEF(24, 32, cpu); \
-    SETUP_LUMA_FUNC_DEF(32,  8, cpu); \
-    SETUP_LUMA_FUNC_DEF(8,  32, cpu); \
-    SETUP_LUMA_FUNC_DEF(64, 64, cpu); \
-    SETUP_LUMA_FUNC_DEF(64, 32, cpu); \
-    SETUP_LUMA_FUNC_DEF(32, 64, cpu); \
-    SETUP_LUMA_FUNC_DEF(64, 48, cpu); \
-    SETUP_LUMA_FUNC_DEF(48, 64, cpu); \
-    SETUP_LUMA_FUNC_DEF(64, 16, cpu); \
-    SETUP_LUMA_FUNC_DEF(16, 64, cpu)
-
-#define SETUP_LUMA_SP_FUNC_DEF(W, H, cpu) \
-    void x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
-
-#define LUMA_SP_FILTERS(cpu) \
-    SETUP_LUMA_SP_FUNC_DEF(4,   4, cpu); \
-    SETUP_LUMA_SP_FUNC_DEF(8,   8, cpu); \
-    SETUP_LUMA_SP_FUNC_DEF(8,   4, cpu); \
-    SETUP_LUMA_SP_FUNC_DEF(4,   8, cpu); \
-    SETUP_LUMA_SP_FUNC_DEF(16, 16, cpu); \
-    SETUP_LUMA_SP_FUNC_DEF(16,  8, cpu); \
-    SETUP_LUMA_SP_FUNC_DEF(8,  16, cpu); \
-    SETUP_LUMA_SP_FUNC_DEF(16, 12, cpu); \
-    SETUP_LUMA_SP_FUNC_DEF(12, 16, cpu); \
-    SETUP_LUMA_SP_FUNC_DEF(16,  4, cpu); \
-    SETUP_LUMA_SP_FUNC_DEF(4,  16, cpu); \
-    SETUP_LUMA_SP_FUNC_DEF(32, 32, cpu); \
-    SETUP_LUMA_SP_FUNC_DEF(32, 16, cpu); \
-    SETUP_LUMA_SP_FUNC_DEF(16, 32, cpu); \
-    SETUP_LUMA_SP_FUNC_DEF(32, 24, cpu); \
-    SETUP_LUMA_SP_FUNC_DEF(24, 32, cpu); \
-    SETUP_LUMA_SP_FUNC_DEF(32,  8, cpu); \
-    SETUP_LUMA_SP_FUNC_DEF(8,  32, cpu); \
-    SETUP_LUMA_SP_FUNC_DEF(64, 64, cpu); \
-    SETUP_LUMA_SP_FUNC_DEF(64, 32, cpu); \
-    SETUP_LUMA_SP_FUNC_DEF(32, 64, cpu); \
-    SETUP_LUMA_SP_FUNC_DEF(64, 48, cpu); \
-    SETUP_LUMA_SP_FUNC_DEF(48, 64, cpu); \
-    SETUP_LUMA_SP_FUNC_DEF(64, 16, cpu); \
-    SETUP_LUMA_SP_FUNC_DEF(16, 64, cpu);
-
-#define SETUP_LUMA_SS_FUNC_DEF(W, H, cpu) \
-    void x265_interp_8tap_vert_ss_ ## W ## x ## H ## cpu(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
-
-#define LUMA_SS_FILTERS(cpu) \
-    SETUP_LUMA_SS_FUNC_DEF(4,   4, cpu); \
-    SETUP_LUMA_SS_FUNC_DEF(8,   8, cpu); \
-    SETUP_LUMA_SS_FUNC_DEF(8,   4, cpu); \
-    SETUP_LUMA_SS_FUNC_DEF(4,   8, cpu); \
-    SETUP_LUMA_SS_FUNC_DEF(16, 16, cpu); \
-    SETUP_LUMA_SS_FUNC_DEF(16,  8, cpu); \
-    SETUP_LUMA_SS_FUNC_DEF(8,  16, cpu); \
-    SETUP_LUMA_SS_FUNC_DEF(16, 12, cpu); \
-    SETUP_LUMA_SS_FUNC_DEF(12, 16, cpu); \
-    SETUP_LUMA_SS_FUNC_DEF(16,  4, cpu); \
-    SETUP_LUMA_SS_FUNC_DEF(4,  16, cpu); \
-    SETUP_LUMA_SS_FUNC_DEF(32, 32, cpu); \
-    SETUP_LUMA_SS_FUNC_DEF(32, 16, cpu); \
-    SETUP_LUMA_SS_FUNC_DEF(16, 32, cpu); \
-    SETUP_LUMA_SS_FUNC_DEF(32, 24, cpu); \
-    SETUP_LUMA_SS_FUNC_DEF(24, 32, cpu); \
-    SETUP_LUMA_SS_FUNC_DEF(32,  8, cpu); \
-    SETUP_LUMA_SS_FUNC_DEF(8,  32, cpu); \
-    SETUP_LUMA_SS_FUNC_DEF(64, 64, cpu); \
-    SETUP_LUMA_SS_FUNC_DEF(64, 32, cpu); \
-    SETUP_LUMA_SS_FUNC_DEF(32, 64, cpu); \
-    SETUP_LUMA_SS_FUNC_DEF(64, 48, cpu); \
-    SETUP_LUMA_SS_FUNC_DEF(48, 64, cpu); \
-    SETUP_LUMA_SS_FUNC_DEF(64, 16, cpu); \
-    SETUP_LUMA_SS_FUNC_DEF(16, 64, cpu);
-
-#if HIGH_BIT_DEPTH
-
-#define SETUP_CHROMA_420_VERT_FUNC_DEF(W, H, cpu) \
-    void x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
-    void x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
-    void x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
-    void x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
-
-#define CHROMA_420_VERT_FILTERS(cpu) \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(4, 4, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(8, 8, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(8, 4, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(4, 8, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(8, 6, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(8, 2, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(16, 16, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(16, 8, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(8, 16, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(16, 12, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(12, 16, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(16, 4, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(4, 16, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(32, 32, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(32, 16, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(16, 32, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(32, 24, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(24, 32, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(32, 8, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(8, 32, cpu)
-
-#define CHROMA_420_VERT_FILTERS_SSE4(cpu) \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(2, 4, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(2, 8, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(4, 2, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(6, 8, cpu);
-
-#define CHROMA_422_VERT_FILTERS(cpu) \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(4, 8, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(8, 16, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(8, 8, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(4, 16, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(8, 12, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(8, 4, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(16, 32, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(16, 16, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(8, 32, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(16, 24, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(12, 32, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(16, 8, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(4, 32, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(32, 64, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(32, 32, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(16, 64, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(32, 48, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(24, 64, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(32, 16, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(8, 64, cpu);
-
-#define CHROMA_422_VERT_FILTERS_SSE4(cpu) \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(2, 8, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(2, 16, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(4, 4, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(6, 16, cpu);
-
-#define CHROMA_444_VERT_FILTERS(cpu) \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(8, 8, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(8, 4, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(4, 8, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(16, 16, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(16, 8, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(8, 16, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(16, 12, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(12, 16, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(16, 4, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(4, 16, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(32, 32, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(32, 16, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(16, 32, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(32, 24, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(24, 32, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(32, 8, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(8, 32, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(64, 64, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(64, 32, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(32, 64, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(64, 48, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(48, 64, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(64, 16, cpu); \
-    SETUP_CHROMA_420_VERT_FUNC_DEF(16, 64, cpu)
-
-#define SETUP_CHROMA_420_HORIZ_FUNC_DEF(W, H, cpu) \
-    void x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
-    void x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
-
-#define CHROMA_420_HORIZ_FILTERS(cpu) \
-    SETUP_CHROMA_420_HORIZ_FUNC_DEF(4, 4, cpu); \
-    SETUP_CHROMA_420_HORIZ_FUNC_DEF(4, 2, cpu); \
-    SETUP_CHROMA_420_HORIZ_FUNC_DEF(2, 4, cpu); \
-    SETUP_CHROMA_420_HORIZ_FUNC_DEF(8, 8, cpu); \
-    SETUP_CHROMA_420_HORIZ_FUNC_DEF(8, 4, cpu); \
-    SETUP_CHROMA_420_HORIZ_FUNC_DEF(4, 8, cpu); \
-    SETUP_CHROMA_420_HORIZ_FUNC_DEF(8, 6, cpu); \
-    SETUP_CHROMA_420_HORIZ_FUNC_DEF(6, 8, cpu); \

x265_1.7.tar.gz/source/common/x86/loopfilter.asm -> x265_1.8.tar.gz/source/common/x86/loopfilter.asm Changed

@@ -29,6 +29,7 @@
 
 SECTION_RODATA 32
 pb_31:      times 32 db 31
+pb_124:     times 32 db 124
 pb_15:      times 32 db 15
 pb_movemask_32:  times 32 db 0x00
                  times 32 db 0xFF
@@ -38,13 +39,118 @@
 cextern pb_128
 cextern pb_2
 cextern pw_2
+cextern pw_pixel_max
 cextern pb_movemask
+cextern pw_1
+cextern hmul_16p
+cextern pb_4
 
 
 ;============================================================================================================
 ; void saoCuOrgE0(pixel * rec, int8_t * offsetEo, int lcuWidth, int8_t* signLeft, intptr_t stride)
 ;============================================================================================================
 INIT_XMM sse4
+%if HIGH_BIT_DEPTH
+cglobal saoCuOrgE0, 4,5,9
+    mov         r4d, r4m
+    movh        m6,  [r1]
+    movzx       r1d, byte [r3]
+    pxor        m5, m5
+    neg         r1b
+    movd        m0, r1d
+    lea         r1, [r0 + r4 * 2]
+    mov         r4d, r2d
+
+.loop:
+    movu        m7, [r0]
+    movu        m8, [r0 + 16]
+    movu        m2, [r0 + 2]
+    movu        m1, [r0 + 18]
+
+    pcmpgtw     m3, m7, m2
+    pcmpgtw     m2, m7
+    pcmpgtw     m4, m8, m1
+    pcmpgtw     m1, m8 
+
+    packsswb    m3, m4
+    packsswb    m2, m1
+
+    pand        m3, [pb_1]
+    por         m3, m2
+
+    palignr     m2, m3, m5, 15
+    por         m2, m0
+
+    mova        m4, [pw_pixel_max]
+    psignb      m2, [pb_128]                ; m2 = signLeft
+    pxor        m0, m0
+    palignr     m0, m3, 15
+    paddb       m3, m2
+    paddb       m3, [pb_2]                  ; m2 = uiEdgeType
+    pshufb      m2, m6, m3
+    pmovsxbw    m3, m2                      ; offsetEo
+    punpckhbw   m2, m2
+    psraw       m2, 8
+    paddw       m7, m3
+    paddw       m8, m2
+    pmaxsw      m7, m5
+    pmaxsw      m8, m5
+    pminsw      m7, m4
+    pminsw      m8, m4
+    movu        [r0], m7
+    movu        [r0 + 16], m8
+
+    add         r0q, 32
+    sub         r2d, 16
+    jnz        .loop
+
+    movzx       r3d, byte [r3 + 1]
+    neg         r3b
+    movd        m0, r3d
+.loopH:
+    movu        m7, [r1]
+    movu        m8, [r1 + 16]
+    movu        m2, [r1 + 2]
+    movu        m1, [r1 + 18]
+
+    pcmpgtw     m3, m7, m2
+    pcmpgtw     m2, m7
+    pcmpgtw     m4, m8, m1
+    pcmpgtw     m1, m8 
+
+    packsswb    m3, m4
+    packsswb    m2, m1
+
+    pand        m3, [pb_1]
+    por         m3, m2
+
+    palignr     m2, m3, m5, 15
+    por         m2, m0
+
+    mova        m4, [pw_pixel_max]
+    psignb      m2, [pb_128]                ; m2 = signLeft
+    pxor        m0, m0
+    palignr     m0, m3, 15
+    paddb       m3, m2
+    paddb       m3, [pb_2]                  ; m2 = uiEdgeType
+    pshufb      m2, m6, m3
+    pmovsxbw    m3, m2                      ; offsetEo
+    punpckhbw   m2, m2
+    psraw       m2, 8
+    paddw       m7, m3
+    paddw       m8, m2
+    pmaxsw      m7, m5
+    pmaxsw      m8, m5
+    pminsw      m7, m4
+    pminsw      m8, m4
+    movu        [r1], m7
+    movu        [r1 + 16], m8
+
+    add         r1q, 32
+    sub         r4d, 16
+    jnz        .loopH
+    RET
+%else ; HIGH_BIT_DEPTH
 cglobal saoCuOrgE0, 5, 5, 8, rec, offsetEo, lcuWidth, signLeft, stride
 
     mov         r4d, r4m
@@ -130,8 +236,70 @@
     sub         r4d, 16
     jnz        .loopH
     RET
+%endif
 
 INIT_YMM avx2
+%if HIGH_BIT_DEPTH
+cglobal saoCuOrgE0, 4,4,9
+    vbroadcasti128  m6, [r1]
+    movzx           r1d, byte [r3]
+    neg             r1b
+    movd            xm0, r1d
+    movzx           r1d, byte [r3 + 1]
+    neg             r1b
+    movd            xm1, r1d
+    vinserti128     m0, m0, xm1, 1
+    mova            m5, [pw_pixel_max]
+    mov             r1d, r4m
+    add             r1d, r1d
+    shr             r2d, 4
+
+.loop:
+    movu            m7, [r0]
+    movu            m8, [r0 + r1]
+    movu            m2, [r0 + 2]
+    movu            m1, [r0 + r1 + 2]
+
+    pcmpgtw         m3, m7, m2
+    pcmpgtw         m2, m7
+    pcmpgtw         m4, m8, m1
+    pcmpgtw         m1, m8
+
+    packsswb        m3, m4
+    packsswb        m2, m1
+    vpermq          m3, m3, 11011000b
+    vpermq          m2, m2, 11011000b
+
+    pand            m3, [pb_1]
+    por             m3, m2
+
+    pslldq          m2, m3, 1
+    por             m2, m0
+
+    psignb          m2, [pb_128]                ; m2 = signLeft
+    pxor            m0, m0
+    palignr         m0, m3, 15
+    paddb           m3, m2
+    paddb           m3, [pb_2]                  ; m3 = uiEdgeType
+    pshufb          m2, m6, m3
+    pmovsxbw        m3, xm2                     ; offsetEo
+    vextracti128    xm2, m2, 1
+    pmovsxbw        m2, xm2
+    pxor            m4, m4
+    paddw           m7, m3
+    paddw           m8, m2
+    pmaxsw          m7, m4
+    pmaxsw          m8, m4
+    pminsw          m7, m5
+    pminsw          m8, m5
+    movu            [r0], m7
+    movu            [r0 + r1], m8
+
+    add             r0q, 32
+    dec             r2d
+    jnz             .loop
+    RET
+%else ; HIGH_BIT_DEPTH
 cglobal saoCuOrgE0, 5, 5, 7, rec, offsetEo, lcuWidth, signLeft, stride
 
     mov                 r4d,        r4m
@@ -184,11 +352,68 @@
     sub                 r2d,        16

x265_1.7.tar.gz/source/common/x86/loopfilter.h -> x265_1.8.tar.gz/source/common/x86/loopfilter.h Changed

@@ -25,21 +25,24 @@
 #ifndef X265_LOOPFILTER_H
 #define X265_LOOPFILTER_H
 
-void x265_saoCuOrgE0_sse4(pixel * rec, int8_t * offsetEo, int endX, int8_t* signLeft, intptr_t stride);
-void x265_saoCuOrgE0_avx2(pixel * rec, int8_t * offsetEo, int endX, int8_t* signLeft, intptr_t stride);
-void x265_saoCuOrgE1_sse4(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
-void x265_saoCuOrgE1_avx2(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
-void x265_saoCuOrgE1_2Rows_sse4(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
-void x265_saoCuOrgE1_2Rows_avx2(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
-void x265_saoCuOrgE2_sse4(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
-void x265_saoCuOrgE2_avx2(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
-void x265_saoCuOrgE2_32_avx2(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
-void x265_saoCuOrgE3_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
-void x265_saoCuOrgE3_avx2(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
-void x265_saoCuOrgE3_32_avx2(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
-void x265_saoCuOrgB0_sse4(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
-void x265_saoCuOrgB0_avx2(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
-void x265_calSign_sse4(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
-void x265_calSign_avx2(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
+#define DECL_SAO(cpu) \
+    void PFX(saoCuOrgE0_ ## cpu)(pixel * rec, int8_t * offsetEo, int endX, int8_t* signLeft, intptr_t stride); \
+    void PFX(saoCuOrgE1_ ## cpu)(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width); \
+    void PFX(saoCuOrgE1_2Rows_ ## cpu)(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width); \
+    void PFX(saoCuOrgE2_ ## cpu)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride); \
+    void PFX(saoCuOrgE2_ ## cpu)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride); \
+    void PFX(saoCuOrgE2_32_ ## cpu)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride); \
+    void PFX(saoCuOrgE3_ ## cpu)(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX); \
+    void PFX(saoCuOrgE3_32_ ## cpu)(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX); \
+    void PFX(saoCuOrgB0_ ## cpu)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride); \
+    void PFX(saoCuStatsBO_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
+    void PFX(saoCuStatsE0_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
+    void PFX(saoCuStatsE1_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
+    void PFX(saoCuStatsE2_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count); \
+    void PFX(saoCuStatsE3_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
+    void PFX(calSign_ ## cpu)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
+
+DECL_SAO(sse4);
+DECL_SAO(avx2);
 
 #endif // ifndef X265_LOOPFILTER_H

x265_1.7.tar.gz/source/common/x86/mc-a.asm -> x265_1.8.tar.gz/source/common/x86/mc-a.asm Changed

@@ -32,6 +32,19 @@
 %include "x86inc.asm"
 %include "x86util.asm"
 
+%if BIT_DEPTH==8
+    %define ADDAVG_FACTOR       256
+    %define ADDAVG_ROUND        128
+%elif BIT_DEPTH==10
+    %define ADDAVG_FACTOR       1024
+    %define ADDAVG_ROUND        512
+%elif BIT_DEPTH==12
+    %define ADDAVG_FACTOR       4096
+    %define ADDAVG_ROUND        2048
+%else
+    %error Unsupport bit depth!
+%endif
+
 SECTION_RODATA 32
 
 ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
@@ -54,11 +67,12 @@
 cextern pw_512
 cextern pw_1023
 cextern pw_1024
+cextern pw_2048
+cextern pw_4096
 cextern pw_00ff
 cextern pw_pixel_max
-cextern sw_64
 cextern pd_32
-cextern deinterleave_shufd
+cextern pd_64
 
 ;====================================================================================================================
 ;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
@@ -93,23 +107,24 @@
     punpcklqdq    m1,          m2
     punpcklqdq    m3,          m5
     paddw         m1,          m3
-    pmulhrsw      m1,          [pw_1024]
-    paddw         m1,          [pw_512]
+    pmulhrsw      m1,          [pw_ %+ ADDAVG_FACTOR]
+    paddw         m1,          [pw_ %+ ADDAVG_ROUND]
 
     pxor          m0,          m0
     pmaxsw        m1,          m0
-    pminsw        m1,          [pw_1023]
+    pminsw        m1,          [pw_pixel_max]
     movd          [r2],        m1
     pextrd        [r2 + r5],   m1, 1
     lea           r2,          [r2 + 2 * r5]
     pextrd        [r2],        m1, 2
     pextrd        [r2 + r5],   m1, 3
-
     RET
+
+
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal addAvg_2x8, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova          m0,          [pw_512]
+    mova          m0,          [pw_ %+ ADDAVG_ROUND]
     pxor          m7,          m7
     add           r3,          r3
     add           r4,          r4
@@ -137,11 +152,11 @@
     punpcklqdq    m1,          m2
     punpcklqdq    m3,          m5
     paddw         m1,          m3
-    pmulhrsw      m1,          [pw_1024]
+    pmulhrsw      m1,          [pw_ %+ ADDAVG_FACTOR]
     paddw         m1,          m0
 
     pmaxsw        m1,          m7
-    pminsw        m1,          [pw_1023]
+    pminsw        m1,          [pw_pixel_max]
     movd          [r2],        m1
     pextrd        [r2 + r5],   m1, 1
     lea           r2,          [r2 + 2 * r5]
@@ -157,8 +172,8 @@
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal addAvg_2x16, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova        m6,         [pw_1023]
-    mova        m7,         [pw_1024]
+    mova        m6,         [pw_pixel_max]
+    mova        m7,         [pw_ %+ ADDAVG_FACTOR]
     mov         r6d,        16/4
     add         r3,         r3
     add         r4,         r4
@@ -184,7 +199,7 @@
     punpcklqdq  m3,         m5
     paddw       m1,         m3
     pmulhrsw    m1,         m7
-    paddw       m1,         [pw_512]
+    paddw       m1,         [pw_ %+ ADDAVG_ROUND]
     pxor        m0,         m0
     pmaxsw      m1,         m0
     pminsw      m1,         m6
@@ -214,21 +229,21 @@
     punpcklqdq     m0,          m1
     punpcklqdq     m2,          m3
     paddw          m0,          m2
-    pmulhrsw       m0,          [pw_1024]
-    paddw          m0,          [pw_512]
+    pmulhrsw       m0,          [pw_ %+ ADDAVG_FACTOR]
+    paddw          m0,          [pw_ %+ ADDAVG_ROUND]
 
     pxor           m6,          m6
     pmaxsw         m0,          m6
-    pminsw         m0,          [pw_1023]
+    pminsw         m0,          [pw_pixel_max]
     movh           [r2],        m0
     movhps         [r2 + r5],   m0
     RET
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal addAvg_6x8, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova        m4,             [pw_512]
-    mova        m5,             [pw_1023]
-    mova        m7,             [pw_1024]
+    mova        m4,             [pw_ %+ ADDAVG_ROUND]
+    mova        m5,             [pw_pixel_max]
+    mova        m7,             [pw_ %+ ADDAVG_FACTOR]
     pxor        m6,             m6
     add         r3,             r3
     add         r4,             r4
@@ -265,9 +280,9 @@
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal addAvg_6x16, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova        m4,             [pw_512]
-    mova        m5,             [pw_1023]
-    mova        m7,             [pw_1024]
+    mova        m4,             [pw_ %+ ADDAVG_ROUND]
+    mova        m5,             [pw_pixel_max]
+    mova        m7,             [pw_ %+ ADDAVG_FACTOR]
     pxor        m6,             m6
     mov         r6d,            16/2
     add         r3,             r3
@@ -301,9 +316,9 @@
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal addAvg_8x2, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova        m4,          [pw_512]
-    mova        m5,          [pw_1023]
-    mova        m7,          [pw_1024]
+    mova        m4,          [pw_ %+ ADDAVG_ROUND]
+    mova        m5,          [pw_pixel_max]
+    mova        m7,          [pw_ %+ ADDAVG_FACTOR]
     pxor        m6,          m6
     add         r3,          r3
     add         r4,          r4
@@ -332,9 +347,9 @@
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal addAvg_8x6, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova        m4,          [pw_512]
-    mova        m5,          [pw_1023]
-    mova        m7,          [pw_1024]
+    mova        m4,          [pw_ %+ ADDAVG_ROUND]
+    mova        m5,          [pw_pixel_max]
+    mova        m7,          [pw_ %+ ADDAVG_FACTOR]
     pxor        m6,          m6
     add         r3,          r3
     add         r4,          r4
@@ -371,9 +386,9 @@
 %macro ADDAVG_W4_H4 1
 INIT_XMM sse4
 cglobal addAvg_4x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova           m4,          [pw_512]
-    mova           m5,          [pw_1023]
-    mova           m7,          [pw_1024]
+    mova           m4,          [pw_ %+ ADDAVG_ROUND]
+    mova           m5,          [pw_pixel_max]
+    mova           m7,          [pw_ %+ ADDAVG_FACTOR]
     pxor           m6,          m6
     add            r3,          r3
     add            r4,          r4
@@ -421,9 +436,9 @@
 %macro ADDAVG_W8_H4 1
 INIT_XMM sse4
 cglobal addAvg_8x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova        m4,          [pw_512]
-    mova        m5,          [pw_1023]
-    mova        m7,          [pw_1024]
+    mova        m4,          [pw_ %+ ADDAVG_ROUND]
+    mova        m5,          [pw_pixel_max]
+    mova        m7,          [pw_ %+ ADDAVG_FACTOR]
     pxor        m6,          m6
     add         r3,          r3
     add         r4,          r4
@@ -471,9 +486,9 @@
 %macro ADDAVG_W12_H4 1
 INIT_XMM sse4
 cglobal addAvg_12x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova           m4,             [pw_512]
-    mova           m5,             [pw_1023]
-    mova           m7,             [pw_1024]
+    mova           m4,             [pw_ %+ ADDAVG_ROUND]

x265_1.7.tar.gz/source/common/x86/mc-a2.asm -> x265_1.8.tar.gz/source/common/x86/mc-a2.asm Changed

@@ -692,7 +692,7 @@
 %endmacro
 
 %macro FILT32x4U 4
-    mova      m1, [r0+r5]
+    movu      m1, [r0+r5]
     pavgb     m0, m1, [r0]
     movu      m3, [r0+r5+1]
     pavgb     m2, m3, [r0+1]
@@ -701,7 +701,7 @@
     pavgb     m0, m2
     pavgb     m1, m3
 
-    mova      m3, [r0+r5+mmsize]
+    movu      m3, [r0+r5+mmsize]
     pavgb     m2, m3, [r0+mmsize]
     movu      m5, [r0+r5+1+mmsize]
     pavgb     m4, m5, [r0+1+mmsize]
@@ -722,10 +722,10 @@
     vpermq    m1, m4, q3120
     vpermq    m2, m2, q3120
     vpermq    m3, m5, q3120
-    mova    [%1], m0
-    mova    [%2], m1
-    mova    [%3], m2
-    mova    [%4], m3
+    movu    [%1], m0
+    movu    [%2], m1
+    movu    [%3], m2
+    movu    [%4], m3
 %endmacro
 
 %macro FILT16x2 4
@@ -796,8 +796,8 @@
 %endmacro
 
 %macro FILT8xA 4
-    mova      m3, [r0+%4+mmsize]
-    mova      m2, [r0+%4]
+    movu      m3, [r0+%4+mmsize]
+    movu      m2, [r0+%4]
     pavgw     m3, [r0+%4+r5+mmsize]
     pavgw     m2, [r0+%4+r5]
     PALIGNR   %1, m3, 2, m6
@@ -815,9 +815,13 @@
     packssdw  m3, %1
     packssdw  m5, m4
 %endif
-    mova    [%2], m3
-    mova    [%3], m5
-    mova      %1, m2
+%if cpuflag(avx2)
+    vpermq     m3, m3, q3120
+    vpermq     m5, m5, q3120
+%endif
+    movu    [%2], m3
+    movu    [%3], m5
+    movu      %1, m2
 %endmacro
 
 ;-----------------------------------------------------------------------------
@@ -871,8 +875,8 @@
 .vloop:
     mov      r6d, r7m
 %ifnidn cpuname, mmx2
-    mova      m0, [r0]
-    mova      m1, [r0+r5]
+    movu      m0, [r0]
+    movu      m1, [r0+r5]
     pavgw     m0, m1
     pavgw     m1, [r0+r5*2]
 %endif
@@ -977,7 +981,7 @@
 FRAME_INIT_LOWRES
 INIT_XMM xop
 FRAME_INIT_LOWRES
-%if HIGH_BIT_DEPTH==0
+%if ARCH_X86_64 == 1
 INIT_YMM avx2
 FRAME_INIT_LOWRES
 %endif

x265_1.7.tar.gz/source/common/x86/mc.h -> x265_1.8.tar.gz/source/common/x86/mc.h Changed

@@ -25,45 +25,15 @@
 #define X265_MC_H
 
 #define LOWRES(cpu) \
-    void x265_frame_init_lowres_core_ ## cpu(const pixel* src0, pixel* dst0, pixel* dsth, pixel* dstv, pixel* dstc, \
+    void PFX(frame_init_lowres_core_ ## cpu)(const pixel* src0, pixel* dst0, pixel* dsth, pixel* dstv, pixel* dstc, \
                                              intptr_t src_stride, intptr_t dst_stride, int width, int height);
 LOWRES(mmx2)
 LOWRES(sse2)
 LOWRES(ssse3)
 LOWRES(avx)
+LOWRES(avx2)
 LOWRES(xop)
 
-#define DECL_SUF(func, args) \
-    void func ## _mmx2 args; \
-    void func ## _sse2 args; \
-    void func ## _ssse3 args;
-DECL_SUF(x265_pixel_avg_64x64, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
-DECL_SUF(x265_pixel_avg_64x48, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
-DECL_SUF(x265_pixel_avg_64x32, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
-DECL_SUF(x265_pixel_avg_64x16, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
-DECL_SUF(x265_pixel_avg_48x64, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
-DECL_SUF(x265_pixel_avg_32x64, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
-DECL_SUF(x265_pixel_avg_32x32, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
-DECL_SUF(x265_pixel_avg_32x24, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
-DECL_SUF(x265_pixel_avg_32x16, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
-DECL_SUF(x265_pixel_avg_32x8,  (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
-DECL_SUF(x265_pixel_avg_24x32, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
-DECL_SUF(x265_pixel_avg_16x64, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
-DECL_SUF(x265_pixel_avg_16x32, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
-DECL_SUF(x265_pixel_avg_16x16, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
-DECL_SUF(x265_pixel_avg_16x12, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
-DECL_SUF(x265_pixel_avg_16x8,  (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
-DECL_SUF(x265_pixel_avg_16x4,  (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
-DECL_SUF(x265_pixel_avg_12x16, (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
-DECL_SUF(x265_pixel_avg_8x32,  (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
-DECL_SUF(x265_pixel_avg_8x16,  (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
-DECL_SUF(x265_pixel_avg_8x8,   (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
-DECL_SUF(x265_pixel_avg_8x4,   (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
-DECL_SUF(x265_pixel_avg_4x16,  (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
-DECL_SUF(x265_pixel_avg_4x8,   (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
-DECL_SUF(x265_pixel_avg_4x4,   (pixel*, intptr_t, const pixel*, intptr_t, const pixel*, intptr_t, int))
-
 #undef LOWRES
-#undef DECL_SUF
 
 #endif // ifndef X265_MC_H

x265_1.7.tar.gz/source/common/x86/pixel-a.asm -> x265_1.8.tar.gz/source/common/x86/pixel-a.asm Changed

@@ -9,6 +9,7 @@
 ;*          Alex Izvorski <aizvorksi@gmail.com>
 ;*          Fiona Glaser <fiona@x264.com>
 ;*          Oskar Arvidsson <oskar@irock.se>
+;*          Min Chen <chenm003@163.com>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
@@ -32,8 +33,6 @@
 %include "x86util.asm"
 
 SECTION_RODATA 32
-hmul_16p:  times 16 db 1
-           times 8 db 1, -1
 hmul_8p:   times 8 db 1
            times 4 db 1, -1
            times 8 db 1
@@ -45,8 +44,7 @@
            times 2 dw 1, -1
            times 4 dw 1
            times 2 dw 1, -1
-ALIGN 32
-hmul_w:    times 2 dw 1, -1, 1, -1, 1, -1, 1, -1
+
 ALIGN 32
 transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14
 transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15
@@ -54,8 +52,6 @@
 sw_f0:     dq 0xfff0, 0
 pd_f0:     times 4 dd 0xffff0000
 
-pw_76543210: dw 0, 1, 2, 3, 4, 5, 6, 7
-
 SECTION .text
 
 cextern pb_0
@@ -72,6 +68,9 @@
 cextern pd_1
 cextern popcnt_table
 cextern pd_2
+cextern hmul_16p
+cextern pb_movemask
+cextern pw_pixel_max
 
 ;=============================================================================
 ; SATD
@@ -242,6 +241,12 @@
 %endif
     HADAMARD4_2D 4, 5, 6, 7, 3, %%n
     paddw m4, m6
+;%if HIGH_BIT_DEPTH && (BIT_DEPTH == 12)
+;    pxor m5, m5
+;    punpcklwd m6, m4, m5
+;    punpckhwd m4, m5
+;    paddd m4, m6
+;%endif
     SWAP %%n, 4
 %endmacro
 
@@ -257,15 +262,45 @@
     HADAMARD 1, max, %2, %4, %6, %7
 %endif
 %ifnidn %9, swap
+  %if (BIT_DEPTH == 12)
+    pxor m%6, m%6
+    punpcklwd m%7, m%2, m%6
+    punpckhwd m%2, m%6
+    paddd m%8, m%7
+    paddd m%8, m%2
+  %else
     paddw m%8, m%2
+  %endif
 %else
     SWAP %8, %2
+  %if (BIT_DEPTH == 12)
+    pxor m%6, m%6
+    punpcklwd m%7, m%8, m%6
+    punpckhwd m%8, m%6
+    paddd m%8, m%7
+  %endif
 %endif
 %if %1
+  %if (BIT_DEPTH == 12)
+    pxor m%6, m%6
+    punpcklwd m%7, m%4, m%6
+    punpckhwd m%4, m%6
+    paddd m%8, m%7
+    paddd m%8, m%4
+  %else
     paddw m%8, m%4
+  %endif
 %else
     HADAMARD 1, max, %3, %5, %6, %7
+  %if (BIT_DEPTH == 12)
+    pxor m%6, m%6
+    punpcklwd m%7, m%3, m%6
+    punpckhwd m%3, m%6
+    paddd m%8, m%7
+    paddd m%8, m%3
+  %else
     paddw m%8, m%3
+  %endif
 %endif
 %endmacro
 
@@ -281,29 +316,23 @@
 %endif
 
     pxor m%10, m%10
-    mova m%9, m%2
-    punpcklwd m%9, m%10
+    punpcklwd m%9, m%2, m%10
     paddd m%8, m%9
-    mova m%9, m%2
-    punpckhwd m%9, m%10
+    punpckhwd m%9, m%2, m%10
     paddd m%8, m%9
 
 %if %1
     pxor m%10, m%10
-    mova m%9, m%4
-    punpcklwd m%9, m%10
+    punpcklwd m%9, m%4, m%10
     paddd m%8, m%9
-    mova m%9, m%4
-    punpckhwd m%9, m%10
+    punpckhwd m%9, m%4, m%10
     paddd m%8, m%9
 %else
     HADAMARD 1, max, %3, %5, %6, %7
     pxor m%10, m%10
-    mova m%9, m%3
-    punpcklwd m%9, m%10
+    punpcklwd m%9, m%3, m%10
     paddd m%8, m%9
-    mova m%9, m%3
-    punpckhwd m%9, m%10
+    punpckhwd m%9, m%3, m%10
     paddd m%8, m%9
 %endif
 %endmacro
@@ -326,6 +355,7 @@
     movd       eax, m0
     and        eax, 0xffff
 %endif ; HIGH_BIT_DEPTH
+    EMMS
     RET
 %endmacro
 
@@ -336,136 +366,10 @@
 ; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
 ;-----------------------------------------------------------------------------
 INIT_MMX mmx2
-cglobal pixel_satd_16x4_internal
-    SATD_4x4_MMX m2,  0, 0
-    SATD_4x4_MMX m1,  4, 0
-    paddw        m0, m2
-    SATD_4x4_MMX m2,  8, 0
-    paddw        m0, m1
-    SATD_4x4_MMX m1, 12, 0
-    paddw        m0, m2
-    paddw        m0, m1
-    ret
-
-cglobal pixel_satd_8x8_internal
-    SATD_4x4_MMX m2,  0, 0
-    SATD_4x4_MMX m1,  4, 1
-    paddw        m0, m2
-    paddw        m0, m1
-pixel_satd_8x4_internal_mmx2:
-    SATD_4x4_MMX m2,  0, 0
-    SATD_4x4_MMX m1,  4, 0
-    paddw        m0, m2
-    paddw        m0, m1
-    ret
-
-%if HIGH_BIT_DEPTH
-%macro SATD_MxN_MMX 3
-cglobal pixel_satd_%1x%2, 4,7
-    SATD_START_MMX
-    pxor   m0, m0
-    call pixel_satd_%1x%3_internal_mmx2
-    HADDUW m0, m1
-    movd  r6d, m0
-%rep %2/%3-1
-    pxor   m0, m0
-    lea    r0, [r0+4*r1]
-    lea    r2, [r2+4*r3]
-    call pixel_satd_%1x%3_internal_mmx2
-    movd   m2, r4
-    HADDUW m0, m1
-    movd   r4, m0
-    add    r6, r4
-    movd   r4, m2
-%endrep
-    movifnidn eax, r6d
-    RET
-%endmacro
-
-SATD_MxN_MMX 16, 16, 4

x265_1.7.tar.gz/source/common/x86/pixel-util.h -> x265_1.8.tar.gz/source/common/x86/pixel-util.h Changed

@@ -24,117 +24,36 @@
 #ifndef X265_PIXEL_UTIL_H
 #define X265_PIXEL_UTIL_H
 
-void x265_getResidual4_sse2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
-void x265_getResidual8_sse2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
-void x265_getResidual16_sse2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
-void x265_getResidual16_sse4(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
-void x265_getResidual32_sse2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
-void x265_getResidual32_sse4(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
-void x265_getResidual16_avx2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
-void x265_getResidual32_avx2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
-
-void x265_transpose4_sse2(pixel* dest, const pixel* src, intptr_t stride);
-void x265_transpose8_sse2(pixel* dest, const pixel* src, intptr_t stride);
-void x265_transpose16_sse2(pixel* dest, const pixel* src, intptr_t stride);
-void x265_transpose32_sse2(pixel* dest, const pixel* src, intptr_t stride);
-void x265_transpose64_sse2(pixel* dest, const pixel* src, intptr_t stride);
-
-void x265_transpose8_avx2(pixel* dest, const pixel* src, intptr_t stride);
-void x265_transpose16_avx2(pixel* dest, const pixel* src, intptr_t stride);
-void x265_transpose32_avx2(pixel* dest, const pixel* src, intptr_t stride);
-void x265_transpose64_avx2(pixel* dest, const pixel* src, intptr_t stride);
-
-uint32_t x265_quant_sse4(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
-uint32_t x265_quant_avx2(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
-uint32_t x265_nquant_sse4(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
-uint32_t x265_nquant_avx2(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
-void x265_dequant_normal_sse4(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
-void x265_dequant_normal_avx2(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
-
-int x265_count_nonzero_4x4_ssse3(const int16_t* quantCoeff);
-int x265_count_nonzero_8x8_ssse3(const int16_t* quantCoeff);
-int x265_count_nonzero_16x16_ssse3(const int16_t* quantCoeff);
-int x265_count_nonzero_32x32_ssse3(const int16_t* quantCoeff);
-int x265_count_nonzero_4x4_avx2(const int16_t* quantCoeff);
-int x265_count_nonzero_8x8_avx2(const int16_t* quantCoeff);
-int x265_count_nonzero_16x16_avx2(const int16_t* quantCoeff);
-int x265_count_nonzero_32x32_avx2(const int16_t* quantCoeff);
-
-void x265_weight_pp_sse4(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
-void x265_weight_pp_avx2(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
-void x265_weight_sp_sse4(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
-
-void x265_pixel_ssim_4x4x2_core_mmx2(const uint8_t* pix1, intptr_t stride1,
-                                     const uint8_t* pix2, intptr_t stride2, int sums[2][4]);
-void x265_pixel_ssim_4x4x2_core_sse2(const pixel* pix1, intptr_t stride1,
-                                     const pixel* pix2, intptr_t stride2, int sums[2][4]);
-void x265_pixel_ssim_4x4x2_core_avx(const pixel* pix1, intptr_t stride1,
-                                    const pixel* pix2, intptr_t stride2, int sums[2][4]);
-float x265_pixel_ssim_end4_sse2(int sum0[5][4], int sum1[5][4], int width);
-float x265_pixel_ssim_end4_avx(int sum0[5][4], int sum1[5][4], int width);
-
-void x265_scale1D_128to64_ssse3(pixel*, const pixel*);
-void x265_scale1D_128to64_avx2(pixel*, const pixel*);
-void x265_scale2D_64to32_ssse3(pixel*, const pixel*, intptr_t);
-void x265_scale2D_64to32_avx2(pixel*, const pixel*, intptr_t);
-
-int x265_scanPosLast_x64(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
-int x265_scanPosLast_avx2_bmi2(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
-uint32_t x265_findPosFirstLast_ssse3(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]);
-
-#define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \
-    void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t*  dest, intptr_t destride, const pixel* src0, const pixel* src1, intptr_t srcstride0, intptr_t srcstride1); \
-    void x265_pixel_add_ps_ ## W ## x ## H ## cpu(pixel* dest, intptr_t destride, const pixel* src0, const int16_t*  src1, intptr_t srcStride0, intptr_t srcStride1);
-
-#define CHROMA_420_PIXELSUB_DEF(cpu) \
-    SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 4, cpu); \
-    SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 8, cpu); \
-    SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 16, cpu); \
-    SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 32, cpu);
-
-#define CHROMA_422_PIXELSUB_DEF(cpu) \
-    SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 8, cpu); \
-    SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 16, cpu); \
-    SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 32, cpu); \
-    SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 64, cpu);
-
-#define SETUP_LUMA_PIXELSUB_PS_FUNC(W, H, cpu) \
-    void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t*  dest, intptr_t destride, const pixel* src0, const pixel* src1, intptr_t srcstride0, intptr_t srcstride1); \
-    void x265_pixel_add_ps_ ## W ## x ## H ## cpu(pixel* dest, intptr_t destride, const pixel* src0, const int16_t*  src1, intptr_t srcStride0, intptr_t srcStride1);
-
-#define LUMA_PIXELSUB_DEF(cpu) \
-    SETUP_LUMA_PIXELSUB_PS_FUNC(8,   8, cpu); \
-    SETUP_LUMA_PIXELSUB_PS_FUNC(16, 16, cpu); \
-    SETUP_LUMA_PIXELSUB_PS_FUNC(32, 32, cpu); \
-    SETUP_LUMA_PIXELSUB_PS_FUNC(64, 64, cpu);
-
-LUMA_PIXELSUB_DEF(_sse2);
-CHROMA_420_PIXELSUB_DEF(_sse2);
-CHROMA_422_PIXELSUB_DEF(_sse2);
-
-LUMA_PIXELSUB_DEF(_sse4);
-CHROMA_420_PIXELSUB_DEF(_sse4);
-CHROMA_422_PIXELSUB_DEF(_sse4);
-
-#define SETUP_LUMA_PIXELVAR_FUNC(W, H, cpu) \
-    uint64_t x265_pixel_var_ ## W ## x ## H ## cpu(const pixel* pix, intptr_t pixstride);
-
-#define LUMA_PIXELVAR_DEF(cpu) \
-    SETUP_LUMA_PIXELVAR_FUNC(8,   8, cpu); \
-    SETUP_LUMA_PIXELVAR_FUNC(16, 16, cpu); \
-    SETUP_LUMA_PIXELVAR_FUNC(32, 32, cpu); \
-    SETUP_LUMA_PIXELVAR_FUNC(64, 64, cpu);
-
-LUMA_PIXELVAR_DEF(_sse2);
-LUMA_PIXELVAR_DEF(_xop);
-LUMA_PIXELVAR_DEF(_avx);
-
-#undef CHROMA_420_PIXELSUB_DEF
-#undef CHROMA_422_PIXELSUB_DEF
-#undef LUMA_PIXELSUB_DEF
-#undef LUMA_PIXELVAR_DEF
-#undef SETUP_CHROMA_PIXELSUB_PS_FUNC
-#undef SETUP_LUMA_PIXELSUB_PS_FUNC
-#undef SETUP_LUMA_PIXELVAR_FUNC
+#define DEFINE_UTILS(cpu) \
+    FUNCDEF_TU_S2(void, getResidual, cpu, const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); \
+    FUNCDEF_TU_S2(void, transpose, cpu, pixel* dest, const pixel* src, intptr_t stride); \
+    FUNCDEF_TU(int, count_nonzero, cpu, const int16_t* quantCoeff); \
+    uint32_t PFX(quant_ ## cpu(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)); \
+    uint32_t PFX(nquant_ ## cpu(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff)); \
+    void PFX(dequant_normal_ ## cpu(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)); \
+    void PFX(dequant_scaling_## cpu(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift)); \
+    void PFX(weight_pp_ ## cpu(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)); \
+    void PFX(weight_sp_ ## cpu(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)); \
+    void PFX(scale1D_128to64_ ## cpu(pixel*, const pixel*)); \
+    void PFX(scale2D_64to32_ ## cpu(pixel*, const pixel*, intptr_t)); \
+    uint32_t PFX(costCoeffRemain_ ## cpu(uint16_t *absCoeff, int numNonZero, int idx)); \
+    uint32_t PFX(costC1C2Flag_sse2(uint16_t *absCoeff, intptr_t numNonZero, uint8_t *baseCtxMod, intptr_t ctxOffset)); \
+
+DEFINE_UTILS(sse2);
+DEFINE_UTILS(ssse3);
+DEFINE_UTILS(sse4);
+DEFINE_UTILS(avx2);
+
+#undef DEFINE_UTILS
+
+void PFX(pixel_ssim_4x4x2_core_sse2(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4]));
+void PFX(pixel_ssim_4x4x2_core_avx(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4]));
+float PFX(pixel_ssim_end4_sse2(int sum0[5][4], int sum1[5][4], int width));
+float PFX(pixel_ssim_end4_avx(int sum0[5][4], int sum1[5][4], int width));
+
+int PFX(scanPosLast_x64(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize));
+int PFX(scanPosLast_avx2_bmi2(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize));
+uint32_t PFX(findPosFirstLast_ssse3(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]));
+uint32_t PFX(costCoeffNxN_sse4(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase));
 
 #endif // ifndef X265_PIXEL_UTIL_H

x265_1.7.tar.gz/source/common/x86/pixel-util8.asm -> x265_1.8.tar.gz/source/common/x86/pixel-util8.asm Changed

@@ -28,7 +28,12 @@
 
 SECTION_RODATA 32
 
-%if BIT_DEPTH == 10
+%if BIT_DEPTH == 12
+ssim_c1:   times 4 dd 107321.76    ; .01*.01*4095*4095*64
+ssim_c2:   times 4 dd 60851437.92  ; .03*.03*4095*4095*64*63
+pf_64:     times 4 dd 64.0
+pf_128:    times 4 dd 128.0
+%elif BIT_DEPTH == 10
 ssim_c1:   times 4 dd 6697.7856    ; .01*.01*1023*1023*64
 ssim_c2:   times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63
 pf_64:     times 4 dd 64.0
@@ -45,18 +50,15 @@
                         times 16 db 0
 deinterleave_shuf:      times  2 db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
 deinterleave_word_shuf: times  2 db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
-hmul_16p:               times 16 db 1
-                        times  8 db 1, -1
 hmulw_16p:              times  8 dw 1
                         times  4 dw 1, -1
 
-trans8_shuf:            dd 0, 4, 1, 5, 2, 6, 3, 7
-
 SECTION .text
 
 cextern pw_1
 cextern pw_0_15
 cextern pb_1
+cextern pb_128
 cextern pw_00ff
 cextern pw_1023
 cextern pw_3fff
@@ -72,6 +74,10 @@
 cextern pb_16
 cextern pb_32
 cextern pb_64
+cextern hmul_16p
+cextern trans8_shuf
+cextern_naked private_prefix %+ _entropyStateBits
+cextern pb_movemask
 
 ;-----------------------------------------------------------------------------
 ; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
@@ -627,7 +633,12 @@
     movd            xm6, r4d            ; m6 = qbits8
 
     ; fill offset
+%if UNIX64 == 0
     vpbroadcastd    m5, r5m             ; m5 = add
+%else ; Mac
+    movd           xm5, r5m
+    vpbroadcastd    m5, xm5             ; m5 = add
+%endif
 
     lea             r5, [pw_1]
 
@@ -699,7 +710,12 @@
     movd            xm6, r4d        ; m6 = qbits8
 
     ; fill offset
-    vpbroadcastd    m5, r5m         ; m5 = ad
+%if UNIX64 == 0
+    vpbroadcastd    m5, r5m         ; m5 = add
+%else ; Mac
+    movd           xm5, r5m
+    vpbroadcastd    m5, xm5         ; m5 = add
+%endif
 
     lea             r5, [pd_1]
 
@@ -817,7 +833,12 @@
 
 INIT_YMM avx2
 cglobal nquant, 3,5,7
+%if UNIX64 == 0
     vpbroadcastd m4, r4m
+%else ; Mac
+    movd         xm4, r4m
+    vpbroadcastd m4, xm4
+%endif
     vpbroadcastd m6, [pw_1]
     mov         r4d, r5m
     pxor        m5, m5              ; m7 = numZero
@@ -873,8 +894,8 @@
 %if HIGH_BIT_DEPTH
     cmp         r3d, 32767
     jle         .skip
-    shr         r3d, 2
-    sub         r4d, 2
+    shr         r3d, (BIT_DEPTH - 8)
+    sub         r4d, (BIT_DEPTH - 8)
 .skip:
 %endif
     movd        m0, r4d             ; m0 = shift
@@ -903,6 +924,136 @@
     jnz        .loop
     RET
 
+;----------------------------------------------------------------------------------------------------------------------
+;void dequant_scaling(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift)
+;----------------------------------------------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal dequant_scaling, 6,6,6
+    add         r5d, 4
+    shr         r3d, 3          ; num/8
+    cmp         r5d, r4d
+    jle         .skip
+    sub         r5d, r4d
+    mova        m0, [pd_1]
+    movd        m1, r5d         ; shift - per
+    dec         r5d
+    movd        m2, r5d         ; shift - per - 1
+    pslld       m0, m2          ; 1 << shift - per - 1
+
+.part0:
+    pmovsxwd    m2, [r0]
+    pmovsxwd    m4, [r0 + 8]
+    movu        m3, [r1]
+    movu        m5, [r1 + 16]
+    pmulld      m2, m3
+    pmulld      m4, m5
+    paddd       m2, m0
+    paddd       m4, m0
+    psrad       m2, m1
+    psrad       m4, m1
+    packssdw    m2, m4
+    movu        [r2], m2
+
+    add         r0, 16
+    add         r1, 32
+    add         r2, 16
+    dec         r3d
+    jnz         .part0
+    jmp         .end
+
+.skip:
+    sub         r4d, r5d        ; per - shift
+    movd        m0, r4d
+
+.part1:
+    pmovsxwd    m2, [r0]
+    pmovsxwd    m4, [r0 + 8]
+    movu        m3, [r1]
+    movu        m5, [r1 + 16]
+    pmulld      m2, m3
+    pmulld      m4, m5
+    packssdw    m2, m4
+    pmovsxwd    m1, m2
+    psrldq      m2, 8
+    pmovsxwd    m2, m2
+    pslld       m1, m0
+    pslld       m2, m0
+    packssdw    m1, m2
+    movu        [r2], m1
+
+    add         r0, 16
+    add         r1, 32
+    add         r2, 16
+    dec         r3d
+    jnz         .part1
+.end:
+    RET
+
+;----------------------------------------------------------------------------------------------------------------------
+;void dequant_scaling(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift)
+;----------------------------------------------------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal dequant_scaling, 6,6,6
+    add         r5d, 4
+    shr         r3d, 4          ; num/16
+    cmp         r5d, r4d
+    jle         .skip
+    sub         r5d, r4d
+    mova        m0, [pd_1]
+    movd        xm1, r5d         ; shift - per
+    dec         r5d
+    movd        xm2, r5d         ; shift - per - 1
+    pslld       m0, xm2          ; 1 << shift - per - 1
+
+.part0:
+    pmovsxwd    m2, [r0]
+    pmovsxwd    m4, [r0 + 16]
+    movu        m3, [r1]
+    movu        m5, [r1 + 32]
+    pmulld      m2, m3
+    pmulld      m4, m5
+    paddd       m2, m0
+    paddd       m4, m0
+    psrad       m2, xm1
+    psrad       m4, xm1
+    packssdw    m2, m4
+    vpermq      m2, m2, 11011000b
+    movu        [r2], m2
+
+    add         r0, 32
+    add         r1, 64
+    add         r2, 32
+    dec         r3d

x265_1.7.tar.gz/source/common/x86/pixel.h -> x265_1.8.tar.gz/source/common/x86/pixel.h Changed

@@ -28,260 +28,41 @@
 #ifndef X265_I386_PIXEL_H
 #define X265_I386_PIXEL_H
 
-#define DECL_PIXELS(ret, name, suffix, args) \
-    ret x265_pixel_ ## name ## _16x64_ ## suffix args; \
-    ret x265_pixel_ ## name ## _16x32_ ## suffix args; \
-    ret x265_pixel_ ## name ## _16x16_ ## suffix args; \
-    ret x265_pixel_ ## name ## _16x12_ ## suffix args; \
-    ret x265_pixel_ ## name ## _16x8_ ## suffix args; \
-    ret x265_pixel_ ## name ## _16x4_ ## suffix args; \
-    ret x265_pixel_ ## name ## _8x32_ ## suffix args; \
-    ret x265_pixel_ ## name ## _8x16_ ## suffix args; \
-    ret x265_pixel_ ## name ## _8x8_ ## suffix args; \
-    ret x265_pixel_ ## name ## _8x4_ ## suffix args; \
-    ret x265_pixel_ ## name ## _4x16_ ## suffix args; \
-    ret x265_pixel_ ## name ## _4x8_ ## suffix args; \
-    ret x265_pixel_ ## name ## _4x4_ ## suffix args; \
-    ret x265_pixel_ ## name ## _32x8_ ## suffix args; \
-    ret x265_pixel_ ## name ## _32x16_ ## suffix args; \
-    ret x265_pixel_ ## name ## _32x24_ ## suffix args; \
-    ret x265_pixel_ ## name ## _24x32_ ## suffix args; \
-    ret x265_pixel_ ## name ## _32x32_ ## suffix args; \
-    ret x265_pixel_ ## name ## _32x64_ ## suffix args; \
-    ret x265_pixel_ ## name ## _64x16_ ## suffix args; \
-    ret x265_pixel_ ## name ## _64x32_ ## suffix args; \
-    ret x265_pixel_ ## name ## _64x48_ ## suffix args; \
-    ret x265_pixel_ ## name ## _64x64_ ## suffix args; \
-    ret x265_pixel_ ## name ## _48x64_ ## suffix args; \
-    ret x265_pixel_ ## name ## _24x32_ ## suffix args; \
-    ret x265_pixel_ ## name ## _12x16_ ## suffix args; \
-
-#define DECL_X1(name, suffix) \
-    DECL_PIXELS(int, name, suffix, (const pixel*, intptr_t, const pixel*, intptr_t))
-
-#define DECL_X1_SS(name, suffix) \
-    DECL_PIXELS(int, name, suffix, (const int16_t*, intptr_t, const int16_t*, intptr_t))
-
-#define DECL_X1_SP(name, suffix) \
-    DECL_PIXELS(int, name, suffix, (const int16_t*, intptr_t, const pixel*, intptr_t))
-
-#define DECL_X4(name, suffix) \
-    DECL_PIXELS(void, name ## _x3, suffix, (const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*)) \
-    DECL_PIXELS(void, name ## _x4, suffix, (const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*))
-
-/* sad-a.asm */
-DECL_X1(sad, mmx2)
-DECL_X1(sad, sse2)
-DECL_X4(sad, sse2_misalign)
-DECL_X1(sad, sse3)
-DECL_X1(sad, sse2_aligned)
-DECL_X1(sad, ssse3)
-DECL_X1(sad, ssse3_aligned)
-DECL_X1(sad, avx2)
-DECL_X1(sad, avx2_aligned)
-DECL_X4(sad, mmx2)
-DECL_X4(sad, sse2)
-DECL_X4(sad, sse3)
-DECL_X4(sad, ssse3)
-DECL_X4(sad, avx)
-DECL_X4(sad, avx2)
-DECL_X1(sad, cache32_mmx2);
-DECL_X1(sad, cache64_mmx2);
-DECL_X1(sad, cache64_sse2);
-DECL_X1(sad, cache64_ssse3);
-DECL_X4(sad, cache32_mmx2);
-DECL_X4(sad, cache64_mmx2);
-DECL_X4(sad, cache64_sse2);
-DECL_X4(sad, cache64_ssse3);
-
-/* pixel-a.asm */
-DECL_X1(satd, mmx2)
-DECL_X1(satd, sse2)
-DECL_X1(satd, ssse3)
-DECL_X1(satd, ssse3_atom)
-DECL_X1(satd, sse4)
-DECL_X1(satd, avx)
-DECL_X1(satd, xop)
-DECL_X1(satd, avx2)
-int x265_pixel_satd_16x24_avx(const pixel*, intptr_t, const pixel*, intptr_t);
-int x265_pixel_satd_32x48_avx(const pixel*, intptr_t, const pixel*, intptr_t);
-int x265_pixel_satd_24x64_avx(const pixel*, intptr_t, const pixel*, intptr_t);
-int x265_pixel_satd_8x64_avx(const pixel*, intptr_t, const pixel*, intptr_t);
-int x265_pixel_satd_8x12_avx(const pixel*, intptr_t, const pixel*, intptr_t);
-int x265_pixel_satd_12x32_avx(const pixel*, intptr_t, const pixel*, intptr_t);
-int x265_pixel_satd_4x32_avx(const pixel*, intptr_t, const pixel*, intptr_t);
-int x265_pixel_satd_8x32_sse2(const pixel*, intptr_t, const pixel*, intptr_t);
-int x265_pixel_satd_16x4_sse2(const pixel*, intptr_t, const pixel*, intptr_t);
-int x265_pixel_satd_16x12_sse2(const pixel*, intptr_t, const pixel*, intptr_t);
-int x265_pixel_satd_16x32_sse2(const pixel*, intptr_t, const pixel*, intptr_t);
-int x265_pixel_satd_16x64_sse2(const pixel*, intptr_t, const pixel*, intptr_t);
-
-DECL_X1(sa8d, mmx2)
-DECL_X1(sa8d, sse2)
-DECL_X1(sa8d, ssse3)
-DECL_X1(sa8d, ssse3_atom)
-DECL_X1(sa8d, sse4)
-DECL_X1(sa8d, avx)
-DECL_X1(sa8d, xop)
-DECL_X1(sa8d, avx2)
-
-/* ssd-a.asm */
-DECL_X1(ssd, mmx)
-DECL_X1(ssd, mmx2)
-DECL_X1(ssd, sse2slow)
-DECL_X1(ssd, sse2)
-DECL_X1(ssd, ssse3)
-DECL_X1(ssd, avx)
-DECL_X1(ssd, xop)
-DECL_X1(ssd, avx2)
-DECL_X1_SS(ssd_ss, mmx)
-DECL_X1_SS(ssd_ss, mmx2)
-DECL_X1_SS(ssd_ss, sse2slow)
-DECL_X1_SS(ssd_ss, sse2)
-DECL_X1_SS(ssd_ss, ssse3)
-DECL_X1_SS(ssd_ss, sse4)
-DECL_X1_SS(ssd_ss, avx)
-DECL_X1_SS(ssd_ss, xop)
-DECL_X1_SS(ssd_ss, avx2)
-DECL_X1_SP(ssd_sp, sse4)
-#define DECL_HEVC_SSD(suffix) \
-    int x265_pixel_ssd_32x64_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
-    int x265_pixel_ssd_16x64_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
-    int x265_pixel_ssd_32x32_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
-    int x265_pixel_ssd_32x16_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
-    int x265_pixel_ssd_16x32_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
-    int x265_pixel_ssd_32x24_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
-    int x265_pixel_ssd_24x32_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
-    int x265_pixel_ssd_32x8_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
-    int x265_pixel_ssd_8x32_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
-    int x265_pixel_ssd_16x16_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
-    int x265_pixel_ssd_16x8_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
-    int x265_pixel_ssd_8x16_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
-    int x265_pixel_ssd_16x12_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
-    int x265_pixel_ssd_16x4_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
-    int x265_pixel_ssd_8x8_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t); \
-    int x265_pixel_ssd_8x4_ ## suffix(const pixel*, intptr_t, const pixel*, intptr_t);
-DECL_HEVC_SSD(sse2)
-DECL_HEVC_SSD(ssse3)
-DECL_HEVC_SSD(avx)
-
-int x265_pixel_ssd_12x16_sse4(const pixel*, intptr_t, const pixel*, intptr_t);
-int x265_pixel_ssd_24x32_sse4(const pixel*, intptr_t, const pixel*, intptr_t);
-int x265_pixel_ssd_48x64_sse4(const pixel*, intptr_t, const pixel*, intptr_t);
-int x265_pixel_ssd_64x16_sse4(const pixel*, intptr_t, const pixel*, intptr_t);
-int x265_pixel_ssd_64x32_sse4(const pixel*, intptr_t, const pixel*, intptr_t);
-int x265_pixel_ssd_64x48_sse4(const pixel*, intptr_t, const pixel*, intptr_t);
-int x265_pixel_ssd_64x64_sse4(const pixel*, intptr_t, const pixel*, intptr_t);
-
-int x265_pixel_ssd_s_4_sse2(const int16_t*, intptr_t);
-int x265_pixel_ssd_s_8_sse2(const int16_t*, intptr_t);
-int x265_pixel_ssd_s_16_sse2(const int16_t*, intptr_t);
-int x265_pixel_ssd_s_32_sse2(const int16_t*, intptr_t);
-int x265_pixel_ssd_s_16_avx2(const int16_t*, intptr_t);
-int x265_pixel_ssd_s_32_avx2(const int16_t*, intptr_t);
-
-#define ADDAVG(func)  \
-    void x265_ ## func ## _sse4(const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
-    void x265_ ## func ## _avx2(const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t);
-ADDAVG(addAvg_2x4)
-ADDAVG(addAvg_2x8)
-ADDAVG(addAvg_4x2);
-ADDAVG(addAvg_4x4)
-ADDAVG(addAvg_4x8)
-ADDAVG(addAvg_4x16)
-ADDAVG(addAvg_6x8)
-ADDAVG(addAvg_8x2)
-ADDAVG(addAvg_8x4)
-ADDAVG(addAvg_8x6)
-ADDAVG(addAvg_8x8)
-ADDAVG(addAvg_8x16)
-ADDAVG(addAvg_8x32)
-ADDAVG(addAvg_12x16)
-ADDAVG(addAvg_16x4)
-ADDAVG(addAvg_16x8)
-ADDAVG(addAvg_16x12)
-ADDAVG(addAvg_16x16)
-ADDAVG(addAvg_16x32)
-ADDAVG(addAvg_16x64)
-ADDAVG(addAvg_24x32)
-ADDAVG(addAvg_32x8)
-ADDAVG(addAvg_32x16)
-ADDAVG(addAvg_32x24)
-ADDAVG(addAvg_32x32)
-ADDAVG(addAvg_32x64)
-ADDAVG(addAvg_48x64)
-ADDAVG(addAvg_64x16)
-ADDAVG(addAvg_64x32)
-ADDAVG(addAvg_64x48)
-ADDAVG(addAvg_64x64)
-
-ADDAVG(addAvg_2x16)
-ADDAVG(addAvg_4x32)
-ADDAVG(addAvg_6x16)
-ADDAVG(addAvg_8x12)
-ADDAVG(addAvg_8x64)
-ADDAVG(addAvg_12x32)
-ADDAVG(addAvg_16x24)
-ADDAVG(addAvg_24x64)
-ADDAVG(addAvg_32x48)

x265_1.7.tar.gz/source/common/x86/sad-a.asm -> x265_1.8.tar.gz/source/common/x86/sad-a.asm Changed

@@ -7,6 +7,7 @@
 ;*          Fiona Glaser <fiona@x264.com>
 ;*          Laurent Aimar <fenrir@via.ecp.fr>
 ;*          Alex Izvorski <aizvorksi@gmail.com>
+;*          Min Chen <chenm003@163.com>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
@@ -32,15 +33,13 @@
 SECTION_RODATA 32
 
 MSK:                  db 255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0
-pb_shuf8x8c2: times 2 db 0,0,0,0,8,8,8,8,-1,-1,-1,-1,-1,-1,-1,-1
-hpred_shuf:           db 0,0,2,2,8,8,10,10,1,1,3,3,9,9,11,11
 
 SECTION .text
 
 cextern pb_3
 cextern pb_shuf8x8c
 cextern pw_8
-cextern sw_64
+cextern pd_64
 
 ;=============================================================================
 ; SAD MMX
@@ -2784,6 +2783,83 @@
 %endif
 %endmacro
 
+%macro SAD_X4_START_2x32P_AVX2 0
+    mova        m4, [r0]
+    movu        m0, [r1]
+    movu        m2, [r2]
+    movu        m1, [r3]
+    movu        m3, [r4]
+    psadbw      m0, m4
+    psadbw      m2, m4
+    psadbw      m1, m4
+    psadbw      m3, m4
+    packusdw    m0, m2
+    packusdw    m1, m3
+
+    mova        m6, [r0+FENC_STRIDE]
+    movu        m2, [r1+r5]
+    movu        m4, [r2+r5]
+    movu        m3, [r3+r5]
+    movu        m5, [r4+r5]
+    psadbw      m2, m6
+    psadbw      m4, m6
+    psadbw      m3, m6
+    psadbw      m5, m6
+    packusdw    m2, m4
+    packusdw    m3, m5
+    paddd       m0, m2
+    paddd       m1, m3
+%endmacro
+
+%macro SAD_X4_2x32P_AVX2 4
+    mova        m6, [r0+%1]
+    movu        m2, [r1+%2]
+    movu        m4, [r2+%2]
+    movu        m3, [r3+%2]
+    movu        m5, [r4+%2]
+    psadbw      m2, m6
+    psadbw      m4, m6
+    psadbw      m3, m6
+    psadbw      m5, m6
+    packusdw    m2, m4
+    packusdw    m3, m5
+    paddd       m0, m2
+    paddd       m1, m3
+
+    mova        m6, [r0+%3]
+    movu        m2, [r1+%4]
+    movu        m4, [r2+%4]
+    movu        m3, [r3+%4]
+    movu        m5, [r4+%4]
+    psadbw      m2, m6
+    psadbw      m4, m6
+    psadbw      m3, m6
+    psadbw      m5, m6
+    packusdw    m2, m4
+    packusdw    m3, m5
+    paddd       m0, m2
+    paddd       m1, m3
+%endmacro
+
+%macro SAD_X4_4x32P_AVX2 2
+%if %1==0
+    lea  r6, [r5*3]
+    SAD_X4_START_2x32P_AVX2
+%else
+    SAD_X4_2x32P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1
+%endif
+    SAD_X4_2x32P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6
+%if %1 != %2-1
+%if (%1&1) != 0
+    add  r0, 8*FENC_STRIDE
+%endif
+    lea  r1, [r1+4*r5]
+    lea  r2, [r2+4*r5]
+    lea  r3, [r3+4*r5]
+    lea  r4, [r4+4*r5]
+%endif
+%endmacro
+
 %macro SAD_X3_END_AVX2 0
     movifnidn r5, r5mp
     packssdw  m0, m1        ; 0 0 1 1 0 0 1 1
@@ -2808,6 +2884,17 @@
     RET
 %endmacro
 
+%macro SAD_X4_32P_END_AVX2 0
+    mov          r0, r6mp
+    vextracti128 xm2, m0, 1
+    vextracti128 xm3, m1, 1
+    paddd        xm0, xm2
+    paddd        xm1, xm3
+    phaddd       xm0, xm1
+    mova         [r0], xm0
+    RET
+%endmacro
+
 ;-----------------------------------------------------------------------------
 ; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
 ;                          uint8_t *pix2, intptr_t i_stride, int scores[3] )
@@ -3320,7 +3407,12 @@
     SAD_X%1_4x%2P_AVX2 x, %3/4
 %assign x x+1
 %endrep
+
+  %if (%1==4) && (%2==32)
+    SAD_X%1_32P_END_AVX2
+  %else
     SAD_X%1_END_AVX2
+  %endif
 %endmacro
 
 INIT_YMM avx2
@@ -3333,6 +3425,12 @@
 SAD_X_AVX2 4, 16, 12, 8
 SAD_X_AVX2 4, 16,  8, 8
 
+SAD_X_AVX2 4, 32,  8, 8
+SAD_X_AVX2 4, 32, 16, 8
+SAD_X_AVX2 4, 32, 24, 8
+SAD_X_AVX2 4, 32, 32, 8
+SAD_X_AVX2 4, 32, 64, 8
+
 ;=============================================================================
 ; SAD cacheline split
 ;=============================================================================
@@ -3440,7 +3538,7 @@
     jle pixel_sad_%1x%2_mmx2
     and    eax, 7
     shl    eax, 3
-    movd   mm6, [sw_64]
+    movd   mm6, [pd_64]
     movd   mm7, eax
     psubw  mm6, mm7
     PROLOGUE 4,5

x265_1.7.tar.gz/source/common/x86/sad16-a.asm -> x265_1.8.tar.gz/source/common/x86/sad16-a.asm Changed

@@ -6,6 +6,7 @@
 ;* Authors: Oskar Arvidsson <oskar@irock.se>
 ;*          Henrik Gramner <henrik@gramner.com>
 ;*          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
+;*          Min Chen <chenm003@163.com>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
@@ -51,8 +52,14 @@
     lea     r2, [r2+2*r3]
     paddw   m1, m2
     paddw   m3, m4
+  %if BIT_DEPTH <= 10
     paddw   m0, m1
     paddw   m0, m3
+  %else
+    paddw   m1, m3
+    pmaddwd m1, [pw_1]
+    paddd   m0, m1
+  %endif
 %endmacro
 
 %macro SAD_INC_2x8P_MMX 0
@@ -70,8 +77,14 @@
     lea     r2, [r2+4*r3]
     paddw   m1, m2
     paddw   m3, m4
+  %if BIT_DEPTH <= 10
     paddw   m0, m1
     paddw   m0, m3
+  %else
+    paddw   m1, m3
+    pmaddwd m1, [pw_1]
+    paddd   m0, m1
+  %endif
 %endmacro
 
 %macro SAD_INC_2x4P_MMX 0
@@ -82,8 +95,14 @@
     ABSW2   m1, m2, m1, m2, m3, m4
     lea     r0, [r0+4*r1]
     lea     r2, [r2+4*r3]
+  %if BIT_DEPTH <= 10
     paddw   m0, m1
     paddw   m0, m2
+  %else
+    paddw   m1, m2
+    pmaddwd m1, [pw_1]
+    paddd   m0, m1
+  %endif
 %endmacro
 
 ;-----------------------------------------------------------------------------
@@ -103,9 +122,17 @@
     jg .loop
 %endif
 %if %1*%2 == 256
+  %if BIT_DEPTH <= 10
     HADDUW  m0, m1
+  %else
+    HADDD  m0, m1
+  %endif
 %else
+  %if BIT_DEPTH <= 10
     HADDW   m0, m1
+  %else
+    HADDD  m0, m1
+  %endif
 %endif
     movd   eax, m0
     RET
@@ -276,8 +303,9 @@
     ABSW2   m3, m4, m3, m4, m7, m5
     paddw   m1, m2
     paddw   m3, m4
-    paddw   m0, m1
-    paddw   m0, m3
+    paddw   m1, m3
+    pmaddwd m1, [pw_1]
+    paddd   m0, m1
 %else
     movu    m1, [r2]
     movu    m2, [r2+2*r3]
@@ -286,8 +314,9 @@
     ABSW2   m1, m2, m1, m2, m3, m4
     lea     r0, [r0+4*r1]
     lea     r2, [r2+4*r3]
-    paddw   m0, m1
-    paddw   m0, m2
+    paddw   m1, m2
+    pmaddwd m1, [pw_1]
+    paddd   m0, m1
 %endif
 %endmacro
 
@@ -307,8 +336,9 @@
     ABSW2   m3, m4, m3, m4, m7, m5
     paddw   m1, m2
     paddw   m3, m4
-    paddw   m0, m1
-    paddw   m8, m3
+    paddw   m1, m3
+    pmaddwd m1, [pw_1]
+    paddd   m0, m1
 %else
     movu    m1, [r2]
     movu    m2, [r2 + 2 * r3]
@@ -317,8 +347,9 @@
     ABSW2   m1, m2, m1, m2, m3, m4
     lea     r0, [r0 + 4 * r1]
     lea     r2, [r2 + 4 * r3]
-    paddw   m0, m1
-    paddw   m8, m2
+    paddw   m1, m2
+    pmaddwd m1, [pw_1]
+    paddd   m0, m1
 %endif
 %endmacro
 
@@ -326,7 +357,7 @@
 ; int pixel_sad_NxM(uint16_t *, intptr_t, uint16_t *, intptr_t)
 ; ---------------------------------------------------------------------------- -
 %macro SAD 2
-cglobal pixel_sad_%1x%2, 4,5-(%2&4/4),8*(%1/mmsize)
+cglobal pixel_sad_%1x%2, 4,5,8
     pxor    m0, m0
 %if %2 == 4
     SAD_INC_2ROW %1
@@ -338,12 +369,7 @@
     dec    r4d
     jg .loop
 %endif
-%if %2 == 32
-    HADDUWD m0, m1
     HADDD   m0, m1
-%else
-    HADDW   m0, m1
-%endif
     movd    eax, xm0
     RET
 %endmacro
@@ -352,21 +378,15 @@
 ; int pixel_sad_Nx64(uint16_t *, intptr_t, uint16_t *, intptr_t)
 ; ---------------------------------------------------------------------------- -
 %macro SAD_Nx64 1
-cglobal pixel_sad_%1x64, 4,5-(64&4/4), 9
+cglobal pixel_sad_%1x64, 4,5, 8
     pxor    m0, m0
-    pxor    m8, m8
     mov     r4d, 64 / 2
 .loop:
     SAD_INC_2ROW_Nx64 %1
     dec    r4d
     jg .loop
 
-    HADDUWD m0, m1
-    HADDUWD m8, m1
     HADDD   m0, m1
-    HADDD   m8, m1
-    paddd   m0, m8
-
     movd    eax, xm0
     RET
 %endmacro
@@ -392,6 +412,654 @@
 SAD  16, 16
 SAD  16, 32
 
+INIT_YMM avx2
+cglobal pixel_sad_16x64, 4,7,4
+    pxor    m0, m0
+    pxor    m3, m3
+    mov     r4d, 64 / 8
+    add     r3d, r3d
+    add     r1d, r1d
+    lea     r5,     [r1 * 3]
+    lea     r6,     [r3 * 3]
+.loop:
+    movu    m1, [r2]
+    movu    m2, [r2 + r3]
+    psubw   m1, [r0]
+    psubw   m2, [r0 + r1]
+    pabsw   m1, m1
+    pabsw   m2, m2
+    paddw   m0, m1
+    paddw   m3, m2
+
+    movu    m1, [r2 + 2 * r3]
+    movu    m2, [r2 + r6]
+    psubw   m1, [r0 + 2 * r1]
+    psubw   m2, [r0 + r5]
+    pabsw   m1, m1
+    pabsw   m2, m2
+    paddw   m0, m1
+    paddw   m3, m2
+
+    lea     r0, [r0 + 4 * r1]
+    lea     r2, [r2 + 4 * r3]
+
+    movu    m1, [r2]

x265_1.7.tar.gz/source/common/x86/ssd-a.asm -> x265_1.8.tar.gz/source/common/x86/ssd-a.asm Changed

@@ -113,6 +113,62 @@
     RET
 %endmacro
 
+; Function to find ssd for 32x16 block, sse2, 12 bit depth
+; Defined sepeartely to be called from SSD_ONE_32 macro
+INIT_XMM sse2
+cglobal ssd_ss_32x16
+    pxor        m8, m8
+    mov         r4d, 16
+.loop:
+    movu        m0, [r0]
+    movu        m1, [r0+mmsize]
+    movu        m2, [r0+2*mmsize]
+    movu        m3, [r0+3*mmsize]
+    movu        m4, [r2]
+    movu        m5, [r2+mmsize]
+    movu        m6, [r2+2*mmsize]
+    movu        m7, [r2+3*mmsize]
+    psubw       m0, m4
+    psubw       m1, m5
+    psubw       m2, m6
+    psubw       m3, m7
+    add         r0, r1
+    add         r2, r3
+    pmaddwd     m0, m0
+    pmaddwd     m1, m1
+    pmaddwd     m2, m2
+    pmaddwd     m3, m3
+    paddd       m2, m3
+    paddd       m0, m1
+    paddd       m0, m2
+    paddd       m8, m0
+    dec         r4d
+    jnz         .loop
+
+    mova        m4, m8
+    pxor        m5, m5
+    punpckldq   m8, m5
+    punpckhdq   m4, m5
+    paddq       m4, m8
+    movhlps     m5, m4
+    paddq       m4, m5
+    paddq       m9, m4
+    ret
+
+%macro SSD_ONE_32 0
+cglobal pixel_ssd_ss_32x64, 4,7,10
+    add         r1d, r1d
+    add         r3d, r3d
+    pxor        m9, m9
+    xor         r4, r4
+    call        ssd_ss_32x16
+    call        ssd_ss_32x16
+    call        ssd_ss_32x16
+    call        ssd_ss_32x16
+    movq        rax, m9
+    RET
+%endmacro
+
 %macro SSD_TWO 2
 cglobal pixel_ssd_ss_%1x%2, 4,7,8
     FIX_STRIDES r1, r3
@@ -312,6 +368,124 @@
     movd   eax, xm0
     RET
 %endmacro
+
+INIT_YMM avx2
+cglobal pixel_ssd_16x16, 4,7,8
+    FIX_STRIDES r1, r3
+    lea     r5, [3 * r1]
+    lea     r6, [3 * r3]
+    mov    r4d, 4
+    pxor    m0, m0
+.loop:
+    movu    m1, [r0]
+    movu    m2, [r0 + r1]
+    movu    m3, [r0 + r1 * 2]
+    movu    m4, [r0 + r5]
+    movu    m6, [r2]
+    movu    m7, [r2 + r3]
+    psubw   m1, m6
+    psubw   m2, m7
+    movu    m6, [r2 + r3 * 2]
+    movu    m7, [r2 + r6]
+    psubw   m3, m6
+    psubw   m4, m7
+
+    lea     r0, [r0 + r1 * 4]
+    lea     r2, [r2 + r3 * 4]
+
+    pmaddwd m1, m1
+    pmaddwd m2, m2
+    pmaddwd m3, m3
+    pmaddwd m4, m4
+    paddd   m1, m2
+    paddd   m3, m4
+    paddd   m0, m1
+    paddd   m0, m3
+
+    dec    r4d
+    jg .loop
+
+    HADDD   m0, m5
+    movd   eax, xm0
+    RET
+
+INIT_YMM avx2
+cglobal pixel_ssd_32x32, 4,7,8
+    add     r1, r1
+    add     r3, r3
+    mov     r4d, 16
+    pxor    m0, m0
+.loop:
+    movu    m1, [r0]
+    movu    m2, [r0 + 32]
+    movu    m3, [r0 + r1]
+    movu    m4, [r0 + r1 + 32]
+    movu    m6, [r2]
+    movu    m7, [r2 + 32]
+    psubw   m1, m6
+    psubw   m2, m7
+    movu    m6, [r2 + r3]
+    movu    m7, [r2 + r3 + 32]
+    psubw   m3, m6
+    psubw   m4, m7
+
+    lea     r0, [r0 + r1 * 2]
+    lea     r2, [r2 + r3 * 2]
+
+    pmaddwd m1, m1
+    pmaddwd m2, m2
+    pmaddwd m3, m3
+    pmaddwd m4, m4
+    paddd   m1, m2
+    paddd   m3, m4
+    paddd   m0, m1
+    paddd   m0, m3
+
+    dec    r4d
+    jg .loop
+
+    HADDD   m0, m5
+    movd   eax, xm0
+    RET
+
+INIT_YMM avx2
+cglobal pixel_ssd_64x64, 4,7,8
+    FIX_STRIDES r1, r3
+    mov    r4d, 64
+    pxor    m0, m0
+.loop:
+    movu    m1, [r0]
+    movu    m2, [r0+32]
+    movu    m3, [r0+32*2]
+    movu    m4, [r0+32*3]
+    movu    m6, [r2]
+    movu    m7, [r2+32]
+    psubw   m1, m6
+    psubw   m2, m7
+    movu    m6, [r2+32*2]
+    movu    m7, [r2+32*3]
+    psubw   m3, m6
+    psubw   m4, m7
+
+    lea     r0, [r0+r1]
+    lea     r2, [r2+r3]
+
+    pmaddwd m1, m1
+    pmaddwd m2, m2
+    pmaddwd m3, m3
+    pmaddwd m4, m4
+    paddd   m1, m2
+    paddd   m3, m4
+    paddd   m0, m1
+    paddd   m0, m3
+
+    dec    r4d
+    jg .loop
+
+    HADDD   m0, m5
+    movd   eax, xm0
+    RET
+
 INIT_MMX mmx2
 SSD_ONE     4,  4
 SSD_ONE     4,  8
@@ -338,7 +512,13 @@
 SSD_ONE    32, 16
 SSD_ONE    32, 24
 SSD_ONE    32, 32
-SSD_ONE    32, 64
+
+%if BIT_DEPTH <= 10
+    SSD_ONE    32, 64
+%else
+    SSD_ONE_32
+%endif
+

x265_1.7.tar.gz/source/common/x86/x86inc.asm -> x265_1.8.tar.gz/source/common/x86/x86inc.asm Changed

x265_1.7.tar.gz/source/common/x86/x86util.asm -> x265_1.8.tar.gz/source/common/x86/x86util.asm Changed

x265_1.7.tar.gz/source/common/yuv.cpp -> x265_1.8.tar.gz/source/common/yuv.cpp Changed

x265_1.7.tar.gz/source/common/yuv.h -> x265_1.8.tar.gz/source/common/yuv.h Changed

x265_1.7.tar.gz/source/compat/getopt/getopt.h -> x265_1.8.tar.gz/source/compat/getopt/getopt.h Changed

@@ -144,23 +144,23 @@
 /* Many other libraries have conflicting prototypes for getopt, with
    differences in the consts, in stdlib.h.  To avoid compilation
    errors, only prototype getopt for the GNU C library.  */
-extern int getopt (int __argc, char *const *__argv, const char *__shortopts);
+extern int getopt (int argc, char *const *argv, const char *shortopts);
 # else /* not __GNU_LIBRARY__ */
 extern int getopt ();
 # endif /* __GNU_LIBRARY__ */
 
 # ifndef __need_getopt
-extern int getopt_long (int __argc, char *const *__argv, const char *__shortopts,
-		        const struct option *__longopts, int32_t *__longind);
-extern int getopt_long_only (int __argc, char *const *__argv,
-			     const char *__shortopts,
-		             const struct option *__longopts, int32_t *__longind);
+extern int getopt_long (int argc, char *const *argv, const char *shortopts,
+		        const struct option *longopts, int32_t *longind);
+extern int getopt_long_only (int argc, char *const *argv,
+			     const char *shortopts,
+		             const struct option *longopts, int32_t *longind);
 
 /* Internal only.  Users should not call this directly.  */
-extern int _getopt_internal (int __argc, char *const *__argv,
-			     const char *__shortopts,
-		             const struct option *__longopts, int32_t *__longind,
-			     int __long_only);
+extern int _getopt_internal (int argc, char *const *argv,
+			     const char *shortopts,
+		             const struct option *longopts, int32_t *longind,
+			     int longonly);
 # endif
 #else /* not __STDC__ */
 extern int getopt ();

x265_1.7.tar.gz/source/compat/msvc/stdint.h -> x265_1.8.tar.gz/source/compat/msvc/stdint.h Changed

x265_1.7.tar.gz/source/encoder/CMakeLists.txt -> x265_1.8.tar.gz/source/encoder/CMakeLists.txt Changed

x265_1.7.tar.gz/source/encoder/analysis.cpp -> x265_1.8.tar.gz/source/encoder/analysis.cpp Changed

@@ -33,7 +33,7 @@
 #include "rdcost.h"
 #include "encoder.h"
 
-using namespace x265;
+using namespace X265_NS;
 
 /* An explanation of rate distortion levels (--rd-level)
  * 
@@ -209,24 +209,20 @@
         return;
     else if (md.bestMode->cu.isIntra(0))
     {
-        m_quant.m_tqBypass = true;
         md.pred[PRED_LOSSLESS].initCosts();
         md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
         PartSize size = (PartSize)md.pred[PRED_LOSSLESS].cu.m_partSize[0];
         uint8_t* modes = md.pred[PRED_LOSSLESS].cu.m_lumaIntraDir;
         checkIntra(md.pred[PRED_LOSSLESS], cuGeom, size, modes, NULL);
         checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
-        m_quant.m_tqBypass = false;
     }
     else
     {
-        m_quant.m_tqBypass = true;
         md.pred[PRED_LOSSLESS].initCosts();
         md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
         md.pred[PRED_LOSSLESS].predYuv.copyFromYuv(md.bestMode->predYuv);
         encodeResAndCalcRdInterCU(md.pred[PRED_LOSSLESS], cuGeom);
         checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
-        m_quant.m_tqBypass = false;
     }
 }
 
@@ -385,6 +381,8 @@
     /* perform Mode task, repeat until no more work is available */
     do
     {
+        uint32_t refMasks[2] = { 0, 0 };
+
         if (m_param->rdLevel <= 4)
         {
             switch (pmode.modes[task])
@@ -396,33 +394,33 @@
                 break;
 
             case PRED_2Nx2N:
-                slave.checkInter_rd0_4(md.pred[PRED_2Nx2N], pmode.cuGeom, SIZE_2Nx2N);
+                slave.checkInter_rd0_4(md.pred[PRED_2Nx2N], pmode.cuGeom, SIZE_2Nx2N, refMasks);
                 if (m_slice->m_sliceType == B_SLICE)
                     slave.checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], pmode.cuGeom);
                 break;
 
             case PRED_Nx2N:
-                slave.checkInter_rd0_4(md.pred[PRED_Nx2N], pmode.cuGeom, SIZE_Nx2N);
+                slave.checkInter_rd0_4(md.pred[PRED_Nx2N], pmode.cuGeom, SIZE_Nx2N, refMasks);
                 break;
 
             case PRED_2NxN:
-                slave.checkInter_rd0_4(md.pred[PRED_2NxN], pmode.cuGeom, SIZE_2NxN);
+                slave.checkInter_rd0_4(md.pred[PRED_2NxN], pmode.cuGeom, SIZE_2NxN, refMasks);
                 break;
 
             case PRED_2NxnU:
-                slave.checkInter_rd0_4(md.pred[PRED_2NxnU], pmode.cuGeom, SIZE_2NxnU);
+                slave.checkInter_rd0_4(md.pred[PRED_2NxnU], pmode.cuGeom, SIZE_2NxnU, refMasks);
                 break;
 
             case PRED_2NxnD:
-                slave.checkInter_rd0_4(md.pred[PRED_2NxnD], pmode.cuGeom, SIZE_2NxnD);
+                slave.checkInter_rd0_4(md.pred[PRED_2NxnD], pmode.cuGeom, SIZE_2NxnD, refMasks);
                 break;
 
             case PRED_nLx2N:
-                slave.checkInter_rd0_4(md.pred[PRED_nLx2N], pmode.cuGeom, SIZE_nLx2N);
+                slave.checkInter_rd0_4(md.pred[PRED_nLx2N], pmode.cuGeom, SIZE_nLx2N, refMasks);
                 break;
 
             case PRED_nRx2N:
-                slave.checkInter_rd0_4(md.pred[PRED_nRx2N], pmode.cuGeom, SIZE_nRx2N);
+                slave.checkInter_rd0_4(md.pred[PRED_nRx2N], pmode.cuGeom, SIZE_nRx2N, refMasks);
                 break;
 
             default:
@@ -441,7 +439,7 @@
                 break;
 
             case PRED_2Nx2N:
-                slave.checkInter_rd5_6(md.pred[PRED_2Nx2N], pmode.cuGeom, SIZE_2Nx2N);
+                slave.checkInter_rd5_6(md.pred[PRED_2Nx2N], pmode.cuGeom, SIZE_2Nx2N, refMasks);
                 md.pred[PRED_BIDIR].rdCost = MAX_INT64;
                 if (m_slice->m_sliceType == B_SLICE)
                 {
@@ -452,27 +450,27 @@
                 break;
 
             case PRED_Nx2N:
-                slave.checkInter_rd5_6(md.pred[PRED_Nx2N], pmode.cuGeom, SIZE_Nx2N);
+                slave.checkInter_rd5_6(md.pred[PRED_Nx2N], pmode.cuGeom, SIZE_Nx2N, refMasks);
                 break;
 
             case PRED_2NxN:
-                slave.checkInter_rd5_6(md.pred[PRED_2NxN], pmode.cuGeom, SIZE_2NxN);
+                slave.checkInter_rd5_6(md.pred[PRED_2NxN], pmode.cuGeom, SIZE_2NxN, refMasks);
                 break;
 
             case PRED_2NxnU:
-                slave.checkInter_rd5_6(md.pred[PRED_2NxnU], pmode.cuGeom, SIZE_2NxnU);
+                slave.checkInter_rd5_6(md.pred[PRED_2NxnU], pmode.cuGeom, SIZE_2NxnU, refMasks);
                 break;
 
             case PRED_2NxnD:
-                slave.checkInter_rd5_6(md.pred[PRED_2NxnD], pmode.cuGeom, SIZE_2NxnD);
+                slave.checkInter_rd5_6(md.pred[PRED_2NxnD], pmode.cuGeom, SIZE_2NxnD, refMasks);
                 break;
 
             case PRED_nLx2N:
-                slave.checkInter_rd5_6(md.pred[PRED_nLx2N], pmode.cuGeom, SIZE_nLx2N);
+                slave.checkInter_rd5_6(md.pred[PRED_nLx2N], pmode.cuGeom, SIZE_nLx2N, refMasks);
                 break;
 
             case PRED_nRx2N:
-                slave.checkInter_rd5_6(md.pred[PRED_nRx2N], pmode.cuGeom, SIZE_nRx2N);
+                slave.checkInter_rd5_6(md.pred[PRED_nRx2N], pmode.cuGeom, SIZE_nRx2N, refMasks);
                 break;
 
             default:
@@ -581,7 +579,8 @@
                 /* RD selection between merge, inter, bidir and intra */
                 if (!m_bChromaSa8d) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
                 {
-                    for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++)
+                    uint32_t numPU = bestInter->cu.getNumPartInter(0);
+                    for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
                     {
                         PredictionUnit pu(bestInter->cu, cuGeom, puIdx);
                         motionCompensation(bestInter->cu, pu, bestInter->predYuv, false, true);
@@ -617,7 +616,8 @@
                 else if (!md.bestMode->cu.m_mergeFlag[0])
                 {
                     /* finally code the best mode selected from SA8D costs */
-                    for (uint32_t puIdx = 0; puIdx < md.bestMode->cu.getNumPartInter(); puIdx++)
+                    uint32_t numPU = md.bestMode->cu.getNumPartInter(0);
+                    for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
                     {
                         PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
                         motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, true);
@@ -746,7 +746,7 @@
         md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.absPartIdx);
 }
 
-void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
+uint32_t Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
 {
     uint32_t depth = cuGeom.depth;
     uint32_t cuAddr = parentCTU.m_cuAddr;
@@ -756,24 +756,104 @@
     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
     uint32_t minDepth = topSkipMinDepth(parentCTU, cuGeom);
-
+    bool earlyskip = false;
+    bool splitIntra = true;
+    uint32_t splitRefs[4] = { 0, 0, 0, 0 };
+    /* Step 1. Evaluate Merge/Skip candidates for likely early-outs */
     if (mightNotSplit && depth >= minDepth)
     {
-        bool bTryIntra = m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames;
-
         /* Compute Merge Cost */
         md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
         md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
         checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
-
-        bool earlyskip = false;
         if (m_param->rdLevel)
             earlyskip = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0); // TODO: sa8d threshold per depth
+    }
+
+    bool bNoSplit = false;
+    if (md.bestMode)
+    {
+        bNoSplit = md.bestMode->cu.isSkipped(0);
+        if (mightSplit && depth && depth >= minDepth && !bNoSplit)
+            bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
+    }
+
+    /* Step 2. Evaluate each of the 4 split sub-blocks in series */
+    if (mightSplit && !bNoSplit)
+    {
+        Mode* splitPred = &md.pred[PRED_SPLIT];
+        splitPred->initCosts();
+        CUData* splitCU = &splitPred->cu;
+        splitCU->initSubCU(parentCTU, cuGeom, qp);
+
+        uint32_t nextDepth = depth + 1;
+        ModeDepth& nd = m_modeDepth[nextDepth];
+        invalidateContexts(nextDepth);
+        Entropy* nextContext = &m_rqt[depth].cur;
+        int nextQP = qp;

x265_1.7.tar.gz/source/encoder/analysis.h -> x265_1.8.tar.gz/source/encoder/analysis.h Changed

@@ -35,7 +35,7 @@
 #include "entropy.h"
 #include "search.h"
 
-namespace x265 {
+namespace X265_NS {
 // private namespace
 
 class Entropy;
@@ -113,16 +113,16 @@
 
     /* full analysis for a P or B slice CU */
     void compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
-    void compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
-    void compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder, int32_t qp);
+    uint32_t compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
+    uint32_t compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder, int32_t qp);
 
     /* measure merge and skip */
     void checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGeom);
-    void checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom, bool isSkipMode);
+    void checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom, bool isShareMergeCand);
 
     /* measure inter options */
-    void checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize);
-    void checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize);
+    void checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refmask[2]);
+    void checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refmask[2]);
 
     void checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom& cuGeom);

x265_1.7.tar.gz/source/encoder/api.cpp -> x265_1.8.tar.gz/source/encoder/api.cpp Changed

@@ -31,25 +31,69 @@
 #include "nal.h"
 #include "bitcost.h"
 
-using namespace x265;
+/* multilib namespace reflectors */
+#if LINKED_8BIT
+namespace x265_8bit {
+const x265_api* x265_api_get(int bitDepth);
+const x265_api* x265_api_query(int bitDepth, int apiVersion, int* err);
+}
+#endif
+
+#if LINKED_10BIT
+namespace x265_10bit {
+const x265_api* x265_api_get(int bitDepth);
+const x265_api* x265_api_query(int bitDepth, int apiVersion, int* err);
+}
+#endif
+
+#if LINKED_12BIT
+namespace x265_12bit {
+const x265_api* x265_api_get(int bitDepth);
+const x265_api* x265_api_query(int bitDepth, int apiVersion, int* err);
+}
+#endif
+
+#if EXPORT_C_API
+/* these functions are exported as C functions (default) */
+using namespace X265_NS;
+extern "C" {
+#else
+/* these functions exist within private namespace (multilib) */
+namespace X265_NS {
+#endif
 
-extern "C"
 x265_encoder *x265_encoder_open(x265_param *p)
 {
     if (!p)
         return NULL;
 
+#if _MSC_VER
+#pragma warning(disable: 4127) // conditional expression is constant, yes I know
+#endif
+
+#if HIGH_BIT_DEPTH
+    if (X265_DEPTH == 12)
+        x265_log(p, X265_LOG_WARNING, "Main12 is HIGHLY experimental, do not use!\n");
+    else if (X265_DEPTH != 10 && X265_DEPTH != 12)
+#else
+    if (X265_DEPTH != 8)
+#endif
+    {
+        x265_log(p, X265_LOG_ERROR, "Build error, internal bit depth mismatch\n");
+        return NULL;
+    }
+
     Encoder* encoder = NULL;
-    x265_param* param = x265_param_alloc();
-    x265_param* latestParam = x265_param_alloc();
+    x265_param* param = PARAM_NS::x265_param_alloc();
+    x265_param* latestParam = PARAM_NS::x265_param_alloc();
     if (!param || !latestParam)
         goto fail;
 
     memcpy(param, p, sizeof(x265_param));
-    x265_log(param, X265_LOG_INFO, "HEVC encoder version %s\n", x265_version_str);
-    x265_log(param, X265_LOG_INFO, "build info %s\n", x265_build_info_str);
+    x265_log(param, X265_LOG_INFO, "HEVC encoder version %s\n", PFX(version_str));
+    x265_log(param, X265_LOG_INFO, "build info %s\n", PFX(build_info_str));
 
-    x265_setup_primitives(param, param->cpuid);
+    x265_setup_primitives(param);
 
     if (x265_check_params(param))
         goto fail;
@@ -59,7 +103,7 @@
 
     encoder = new Encoder;
     if (!param->rc.bEnableSlowFirstPass)
-        x265_param_apply_fastfirstpass(param);
+        PARAM_NS::x265_param_apply_fastfirstpass(param);
 
     // may change params for auto-detect, etc
     encoder->configure(param);
@@ -87,12 +131,11 @@
 
 fail:
     delete encoder;
-    x265_param_free(param);
-    x265_param_free(latestParam);
+    PARAM_NS::x265_param_free(param);
+    PARAM_NS::x265_param_free(latestParam);
     return NULL;
 }
 
-extern "C"
 int x265_encoder_headers(x265_encoder *enc, x265_nal **pp_nal, uint32_t *pi_nal)
 {
     if (pp_nal && enc)
@@ -109,7 +152,6 @@
     return -1;
 }
 
-extern "C"
 void x265_encoder_parameters(x265_encoder *enc, x265_param *out)
 {
     if (enc && out)
@@ -119,7 +161,6 @@
     }
 }
 
-extern "C"
 int x265_encoder_reconfig(x265_encoder* enc, x265_param* param_in)
 {
     if (!enc || !param_in)
@@ -140,7 +181,6 @@
     return ret;
 }
 
-extern "C"
 int x265_encoder_encode(x265_encoder *enc, x265_nal **pp_nal, uint32_t *pi_nal, x265_picture *pic_in, x265_picture *pic_out)
 {
     if (!enc)
@@ -175,7 +215,6 @@
     return numEncoded;
 }
 
-extern "C"
 void x265_encoder_get_stats(x265_encoder *enc, x265_stats *outputStats, uint32_t statsSizeBytes)
 {
     if (enc && outputStats)
@@ -185,17 +224,15 @@
     }
 }
 
-extern "C"
-void x265_encoder_log(x265_encoder* enc, int argc, char **argv)
+void x265_encoder_log(x265_encoder* enc, int, char **)
 {
     if (enc)
     {
         Encoder *encoder = static_cast<Encoder*>(enc);
-        encoder->writeLog(argc, argv);
+        x265_log(encoder->m_param, X265_LOG_WARNING, "x265_encoder_log is now deprecated\n");
     }
 }
 
-extern "C"
 void x265_encoder_close(x265_encoder *enc)
 {
     if (enc)
@@ -210,7 +247,6 @@
     }
 }
 
-extern "C"
 void x265_cleanup(void)
 {
     if (!g_ctuSizeConfigured)
@@ -220,13 +256,11 @@
     }
 }
 
-extern "C"
 x265_picture *x265_picture_alloc()
 {
     return (x265_picture*)x265_malloc(sizeof(x265_picture));
 }
 
-extern "C"
 void x265_picture_init(x265_param *param, x265_picture *pic)
 {
     memset(pic, 0, sizeof(x265_picture));
@@ -245,7 +279,6 @@
     }
 }
 
-extern "C"
 void x265_picture_free(x265_picture *p)
 {
     return x265_free(p);
@@ -253,12 +286,24 @@
 
 static const x265_api libapi =
 {
-    &x265_param_alloc,
-    &x265_param_free,
-    &x265_param_default,
-    &x265_param_parse,
-    &x265_param_apply_profile,
-    &x265_param_default_preset,
+    X265_MAJOR_VERSION,
+    X265_BUILD,
+    sizeof(x265_param),
+    sizeof(x265_picture),
+    sizeof(x265_analysis_data),
+    sizeof(x265_zone),
+    sizeof(x265_stats),

x265_1.7.tar.gz/source/encoder/bitcost.cpp -> x265_1.8.tar.gz/source/encoder/bitcost.cpp Changed

x265_1.7.tar.gz/source/encoder/bitcost.h -> x265_1.8.tar.gz/source/encoder/bitcost.h Changed

x265_1.7.tar.gz/source/encoder/dpb.cpp -> x265_1.8.tar.gz/source/encoder/dpb.cpp Changed

x265_1.7.tar.gz/source/encoder/dpb.h -> x265_1.8.tar.gz/source/encoder/dpb.h Changed

x265_1.7.tar.gz/source/encoder/encoder.cpp -> x265_1.8.tar.gz/source/encoder/encoder.cpp Changed

@@ -39,21 +39,13 @@
 
 #include "x265.h"
 
-namespace x265 {
+namespace X265_NS {
 const char g_sliceTypeToChar[] = {'B', 'P', 'I'};
 }
 
-static const char* summaryCSVHeader =
-    "Command, Date/Time, Elapsed Time, FPS, Bitrate, "
-    "Y PSNR, U PSNR, V PSNR, Global PSNR, SSIM, SSIM (dB), "
-    "I count, I ave-QP, I kpbs, I-PSNR Y, I-PSNR U, I-PSNR V, I-SSIM (dB), "
-    "P count, P ave-QP, P kpbs, P-PSNR Y, P-PSNR U, P-PSNR V, P-SSIM (dB), "
-    "B count, B ave-QP, B kpbs, B-PSNR Y, B-PSNR U, B-PSNR V, B-SSIM (dB), "
-    "Version\n";
-
 static const char* defaultAnalysisFileName = "x265_analysis.dat";
 
-using namespace x265;
+using namespace X265_NS;
 
 Encoder::Encoder()
 {
@@ -72,7 +64,6 @@
     m_exportedPic = NULL;
     m_numDelayedPic = 0;
     m_outputCount = 0;
-    m_csvfpt = NULL;
     m_param = NULL;
     m_latestParam = NULL;
     m_cuOffsetY = NULL;
@@ -103,7 +94,10 @@
 
     // Do not allow WPP if only one row or fewer than 3 columns, it is pointless and unstable
     if (rows == 1 || cols < 3)
+    {
+        x265_log(p, X265_LOG_WARNING, "Too few rows/columns, --wpp disabled\n");
         p->bEnableWavefront = 0;
+    }
 
     bool allowPools = !p->numaPools || strcmp(p->numaPools, "none");
 
@@ -149,6 +143,12 @@
         p->bEnableWavefront = p->bDistributeModeAnalysis = p->bDistributeMotionEstimation = p->lookaheadSlices = 0;
     }
 
+    if (!p->bEnableWavefront && p->rc.vbvBufferSize)
+    {
+        x265_log(p, X265_LOG_ERROR, "VBV requires wavefront parallelism\n");
+        m_aborted = true;
+    }
+
     char buf[128];
     int len = 0;
     if (p->bEnableWavefront)
@@ -214,43 +214,6 @@
     initSPS(&m_sps);
     initPPS(&m_pps);
 
-    /* Try to open CSV file handle */
-    if (m_param->csvfn)
-    {
-        m_csvfpt = fopen(m_param->csvfn, "r");
-        if (m_csvfpt)
-        {
-            /* file already exists, re-open for append */
-            fclose(m_csvfpt);
-            m_csvfpt = fopen(m_param->csvfn, "ab");
-        }
-        else
-        {
-            /* new CSV file, write header */
-            m_csvfpt = fopen(m_param->csvfn, "wb");
-            if (m_csvfpt)
-            {
-                if (m_param->logLevel >= X265_LOG_FRAME)
-                {
-                    fprintf(m_csvfpt, "Encode Order, Type, POC, QP, Bits, ");
-                    if (m_param->rc.rateControlMode == X265_RC_CRF)
-                        fprintf(m_csvfpt, "RateFactor, ");
-                    fprintf(m_csvfpt, "Y PSNR, U PSNR, V PSNR, YUV PSNR, SSIM, SSIM (dB),  List 0, List 1");
-                    /* detailed performance statistics */
-                    fprintf(m_csvfpt, ", DecideWait (ms), Row0Wait (ms), Wall time (ms), Ref Wait Wall (ms), Total CTU time (ms), Stall Time (ms), Avg WPP, Row Blocks\n");
-                }
-                else
-                    fputs(summaryCSVHeader, m_csvfpt);
-            }
-        }
-
-        if (!m_csvfpt)
-        {
-            x265_log(m_param, X265_LOG_ERROR, "Unable to open CSV log file <%s>, aborting\n", m_param->csvfn);
-            m_aborted = true;
-        }
-    }
-
     int numRows = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize;
     int numCols = (m_param->sourceWidth  + g_maxCUSize - 1) / g_maxCUSize;
     for (int i = 0; i < m_param->frameNumThreads; i++)
@@ -362,8 +325,6 @@
 
     if (m_analysisFile)
         fclose(m_analysisFile);
-    if (m_csvfpt)
-        fclose(m_csvfpt);
 
     if (m_param)
     {
@@ -372,15 +333,14 @@
         free((char*)m_param->rc.statFileName);
         free((char*)m_param->analysisFileName);
         free((char*)m_param->scalingLists);
-        free((char*)m_param->csvfn);
         free((char*)m_param->numaPools);
         free((char*)m_param->masteringDisplayColorVolume);
         free((char*)m_param->contentLightLevelInfo);
 
-        x265_param_free(m_param);
+        PARAM_NS::x265_param_free(m_param);
     }
 
-    x265_param_free(m_latestParam);
+    PARAM_NS::x265_param_free(m_latestParam);
 }
 
 void Encoder::updateVbvPlan(RateControl* rc)
@@ -570,6 +530,7 @@
         if (outFrame)
         {
             Slice *slice = outFrame->m_encData->m_slice;
+            x265_frame_stats* frameData = NULL;
 
             /* Free up pic_in->analysisData since it has already been used */
             if (m_param->analysisMode == X265_ANALYSIS_LOAD)
@@ -582,6 +543,7 @@
                 pic_out->bitDepth = X265_DEPTH;
                 pic_out->userData = outFrame->m_userData;
                 pic_out->colorSpace = m_param->internalCsp;
+                frameData = &(pic_out->frameData);
 
                 pic_out->pts = outFrame->m_pts;
                 pic_out->dts = outFrame->m_dts;
@@ -648,7 +610,12 @@
             if (m_aborted)
                 return -1;
 
-            finishFrameStats(outFrame, curEncoder, curEncoder->m_accessUnitBits);
+            finishFrameStats(outFrame, curEncoder, curEncoder->m_accessUnitBits, frameData);
+
+            /* Write RateControl Frame level stats in multipass encodes */
+            if (m_param->rc.bStatWrite)
+                if (m_rateControl->writeRateControlFrameStats(outFrame, &curEncoder->m_rce))
+                    m_aborted = true;
 
             /* Allow this frame to be recycled if no frame encoders are using it for reference */
             if (!pic_out)
@@ -729,7 +696,7 @@
                 m_aborted = true;
         }
         else if (m_encodedFrameNum)
-            m_rateControl->setFinalFrameCount(m_encodedFrameNum);
+            m_rateControl->setFinalFrameCount(m_encodedFrameNum); 
     }
     while (m_bZeroLatency && ++pass < 2);
 
@@ -787,38 +754,6 @@
     m_totalQp += aveQp;
 }
 
-char* Encoder::statsCSVString(EncStats& stat, char* buffer)
-{
-    if (!stat.m_numPics)
-    {
-        sprintf(buffer, "-, -, -, -, -, -, -, ");
-        return buffer;
-    }
-
-    double fps = (double)m_param->fpsNum / m_param->fpsDenom;
-    double scale = fps / 1000 / (double)stat.m_numPics;
-
-    int len = sprintf(buffer, "%-6u, ", stat.m_numPics);
-
-    len += sprintf(buffer + len, "%2.2lf, ", stat.m_totalQp / (double)stat.m_numPics);
-    len += sprintf(buffer + len, "%-8.2lf, ", stat.m_accBits * scale);
-    if (m_param->bEnablePsnr)
-    {
-        len += sprintf(buffer + len, "%.3lf, %.3lf, %.3lf, ",
-                       stat.m_psnrSumY / (double)stat.m_numPics,
-                       stat.m_psnrSumU / (double)stat.m_numPics,
-                       stat.m_psnrSumV / (double)stat.m_numPics);
-    }
-    else
-        len += sprintf(buffer + len, "-, -, -, ");
-
-    if (m_param->bEnableSsim)
-        sprintf(buffer + len, "%.3lf, ", x265_ssim2dB(stat.m_globalSsim / (double)stat.m_numPics));
-    else
-        sprintf(buffer + len, "-, ");
-    return buffer;

x265_1.7.tar.gz/source/encoder/encoder.h -> x265_1.8.tar.gz/source/encoder/encoder.h Changed

@@ -32,7 +32,7 @@
 
 struct x265_encoder {};
 
-namespace x265 {
+namespace X265_NS {
 // private namespace
 extern const char g_sliceTypeToChar[3];
 
@@ -105,7 +105,6 @@
     EncStats           m_analyzeI;
     EncStats           m_analyzeP;
     EncStats           m_analyzeB;
-    FILE*              m_csvfpt;
     int64_t            m_encodeStartTime;
 
     // weighted prediction
@@ -149,14 +148,10 @@
 
     void fetchStats(x265_stats* stats, size_t statsSizeBytes);
 
-    void writeLog(int argc, char **argv);
-
     void printSummary();
 
     char* statsString(EncStats&, char*);
 
-    char* statsCSVString(EncStats& stat, char* buffer);
-
     void configure(x265_param *param);
 
     void updateVbvPlan(RateControl* rc);
@@ -169,7 +164,7 @@
 
     void writeAnalysisFile(x265_analysis_data* pic);
 
-    void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, uint64_t bits);
+    void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, uint64_t bits, x265_frame_stats* frameStats);
 
 protected:

x265_1.7.tar.gz/source/encoder/entropy.cpp -> x265_1.8.tar.gz/source/encoder/entropy.cpp Changed

@@ -35,9 +35,7 @@
 #define CU_DQP_EG_k    0 // exp-golomb order
 #define START_VALUE    8 // start value for dpcm mode
 
-static const uint32_t g_puOffset[8] = { 0, 8, 4, 4, 2, 10, 1, 5 };
-
-namespace x265 {
+namespace X265_NS {
 
 Entropy::Entropy()
 {
@@ -216,7 +214,7 @@
         WRITE_FLAG(csp == X265_CSP_I420 || csp == X265_CSP_I400,                         "general_max_420chroma_constraint_flag");
         WRITE_FLAG(csp == X265_CSP_I400,                                                 "general_max_monochrome_constraint_flag");
         WRITE_FLAG(ptl.intraConstraintFlag,        "general_intra_constraint_flag");
-        WRITE_FLAG(0,                              "general_one_picture_only_constraint_flag");
+        WRITE_FLAG(ptl.onePictureOnlyConstraintFlag,"general_one_picture_only_constraint_flag");
         WRITE_FLAG(ptl.lowerBitRateConstraintFlag, "general_lower_bit_rate_constraint_flag");
         WRITE_CODE(0 , 16, "XXX_reserved_zero_35bits[0..15]");
         WRITE_CODE(0 , 16, "XXX_reserved_zero_35bits[16..31]");
@@ -862,12 +860,9 @@
 void Entropy::codePUWise(const CUData& cu, uint32_t absPartIdx)
 {
     X265_CHECK(!cu.isIntra(absPartIdx), "intra block not expected\n");
-    PartSize partSize = (PartSize)cu.m_partSize[absPartIdx];
-    uint32_t numPU = (partSize == SIZE_2Nx2N ? 1 : (partSize == SIZE_NxN ? 4 : 2));
-    uint32_t depth = cu.m_cuDepth[absPartIdx];
-    uint32_t puOffset = (g_puOffset[uint32_t(partSize)] << (g_unitSizeDepth - depth) * 2) >> 4;
+    uint32_t numPU = cu.getNumPartInter(absPartIdx);
 
-    for (uint32_t puIdx = 0, subPartIdx = absPartIdx; puIdx < numPU; puIdx++, subPartIdx += puOffset)
+    for (uint32_t puIdx = 0, subPartIdx = absPartIdx; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, absPartIdx))
     {
         codeMergeFlag(cu, subPartIdx);
         if (cu.m_mergeFlag[subPartIdx])
@@ -1433,6 +1428,55 @@
         encodeBin(cu.getCbf(absPartIdx, ttype, lowestTUDepth), m_contextState[OFF_QT_CBF_CTX + ctx]);
 }
 
+#if CHECKED_BUILD || _DEBUG
+uint32_t costCoeffRemain_c0(uint16_t *absCoeff, int numNonZero)
+{
+    uint32_t goRiceParam = 0;
+    int firstCoeff2 = 1;
+    uint32_t baseLevelN = 0x5555AAAA; // 2-bits encode format baseLevel
+
+    uint32_t sum = 0;
+    int idx = 0;
+    do
+    {
+        int baseLevel = (baseLevelN & 3) | firstCoeff2;
+        X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ? (2 + firstCoeff2) : 1), "baseLevel check failurr\n");
+        baseLevelN >>= 2;
+        int codeNumber = absCoeff[idx] - baseLevel;
+
+        if (codeNumber >= 0)
+        {
+            //writeCoefRemainExGolomb(absCoeff[idx] - baseLevel, goRiceParam);
+            uint32_t length = 0;
+
+            codeNumber = ((uint32_t)codeNumber >> goRiceParam) - COEF_REMAIN_BIN_REDUCTION;
+            if (codeNumber >= 0)
+            {
+                {
+                    unsigned long cidx;
+                    CLZ(cidx, codeNumber + 1);
+                    length = cidx;
+                }
+                X265_CHECK((codeNumber != 0) || (length == 0), "length check failure\n");
+
+                codeNumber = (length + length);
+            }
+            sum += (COEF_REMAIN_BIN_REDUCTION + 1 + goRiceParam + codeNumber);
+
+            if (absCoeff[idx] > (COEF_REMAIN_BIN_REDUCTION << goRiceParam))
+                goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2);
+            X265_CHECK(goRiceParam <= 4, "goRiceParam check failure\n");
+        }
+        if (absCoeff[idx] >= 2)
+            firstCoeff2 = 0;
+        idx++;
+    }
+    while(idx < numNonZero);
+
+    return sum;
+}
+#endif // debug only code
+
 void Entropy::codeCoeffNxN(const CUData& cu, const coeff_t* coeff, uint32_t absPartIdx, uint32_t log2TrSize, TextType ttype)
 {
     uint32_t trSize = 1 << log2TrSize;
@@ -1440,7 +1484,7 @@
     // compute number of significant coefficients
     uint32_t numSig = primitives.cu[log2TrSize - 2].count_nonzero(coeff);
     X265_CHECK(numSig > 0, "cbf check fail\n");
-    bool bHideFirstSign = cu.m_slice->m_pps->bSignHideEnabled && !tqBypass;
+    bool bHideFirstSign = cu.m_slice->m_pps->bSignHideEnabled & !tqBypass;
 
     if (log2TrSize <= MAX_LOG2_TS_SIZE && !tqBypass && cu.m_slice->m_pps->bTransformSkipEnabled)
         codeTransformSkipFlags(cu.m_transformSkip[ttype][absPartIdx], ttype);
@@ -1489,9 +1533,11 @@
         if (codingParameters.scanType == SCAN_VER)
             std::swap(pos[0], pos[1]);
 
-        int ctxIdx = bIsLuma ? (3 * (log2TrSize - 2) + ((log2TrSize - 1) >> 2)) : NUM_CTX_LAST_FLAG_XY_LUMA;
-        int ctxShift = bIsLuma ? ((log2TrSize + 1) >> 2) : log2TrSize - 2;
+        int ctxIdx = bIsLuma ? (3 * (log2TrSize - 2) + (log2TrSize == 5)) : NUM_CTX_LAST_FLAG_XY_LUMA;
+        int ctxShift = (bIsLuma ? (log2TrSize > 2) : (log2TrSize - 2));
         uint32_t maxGroupIdx = (log2TrSize << 1) - 1;
+        X265_CHECK(((log2TrSize - 1) >> 2) == (uint32_t)(log2TrSize == 5), "ctxIdx check failure\n");
+        X265_CHECK((uint32_t)ctxShift == (bIsLuma ? ((log2TrSize + 1) >> 2) : log2TrSize - 2), "ctxShift check failure\n");
 
         uint8_t *ctx = &m_contextState[OFF_CTX_LAST_FLAG_X];
         for (uint32_t i = 0; i < 2; i++, ctxIdx += NUM_CTX_LAST_FLAG_XY)
@@ -1519,12 +1565,12 @@
     uint8_t * const baseCtx = bIsLuma ? &m_contextState[OFF_SIG_FLAG_CTX] : &m_contextState[OFF_SIG_FLAG_CTX + NUM_SIG_FLAG_CTX_LUMA];
     uint32_t c1 = 1;
     int scanPosSigOff = scanPosLast - (lastScanSet << MLS_CG_SIZE) - 1;
-    int absCoeff[1 << MLS_CG_SIZE];
-    int numNonZero = 1;
+    ALIGN_VAR_32(uint16_t, absCoeff[(1 << MLS_CG_SIZE)]);
+    uint32_t numNonZero = 1;
     unsigned long lastNZPosInCG;
     unsigned long firstNZPosInCG;
 
-    absCoeff[0] = int(abs(coeff[posLast]));
+    absCoeff[0] = (uint16_t)abs(coeff[posLast]);
 
     for (int subSet = lastScanSet; subSet >= 0; subSet--)
     {
@@ -1540,7 +1586,7 @@
 
         // encode significant_coeffgroup_flag
         const int cgBlkPos = codingParameters.scanCG[subSet];
-        const int cgPosY   = cgBlkPos >> (log2TrSize - MLS_CG_LOG2_SIZE);
+        const int cgPosY   = (uint32_t)cgBlkPos >> (log2TrSize - MLS_CG_LOG2_SIZE);
         const int cgPosX   = cgBlkPos & ((1 << (log2TrSize - MLS_CG_LOG2_SIZE)) - 1);
         const uint64_t cgBlkPosMask = ((uint64_t)1 << cgBlkPos);
 
@@ -1554,21 +1600,14 @@
         }
 
         // encode significant_coeff_flag
-        if (sigCoeffGroupFlag64 & cgBlkPosMask)
+        if ((scanPosSigOff >= 0) && (sigCoeffGroupFlag64 & cgBlkPosMask))
         {
             X265_CHECK((log2TrSize != 2) || (log2TrSize == 2 && subSet == 0), "log2TrSize and subSet mistake!\n");
             const int patternSigCtx = Quant::calcPatternSigCtx(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, (trSize >> MLS_CG_LOG2_SIZE));
             const uint32_t posOffset = (bIsLuma && subSet) ? 3 : 0;
 
-            static const uint8_t ctxIndMap4x4[16] =
-            {
-                0, 1, 4, 5,
-                2, 3, 4, 5,
-                6, 6, 8, 8,
-                7, 7, 8, 8
-            };
             // NOTE: [patternSigCtx][posXinSubset][posYinSubset]
-            static const uint8_t table_cnt[4][SCAN_SET_SIZE] =
+            static const uint8_t table_cnt[5][SCAN_SET_SIZE] =
             {
                 // patternSigCtx = 0
                 {
@@ -1597,50 +1636,61 @@
                     2, 2, 2, 2,
                     2, 2, 2, 2,
                     2, 2, 2, 2,
+                },
+                // 4x4
+                {
+                    0, 1, 4, 5,
+                    2, 3, 4, 5,
+                    6, 6, 8, 8,
+                    7, 7, 8, 8
                 }
             };
 
             const int offset = codingParameters.firstSignificanceMapContext;
-            ALIGN_VAR_32(uint16_t, tmpCoeff[SCAN_SET_SIZE]);
-            // TODO: accelerate by PABSW
             const uint32_t blkPosBase  = codingParameters.scan[subPosBase];
-            for (int i = 0; i < MLS_CG_SIZE; i++)
-            {
-                tmpCoeff[i * MLS_CG_SIZE + 0] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 0]);
-                tmpCoeff[i * MLS_CG_SIZE + 1] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 1]);
-                tmpCoeff[i * MLS_CG_SIZE + 2] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 2]);
-                tmpCoeff[i * MLS_CG_SIZE + 3] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 3]);
-            }
 
+            X265_CHECK(scanPosSigOff >= 0, "scanPosSigOff check failure\n");
             if (m_bitIf)
             {
+                ALIGN_VAR_32(uint16_t, tmpCoeff[SCAN_SET_SIZE]);
+
+                // TODO: accelerate by PABSW
+                for (int i = 0; i < MLS_CG_SIZE; i++)
+                {
+                    tmpCoeff[i * MLS_CG_SIZE + 0] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 0]);
+                    tmpCoeff[i * MLS_CG_SIZE + 1] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 1]);
+                    tmpCoeff[i * MLS_CG_SIZE + 2] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 2]);

x265_1.7.tar.gz/source/encoder/entropy.h -> x265_1.8.tar.gz/source/encoder/entropy.h Changed

x265_1.7.tar.gz/source/encoder/frameencoder.cpp -> x265_1.8.tar.gz/source/encoder/frameencoder.cpp Changed

@@ -35,7 +35,7 @@
 #include "slicetype.h"
 #include "nal.h"
 
-namespace x265 {
+namespace X265_NS {
 void weightAnalyse(Slice& slice, Frame& frame, x265_param& param);
 
 FrameEncoder::FrameEncoder()
@@ -59,7 +59,6 @@
     m_cuGeoms = NULL;
     m_ctuGeomMap = NULL;
     m_localTldIdx = 0;
-    memset(&m_frameStats, 0, sizeof(m_frameStats));
     memset(&m_rce, 0, sizeof(RateControlEntry));
 }
 
@@ -313,7 +312,7 @@
     m_SSDY = m_SSDU = m_SSDV = 0;
     m_ssim = 0;
     m_ssimCnt = 0;
-    memset(&m_frameStats, 0, sizeof(m_frameStats));
+    memset(&(m_frame->m_encData->m_frameStats), 0, sizeof(m_frame->m_encData->m_frameStats));
 
     /* Emit access unit delimiter unless this is the first frame and the user is
      * not repeating headers (since AUD is supposed to be the first NAL in the access
@@ -419,25 +418,6 @@
 
             m_top->m_lastBPSEI = m_rce.encodeOrder;
         }
-
-        // The recovery point SEI message assists a decoder in determining when the decoding
-        // process will produce acceptable pictures for display after the decoder initiates
-        // random access. The m_recoveryPocCnt is in units of POC(picture order count) which
-        // means pictures encoded after the CRA but precede it in display order(leading) are
-        // implicitly discarded after a random access seek regardless of the value of
-        // m_recoveryPocCnt. Our encoder does not use references prior to the most recent CRA,
-        // so all pictures following the CRA in POC order are guaranteed to be displayable,
-        // so m_recoveryPocCnt is always 0.
-        SEIRecoveryPoint sei_recovery_point;
-        sei_recovery_point.m_recoveryPocCnt = 0;
-        sei_recovery_point.m_exactMatchingFlag = true;
-        sei_recovery_point.m_brokenLinkFlag = false;
-
-        m_bs.resetBits();
-        sei_recovery_point.write(m_bs, *slice->m_sps);
-        m_bs.writeByteAlignment();
-
-        m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
     }
 
     if (m_param->bEmitHRDSEI || !!m_param->interlaceMode)
@@ -475,6 +455,19 @@
         m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
     }
 
+    /* CQP and CRF (without capped VBV) doesn't use mid-frame statistics to 
+     * tune RateControl parameters for other frames.
+     * Hence, for these modes, update m_startEndOrder and unlock RC for previous threads waiting in
+     * RateControlEnd here, after the slicecontexts are initialized. For the rest - ABR
+     * and VBV, unlock only after rateControlUpdateStats of this frame is called */
+    if (m_param->rc.rateControlMode != X265_RC_ABR && !m_top->m_rateControl->m_isVbv)
+    {
+        m_top->m_rateControl->m_startEndOrder.incr();
+
+        if (m_rce.encodeOrder < m_param->frameNumThreads - 1)
+            m_top->m_rateControl->m_startEndOrder.incr(); // faked rateControlEnd calls for negative frames
+    }
+
     /* Analyze CTU rows, most of the hard work is done here.  Frame is
      * compressed in a wave-front pattern if WPP is enabled. Row based loop
      * filters runs behind the CTU compression and reconstruction */
@@ -559,17 +552,56 @@
         // accumulate intra,inter,skip cu count per frame for 2 pass
         for (uint32_t i = 0; i < m_numRows; i++)
         {
-            m_frameStats.mvBits    += m_rows[i].rowStats.mvBits;
-            m_frameStats.coeffBits += m_rows[i].rowStats.coeffBits;
-            m_frameStats.miscBits  += m_rows[i].rowStats.miscBits;
-            totalI                 += m_rows[i].rowStats.iCuCnt;
-            totalP                 += m_rows[i].rowStats.pCuCnt;
-            totalSkip              += m_rows[i].rowStats.skipCuCnt;
+            m_frame->m_encData->m_frameStats.mvBits    += m_rows[i].rowStats.mvBits;
+            m_frame->m_encData->m_frameStats.coeffBits += m_rows[i].rowStats.coeffBits;
+            m_frame->m_encData->m_frameStats.miscBits  += m_rows[i].rowStats.miscBits;
+            totalI                                     += m_rows[i].rowStats.intra8x8Cnt;
+            totalP                                     += m_rows[i].rowStats.inter8x8Cnt;
+            totalSkip                                  += m_rows[i].rowStats.skip8x8Cnt;
         }
         int totalCuCount = totalI + totalP + totalSkip;
-        m_frameStats.percentIntra = (double)totalI / totalCuCount;
-        m_frameStats.percentInter = (double)totalP / totalCuCount;
-        m_frameStats.percentSkip  = (double)totalSkip / totalCuCount;
+        m_frame->m_encData->m_frameStats.percent8x8Intra = (double)totalI / totalCuCount;
+        m_frame->m_encData->m_frameStats.percent8x8Inter = (double)totalP / totalCuCount;
+        m_frame->m_encData->m_frameStats.percent8x8Skip  = (double)totalSkip / totalCuCount;
+    }
+    for (uint32_t i = 0; i < m_numRows; i++)
+    {
+        m_frame->m_encData->m_frameStats.cntIntraNxN      += m_rows[i].rowStats.cntIntraNxN;
+        m_frame->m_encData->m_frameStats.totalCu          += m_rows[i].rowStats.totalCu;
+        m_frame->m_encData->m_frameStats.totalCtu         += m_rows[i].rowStats.totalCtu;
+        m_frame->m_encData->m_frameStats.lumaDistortion   += m_rows[i].rowStats.lumaDistortion;
+        m_frame->m_encData->m_frameStats.chromaDistortion += m_rows[i].rowStats.chromaDistortion;
+        m_frame->m_encData->m_frameStats.psyEnergy        += m_rows[i].rowStats.psyEnergy;
+        m_frame->m_encData->m_frameStats.lumaLevel        += m_rows[i].rowStats.lumaLevel;
+
+        if (m_rows[i].rowStats.maxLumaLevel > m_frame->m_encData->m_frameStats.maxLumaLevel)
+            m_frame->m_encData->m_frameStats.maxLumaLevel = m_rows[i].rowStats.maxLumaLevel;
+        for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
+        {
+            m_frame->m_encData->m_frameStats.cntSkipCu[depth] += m_rows[i].rowStats.cntSkipCu[depth];
+            m_frame->m_encData->m_frameStats.cntMergeCu[depth] += m_rows[i].rowStats.cntMergeCu[depth];
+            for (int m = 0; m < INTER_MODES; m++)
+                m_frame->m_encData->m_frameStats.cuInterDistribution[depth][m] += m_rows[i].rowStats.cuInterDistribution[depth][m];
+            for (int n = 0; n < INTRA_MODES; n++)
+                m_frame->m_encData->m_frameStats.cuIntraDistribution[depth][n] += m_rows[i].rowStats.cuIntraDistribution[depth][n];
+        }
+    }
+    m_frame->m_encData->m_frameStats.avgLumaDistortion   = (double)(m_frame->m_encData->m_frameStats.lumaDistortion) / m_frame->m_encData->m_frameStats.totalCtu;
+    m_frame->m_encData->m_frameStats.avgChromaDistortion = (double)(m_frame->m_encData->m_frameStats.chromaDistortion) / m_frame->m_encData->m_frameStats.totalCtu;
+    m_frame->m_encData->m_frameStats.avgPsyEnergy        = (double)(m_frame->m_encData->m_frameStats.psyEnergy) / m_frame->m_encData->m_frameStats.totalCtu;
+    m_frame->m_encData->m_frameStats.avgLumaLevel        = m_frame->m_encData->m_frameStats.lumaLevel / m_frame->m_encData->m_frameStats.totalCtu;
+    m_frame->m_encData->m_frameStats.percentIntraNxN     = (double)(m_frame->m_encData->m_frameStats.cntIntraNxN * 100) / m_frame->m_encData->m_frameStats.totalCu;
+    for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
+    {
+        m_frame->m_encData->m_frameStats.percentSkipCu[depth]  = (double)(m_frame->m_encData->m_frameStats.cntSkipCu[depth] * 100) / m_frame->m_encData->m_frameStats.totalCu;
+        m_frame->m_encData->m_frameStats.percentMergeCu[depth] = (double)(m_frame->m_encData->m_frameStats.cntMergeCu[depth] * 100) / m_frame->m_encData->m_frameStats.totalCu;
+        for (int n = 0; n < INTRA_MODES; n++)
+            m_frame->m_encData->m_frameStats.percentIntraDistribution[depth][n] = (double)(m_frame->m_encData->m_frameStats.cuIntraDistribution[depth][n] * 100) / m_frame->m_encData->m_frameStats.totalCu;
+        uint64_t cuInterRectCnt = 0; // sum of Nx2N, 2NxN counts
+        cuInterRectCnt += m_frame->m_encData->m_frameStats.cuInterDistribution[depth][1] + m_frame->m_encData->m_frameStats.cuInterDistribution[depth][2];
+        m_frame->m_encData->m_frameStats.percentInterDistribution[depth][0] = (double)(m_frame->m_encData->m_frameStats.cuInterDistribution[depth][0] * 100) / m_frame->m_encData->m_frameStats.totalCu;
+        m_frame->m_encData->m_frameStats.percentInterDistribution[depth][1] = (double)(cuInterRectCnt * 100) / m_frame->m_encData->m_frameStats.totalCu;
+        m_frame->m_encData->m_frameStats.percentInterDistribution[depth][2] = (double)(m_frame->m_encData->m_frameStats.cuInterDistribution[depth][3] * 100) / m_frame->m_encData->m_frameStats.totalCu;
     }
 
     m_bs.resetBits();
@@ -638,7 +670,7 @@
     m_endCompressTime = x265_mdate();
 
     /* rateControlEnd may also block for earlier frames to call rateControlUpdateStats */
-    if (m_top->m_rateControl->rateControlEnd(m_frame, m_accessUnitBits, &m_rce, &m_frameStats) < 0)
+    if (m_top->m_rateControl->rateControlEnd(m_frame, m_accessUnitBits, &m_rce) < 0)
         m_top->m_aborted = true;
 
     /* Decrement referenced frame reference counts, allow them to be recycled */
@@ -826,13 +858,6 @@
     const uint32_t lineStartCUAddr = row * numCols;
     bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
 
-    /* These store the count of inter, intra and skip cus within quad tree structure of each CTU */
-    uint32_t qTreeInterCnt[NUM_CU_DEPTH];
-    uint32_t qTreeIntraCnt[NUM_CU_DEPTH];
-    uint32_t qTreeSkipCnt[NUM_CU_DEPTH];
-    for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
-        qTreeIntraCnt[depth] = qTreeInterCnt[depth] = qTreeSkipCnt[depth] = 0;
-
     while (curRow.completed < numCols)
     {
         ProfileScopeEvent(encodeCTU);
@@ -904,30 +929,57 @@
         // Completed CU processing
         curRow.completed++;
 
-        if (m_param->bLogCuStats || m_param->rc.bStatWrite)
-            curEncData.m_rowStat[row].sumQpAq += collectCTUStatistics(*ctu, qTreeInterCnt, qTreeIntraCnt, qTreeSkipCnt);
-        else if (m_param->rc.aqMode)
-            curEncData.m_rowStat[row].sumQpAq += calcCTUQP(*ctu);
+        FrameStats frameLog;
+        curEncData.m_rowStat[row].sumQpAq += collectCTUStatistics(*ctu, &frameLog);
 
         // copy no. of intra, inter Cu cnt per row into frame stats for 2 pass
         if (m_param->rc.bStatWrite)
         {
-            curRow.rowStats.mvBits += best.mvBits;
+            curRow.rowStats.mvBits    += best.mvBits;
             curRow.rowStats.coeffBits += best.coeffBits;
-            curRow.rowStats.miscBits += best.totalBits - (best.mvBits + best.coeffBits);
+            curRow.rowStats.miscBits  += best.totalBits - (best.mvBits + best.coeffBits);
 
             for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
             {
                 /* 1 << shift == number of 8x8 blocks at current depth */
                 int shift = 2 * (g_maxCUDepth - depth);
-                curRow.rowStats.iCuCnt += qTreeIntraCnt[depth] << shift;
-                curRow.rowStats.pCuCnt += qTreeInterCnt[depth] << shift;
-                curRow.rowStats.skipCuCnt += qTreeSkipCnt[depth] << shift;
+                int cuSize = g_maxCUSize >> depth;
 
-                // clear the row cu data from thread local object
-                qTreeIntraCnt[depth] = qTreeInterCnt[depth] = qTreeSkipCnt[depth] = 0;
+                if (cuSize == 8)
+                    curRow.rowStats.intra8x8Cnt += (int)(frameLog.cntIntra[depth] + frameLog.cntIntraNxN);
+                else
+                    curRow.rowStats.intra8x8Cnt += (int)(frameLog.cntIntra[depth] << shift);
+
+                curRow.rowStats.inter8x8Cnt += (int)(frameLog.cntInter[depth] << shift);
+                curRow.rowStats.skip8x8Cnt += (int)((frameLog.cntSkipCu[depth] + frameLog.cntMergeCu[depth]) << shift);
             }

x265_1.7.tar.gz/source/encoder/frameencoder.h -> x265_1.8.tar.gz/source/encoder/frameencoder.h Changed

@@ -41,7 +41,7 @@
 #include "reference.h"
 #include "nal.h"
 
-namespace x265 {
+namespace X265_NS {
 // private x265 namespace
 
 class ThreadPool;
@@ -49,8 +49,6 @@
 
 #define ANGULAR_MODE_ID 2
 #define AMP_ID 3
-#define INTER_MODES 4
-#define INTRA_MODES 3
 
 struct StatisticLog
 {
@@ -156,8 +154,6 @@
     MD5Context               m_state[3];
     uint32_t                 m_crc[3];
     uint32_t                 m_checksum[3];
-    StatisticLog             m_sliceTypeLog[3];     // per-slice type CU statistics
-    FrameStats               m_frameStats;          // stats of current frame for multi-pass encodes
 
     volatile int             m_activeWorkerCount;        // count of workers currently encoding or filtering CTUs
     volatile int             m_totalActiveWorkerCount;   // sum of m_activeWorkerCount sampled at end of each CTU
@@ -221,8 +217,7 @@
     void encodeSlice();
 
     void threadMain();
-    int  collectCTUStatistics(const CUData& ctu, uint32_t* qtreeInterCnt, uint32_t* qtreeIntraCnt, uint32_t* qtreeSkipCnt);
-    int  calcCTUQP(const CUData& ctu);
+    int  collectCTUStatistics(const CUData& ctu, FrameStats* frameLog);
     void noiseReductionUpdate();
 
     /* Called by WaveFront::findJob() */

x265_1.7.tar.gz/source/encoder/framefilter.cpp -> x265_1.8.tar.gz/source/encoder/framefilter.cpp Changed

x265_1.7.tar.gz/source/encoder/framefilter.h -> x265_1.8.tar.gz/source/encoder/framefilter.h Changed

x265_1.7.tar.gz/source/encoder/level.cpp -> x265_1.8.tar.gz/source/encoder/level.cpp Changed

@@ -25,7 +25,7 @@
 #include "slice.h"
 #include "level.h"
 
-namespace x265 {
+namespace X265_NS {
 typedef struct
 {
     uint32_t maxLumaSamples;
@@ -61,18 +61,37 @@
 /* determine minimum decoder level required to decode the described video */
 void determineLevel(const x265_param &param, VPS& vps)
 {
+    vps.ptl.onePictureOnlyConstraintFlag = param.totalFrames == 1;
+    vps.ptl.intraConstraintFlag = param.keyframeMax <= 1 || vps.ptl.onePictureOnlyConstraintFlag;
+    vps.ptl.bitDepthConstraint = param.internalBitDepth;
+    vps.ptl.chromaFormatConstraint = param.internalCsp;
+
+    /* TODO: figure out HighThroughput signaling, aka: HbrFactor in section A.4.2, only available
+     * for intra-only profiles (vps.ptl.intraConstraintFlag) */
+    vps.ptl.lowerBitRateConstraintFlag = true;
+
     vps.maxTempSubLayers = param.bEnableTemporalSubLayers ? 2 : 1;
-    if (param.internalCsp == X265_CSP_I420)
+    
+    if (param.internalCsp == X265_CSP_I420 && param.internalBitDepth <= 10)
     {
-        if (param.internalBitDepth == 8)
+        /* Probably an HEVC v1 profile, but must check to be sure */
+        if (param.internalBitDepth <= 8)
         {
-            if (param.keyframeMax == 1 && param.maxNumReferences == 1)
+            if (vps.ptl.onePictureOnlyConstraintFlag)
                 vps.ptl.profileIdc = Profile::MAINSTILLPICTURE;
+            else if (vps.ptl.intraConstraintFlag)
+                vps.ptl.profileIdc = Profile::MAINREXT; /* Main Intra */
             else 
                 vps.ptl.profileIdc = Profile::MAIN;
         }
-        else if (param.internalBitDepth == 10)
-            vps.ptl.profileIdc = Profile::MAIN10;
+        else if (param.internalBitDepth <= 10)
+        {
+            /* note there is no 10bit still picture profile */
+            if (vps.ptl.intraConstraintFlag)
+                vps.ptl.profileIdc = Profile::MAINREXT; /* Main10 Intra */
+            else
+                vps.ptl.profileIdc = Profile::MAIN10;
+        }
     }
     else
         vps.ptl.profileIdc = Profile::MAINREXT;
@@ -162,17 +181,19 @@
             return;
         }
 
-#define CHECK_RANGE(value, main, high) (value > main && value <= high)
+#define CHECK_RANGE(value, main, high) (high != MAX_UINT && value > main && value <= high)
 
-        if (CHECK_RANGE(bitrate, levels[i].maxBitrateMain, levels[i].maxBitrateHigh) &&
-            CHECK_RANGE((uint32_t)param.rc.vbvBufferSize, levels[i].maxCpbSizeMain, levels[i].maxCpbSizeHigh) &&
-            levels[i].maxBitrateHigh != MAX_UINT)
+        if (CHECK_RANGE(bitrate, levels[i].maxBitrateMain, levels[i].maxBitrateHigh) ||
+            CHECK_RANGE((uint32_t)param.rc.vbvBufferSize, levels[i].maxCpbSizeMain, levels[i].maxCpbSizeHigh))
         {
-            /* If the user has not enabled high tier, continue looking to see if we can encode at a higher level, main tier */
-            if (!param.bHighTier && (levels[i].levelIdc < param.levelIdc))
-                continue;
-            else
+            /* The bitrate or buffer size are out of range for Main tier, but in
+             * range for High tier. If the user requested High tier then give
+             * them High tier at this level.  Otherwise allow the loop to
+             * progress to the Main tier of the next level */
+            if (param.bHighTier)
                 vps.ptl.tierFlag = Level::HIGH;
+            else
+                continue;
         }
         else
             vps.ptl.tierFlag = Level::MAIN;
@@ -184,29 +205,68 @@
         break;
     }
 
-    vps.ptl.intraConstraintFlag = false;
-    vps.ptl.lowerBitRateConstraintFlag = true;
-    vps.ptl.bitDepthConstraint = param.internalBitDepth;
-    vps.ptl.chromaFormatConstraint = param.internalCsp;
-    
     static const char *profiles[] = { "None", "Main", "Main 10", "Main Still Picture", "RExt" };
     static const char *tiers[]    = { "Main", "High" };
 
-    const char *profile = profiles[vps.ptl.profileIdc];
+    char profbuf[64];
+    strcpy(profbuf, profiles[vps.ptl.profileIdc]);
+
+    bool bStillPicture = false;
     if (vps.ptl.profileIdc == Profile::MAINREXT)
     {
-        if (param.internalCsp == X265_CSP_I422)
-            profile = "Main 4:2:2 10";
-        if (param.internalCsp == X265_CSP_I444)
+        if (vps.ptl.bitDepthConstraint > 12 && vps.ptl.intraConstraintFlag)
+        {
+            if (vps.ptl.onePictureOnlyConstraintFlag)
+            {
+                strcpy(profbuf, "Main 4:4:4 16 Still Picture");
+                bStillPicture = true;
+            }
+            else
+                strcpy(profbuf, "Main 4:4:4 16");
+        }
+        else if (param.internalCsp == X265_CSP_I420)
+        {
+            X265_CHECK(vps.ptl.intraConstraintFlag || vps.ptl.bitDepthConstraint > 10, "rext fail\n");
+            if (vps.ptl.bitDepthConstraint <= 8)
+                strcpy(profbuf, "Main");
+            else if (vps.ptl.bitDepthConstraint <= 10)
+                strcpy(profbuf, "Main 10");
+            else if (vps.ptl.bitDepthConstraint <= 12)
+                strcpy(profbuf, "Main 12");
+        }
+        else if (param.internalCsp == X265_CSP_I422)
+        {
+            /* there is no Main 4:2:2 profile, so it must be signaled as Main10 4:2:2 */
+            if (param.internalBitDepth <= 10)
+                strcpy(profbuf, "Main 4:2:2 10");
+            else if (vps.ptl.bitDepthConstraint <= 12)
+                strcpy(profbuf, "Main 4:2:2 12");
+        }
+        else if (param.internalCsp == X265_CSP_I444)
         {
             if (vps.ptl.bitDepthConstraint <= 8)
-                profile = "Main 4:4:4 8";
+            {
+                if (vps.ptl.onePictureOnlyConstraintFlag)
+                {
+                    strcpy(profbuf, "Main 4:4:4 Still Picture");
+                    bStillPicture = true;
+                }
+                else
+                    strcpy(profbuf, "Main 4:4:4");
+            }
             else if (vps.ptl.bitDepthConstraint <= 10)
-                profile = "Main 4:4:4 10";
+                strcpy(profbuf, "Main 4:4:4 10");
+            else if (vps.ptl.bitDepthConstraint <= 12)
+                strcpy(profbuf, "Main 4:4:4 12");
         }
+        else
+            strcpy(profbuf, "Unknown");
+
+        if (vps.ptl.intraConstraintFlag && !bStillPicture)
+            strcat(profbuf, " Intra");
     }
     x265_log(&param, X265_LOG_INFO, "%s profile, Level-%s (%s tier)\n",
-             profile, levels[i].name, tiers[vps.ptl.tierFlag]);
+             profbuf, levels[i].name, tiers[vps.ptl.tierFlag]);
 }
 
 /* enforce a maximum decoder level requirement, in other words assure that a
@@ -340,80 +400,88 @@
 
     return true;
 }
+}
+
+#if EXPORT_C_API
+
+/* these functions are exported as C functions (default) */
+using namespace X265_NS;
+extern "C" {
+
+#else
+
+/* these functions exist within private namespace (multilib) */
+namespace X265_NS {
+
+#endif
 
-extern "C"
 int x265_param_apply_profile(x265_param *param, const char *profile)
 {
     if (!param || !profile)
         return 0;
 
-#if HIGH_BIT_DEPTH
-    if (!strcmp(profile, "main") || !strcmp(profile, "mainstillpicture") || !strcmp(profile, "msp") || !strcmp(profile, "main444-8"))
-    {
-        x265_log(param, X265_LOG_ERROR, "%s profile not supported, compiled for Main10.\n", profile);
-        return -1;
-    }
-#else
-    if (!strcmp(profile, "main10") || !strcmp(profile, "main422-10") || !strcmp(profile, "main444-10"))
-    {
-        x265_log(param, X265_LOG_ERROR, "%s profile not supported, compiled for Main.\n", profile);
-        return -1;
-    }
+    /* Check if profile bit-depth requirement is exceeded by internal bit depth */
+    bool bInvalidDepth = false;

x265_1.7.tar.gz/source/encoder/level.h -> x265_1.8.tar.gz/source/encoder/level.h Changed

x265_1.7.tar.gz/source/encoder/motion.cpp -> x265_1.8.tar.gz/source/encoder/motion.cpp Changed

@@ -31,7 +31,7 @@
 #pragma warning(disable: 4127) // conditional  expression is constant (macros use this construct)
 #endif
 
-using namespace x265;
+using namespace X265_NS;
 
 namespace {
 
@@ -56,7 +56,7 @@
     { 2, 8, 2, 8, true },  // 2x8 SATD HPEL + 2x8 SATD QPEL
 };
 
-int sizeScale[NUM_PU_SIZES];
+static int sizeScale[NUM_PU_SIZES];
 #define SAD_THRESH(v) (bcost < (((v >> 4) * sizeScale[partEnum])))
 
 /* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */
@@ -234,14 +234,9 @@
                pix_base + (m1x) + (m1y) * stride, \
                pix_base + (m2x) + (m2y) * stride, \
                stride, costs); \
-        const uint16_t *base_mvx = &m_cost_mvx[(bmv.x + (m0x)) << 2]; \
-        const uint16_t *base_mvy = &m_cost_mvy[(bmv.y + (m0y)) << 2]; \
-        X265_CHECK(mvcost((bmv + MV(m0x, m0y)) << 2) == (base_mvx[((m0x) - (m0x)) << 2] + base_mvy[((m0y) - (m0y)) << 2]), "mvcost() check failure\n"); \
-        X265_CHECK(mvcost((bmv + MV(m1x, m1y)) << 2) == (base_mvx[((m1x) - (m0x)) << 2] + base_mvy[((m1y) - (m0y)) << 2]), "mvcost() check failure\n"); \
-        X265_CHECK(mvcost((bmv + MV(m2x, m2y)) << 2) == (base_mvx[((m2x) - (m0x)) << 2] + base_mvy[((m2y) - (m0y)) << 2]), "mvcost() check failure\n"); \
-        (costs)[0] += (base_mvx[((m0x) - (m0x)) << 2] + base_mvy[((m0y) - (m0y)) << 2]); \
-        (costs)[1] += (base_mvx[((m1x) - (m0x)) << 2] + base_mvy[((m1y) - (m0y)) << 2]); \
-        (costs)[2] += (base_mvx[((m2x) - (m0x)) << 2] + base_mvy[((m2y) - (m0y)) << 2]); \
+        (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
+        (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
+        (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
     }
 
 #define COST_MV_PT_DIST_X4(m0x, m0y, p0, d0, m1x, m1y, p1, d1, m2x, m2y, p2, d2, m3x, m3y, p3, d3) \
@@ -271,16 +266,10 @@
                pix_base + (m2x) + (m2y) * stride, \
                pix_base + (m3x) + (m3y) * stride, \
                stride, costs); \
-        const uint16_t *base_mvx = &m_cost_mvx[(omv.x << 2)]; \
-        const uint16_t *base_mvy = &m_cost_mvy[(omv.y << 2)]; \
-        X265_CHECK(mvcost((omv + MV(m0x, m0y)) << 2) == (base_mvx[(m0x) << 2] + base_mvy[(m0y) << 2]), "mvcost() check failure\n"); \
-        X265_CHECK(mvcost((omv + MV(m1x, m1y)) << 2) == (base_mvx[(m1x) << 2] + base_mvy[(m1y) << 2]), "mvcost() check failure\n"); \
-        X265_CHECK(mvcost((omv + MV(m2x, m2y)) << 2) == (base_mvx[(m2x) << 2] + base_mvy[(m2y) << 2]), "mvcost() check failure\n"); \
-        X265_CHECK(mvcost((omv + MV(m3x, m3y)) << 2) == (base_mvx[(m3x) << 2] + base_mvy[(m3y) << 2]), "mvcost() check failure\n"); \
-        costs[0] += (base_mvx[(m0x) << 2] + base_mvy[(m0y) << 2]); \
-        costs[1] += (base_mvx[(m1x) << 2] + base_mvy[(m1y) << 2]); \
-        costs[2] += (base_mvx[(m2x) << 2] + base_mvy[(m2y) << 2]); \
-        costs[3] += (base_mvx[(m3x) << 2] + base_mvy[(m3y) << 2]); \
+        costs[0] += mvcost((omv + MV(m0x, m0y)) << 2); \
+        costs[1] += mvcost((omv + MV(m1x, m1y)) << 2); \
+        costs[2] += mvcost((omv + MV(m2x, m2y)) << 2); \
+        costs[3] += mvcost((omv + MV(m3x, m3y)) << 2); \
         COPY2_IF_LT(bcost, costs[0], bmv, omv + MV(m0x, m0y)); \
         COPY2_IF_LT(bcost, costs[1], bmv, omv + MV(m1x, m1y)); \
         COPY2_IF_LT(bcost, costs[2], bmv, omv + MV(m2x, m2y)); \
@@ -296,17 +285,10 @@
                pix_base + (m2x) + (m2y) * stride, \
                pix_base + (m3x) + (m3y) * stride, \
                stride, costs); \
-        /* TODO: use restrict keyword in ICL */ \
-        const uint16_t *base_mvx = &m_cost_mvx[(bmv.x << 2)]; \
-        const uint16_t *base_mvy = &m_cost_mvy[(bmv.y << 2)]; \
-        X265_CHECK(mvcost((bmv + MV(m0x, m0y)) << 2) == (base_mvx[(m0x) << 2] + base_mvy[(m0y) << 2]), "mvcost() check failure\n"); \
-        X265_CHECK(mvcost((bmv + MV(m1x, m1y)) << 2) == (base_mvx[(m1x) << 2] + base_mvy[(m1y) << 2]), "mvcost() check failure\n"); \
-        X265_CHECK(mvcost((bmv + MV(m2x, m2y)) << 2) == (base_mvx[(m2x) << 2] + base_mvy[(m2y) << 2]), "mvcost() check failure\n"); \
-        X265_CHECK(mvcost((bmv + MV(m3x, m3y)) << 2) == (base_mvx[(m3x) << 2] + base_mvy[(m3y) << 2]), "mvcost() check failure\n"); \
-        (costs)[0] += (base_mvx[(m0x) << 2] + base_mvy[(m0y) << 2]); \
-        (costs)[1] += (base_mvx[(m1x) << 2] + base_mvy[(m1y) << 2]); \
-        (costs)[2] += (base_mvx[(m2x) << 2] + base_mvy[(m2y) << 2]); \
-        (costs)[3] += (base_mvx[(m3x) << 2] + base_mvy[(m3y) << 2]); \
+        (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
+        (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
+        (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
+        (costs)[3] += mvcost((bmv + MV(m3x, m3y)) << 2); \
     }
 
 #define DIA1_ITER(mx, my) \
@@ -639,36 +621,18 @@
         }
     }
 
+    X265_CHECK(!(ref->isLowres && numCandidates), "lowres motion candidates not allowed\n")
     // measure SAD cost at each QPEL motion vector candidate
-    if (ref->isLowres)
-    {
-        for (int i = 0; i < numCandidates; i++)
-        {
-            MV m = mvc[i].clipped(qmvmin, qmvmax);
-            if (m.notZero() && m != pmv && m != bestpre) // check already measured
-            {
-                int cost = ref->lowresQPelCost(fenc, blockOffset, m, sad) + mvcost(m);
-                if (cost < bprecost)
-                {
-                    bprecost = cost;
-                    bestpre = m;
-                }
-            }
-        }
-    }
-    else
+    for (int i = 0; i < numCandidates; i++)
     {
-        for (int i = 0; i < numCandidates; i++)
+        MV m = mvc[i].clipped(qmvmin, qmvmax);
+        if (m.notZero() & (m != pmv ? 1 : 0) & (m != bestpre ? 1 : 0)) // check already measured
         {
-            MV m = mvc[i].clipped(qmvmin, qmvmax);
-            if (m.notZero() && m != pmv && m != bestpre) // check already measured
+            int cost = subpelCompare(ref, m, sad) + mvcost(m);
+            if (cost < bprecost)
             {
-                int cost = subpelCompare(ref, m, sad) + mvcost(m);
-                if (cost < bprecost)
-                {
-                    bprecost = cost;
-                    bestpre = m;
-                }
+                bprecost = cost;
+                bestpre = m;
             }
         }
     }

x265_1.7.tar.gz/source/encoder/motion.h -> x265_1.8.tar.gz/source/encoder/motion.h Changed

x265_1.7.tar.gz/source/encoder/nal.cpp -> x265_1.8.tar.gz/source/encoder/nal.cpp Changed

x265_1.7.tar.gz/source/encoder/nal.h -> x265_1.8.tar.gz/source/encoder/nal.h Changed

x265_1.7.tar.gz/source/encoder/ratecontrol.cpp -> x265_1.8.tar.gz/source/encoder/ratecontrol.cpp Changed

@@ -37,7 +37,7 @@
 #define BR_SHIFT  6
 #define CPB_SHIFT 4
 
-using namespace x265;
+using namespace X265_NS;
 
 /* Amortize the partial cost of I frames over the next N frames */
 
@@ -181,6 +181,8 @@
     m_bTerminated = false;
     m_finalFrameCount = 0;
     m_numEntries = 0;
+    m_isSceneTransition = false;
+    m_lastPredictorReset = 0;
     if (m_param->rc.rateControlMode == X265_RC_CRF)
     {
         m_param->rc.qp = (int)m_param->rc.rfConstant;
@@ -273,7 +275,6 @@
     if(m_param->rc.bStrictCbr)
         m_rateTolerance = 0.7;
 
-    m_leadingBframes = m_param->bframes;
     m_bframeBits = 0;
     m_leadingNoBSatd = 0;
     m_ipOffset = 6.0 * X265_LOG2(m_param->rc.ipFactor);
@@ -282,6 +283,7 @@
     /* Adjust the first frame in order to stabilize the quality level compared to the rest */
 #define ABR_INIT_QP_MIN (24)
 #define ABR_INIT_QP_MAX (40)
+#define ABR_SCENECUT_INIT_QP_MIN (12)
 #define CRF_INIT_QP (int)m_param->rc.rfConstant
     for (int i = 0; i < 3; i++)
         m_lastQScaleFor[i] = x265_qp2qScale(m_param->rc.rateControlMode == X265_RC_CRF ? CRF_INIT_QP : ABR_INIT_QP_MIN);
@@ -369,20 +371,8 @@
     m_accumPNorm = .01;
     m_accumPQp = (m_param->rc.rateControlMode == X265_RC_CRF ? CRF_INIT_QP : ABR_INIT_QP_MIN) * m_accumPNorm;
 
-    /* Frame Predictors and Row predictors used in vbv */
-    for (int i = 0; i < 4; i++)
-    {
-        m_pred[i].coeff = 1.0;
-        m_pred[i].count = 1.0;
-        m_pred[i].decay = 0.5;
-        m_pred[i].offset = 0.0;
-    }
-    m_pred[0].coeff = m_pred[3].coeff = 0.75;
-    if (m_param->rc.qCompress >= 0.8) // when tuned for grain 
-    {
-        m_pred[1].coeff = 0.75;
-        m_pred[0].coeff = m_pred[3].coeff = 0.50;
-    }
+    /* Frame Predictors used in vbv */
+    initFramePredictors();
     if (!m_statFileOut && (m_param->rc.bStatWrite || m_param->rc.bStatRead))
     {
         /* If the user hasn't defined the stat filename, use the default value */
@@ -931,6 +921,24 @@
         return X265_TYPE_AUTO;
 }
 
+void RateControl::initFramePredictors()
+{
+    /* Frame Predictors used in vbv */
+    for (int i = 0; i < 4; i++)
+    {
+        m_pred[i].coeff = 1.0;
+        m_pred[i].count = 1.0;
+        m_pred[i].decay = 0.5;
+        m_pred[i].offset = 0.0;
+    }
+    m_pred[0].coeff = m_pred[3].coeff = 0.75;
+    if (m_param->rc.qCompress >= 0.8) // when tuned for grain 
+    {
+        m_pred[1].coeff = 0.75;
+        m_pred[0].coeff = m_pred[3].coeff = 0.50;
+    }
+}
+
 int RateControl::rateControlStart(Frame* curFrame, RateControlEntry* rce, Encoder* enc)
 {
     int orderValue = m_startEndOrder.get();
@@ -960,10 +968,20 @@
         copyRceData(rce, &m_rce2Pass[rce->poc]);
     }
     rce->isActive = true;
-    if (m_sliceType == B_SLICE)
-        rce->bframes = m_leadingBframes;
-    else
-        m_leadingBframes = curFrame->m_lowres.leadingBframes;
+    bool isRefFrameScenecut = m_sliceType!= I_SLICE && m_curSlice->m_refPicList[0][0]->m_lowres.bScenecut == 1;
+    if (curFrame->m_lowres.bScenecut)
+    {
+        m_isSceneTransition = true;
+        m_lastPredictorReset = rce->encodeOrder;
+        initFramePredictors();
+    }
+    else if (m_sliceType != B_SLICE && !isRefFrameScenecut)
+        m_isSceneTransition = false;
+
+    if (rce->encodeOrder < m_lastPredictorReset + m_param->frameNumThreads)
+    {
+        rce->rowPreds[0][0].count = 0;
+    }
 
     rce->bLastMiniGopBFrame = curFrame->m_lowres.bLastMiniGopBFrame;
     rce->bufferRate = m_bufferRate;
@@ -1040,6 +1058,10 @@
                 }
             }
         }
+        /* For a scenecut that occurs within the mini-gop, enable scene transition
+         * switch until the next mini-gop to ensure a min qp for all the frames within 
+         * the scene-transition mini-gop */
+
         double q = x265_qScale2qp(rateEstimateQscale(curFrame, rce));
         q = x265_clip3((double)QP_MIN, (double)QP_MAX_MAX, q);
         m_qp = int(q + 0.5);
@@ -1087,18 +1109,6 @@
     }
     m_framesDone++;
 
-    /* CQP and CRF (without capped VBV) doesn't use mid-frame statistics to 
-     * tune RateControl parameters for other frames.
-     * Hence, for these modes, update m_startEndOrder and unlock RC for previous threads waiting in
-     * RateControlEnd here.those modes here. For the rest - ABR
-     * and VBV, unlock only after rateControlUpdateStats of this frame is called */
-    if (m_param->rc.rateControlMode != X265_RC_ABR && !m_isVbv)
-    {
-        m_startEndOrder.incr();
-
-        if (rce->encodeOrder < m_param->frameNumThreads - 1)
-            m_startEndOrder.incr(); // faked rateControlEnd calls for negative frames
-    }
     return m_qp;
 }
 
@@ -1394,6 +1404,13 @@
         else
             q += m_pbOffset;
 
+        /* Set a min qp at scenechanges and transitions */
+        if (m_isSceneTransition)
+        {
+            q = X265_MAX(ABR_SCENECUT_INIT_QP_MIN, q);
+            double minScenecutQscale =x265_qp2qScale(ABR_SCENECUT_INIT_QP_MIN); 
+            m_lastQScaleFor[P_SLICE] = X265_MAX(minScenecutQscale, m_lastQScaleFor[P_SLICE]);
+        }
         double qScale = x265_qp2qScale(q);
         rce->qpNoVbv = q;
         double lmin = 0, lmax = 0;
@@ -1556,11 +1573,19 @@
                 q = X265_MIN(lqmax, q);
             }
             q = x265_clip3(MIN_QPSCALE, MAX_MAX_QPSCALE, q);
+            /* Set a min qp at scenechanges and transitions */
+            if (m_isSceneTransition)
+            {
+               double minScenecutQscale =x265_qp2qScale(ABR_SCENECUT_INIT_QP_MIN); 
+               q = X265_MAX(minScenecutQscale, q);
+               m_lastQScaleFor[P_SLICE] = X265_MAX(minScenecutQscale, m_lastQScaleFor[P_SLICE]);
+            }
             rce->qpNoVbv = x265_qScale2qp(q);
             q = clipQscale(curFrame, rce, q);
             /*  clip qp to permissible range after vbv-lookahead estimation to avoid possible
-             * mispredictions by initial frame size predictors */
-            if (!m_2pass && m_isVbv && m_pred[m_predType].count == 1)
+             * mispredictions by initial frame size predictors, after each scenecut */
+            bool isFrameAfterScenecut = m_sliceType!= I_SLICE && m_curSlice->m_refPicList[0][0]->m_lowres.bScenecut;
+            if (!m_2pass && m_isVbv && isFrameAfterScenecut)
                 q = x265_clip3(lqmin, lqmax, q);
         }
         m_lastQScaleFor[m_sliceType] = q;
@@ -1762,7 +1787,7 @@
                 }
                 /* Try to get the buffer not more than 80% filled, but don't set an impossible goal. */
                 targetFill = x265_clip3(m_bufferSize * (1 - 0.2 * finalDur), m_bufferSize, m_bufferFill - totalDuration * m_vbvMaxRate * 0.5);
-                if (m_isCbr && bufferFillCur > targetFill)
+                if (m_isCbr && bufferFillCur > targetFill && !m_isSceneTransition)
                 {
                     q /= 1.01;
                     loopTerminate |= 2;
@@ -1904,6 +1929,7 @@
             else if (picType == P_SLICE)
             {
                 intraCostForPendingCus = curEncData.m_rowStat[row].intraSatdForVbv - curEncData.m_rowStat[row].diagIntraSatd;
+                intraCostForPendingCus >>= X265_DEPTH - 8;
                 /* Our QP is lower than the reference! */
                 double pred_intra = predictSize(rce->rowPred[1], qScale, intraCostForPendingCus);
                 /* Sum: better to overestimate than underestimate by using only one of the two predictors. */
@@ -1939,7 +1965,7 @@
             uint64_t intraRowSatdCost = curEncData.m_rowStat[row].diagIntraSatd;
             if (row == 1)
                 intraRowSatdCost += curEncData.m_rowStat[0].diagIntraSatd;
-
+            intraRowSatdCost >>= X265_DEPTH - 8;
             updatePredictor(rce->rowPred[1], qScaleVbv, (double)intraRowSatdCost, encodedBits);
         }
     }
@@ -2130,7 +2156,7 @@

x265_1.7.tar.gz/source/encoder/ratecontrol.h -> x265_1.8.tar.gz/source/encoder/ratecontrol.h Changed

@@ -29,7 +29,7 @@
 #include "common.h"
 #include "sei.h"
 
-namespace x265 {
+namespace X265_NS {
 // encoder namespace
 
 class Encoder;
@@ -46,23 +46,6 @@
 #define MIN_AMORTIZE_FRACTION 0.2
 #define CLIP_DURATION(f) x265_clip3(MIN_FRAME_DURATION, MAX_FRAME_DURATION, f)
 
-/* Current frame stats for 2 pass */
-struct FrameStats
-{
-    int         mvBits;    /* MV bits (MV+Ref+Block Type) */
-    int         coeffBits; /* Texture bits (DCT coefs) */
-    int         miscBits;
-
-    int         iCuCnt;
-    int         pCuCnt;
-    int         skipCuCnt;
-    
-    /* CU type counts stored as percentage */
-    double      percentIntra;
-    double      percentInter;
-    double      percentSkip;
-};
-
 struct Predictor
 {
     double coeff;
@@ -164,7 +147,6 @@
     double  m_pbOffset;
     int64_t m_bframeBits;
     int64_t m_currentSatd;
-    int     m_leadingBframes;
     int     m_qpConstant[3];
     int     m_lastNonBPictType;
     int     m_framesDone;        /* # of frames passed through RateCotrol already */
@@ -190,6 +172,8 @@
     int64_t m_lastBsliceSatdCost;
     int     m_numBframesInPattern;
     bool    m_isPatternPresent;
+    bool    m_isSceneTransition;
+    int     m_lastPredictorReset;
 
     /* a common variable on which rateControlStart, rateControlEnd and rateControUpdateStats waits to
      * sync the calls to these functions. For example
@@ -241,12 +225,12 @@
     // to be called for each curFrame to process RateControl and set QP
     int  rateControlStart(Frame* curFrame, RateControlEntry* rce, Encoder* enc);
     void rateControlUpdateStats(RateControlEntry* rce);
-    int  rateControlEnd(Frame* curFrame, int64_t bits, RateControlEntry* rce, FrameStats* stats);
+    int  rateControlEnd(Frame* curFrame, int64_t bits, RateControlEntry* rce);
     int  rowDiagonalVbvRateControl(Frame* curFrame, uint32_t row, RateControlEntry* rce, double& qpVbv);
     int  rateControlSliceType(int frameNum);
     bool cuTreeReadFor2Pass(Frame* curFrame);
     void hrdFullness(SEIBufferingPeriod* sei);
-
+    int writeRateControlFrameStats(Frame* curFrame, RateControlEntry* rce);
 protected:
 
     static const int   s_slidingWindowFrames;
@@ -274,6 +258,7 @@
     void   checkAndResetABR(RateControlEntry* rce, bool isFrameDone);
     double predictRowsSizeSum(Frame* pic, RateControlEntry* rce, double qpm, int32_t& encodedBits);
     bool   initPass2();
+    void   initFramePredictors();
     double getDiffLimitedQScale(RateControlEntry *rce, double q);
     double countExpectedBits();
     bool   vbv2Pass(uint64_t allAvailableBits);

x265_1.7.tar.gz/source/encoder/rdcost.h -> x265_1.8.tar.gz/source/encoder/rdcost.h Changed

@@ -27,7 +27,7 @@
 #include "common.h"
 #include "slice.h"
 
-namespace x265 {
+namespace X265_NS {
 // private namespace
 
 class RDCost
@@ -88,10 +88,17 @@
         m_lambda = (uint64_t)floor(256.0 * lambda);
     }
 
-    inline uint64_t calcRdCost(uint32_t distortion, uint32_t bits) const
+    inline uint64_t calcRdCost(sse_ret_t distortion, uint32_t bits) const
     {
+#if X265_DEPTH <= 10
         X265_CHECK(bits <= (UINT64_MAX - 128) / m_lambda2,
-                   "calcRdCost wrap detected dist: %u, bits %u, lambda: "X265_LL"\n", distortion, bits, m_lambda2);
+                   "calcRdCost wrap detected dist: %u, bits %u, lambda: " X265_LL "\n",
+                   distortion, bits, m_lambda2);
+#else
+        X265_CHECK(bits <= (UINT64_MAX - 128) / m_lambda2,
+                   "calcRdCost wrap detected dist: " X265_LL ", bits %u, lambda: " X265_LL "\n",
+                   distortion, bits, m_lambda2);
+#endif
         return distortion + ((bits * m_lambda2 + 128) >> 8);
     }
 
@@ -108,7 +115,7 @@
     }
 
     /* return the RD cost of this prediction, including the effect of psy-rd */
-    inline uint64_t calcPsyRdCost(uint32_t distortion, uint32_t bits, uint32_t psycost) const
+    inline uint64_t calcPsyRdCost(sse_ret_t distortion, uint32_t bits, uint32_t psycost) const
     {
         return distortion + ((m_lambda * m_psyRd * psycost) >> 24) + ((bits * m_lambda2) >> 8);
     }
@@ -116,15 +123,22 @@
     inline uint64_t calcRdSADCost(uint32_t sadCost, uint32_t bits) const
     {
         X265_CHECK(bits <= (UINT64_MAX - 128) / m_lambda,
-                   "calcRdSADCost wrap detected dist: %u, bits %u, lambda: "X265_LL"\n", sadCost, bits, m_lambda);
+                   "calcRdSADCost wrap detected dist: %u, bits %u, lambda: " X265_LL "\n", sadCost, bits, m_lambda);
         return sadCost + ((bits * m_lambda + 128) >> 8);
     }
 
-    inline uint32_t scaleChromaDist(uint32_t plane, uint32_t dist) const
+    inline sse_ret_t scaleChromaDist(uint32_t plane, sse_ret_t dist) const
     {
+#if X265_DEPTH <= 10
+        X265_CHECK(dist <= (UINT64_MAX - 128) / m_chromaDistWeight[plane - 1],
+                   "scaleChromaDist wrap detected dist: %u, lambda: %u\n",
+                   dist, m_chromaDistWeight[plane - 1]);
+#else
         X265_CHECK(dist <= (UINT64_MAX - 128) / m_chromaDistWeight[plane - 1],
-                   "scaleChromaDist wrap detected dist: %u, lambda: %u\n", dist, m_chromaDistWeight[plane - 1]);
-        return (uint32_t)((dist * (uint64_t)m_chromaDistWeight[plane - 1] + 128) >> 8);
+                   "scaleChromaDist wrap detected dist: " X265_LL " lambda: %u\n",
+                   dist, m_chromaDistWeight[plane - 1]);
+#endif
+        return (sse_ret_t)((dist * (uint64_t)m_chromaDistWeight[plane - 1] + 128) >> 8);
     }
 
     inline uint32_t getCost(uint32_t bits) const

x265_1.7.tar.gz/source/encoder/reference.cpp -> x265_1.8.tar.gz/source/encoder/reference.cpp Changed

x265_1.7.tar.gz/source/encoder/reference.h -> x265_1.8.tar.gz/source/encoder/reference.h Changed

x265_1.7.tar.gz/source/encoder/sao.cpp -> x265_1.8.tar.gz/source/encoder/sao.cpp Changed

@@ -42,15 +42,25 @@
     return (x >> 31) | ((int)((((uint32_t)-x)) >> 31));
 }
 
+inline int signOf2(const int a, const int b)
+{
+    // NOTE: don't reorder below compare, both ICL, VC, GCC optimize strong depends on order!
+    int r = 0;
+    if (a < b)
+        r = -1;
+    if (a > b)
+        r = 1;
+    return r;
+}
+
 inline int64_t estSaoDist(int32_t count, int offset, int32_t offsetOrg)
 {
     return (count * offset - offsetOrg * 2) * offset;
 }
-
 } // end anonymous namespace
 
 
-namespace x265 {
+namespace X265_NS {
 
 const uint32_t SAO::s_eoTable[NUM_EDGETYPE] =
 {
@@ -213,14 +223,19 @@
         frame->m_encData->m_saoParam = saoParam;
     }
 
-    rdoSaoUnitRowInit(saoParam);
+    saoParam->bSaoFlag[0] = true;
+    saoParam->bSaoFlag[1] = true;
 
-    // NOTE: Disable SAO automatic turn-off when frame parallelism is
-    // enabled for output exact independent of frame thread count
-    if (m_param->frameNumThreads > 1)
+    m_numNoSao[0] = 0; // Luma
+    m_numNoSao[1] = 0; // Chroma
+
+    // NOTE: Allow SAO automatic turn-off only when frame parallelism is disabled.
+    if (m_param->frameNumThreads == 1)
     {
-        saoParam->bSaoFlag[0] = true;
-        saoParam->bSaoFlag[1] = true;
+        if (m_refDepth > 0 && m_depthSaoRate[0][m_refDepth - 1] > SAO_ENCODING_RATE)
+            saoParam->bSaoFlag[0] = false;
+        if (m_refDepth > 0 && m_depthSaoRate[1][m_refDepth - 1] > SAO_ENCODING_RATE_CHROMA)
+            saoParam->bSaoFlag[1] = false;
     }
 }
 
@@ -656,7 +671,6 @@
 /* Calculate SAO statistics for current CTU without non-crossing slice */
 void SAO::calcSaoStatsCu(int addr, int plane)
 {
-    int x, y;
     const CUData* cu = m_frame->m_encData->getPicCTU(addr);
     const pixel* fenc0 = m_frame->m_fencPic->getPlaneAddr(plane, addr);
     const pixel* rec0  = m_frame->m_reconPic->getPlaneAddr(plane, addr);
@@ -687,8 +701,6 @@
     int startY;
     int endX;
     int endY;
-    int32_t* stats;
-    int32_t* count;
 
     int skipB = plane ? 2 : 4;
     int skipR = plane ? 3 : 5;
@@ -698,34 +710,16 @@
 
     // SAO_BO:
     {
-        const int boShift = X265_DEPTH - SAO_BO_BITS;
-
         if (m_param->bSaoNonDeblocked)
         {
             skipB = plane ? 1 : 3;
             skipR = plane ? 2 : 4;
         }
-        stats = m_offsetOrg[plane][SAO_BO];
-        count = m_count[plane][SAO_BO];
-
-        fenc = fenc0;
-        rec  = rec0;
 
         endX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR;
         endY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB;
 
-        for (y = 0; y < endY; y++)
-        {
-            for (x = 0; x < endX; x++)
-            {
-                int classIdx = 1 + (rec[x] >> boShift);
-                stats[classIdx] += (fenc[x] - rec[x]);
-                count[classIdx]++;
-            }
-
-            fenc += stride;
-            rec += stride;
-        }
+        primitives.saoCuStatsBO(fenc0, rec0, stride, endX, endY, m_offsetOrg[plane][SAO_BO], m_count[plane][SAO_BO]);
     }
 
     {
@@ -736,30 +730,11 @@
                 skipB = plane ? 1 : 3;
                 skipR = plane ? 3 : 5;
             }
-            stats = m_offsetOrg[plane][SAO_EO_0];
-            count = m_count[plane][SAO_EO_0];
-
-            fenc = fenc0;
-            rec  = rec0;
 
             startX = !lpelx;
             endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
-            for (y = 0; y < ctuHeight - skipB; y++)
-            {
-                int signLeft = signOf(rec[startX] - rec[startX - 1]);
-                for (x = startX; x < endX; x++)
-                {
-                    int signRight = signOf(rec[x] - rec[x + 1]);
-                    int edgeType = signRight + signLeft + 2;
-                    signLeft = -signRight;
-
-                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
-                    count[s_eoTable[edgeType]]++;
-                }
 
-                fenc += stride;
-                rec += stride;
-            }
+            primitives.saoCuStatsE0(fenc0 + startX, rec0 + startX, stride, endX - startX, ctuHeight - skipB, m_offsetOrg[plane][SAO_EO_0], m_count[plane][SAO_EO_0]);
         }
 
         // SAO_EO_1: // dir: |
@@ -769,8 +744,6 @@
                 skipB = plane ? 2 : 4;
                 skipR = plane ? 2 : 4;
             }
-            stats = m_offsetOrg[plane][SAO_EO_1];
-            count = m_count[plane][SAO_EO_1];
 
             fenc = fenc0;
             rec  = rec0;
@@ -786,21 +759,7 @@
 
             primitives.sign(upBuff1, rec, &rec[- stride], ctuWidth);
 
-            for (y = startY; y < endY; y++)
-            {
-                for (x = 0; x < endX; x++)
-                {
-                    int8_t signDown = signOf(rec[x] - rec[x + stride]);
-                    int edgeType = signDown + upBuff1[x] + 2;
-                    upBuff1[x] = -signDown;
-
-                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
-                    count[s_eoTable[edgeType]]++;
-                }
-
-                fenc += stride;
-                rec += stride;
-            }
+            primitives.saoCuStatsE1(fenc0 + startY * stride, rec0 + startY * stride, stride, upBuff1, endX, endY - startY, m_offsetOrg[plane][SAO_EO_1], m_count[plane][SAO_EO_1]);
         }
 
         // SAO_EO_2: // dir: 135
@@ -810,8 +769,6 @@
                 skipB = plane ? 2 : 4;
                 skipR = plane ? 3 : 5;
             }
-            stats = m_offsetOrg[plane][SAO_EO_2];
-            count = m_count[plane][SAO_EO_2];
 
             fenc = fenc0;
             rec  = rec0;
@@ -829,23 +786,7 @@
 
             primitives.sign(&upBuff1[startX], &rec[startX], &rec[startX - stride - 1], (endX - startX));
 
-            for (y = startY; y < endY; y++)
-            {
-                upBufft[startX] = signOf(rec[startX + stride] - rec[startX - 1]);
-                for (x = startX; x < endX; x++)
-                {
-                    int8_t signDown = signOf(rec[x] - rec[x + stride + 1]);
-                    int edgeType = signDown + upBuff1[x] + 2;
-                    upBufft[x + 1] = -signDown;
-                    stats[s_eoTable[edgeType]] += (fenc[x] - rec[x]);
-                    count[s_eoTable[edgeType]]++;
-                }
-
-                std::swap(upBuff1, upBufft);
-
-                rec += stride;
-                fenc += stride;

x265_1.7.tar.gz/source/encoder/sao.h -> x265_1.8.tar.gz/source/encoder/sao.h Changed

@@ -30,7 +30,7 @@
 #include "frame.h"
 #include "entropy.h"
 
-namespace x265 {
+namespace X265_NS {
 // private namespace
 
 enum SAOTypeLen
@@ -52,12 +52,12 @@
 
 class SAO
 {
-protected:
+public:
 
     enum { SAO_MAX_DEPTH = 4 };
     enum { SAO_BO_BITS  = 5 };
     enum { MAX_NUM_SAO_CLASS = 33 };
-    enum { SAO_BIT_INC = X265_MAX(X265_DEPTH - 10, 0) };
+    enum { SAO_BIT_INC = 0 }; /* in HM12.0, it wrote as X265_MAX(X265_DEPTH - 10, 0) */
     enum { OFFSET_THRESH = 1 << X265_MIN(X265_DEPTH - 5, 5) };
     enum { NUM_EDGETYPE = 5 };
     enum { NUM_PLANE = 3 };
@@ -68,6 +68,8 @@
     typedef int32_t (PerClass[MAX_NUM_SAO_TYPE][MAX_NUM_SAO_CLASS]);
     typedef int32_t (PerPlane[NUM_PLANE][MAX_NUM_SAO_TYPE][MAX_NUM_SAO_CLASS]);
 
+protected:
+
     /* allocated per part */
     PerClass*   m_count;
     PerClass*   m_offset;
@@ -142,7 +144,6 @@
                              int32_t* currentDistortionTableBo, double* currentRdCostTableBo);
     inline int64_t estSaoTypeDist(int plane, int typeIdx, double lambda, int32_t* currentDistortionTableBo, double* currentRdCostTableBo);
 
-    void rdoSaoUnitRowInit(SAOParam* saoParam);
     void rdoSaoUnitRowEnd(const SAOParam* saoParam, int numctus);
     void rdoSaoUnitRow(SAOParam* saoParam, int idxY);
 };

x265_1.7.tar.gz/source/encoder/search.cpp -> x265_1.8.tar.gz/source/encoder/search.cpp Changed

@@ -33,7 +33,7 @@
 #include "analysis.h"  // TLD
 #include "framedata.h"
 
-using namespace x265;
+using namespace X265_NS;
 
 #if _MSC_VER
 #pragma warning(disable: 4800) // 'uint8_t' : forcing value to bool 'true' or 'false' (performance warning)
@@ -319,7 +319,7 @@
         uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
         if (numSig)
         {
-            m_quant.invtransformNxN(residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
+            m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
             primitives.cu[sizeIdx].add_ps(reconQt, reconQtStride, pred, residual, stride, stride);
         }
         else
@@ -517,7 +517,7 @@
         uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, useTSkip);
         if (numSig)
         {
-            m_quant.invtransformNxN(residual, stride, coeff, log2TrSize, TEXT_LUMA, true, useTSkip, numSig);
+            m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSize, TEXT_LUMA, true, useTSkip, numSig);
             primitives.cu[sizeIdx].add_ps(tmpRecon, tmpReconStride, pred, residual, stride, stride);
         }
         else if (useTSkip)
@@ -530,7 +530,7 @@
             // no residual coded, recon = pred
             primitives.cu[sizeIdx].copy_pp(tmpRecon, tmpReconStride, pred, stride);
 
-        uint32_t tmpDist = primitives.cu[sizeIdx].sse_pp(tmpRecon, tmpReconStride, fenc, stride);
+        sse_ret_t tmpDist = primitives.cu[sizeIdx].sse_pp(tmpRecon, tmpReconStride, fenc, stride);
 
         cu.setTransformSkipSubParts(useTSkip, TEXT_LUMA, absPartIdx, fullDepth);
         cu.setCbfSubParts((!!numSig) << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
@@ -667,7 +667,7 @@
         uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
         if (numSig)
         {
-            m_quant.invtransformNxN(residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
+            m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
             primitives.cu[sizeIdx].add_ps(picReconY, picStride, pred, residual, stride, stride);
             cu.setCbfSubParts(1 << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
         }
@@ -797,7 +797,7 @@
     uint32_t qtLayer = log2TrSize - 2;
     uint32_t stride = mode.fencYuv->m_csize;
     const uint32_t sizeIdxC = log2TrSizeC - 2;
-    uint32_t outDist = 0;
+    sse_ret_t outDist = 0;
 
     uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
     const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
@@ -841,7 +841,7 @@
             uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
             if (numSig)
             {
-                m_quant.invtransformNxN(residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
+                m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
                 primitives.cu[sizeIdxC].add_ps(reconQt, reconQtStride, pred, residual, stride, stride);
                 cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
             }
@@ -942,7 +942,7 @@
                 uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, useTSkip);
                 if (numSig)
                 {
-                    m_quant.invtransformNxN(residual, stride, coeff, log2TrSizeC, ttype, true, useTSkip, numSig);
+                    m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSizeC, ttype, true, useTSkip, numSig);
                     primitives.cu[sizeIdxC].add_ps(recon, reconStride, pred, residual, stride, stride);
                     cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
                 }
@@ -956,7 +956,7 @@
                     primitives.cu[sizeIdxC].copy_pp(recon, reconStride, pred, stride);
                     cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
                 }
-                uint32_t tmpDist = primitives.cu[sizeIdxC].sse_pp(recon, reconStride, fenc, stride);
+                sse_ret_t tmpDist = primitives.cu[sizeIdxC].sse_pp(recon, reconStride, fenc, stride);
                 tmpDist = m_rdCost.scaleChromaDist(chromaId, tmpDist);
 
                 cu.setTransformSkipPartRange(useTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
@@ -1129,7 +1129,7 @@
             uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
             if (numSig)
             {
-                m_quant.invtransformNxN(residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
+                m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
                 primitives.cu[sizeIdxC].add_ps(picReconC, picStride, pred, residual, stride, stride);
                 cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
             }
@@ -1156,14 +1156,14 @@
 
     cu.setPartSizeSubParts(partSize);
     cu.setPredModeSubParts(MODE_INTRA);
-    m_quant.m_tqBypass = !!cu.m_tqBypass[0];
 
     uint32_t tuDepthRange[2];
     cu.getIntraTUQtDepthRange(tuDepthRange, 0);
 
     intraMode.initCosts();
-    intraMode.distortion += estIntraPredQT(intraMode, cuGeom, tuDepthRange, sharedModes);
-    intraMode.distortion += estIntraPredChromaQT(intraMode, cuGeom, sharedChromaModes);
+    intraMode.lumaDistortion += estIntraPredQT(intraMode, cuGeom, tuDepthRange, sharedModes);
+    intraMode.chromaDistortion += estIntraPredChromaQT(intraMode, cuGeom, sharedChromaModes);
+    intraMode.distortion += intraMode.lumaDistortion + intraMode.chromaDistortion;
 
     m_entropyCoder.resetBits();
     if (m_slice->m_pps->bTransquantBypassEnabled)
@@ -1378,8 +1378,9 @@
     codeIntraLumaQT(intraMode, cuGeom, 0, 0, false, icosts, tuDepthRange);
     extractIntraResultQT(cu, *reconYuv, 0, 0);
 
-    intraMode.distortion = icosts.distortion;
-    intraMode.distortion += estIntraPredChromaQT(intraMode, cuGeom, NULL);
+    intraMode.lumaDistortion = icosts.distortion;
+    intraMode.chromaDistortion = estIntraPredChromaQT(intraMode, cuGeom, NULL);
+    intraMode.distortion = intraMode.lumaDistortion + intraMode.chromaDistortion;
 
     m_entropyCoder.resetBits();
     if (m_slice->m_pps->bTransquantBypassEnabled)
@@ -1861,6 +1862,29 @@
     return outCost;
 }
 
+/* find the lowres motion vector from lookahead in middle of current PU */
+MV Search::getLowresMV(const CUData& cu, const PredictionUnit& pu, int list, int ref)
+{
+    int diffPoc = abs(m_slice->m_poc - m_slice->m_refPicList[list][ref]->m_poc);
+    if (diffPoc > m_param->bframes + 1)
+        /* poc difference is out of range for lookahead */
+        return 0;
+
+    MV* mvs = m_frame->m_lowres.lowresMvs[list][diffPoc - 1];
+    if (mvs[0].x == 0x7FFF)
+        /* this motion search was not estimated by lookahead */
+        return 0;
+
+    uint32_t block_x = (cu.m_cuPelX + g_zscanToPelX[pu.puAbsPartIdx] + pu.width / 2) >> 4;
+    uint32_t block_y = (cu.m_cuPelY + g_zscanToPelY[pu.puAbsPartIdx] + pu.height / 2) >> 4;
+    uint32_t idx = block_y * m_frame->m_lowres.maxBlocksInRow + block_x;
+
+    X265_CHECK(block_x < m_frame->m_lowres.maxBlocksInRow, "block_x is too high\n");
+    X265_CHECK(block_y < m_frame->m_lowres.maxBlocksInCol, "block_y is too high\n");
+
+    return mvs[idx] << 1; /* scale up lowres mv */
+}
+
 /* Pick between the two AMVP candidates which is the best one to use as
  * MVP for the motion search, based on SAD cost */
 int Search::selectMVP(const CUData& cu, const PredictionUnit& pu, const MV amvp[AMVP_NUM_CANDS], int list, int ref)
@@ -1929,10 +1953,16 @@
     /* Perform ME, repeat until no more work is available */
     do
     {
-        if (meId < m_slice->m_numRefIdx[0])
-            slave.singleMotionEstimation(*this, pme.mode, pme.pu, pme.puIdx, 0, meId);
+        if (meId < pme.m_jobs.refCnt[0])
+        {
+            int refIdx = pme.m_jobs.ref[0][meId]; //L0
+            slave.singleMotionEstimation(*this, pme.mode, pme.pu, pme.puIdx, 0, refIdx);
+        }
         else
-            slave.singleMotionEstimation(*this, pme.mode, pme.pu, pme.puIdx, 1, meId - m_slice->m_numRefIdx[0]);
+        {
+            int refIdx = pme.m_jobs.ref[1][meId - pme.m_jobs.refCnt[0]]; //L1
+            slave.singleMotionEstimation(*this, pme.mode, pme.pu, pme.puIdx, 1, refIdx);
+        }
 
         meId = -1;
         pme.m_lock.acquire();
@@ -1950,13 +1980,18 @@
 
     MotionData* bestME = interMode.bestME[part];
 
-    MV  mvc[(MD_ABOVE_LEFT + 1) * 2 + 1];
+    // 12 mv candidates including lowresMV
+    MV  mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
     int numMvc = interMode.cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);
 
     const MV* amvp = interMode.amvpCand[list][ref];
     int mvpIdx = selectMVP(interMode.cu, pu, amvp, list, ref);
     MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx];
 
+    MV lmv = getLowresMV(interMode.cu, pu, list, ref);
+    if (lmv.notZero())
+        mvc[numMvc++] = lmv;
+
     setSearchRange(interMode.cu, mvp, m_param->searchRange, mvmin, mvmax);
 
     int satdCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv);
@@ -1983,23 +2018,22 @@
 }
 
 /* find the best inter prediction for each PU of specified mode */
-void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC)
+void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t refMasks[2])
 {
     ProfileCUScope(interMode.cu, motionEstimationElapsedTime, countMotionEstimate);
 
     CUData& cu = interMode.cu;

x265_1.7.tar.gz/source/encoder/search.h -> x265_1.8.tar.gz/source/encoder/search.h Changed

@@ -48,7 +48,7 @@
 #define ProfileCounter(cu, count)
 #endif
 
-namespace x265 {
+namespace X265_NS {
 // private namespace
 
 class Entropy;
@@ -109,7 +109,9 @@
     uint64_t   sa8dCost;   // sum of partition sa8d distortion costs   (sa8d(fenc, pred) + lambda * bits)
     uint32_t   sa8dBits;   // signal bits used in sa8dCost calculation
     uint32_t   psyEnergy;  // sum of partition psycho-visual energy difference
-    uint32_t   distortion; // sum of partition SSE distortion
+    sse_ret_t  lumaDistortion;
+    sse_ret_t  chromaDistortion;
+    sse_ret_t  distortion; // sum of partition SSE distortion
     uint32_t   totalBits;  // sum of partition bits (mv + coeff)
     uint32_t   mvBits;     // Mv bits + Ref + block type (or intra mode)
     uint32_t   coeffBits;  // Texture bits (DCT Coeffs)
@@ -120,6 +122,8 @@
         sa8dCost = 0;
         sa8dBits = 0;
         psyEnergy = 0;
+        lumaDistortion = 0;
+        chromaDistortion = 0;
         distortion = 0;
         totalBits = 0;
         mvBits = 0;
@@ -133,7 +137,15 @@
         sa8dCost = UINT64_MAX / 2;
         sa8dBits = MAX_UINT / 2;
         psyEnergy = MAX_UINT / 2;
+#if X265_DEPTH <= 10
+        lumaDistortion = MAX_UINT / 2;
+        chromaDistortion = MAX_UINT / 2;
         distortion = MAX_UINT / 2;
+#else
+        lumaDistortion = UINT64_MAX / 2;
+        chromaDistortion = UINT64_MAX / 2;
+        distortion = UINT64_MAX / 2;
+#endif
         totalBits = MAX_UINT / 2;
         mvBits = MAX_UINT / 2;
         coeffBits = MAX_UINT / 2;
@@ -141,14 +153,29 @@
 
     bool ok() const
     {
+#if X265_DEPTH <= 10
+        return !(rdCost >= UINT64_MAX / 2 ||
+            sa8dCost >= UINT64_MAX / 2 ||
+            sa8dBits >= MAX_UINT / 2 ||
+            psyEnergy >= MAX_UINT / 2 ||
+            lumaDistortion >= MAX_UINT / 2 ||
+            chromaDistortion >= MAX_UINT / 2 ||
+            distortion >= MAX_UINT / 2 ||
+            totalBits >= MAX_UINT / 2 ||
+            mvBits >= MAX_UINT / 2 ||
+            coeffBits >= MAX_UINT / 2);
+#else
         return !(rdCost >= UINT64_MAX / 2 ||
                  sa8dCost >= UINT64_MAX / 2 ||
                  sa8dBits >= MAX_UINT / 2 ||
                  psyEnergy >= MAX_UINT / 2 ||
-                 distortion >= MAX_UINT / 2 ||
+                 lumaDistortion >= UINT64_MAX / 2 ||
+                 chromaDistortion >= UINT64_MAX / 2 ||
+                 distortion >= UINT64_MAX / 2 ||
                  totalBits >= MAX_UINT / 2 ||
                  mvBits >= MAX_UINT / 2 ||
                  coeffBits >= MAX_UINT / 2);
+#endif
     }
 
     void addSubCosts(const Mode& subMode)
@@ -159,6 +186,8 @@
         sa8dCost += subMode.sa8dCost;
         sa8dBits += subMode.sa8dBits;
         psyEnergy += subMode.psyEnergy;
+        lumaDistortion += subMode.lumaDistortion;
+        chromaDistortion += subMode.chromaDistortion;
         distortion += subMode.distortion;
         totalBits += subMode.totalBits;
         mvBits += subMode.mvBits;
@@ -186,6 +215,11 @@
     int64_t  weightAnalyzeTime;                 // elapsed worker time analyzing reference weights
     int64_t  totalCTUTime;                      // elapsed worker time in compressCTU (includes pmode master)
 
+    uint32_t skippedMotionReferences[NUM_CU_DEPTH];
+    uint32_t totalMotionReferences[NUM_CU_DEPTH];
+    uint32_t skippedIntraCU[NUM_CU_DEPTH];
+    uint32_t totalIntraCU[NUM_CU_DEPTH];
+
     uint64_t countIntraRDO[NUM_CU_DEPTH];
     uint64_t countInterRDO[NUM_CU_DEPTH];
     uint64_t countIntraAnalysis;
@@ -213,6 +247,10 @@
             interRDOElapsedTime[i] += other.interRDOElapsedTime[i];
             countIntraRDO[i] += other.countIntraRDO[i];
             countInterRDO[i] += other.countInterRDO[i];
+            skippedMotionReferences[i] += other.skippedMotionReferences[i];
+            totalMotionReferences[i] += other.totalMotionReferences[i];
+            skippedIntraCU[i] += other.skippedIntraCU[i];
+            totalIntraCU[i] += other.totalIntraCU[i];
         }
 
         intraAnalysisElapsedTime += other.intraAnalysisElapsedTime;
@@ -301,7 +339,7 @@
     void     encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom);
 
     // estimation inter prediction (non-skip)
-    void     predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC);
+    void     predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t masks[2]);
 
     // encode residual and compute rd-cost for inter mode
     void     encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom);
@@ -319,6 +357,8 @@
     void checkDQP(Mode& mode, const CUGeom& cuGeom);
     void checkDQPForSplitPred(Mode& mode, const CUGeom& cuGeom);
 
+    MV getLowresMV(const CUData& cu, const PredictionUnit& pu, int list, int ref);
+
     class PME : public BondedTaskGroup
     {
     public:
@@ -329,6 +369,11 @@
         const PredictionUnit& pu;
         int           puIdx;
 
+        struct {
+            int ref[2][MAX_NUM_REF];
+            int refCnt[2];
+        } m_jobs;
+
         PME(Search& s, Mode& m, const CUGeom& g, const PredictionUnit& u, int p) : master(s), mode(m), cuGeom(g), pu(u), puIdx(p) {}
 
         void processTasks(int workerThreadId);
@@ -365,7 +410,7 @@
     {
         uint64_t rdcost;
         uint32_t bits;
-        uint32_t distortion;
+        sse_ret_t distortion;
         uint32_t energy;
         Cost() { rdcost = 0; bits = 0; distortion = 0; energy = 0; }
     };

x265_1.7.tar.gz/source/encoder/sei.cpp -> x265_1.8.tar.gz/source/encoder/sei.cpp Changed

x265_1.7.tar.gz/source/encoder/sei.h -> x265_1.8.tar.gz/source/encoder/sei.h Changed

x265_1.7.tar.gz/source/encoder/slicetype.cpp -> x265_1.8.tar.gz/source/encoder/slicetype.cpp Changed

@@ -40,7 +40,7 @@
 #define ProfileLookaheadTime(elapsed, count)
 #endif
 
-using namespace x265;
+using namespace X265_NS;
 
 namespace {
 
@@ -94,9 +94,7 @@
     /* Actual adaptive quantization */
     int maxCol = curFrame->m_fencPic->m_picWidth;
     int maxRow = curFrame->m_fencPic->m_picHeight;
-    int blockWidth = ((param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
-    int blockHeight = ((param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
-    int blockCount = blockWidth * blockHeight;
+    int blockCount = curFrame->m_lowres.maxBlocksInRow * curFrame->m_lowres.maxBlocksInCol;
 
     for (int y = 0; y < 3; y++)
     {
@@ -133,15 +131,16 @@
     {
         blockXY = 0;
         double avg_adj_pow2 = 0, avg_adj = 0, qp_adj = 0;
-        if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
+        double bias_strength = 0.f;
+        if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE || param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED)
         {
-            double bit_depth_correction = pow(1 << (X265_DEPTH - 8), 0.5);
+            double bit_depth_correction = 1.f / (1 << (2*(X265_DEPTH-8)));
             for (blockY = 0; blockY < maxRow; blockY += 16)
             {
                 for (blockX = 0; blockX < maxCol; blockX += 16)
                 {
                     uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp);
-                    qp_adj = pow(energy + 1, 0.1);
+                    qp_adj = pow(energy * bit_depth_correction + 1, 0.1);
                     curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
                     avg_adj += qp_adj;
                     avg_adj_pow2 += qp_adj * qp_adj;
@@ -151,8 +150,9 @@
 
             avg_adj /= blockCount;
             avg_adj_pow2 /= blockCount;
-            strength = param->rc.aqStrength * avg_adj / bit_depth_correction;
-            avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - (11.f * bit_depth_correction)) / avg_adj;
+            strength = param->rc.aqStrength * avg_adj;
+            avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - (11.f)) / avg_adj;
+            bias_strength = param->rc.aqStrength;
         }
         else
             strength = param->rc.aqStrength * 1.0397f;
@@ -162,7 +162,12 @@
         {
             for (blockX = 0; blockX < maxCol; blockX += 16)
             {
-                if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
+                if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED)
+                {
+                    qp_adj = curFrame->m_lowres.qpCuTreeOffset[blockXY];
+                    qp_adj = strength * (qp_adj - avg_adj) + bias_strength * (1.f - 11.f / (qp_adj * qp_adj));
+                }
+                else if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
                 {
                     qp_adj = curFrame->m_lowres.qpCuTreeOffset[blockXY];
                     qp_adj = strength * (qp_adj - avg_adj);
@@ -464,6 +469,7 @@
     m_pool  = pool;
 
     m_lastNonB = NULL;
+    m_isSceneTransition = false;
     m_scratch  = NULL;
     m_tld      = NULL;
     m_filled   = false;
@@ -1248,7 +1254,9 @@
 
     int numBFrames = 0;
     int numAnalyzed = numFrames;
-    if (m_param->scenecutThreshold && scenecut(frames, 0, 1, true, origNumFrames, maxSearch))
+    bool isScenecut = scenecut(frames, 0, 1, true, origNumFrames);
+    /* When scenecut threshold is set, use scenecut detection for I frame placements */
+    if (m_param->scenecutThreshold && isScenecut)
     {
         frames[1]->sliceType = X265_TYPE_I;
         return;
@@ -1338,14 +1346,13 @@
         /* Check scenecut on the first minigop. */
         for (int j = 1; j < numBFrames + 1; j++)
         {
-            if (m_param->scenecutThreshold && scenecut(frames, j, j + 1, false, origNumFrames, maxSearch))
+            if (scenecut(frames, j, j + 1, false, origNumFrames))
             {
                 frames[j]->sliceType = X265_TYPE_P;
                 numAnalyzed = j;
                 break;
             }
         }
-
         resetStart = bKeyframe ? 1 : X265_MIN(numBFrames + 2, numAnalyzed + 1);
     }
     else
@@ -1369,50 +1376,99 @@
     if (bIsVbvLookahead)
         vbvLookahead(frames, numFrames, bKeyframe);
 
+     int maxp1 = X265_MIN(m_param->bframes + 1, origNumFrames);
     /* Restore frame types for all frames that haven't actually been decided yet. */
     for (int j = resetStart; j <= numFrames; j++)
+    {
         frames[j]->sliceType = X265_TYPE_AUTO;
+        /* If any frame marked as scenecut is being restarted for sliceDecision, 
+         * undo scene Transition flag */
+        if (j <= maxp1 && frames[j]->bScenecut && m_isSceneTransition)
+            m_isSceneTransition = false;
+    }
 }
 
-bool Lookahead::scenecut(Lowres **frames, int p0, int p1, bool bRealScenecut, int numFrames, int maxSearch)
+bool Lookahead::scenecut(Lowres **frames, int p0, int p1, bool bRealScenecut, int numFrames)
 {
     /* Only do analysis during a normal scenecut check. */
     if (bRealScenecut && m_param->bframes)
     {
         int origmaxp1 = p0 + 1;
         /* Look ahead to avoid coding short flashes as scenecuts. */
-        if (m_param->bFrameAdaptive == X265_B_ADAPT_TRELLIS)
-            /* Don't analyse any more frames than the trellis would have covered. */
-            origmaxp1 += m_param->bframes;
-        else
-            origmaxp1++;
+        origmaxp1 += m_param->bframes;
         int maxp1 = X265_MIN(origmaxp1, numFrames);
-
+        bool fluctuate = false;
+        bool noScenecuts = false;
+        int64_t avgSatdCost = 0;
+        if (frames[0]->costEst[1][0] > -1)
+            avgSatdCost = frames[0]->costEst[1][0];
+        int cnt = 1;
         /* Where A and B are scenes: AAAAAABBBAAAAAA
          * If BBB is shorter than (maxp1-p0), it is detected as a flash
          * and not considered a scenecut. */
         for (int cp1 = p1; cp1 <= maxp1; cp1++)
         {
             if (!scenecutInternal(frames, p0, cp1, false))
+            {
                 /* Any frame in between p0 and cur_p1 cannot be a real scenecut. */
                 for (int i = cp1; i > p0; i--)
+                {
                     frames[i]->bScenecut = false;
+                    noScenecuts = false;
+                }
+            }
+            else if (scenecutInternal(frames, cp1 - 1, cp1, false))
+            {
+                /* If current frame is a Scenecut from p0 frame as well as Scenecut from
+                 * preceeding frame, mark it as a Scenecut */
+                frames[cp1]->bScenecut = true;
+                noScenecuts = true;
+            }
+
+            /* compute average satdcost of all the frames in the mini-gop to confirm 
+             * whether there is any great fluctuation among them to rule out false positives */
+            X265_CHECK(frames[cp1]->costEst[cp1 - p0][0]!= -1, "costEst is not done \n");
+            avgSatdCost += frames[cp1]->costEst[cp1 - p0][0];
+            cnt++;
         }
 
-        /* Where A-F are scenes: AAAAABBCCDDEEFFFFFF
-         * If each of BB ... EE are shorter than (maxp1-p0), they are
-         * detected as flashes and not considered scenecuts.
-         * Instead, the first F frame becomes a scenecut.
-         * If the video ends before F, no frame becomes a scenecut. */
-        for (int cp0 = p0; cp0 <= maxp1; cp0++)
+        /* Identify possible scene fluctuations by comparing the satd cost of the frames.
+         * This could denote the beginning or ending of scene transitions.
+         * During a scene transition(fade in/fade outs), if fluctuate remains false,
+         * then the scene had completed its transition or stabilized */
+        if (noScenecuts)
         {
-            if (origmaxp1 > maxSearch || (cp0 < maxp1 && scenecutInternal(frames, cp0, maxp1, false)))
-                /* If cur_p0 is the p0 of a scenecut, it cannot be the p1 of a scenecut. */
-                frames[cp0]->bScenecut = false;
+            fluctuate = false;
+            avgSatdCost /= cnt;
+            for (int i = p1; i <= maxp1; i++)
+            {
+                int64_t curCost  = frames[i]->costEst[i - p0][0];
+                int64_t prevCost = frames[i - 1]->costEst[i - 1 - p0][0];
+                if (fabs((double)(curCost - avgSatdCost)) > 0.1 * avgSatdCost || 
+                    fabs((double)(curCost - prevCost)) > 0.1 * prevCost)
+                {
+                    fluctuate = true;
+                    if (!m_isSceneTransition && frames[i]->bScenecut)
+                    {
+                        m_isSceneTransition = true;
+                        /* just mark the first scenechange in the scene transition as a scenecut. */
+                        for (int j = i + 1; j <= maxp1; j++)
+                            frames[j]->bScenecut = false;
+                        break;

x265_1.7.tar.gz/source/encoder/slicetype.h -> x265_1.8.tar.gz/source/encoder/slicetype.h Changed

x265_1.7.tar.gz/source/encoder/weightPrediction.cpp -> x265_1.8.tar.gz/source/encoder/weightPrediction.cpp Changed

x265_1.7.tar.gz/source/input/input.cpp -> x265_1.8.tar.gz/source/input/input.cpp Changed

x265_1.7.tar.gz/source/input/input.h -> x265_1.8.tar.gz/source/input/input.h Changed

x265_1.7.tar.gz/source/input/y4m.cpp -> x265_1.8.tar.gz/source/input/y4m.cpp Changed

x265_1.7.tar.gz/source/input/y4m.h -> x265_1.8.tar.gz/source/input/y4m.h Changed

x265_1.7.tar.gz/source/input/yuv.cpp -> x265_1.8.tar.gz/source/input/yuv.cpp Changed

x265_1.7.tar.gz/source/input/yuv.h -> x265_1.8.tar.gz/source/input/yuv.h Changed

x265_1.7.tar.gz/source/output/output.cpp -> x265_1.8.tar.gz/source/output/output.cpp Changed

x265_1.7.tar.gz/source/output/output.h -> x265_1.8.tar.gz/source/output/output.h Changed

x265_1.7.tar.gz/source/output/raw.cpp -> x265_1.8.tar.gz/source/output/raw.cpp Changed

x265_1.7.tar.gz/source/output/raw.h -> x265_1.8.tar.gz/source/output/raw.h Changed

x265_1.7.tar.gz/source/output/reconplay.cpp -> x265_1.8.tar.gz/source/output/reconplay.cpp Changed

x265_1.7.tar.gz/source/output/reconplay.h -> x265_1.8.tar.gz/source/output/reconplay.h Changed

x265_1.7.tar.gz/source/output/y4m.cpp -> x265_1.8.tar.gz/source/output/y4m.cpp Changed

x265_1.7.tar.gz/source/output/y4m.h -> x265_1.8.tar.gz/source/output/y4m.h Changed

x265_1.7.tar.gz/source/output/yuv.cpp -> x265_1.8.tar.gz/source/output/yuv.cpp Changed

x265_1.7.tar.gz/source/output/yuv.h -> x265_1.8.tar.gz/source/output/yuv.h Changed

x265_1.7.tar.gz/source/profile/vtune/vtune.cpp -> x265_1.8.tar.gz/source/profile/vtune/vtune.cpp Changed

x265_1.7.tar.gz/source/profile/vtune/vtune.h -> x265_1.8.tar.gz/source/profile/vtune/vtune.h Changed

x265_1.7.tar.gz/source/test/CMakeLists.txt -> x265_1.8.tar.gz/source/test/CMakeLists.txt Changed

x265_1.7.tar.gz/source/test/checkasm-a.asm -> x265_1.8.tar.gz/source/test/checkasm-a.asm Changed

x265_1.7.tar.gz/source/test/intrapredharness.cpp -> x265_1.8.tar.gz/source/test/intrapredharness.cpp Changed

@@ -25,12 +25,22 @@
 #include "predict.h"
 #include "intrapredharness.h"
 
-using namespace x265;
+using namespace X265_NS;
 
 IntraPredHarness::IntraPredHarness()
 {
     for (int i = 0; i < INPUT_SIZE; i++)
         pixel_buff[i] = rand() % PIXEL_MAX;
+
+    /* [0] --- Random values
+     * [1] --- Minimum
+     * [2] --- Maximum */
+    for (int i = 0; i < BUFFSIZE; i++)
+    {
+        pixel_test_buff[0][i]   = rand() % PIXEL_MAX;
+        pixel_test_buff[1][i]   = PIXEL_MIN;
+        pixel_test_buff[2][i]   = PIXEL_MAX;
+    }
 }
 
 bool IntraPredHarness::check_dc_primitive(intra_pred_t ref, intra_pred_t opt, int width)
@@ -177,6 +187,27 @@
     return true;
 }
 
+bool IntraPredHarness::check_intra_filter_primitive(const intra_filter_t ref, const intra_filter_t opt)
+{
+    memset(pixel_out_c, 0, 64 * 64 * sizeof(pixel));
+    memset(pixel_out_vec, 0, 64 * 64 * sizeof(pixel));
+    int j = 0;
+
+    for (int i = 0; i < 100; i++)
+    {
+        int index = rand() % TEST_CASES;
+
+        ref(pixel_test_buff[index] + j, pixel_out_c);
+        checked(opt, pixel_test_buff[index] + j, pixel_out_vec);
+
+        if (memcmp(pixel_out_c, pixel_out_vec, 64 * 64 * sizeof(pixel)))
+            return false;
+
+        reportfail();
+        j += FENC_STRIDE;
+    }
+    return true;
+}
 bool IntraPredHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
 {
     for (int i = BLOCK_4x4; i <= BLOCK_32x32; i++)
@@ -213,6 +244,14 @@
                 return false;
             }
         }
+        if (opt.cu[i].intra_filter)
+        {
+            if (!check_intra_filter_primitive(ref.cu[i].intra_filter, opt.cu[i].intra_filter))
+            {
+                printf("intra_filter_%dx%d failed\n", size, size);
+                return false;
+            }
+        }
     }
 
     return true;
@@ -268,5 +307,10 @@
                                pixel_out_vec, FENC_STRIDE, pixel_buff + srcStride, mode, bFilter);
             }
         }
+        if (opt.cu[i].intra_filter)
+        {
+            printf("intra_filter_%dx%d", size, size);
+            REPORT_SPEEDUP(opt.cu[i].intra_filter, ref.cu[i].intra_filter, pixel_buff, pixel_out_c);
+        }
     }
 }

x265_1.7.tar.gz/source/test/intrapredharness.h -> x265_1.8.tar.gz/source/test/intrapredharness.h Changed

x265_1.7.tar.gz/source/test/ipfilterharness.cpp -> x265_1.8.tar.gz/source/test/ipfilterharness.cpp Changed

x265_1.7.tar.gz/source/test/mbdstharness.cpp -> x265_1.8.tar.gz/source/test/mbdstharness.cpp Changed

@@ -27,7 +27,7 @@
 #include "common.h"
 #include "mbdstharness.h"
 
-using namespace x265;
+using namespace X265_NS;
 
 struct DctConf
 {
@@ -53,7 +53,7 @@
 
 MBDstHarness::MBDstHarness()
 {
-    const int idct_max = (1 << (BIT_DEPTH + 4)) - 1;
+    const int idct_max = (1 << (X265_DEPTH + 4)) - 1;
 
     /* [0] --- Random values
      * [1] --- Minimum
@@ -215,8 +215,14 @@
         uint32_t optReturnValue = 0;
         uint32_t refReturnValue = 0;
 
-        int bits = (rand() % 24) + 8;
-        int valueToAdd = rand() % (1 << bits);
+        int sliceType = rand() % 2;
+        int log2TrSize = rand() % 4 + 2;
+        int qp = rand() % (QP_MAX_SPEC + QP_BD_OFFSET + 1);
+        int per = qp / 6;
+        int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
+
+        int bits = QUANT_SHIFT + per + transformShift;
+        int valueToAdd = (sliceType == 1 ? 171 : 85) << (bits - 9);
         int cmp_size = sizeof(int) * height * width;
         int cmp_size1 = sizeof(short) * height * width;
         int numCoeff = height * width;

x265_1.7.tar.gz/source/test/pixelharness.cpp -> x265_1.8.tar.gz/source/test/pixelharness.cpp Changed

@@ -23,8 +23,9 @@
 
 #include "pixelharness.h"
 #include "primitives.h"
+#include "entropy.h"
 
-using namespace x265;
+using namespace X265_NS;
 
 PixelHarness::PixelHarness()
 {
@@ -93,7 +94,7 @@
     return true;
 }
 
-bool PixelHarness::check_pixelcmp_ss(pixelcmp_ss_t ref, pixelcmp_ss_t opt)
+bool PixelHarness::check_pixel_sse(pixel_sse_t ref, pixel_sse_t opt)
 {
     int j = 0;
     intptr_t stride = STRIDE;
@@ -102,8 +103,29 @@
     {
         int index1 = rand() % TEST_CASES;
         int index2 = rand() % TEST_CASES;
-        int vres = (int)checked(opt, short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
-        int cres = ref(short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
+        sse_ret_t vres = (sse_ret_t)checked(opt, pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
+        sse_ret_t cres = ref(pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
+        if (vres != cres)
+            return false;
+
+        reportfail();
+        j += INCR;
+    }
+
+    return true;
+}
+
+bool PixelHarness::check_pixel_sse_ss(pixel_sse_ss_t ref, pixel_sse_ss_t opt)
+{
+    int j = 0;
+    intptr_t stride = STRIDE;
+
+    for (int i = 0; i < ITERS; i++)
+    {
+        int index1 = rand() % TEST_CASES;
+        int index2 = rand() % TEST_CASES;
+        sse_ret_t vres = (sse_ret_t)checked(opt, short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
+        sse_ret_t cres = ref(short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
         if (vres != cres)
             return false;
 
@@ -900,8 +922,8 @@
     ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
     ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
 
-    memset(ref_dest, 0xCD, sizeof(ref_dest));
-    memset(opt_dest, 0xCD, sizeof(opt_dest));
+    for (int i = 0; i < 64 * 64; i++)
+        ref_dest[i] = opt_dest[i] = rand() % (PIXEL_MAX);
 
     int j = 0;
 
@@ -928,8 +950,8 @@
     ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
     ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
 
-    memset(ref_dest, 0xCD, sizeof(ref_dest));
-    memset(opt_dest, 0xCD, sizeof(opt_dest));
+    for (int i = 0; i < 64 * 64; i++)
+        ref_dest[i] = opt_dest[i] = rand() % (PIXEL_MAX);
 
     int j = 0;
 
@@ -956,8 +978,8 @@
     ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
     ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
 
-    memset(ref_dest, 0xCD, sizeof(ref_dest));
-    memset(opt_dest, 0xCD, sizeof(opt_dest));
+    for (int i = 0; i < 64 * 64; i++)
+        ref_dest[i] = opt_dest[i] = rand() % (PIXEL_MAX);
 
     for (int id = 0; id < 2; id++)
     {
@@ -992,8 +1014,8 @@
     ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
     ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
 
-    memset(ref_dest, 0xCD, sizeof(ref_dest));
-    memset(opt_dest, 0xCD, sizeof(opt_dest));
+    for (int i = 0; i < 64 * 64; i++)
+        ref_dest[i] = opt_dest[i] = rand() % (PIXEL_MAX);
 
     int j = 0;
 
@@ -1016,13 +1038,234 @@
     return true;
 }
 
+bool PixelHarness::check_saoCuStatsBO_t(saoCuStatsBO_t ref, saoCuStatsBO_t opt)
+{
+    enum { NUM_EDGETYPE = 33 }; // classIdx = 1 + (rec[x] >> 3);
+    int32_t stats_ref[NUM_EDGETYPE];
+    int32_t stats_vec[NUM_EDGETYPE];
+
+    int32_t count_ref[NUM_EDGETYPE];
+    int32_t count_vec[NUM_EDGETYPE];
+
+    int j = 0;
+    for (int i = 0; i < ITERS; i++)
+    {
+        // initialize input data to random, the dynamic range wrong but good to verify our asm code
+        for (int x = 0; x < NUM_EDGETYPE; x++)
+        {
+            stats_ref[x] = stats_vec[x] = rand();
+            count_ref[x] = count_vec[x] = rand();
+        }
+
+        intptr_t stride = 16 * (rand() % 4 + 1);
+        int endX = MAX_CU_SIZE - (rand() % 5);
+        int endY = MAX_CU_SIZE - (rand() % 4) - 1;
+
+        ref(pbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_ref, count_ref);
+        checked(opt, pbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_vec, count_vec);
+
+        if (memcmp(stats_ref, stats_vec, sizeof(stats_ref)) || memcmp(count_ref, count_vec, sizeof(count_ref)))
+            return false;
+
+        reportfail();
+        j += INCR;
+    }
+
+    return true;
+}
+
+bool PixelHarness::check_saoCuStatsE0_t(saoCuStatsE0_t ref, saoCuStatsE0_t opt)
+{
+    enum { NUM_EDGETYPE = 5 };
+    int32_t stats_ref[NUM_EDGETYPE];
+    int32_t stats_vec[NUM_EDGETYPE];
+
+    int32_t count_ref[NUM_EDGETYPE];
+    int32_t count_vec[NUM_EDGETYPE];
+
+    int j = 0;
+    for (int i = 0; i < ITERS; i++)
+    {
+        // initialize input data to random, the dynamic range wrong but good to verify our asm code
+        for (int x = 0; x < NUM_EDGETYPE; x++)
+        {
+            stats_ref[x] = stats_vec[x] = rand();
+            count_ref[x] = count_vec[x] = rand();
+        }
+
+        intptr_t stride = 16 * (rand() % 4 + 1);
+        int endX = MAX_CU_SIZE - (rand() % 5) - 1;
+        int endY = MAX_CU_SIZE - (rand() % 4) - 1;
+
+        ref(pbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_ref, count_ref);
+        checked(opt, pbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_vec, count_vec);
+
+        if (memcmp(stats_ref, stats_vec, sizeof(stats_ref)) || memcmp(count_ref, count_vec, sizeof(count_ref)))
+            return false;
+
+        reportfail();
+        j += INCR;
+    }
+
+    return true;
+}
+
+bool PixelHarness::check_saoCuStatsE1_t(saoCuStatsE1_t ref, saoCuStatsE1_t opt)
+{
+    enum { NUM_EDGETYPE = 5 };
+    int32_t stats_ref[NUM_EDGETYPE];
+    int32_t stats_vec[NUM_EDGETYPE];
+
+    int32_t count_ref[NUM_EDGETYPE];
+    int32_t count_vec[NUM_EDGETYPE];
+
+    int8_t _upBuff1_ref[MAX_CU_SIZE + 2], *upBuff1_ref = _upBuff1_ref + 1;
+    int8_t _upBuff1_vec[MAX_CU_SIZE + 2], *upBuff1_vec = _upBuff1_vec + 1;
+
+    int j = 0;
+
+    for (int i = 0; i < ITERS; i++)
+    {
+        // initialize input data to random, the dynamic range wrong but good to verify our asm code
+        for (int x = 0; x < NUM_EDGETYPE; x++)
+        {
+            stats_ref[x] = stats_vec[x] = rand();
+            count_ref[x] = count_vec[x] = rand();
+        }
+
+        // initial sign
+        for (int x = 0; x < MAX_CU_SIZE + 2; x++)
+            _upBuff1_ref[x] = _upBuff1_vec[x] = (rand() % 3) - 1;
+
+        intptr_t stride = 16 * (rand() % 4 + 1);

x265_1.7.tar.gz/source/test/pixelharness.h -> x265_1.8.tar.gz/source/test/pixelharness.h Changed

@@ -66,7 +66,8 @@
     double   double_test_buff[TEST_CASES][BUFFSIZE];
 
     bool check_pixelcmp(pixelcmp_t ref, pixelcmp_t opt);
-    bool check_pixelcmp_ss(pixelcmp_ss_t ref, pixelcmp_ss_t opt);
+    bool check_pixel_sse(pixel_sse_t ref, pixel_sse_t opt);
+    bool check_pixel_sse_ss(pixel_sse_ss_t ref, pixel_sse_ss_t opt);
     bool check_pixelcmp_x3(pixelcmp_x3_t ref, pixelcmp_x3_t opt);
     bool check_pixelcmp_x4(pixelcmp_x4_t ref, pixelcmp_x4_t opt);
     bool check_copy_pp(copy_pp_t ref, copy_pp_t opt);
@@ -100,6 +101,11 @@
     bool check_saoCuOrgE3_t(saoCuOrgE3_t ref, saoCuOrgE3_t opt);
     bool check_saoCuOrgE3_32_t(saoCuOrgE3_t ref, saoCuOrgE3_t opt);
     bool check_saoCuOrgB0_t(saoCuOrgB0_t ref, saoCuOrgB0_t opt);
+    bool check_saoCuStatsBO_t(saoCuStatsBO_t ref, saoCuStatsBO_t opt);
+    bool check_saoCuStatsE0_t(saoCuStatsE0_t ref, saoCuStatsE0_t opt);
+    bool check_saoCuStatsE1_t(saoCuStatsE1_t ref, saoCuStatsE1_t opt);
+    bool check_saoCuStatsE2_t(saoCuStatsE2_t ref, saoCuStatsE2_t opt);
+    bool check_saoCuStatsE3_t(saoCuStatsE3_t ref, saoCuStatsE3_t opt);
     bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);
     bool check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt);
     bool check_cutree_propagate_cost(cutree_propagate_cost ref, cutree_propagate_cost opt);
@@ -108,6 +114,8 @@
     bool check_calSign(sign_t ref, sign_t opt);
     bool check_scanPosLast(scanPosLast_t ref, scanPosLast_t opt);
     bool check_findPosFirstLast(findPosFirstLast_t ref, findPosFirstLast_t opt);
+    bool check_costCoeffNxN(costCoeffNxN_t ref, costCoeffNxN_t opt);
+    bool check_costCoeffRemain(costCoeffRemain_t ref, costCoeffRemain_t opt);
 
 public:

x265_1.7.tar.gz/source/test/regression-tests.txt -> x265_1.8.tar.gz/source/test/regression-tests.txt Changed

@@ -12,50 +12,50 @@
 # not auto-detected.
 
 BasketballDrive_1920x1080_50.y4m,--preset faster --aq-strength 2 --merange 190
-BasketballDrive_1920x1080_50.y4m,--preset medium --ctu 16 --max-tu-size 8 --subme 7 --qg-size 32
+BasketballDrive_1920x1080_50.y4m,--preset medium --ctu 16 --max-tu-size 8 --subme 7 --qg-size 16 --cu-lossless
 BasketballDrive_1920x1080_50.y4m,--preset medium --keyint -1 --nr-inter 100 -F4 --no-sao
-BasketballDrive_1920x1080_50.y4m,--preset slow --nr-intra 100 -F4 --aq-strength 3 --qg-size 16
+BasketballDrive_1920x1080_50.y4m,--preset slow --nr-intra 100 -F4 --aq-strength 3 --qg-size 16 --limit-refs 1
 BasketballDrive_1920x1080_50.y4m,--preset slower --lossless --chromaloc 3 --subme 0
 BasketballDrive_1920x1080_50.y4m,--preset superfast --psy-rd 1 --ctu 16 --no-wpp
 BasketballDrive_1920x1080_50.y4m,--preset ultrafast --signhide --colormatrix bt709
 BasketballDrive_1920x1080_50.y4m,--preset veryfast --tune zerolatency --no-temporal-mvp
-BasketballDrive_1920x1080_50.y4m,--preset veryslow --crf 4 --cu-lossless --pmode
+BasketballDrive_1920x1080_50.y4m,--preset veryslow --crf 4 --cu-lossless --pmode --limit-refs 1
 Coastguard-4k.y4m,--preset medium --rdoq-level 1 --tune ssim --no-signhide --me umh
-Coastguard-4k.y4m,--preset slow --tune psnr --cbqpoffs -1 --crqpoffs 1
+Coastguard-4k.y4m,--preset slow --tune psnr --cbqpoffs -1 --crqpoffs 1 --limit-refs 1
 Coastguard-4k.y4m,--preset superfast --tune grain --overscan=crop
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --aq-mode 0 --sar 2 --range full
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset faster --max-tu-size 4 --min-cu-size 32
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset medium --no-wpp --no-cutree --no-strong-intra-smoothing
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset medium --no-wpp --no-cutree --no-strong-intra-smoothing --limit-refs 1
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset slow --no-wpp --tune ssim --transfer smpte240m
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset slower --tune ssim --tune fastdecode
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset slower --tune ssim --tune fastdecode --limit-refs 2
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset superfast --weightp --no-wpp --sao
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset ultrafast --weightp --tune zerolatency --qg-size 16
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset veryfast --temporal-layers --tune grain
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset medium --dither --keyint -1 --rdoq-level 1
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset superfast --weightp --dither --no-psy-rd
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset ultrafast --weightp --no-wpp --no-open-gop
-CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryfast --temporal-layers --repeat-headers
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryfast --temporal-layers --repeat-headers --limit-refs 2
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --tskip --tskip-fast --no-scenecut
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset medium --tune psnr --bframes 16
-DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers --no-psy-rd --qg-size 32
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers --no-psy-rd --qg-size 32 --limit-refs 0 --cu-lossless
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset superfast --weightp --qg-size 16
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset medium --nr-inter 500 -F4 --no-psy-rdoq
-DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset slower --no-weightp --rdoq-level 0
+DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset slower --no-weightp --rdoq-level 0 --limit-refs 3
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset veryfast --weightp --nr-intra 1000 -F4
 FourPeople_1280x720_60.y4m,--preset medium --qp 38 --no-psy-rd
 FourPeople_1280x720_60.y4m,--preset superfast --no-wpp --lookahead-slices 2
 Keiba_832x480_30.y4m,--preset medium --pmode --tune grain
-Keiba_832x480_30.y4m,--preset slower --fast-intra --nr-inter 500 -F4
+Keiba_832x480_30.y4m,--preset slower --fast-intra --nr-inter 500 -F4 --limit-refs 0
 Keiba_832x480_30.y4m,--preset superfast --no-fast-intra --nr-intra 1000 -F4
 Kimono1_1920x1080_24_10bit_444.yuv,--preset medium --min-cu-size 32
 Kimono1_1920x1080_24_10bit_444.yuv,--preset superfast --weightb
 KristenAndSara_1280x720_60.y4m,--preset medium --no-cutree --max-tu-size 16
-KristenAndSara_1280x720_60.y4m,--preset slower --pmode --max-tu-size 8
-KristenAndSara_1280x720_60.y4m,--preset superfast --min-cu-size 16 --qg-size 16
+KristenAndSara_1280x720_60.y4m,--preset slower --pmode --max-tu-size 8 --limit-refs 0
+KristenAndSara_1280x720_60.y4m,--preset superfast --min-cu-size 16 --qg-size 16 --limit-refs 1
 KristenAndSara_1280x720_60.y4m,--preset ultrafast --strong-intra-smoothing
-NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --tune grain
+NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --tune grain --limit-refs 2
 NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset superfast --tune psnr
-News-4k.y4m,--preset medium --tune ssim --no-sao --qg-size 32
+News-4k.y4m,--preset medium --tune ssim --no-sao --qg-size 16
 News-4k.y4m,--preset superfast --lookahead-slices 6 --aq-mode 0
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset medium --no-weightp
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset slower --tune fastdecode
@@ -66,16 +66,16 @@
 RaceHorses_416x240_30.y4m,--preset medium --tskip-fast --tskip
 RaceHorses_416x240_30.y4m,--preset slower --keyint -1 --rdoq-level 0
 RaceHorses_416x240_30.y4m,--preset superfast --no-cutree
-RaceHorses_416x240_30.y4m,--preset veryslow --tskip-fast --tskip
-RaceHorses_416x240_30_10bit.yuv,--preset fast --lookahead-slices 2 --b-intra
+RaceHorses_416x240_30.y4m,--preset veryslow --tskip-fast --tskip --limit-refs 3
+RaceHorses_416x240_30_10bit.yuv,--preset fast --lookahead-slices 2 --b-intra --limit-refs 1
 RaceHorses_416x240_30_10bit.yuv,--preset faster --rdoq-level 0 --dither
 RaceHorses_416x240_30_10bit.yuv,--preset slow --tune grain
-RaceHorses_416x240_30_10bit.yuv,--preset ultrafast --tune psnr
+RaceHorses_416x240_30_10bit.yuv,--preset ultrafast --tune psnr --limit-refs 1
 RaceHorses_416x240_30_10bit.yuv,--preset veryfast --weightb
-RaceHorses_416x240_30_10bit.yuv,--preset placebo
+RaceHorses_416x240_30_10bit.yuv,--preset placebo --limit-refs 1
 SteamLocomotiveTrain_2560x1600_60_10bit_crop.yuv,--preset medium --dither
 big_buck_bunny_360p24.y4m,--preset faster --keyint 240 --min-keyint 60 --rc-lookahead 200
-big_buck_bunny_360p24.y4m,--preset medium --keyint 60 --min-keyint 48 --weightb
+big_buck_bunny_360p24.y4m,--preset medium --keyint 60 --min-keyint 48 --weightb --limit-refs 3
 big_buck_bunny_360p24.y4m,--preset slow --psy-rdoq 2.0 --rdoq-level 1 --no-b-intra
 big_buck_bunny_360p24.y4m,--preset superfast --psy-rdoq 2.0
 big_buck_bunny_360p24.y4m,--preset ultrafast --deblock=2
@@ -83,20 +83,20 @@
 city_4cif_60fps.y4m,--preset medium --crf 4 --cu-lossless --sao-non-deblock
 city_4cif_60fps.y4m,--preset superfast --rdpenalty 1 --tu-intra-depth 2
 city_4cif_60fps.y4m,--preset slower --scaling-list default
-city_4cif_60fps.y4m,--preset veryslow --rdpenalty 2 --sao-non-deblock --no-b-intra
+city_4cif_60fps.y4m,--preset veryslow --rdpenalty 2 --sao-non-deblock --no-b-intra --limit-refs 0
 ducks_take_off_420_720p50.y4m,--preset fast --deblock 6 --bframes 16 --rc-lookahead 40
-ducks_take_off_420_720p50.y4m,--preset faster --qp 24 --deblock -6
+ducks_take_off_420_720p50.y4m,--preset faster --qp 24 --deblock -6 --limit-refs 2
 ducks_take_off_420_720p50.y4m,--preset medium --tskip --tskip-fast --constrained-intra
 ducks_take_off_420_720p50.y4m,--preset slow --scaling-list default --qp 40
 ducks_take_off_420_720p50.y4m,--preset ultrafast --constrained-intra --rd 1
 ducks_take_off_420_720p50.y4m,--preset veryslow --constrained-intra --bframes 2
 ducks_take_off_444_720p50.y4m,--preset medium --qp 38 --no-scenecut
-ducks_take_off_444_720p50.y4m,--preset superfast --weightp --rd 0
-ducks_take_off_444_720p50.y4m,--preset slower --psy-rd 1 --psy-rdoq 2.0 --rdoq-level 1
+ducks_take_off_444_720p50.y4m,--preset superfast --weightp --rd 0 --limit-refs 2
+ducks_take_off_444_720p50.y4m,--preset slower --psy-rd 1 --psy-rdoq 2.0 --rdoq-level 1 --limit-refs 1
 mobile_calendar_422_ntsc.y4m,--preset medium --bitrate 500 -F4
 mobile_calendar_422_ntsc.y4m,--preset slower --tskip --tskip-fast
 mobile_calendar_422_ntsc.y4m,--preset superfast --weightp --rd 0
-mobile_calendar_422_ntsc.y4m,--preset veryslow --tskip
+mobile_calendar_422_ntsc.y4m,--preset veryslow --tskip --limit-refs 2
 old_town_cross_444_720p50.y4m,--preset faster --rd 1 --tune zero-latency
 old_town_cross_444_720p50.y4m,--preset medium --keyint -1 --no-weightp --ref 6
 old_town_cross_444_720p50.y4m,--preset slow --rdoq-level 1 --early-skip --ref 7 --no-b-pyramid
@@ -113,12 +113,19 @@
 vtc1nw_422_ntsc.y4m,--preset slower --nr-inter 1000 -F4 --tune fast-decode --qg-size 16
 vtc1nw_422_ntsc.y4m,--preset superfast --weightp --nr-intra 100 -F4
 washdc_422_ntsc.y4m,--preset faster --rdoq-level 1 --max-merge 5
-washdc_422_ntsc.y4m,--preset medium --no-weightp --max-tu-size 4
-washdc_422_ntsc.y4m,--preset slower --psy-rdoq 2.0 --rdoq-level 2 --qg-size 32
+washdc_422_ntsc.y4m,--preset medium --no-weightp --max-tu-size 4 --limit-refs 1
+washdc_422_ntsc.y4m,--preset slower --psy-rdoq 2.0 --rdoq-level 2 --qg-size 32 --limit-refs 1
 washdc_422_ntsc.y4m,--preset superfast --psy-rd 1 --tune zerolatency
 washdc_422_ntsc.y4m,--preset ultrafast --weightp --tu-intra-depth 4
 washdc_422_ntsc.y4m,--preset veryfast --tu-inter-depth 4
-washdc_422_ntsc.y4m,--preset veryslow --crf 4 --cu-lossless
+washdc_422_ntsc.y4m,--preset veryslow --crf 4 --cu-lossless --limit-refs 3
+BasketballDrive_1920x1080_50.y4m,--preset medium --no-cutree --analysis-mode=save --bitrate 15000,--preset medium --no-cutree --analysis-mode=load --bitrate 13000,--preset medium --no-cutree --analysis-mode=load --bitrate 11000,--preset medium --no-cutree --analysis-mode=load --bitrate 9000,--preset medium --no-cutree --analysis-mode=load --bitrate 7000
+NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset slow --no-cutree --analysis-mode=save --bitrate 15000,--preset slow --no-cutree --analysis-mode=load --bitrate 13000,--preset slow --no-cutree --analysis-mode=load --bitrate 11000,--preset slow --no-cutree --analysis-mode=load --bitrate 9000,--preset slow --no-cutree --analysis-mode=load --bitrate 7000
+old_town_cross_444_720p50.y4m,--preset veryslow --no-cutree --analysis-mode=save --bitrate 15000 --early-skip,--preset veryslow --no-cutree --analysis-mode=load --bitrate 13000 --early-skip,--preset veryslow --no-cutree --analysis-mode=load --bitrate 11000 --early-skip,--preset veryslow --no-cutree --analysis-mode=load --bitrate 9000 --early-skip,--preset veryslow --no-cutree --analysis-mode=load --bitrate 7000 --early-skip
+Johnny_1280x720_60.y4m,--preset medium --no-cutree --analysis-mode=save --bitrate 15000 --tskip-fast,--preset medium --no-cutree --analysis-mode=load --bitrate 13000  --tskip-fast,--preset medium --no-cutree --analysis-mode=load --bitrate 11000  --tskip-fast,--preset medium --no-cutree --analysis-mode=load --bitrate 9000  --tskip-fast,--preset medium --no-cutree --analysis-mode=load --bitrate 7000  --tskip-fast
+BasketballDrive_1920x1080_50.y4m,--preset medium --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
+FourPeople_1280x720_60.y4m,--preset ultrafast --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
+FourPeople_1280x720_60.y4m,--preset veryslow --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
 
 # interlace test, even though input YUV is not field seperated
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --interlace bff

x265_1.7.tar.gz/source/test/smoke-tests.txt -> x265_1.8.tar.gz/source/test/smoke-tests.txt Changed

@@ -6,14 +6,14 @@
 
 big_buck_bunny_360p24.y4m,--preset=superfast --bitrate 400 --vbv-bufsize 600 --vbv-maxrate 400 --hrd --aud --repeat-headers
 big_buck_bunny_360p24.y4m,--preset=medium --bitrate 1000 -F4 --cu-lossless --scaling-list default
-big_buck_bunny_360p24.y4m,--preset=slower --no-weightp --cu-stats --pme --qg-size 16
+big_buck_bunny_360p24.y4m,--preset=slower --no-weightp --pme --qg-size 16
 washdc_422_ntsc.y4m,--preset=faster --no-strong-intra-smoothing --keyint 1 --qg-size 16
 washdc_422_ntsc.y4m,--preset=medium --qp 40 --nr-inter 400 -F4
 washdc_422_ntsc.y4m,--preset=veryslow --pmode --tskip --rdoq-level 0
 old_town_cross_444_720p50.y4m,--preset=ultrafast --weightp --keyint -1
 old_town_cross_444_720p50.y4m,--preset=fast --keyint 20 --min-cu-size 16
 old_town_cross_444_720p50.y4m,--preset=slow --sao-non-deblock --pmode --qg-size 32
-RaceHorses_416x240_30_10bit.yuv,--preset=veryfast --cu-stats --max-tu-size 8
+RaceHorses_416x240_30_10bit.yuv,--preset=veryfast --max-tu-size 8
 RaceHorses_416x240_30_10bit.yuv,--preset=slower --bitrate 500 -F4 --rdoq-level 1
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset=ultrafast --constrained-intra --min-keyint 5 --keyint 10
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset=medium --max-tu-size 16

x265_1.7.tar.gz/source/test/testbench.cpp -> x265_1.8.tar.gz/source/test/testbench.cpp Changed

x265_1.7.tar.gz/source/test/testharness.h -> x265_1.8.tar.gz/source/test/testharness.h Changed

@@ -31,18 +31,13 @@
 #pragma warning(disable: 4324) // structure was padded due to __declspec(align())
 #endif
 
-#if HIGH_BIT_DEPTH
-#define BIT_DEPTH 10
-#else
-#define BIT_DEPTH 8
-#endif
-#define PIXEL_MAX ((1 << BIT_DEPTH) - 1)
+#define PIXEL_MAX ((1 << X265_DEPTH) - 1)
 #define PIXEL_MIN 0
 #define SHORT_MAX  32767
 #define SHORT_MIN -32767
 #define UNSIGNED_SHORT_MAX 65535
 
-using namespace x265;
+using namespace X265_NS;
 
 extern const char* lumaPartStr[NUM_PU_SIZES];
 extern const char* const* chromaPartStr[X265_CSP_COUNT];
@@ -123,14 +118,14 @@
 
 extern "C" {
 #if X265_ARCH_X86
-int x265_stack_pagealign(int (*func)(), int align);
+int PFX(stack_pagealign)(int (*func)(), int align);
 
 /* detect when callee-saved regs aren't saved
  * needs an explicit asm check because it only sometimes crashes in normal use. */
-intptr_t x265_checkasm_call(intptr_t (*func)(), int *ok, ...);
-float x265_checkasm_call_float(float (*func)(), int *ok, ...);
+intptr_t PFX(checkasm_call)(intptr_t (*func)(), int *ok, ...);
+float PFX(checkasm_call_float)(float (*func)(), int *ok, ...);
 #else
-#define x265_stack_pagealign(func, align) func()
+#define PFX(stack_pagealign)(func, align) func()
 #endif
 
 #if X86_64
@@ -144,24 +139,24 @@
  * overwrite the junk written to the stack so there's no guarantee that it will always
  * detect all functions that assumes zero-extension.
  */
-void x265_checkasm_stack_clobber(uint64_t clobber, ...);
+void PFX(checkasm_stack_clobber)(uint64_t clobber, ...);
 #define checked(func, ...) ( \
         m_ok = 1, m_rand = (rand() & 0xffff) * 0x0001000100010001ULL, \
-        x265_checkasm_stack_clobber(m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, \
+        PFX(checkasm_stack_clobber)(m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, \
                                     m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, \
                                     m_rand, m_rand, m_rand, m_rand, m_rand), /* max_args+6 */ \
-        x265_checkasm_call((intptr_t(*)())func, &m_ok, 0, 0, 0, 0, __VA_ARGS__))
+        PFX(checkasm_call)((intptr_t(*)())func, &m_ok, 0, 0, 0, 0, __VA_ARGS__))
 
 #define checked_float(func, ...) ( \
         m_ok = 1, m_rand = (rand() & 0xffff) * 0x0001000100010001ULL, \
-        x265_checkasm_stack_clobber(m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, \
+        PFX(checkasm_stack_clobber)(m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, \
                                     m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, \
                                     m_rand, m_rand, m_rand, m_rand, m_rand), /* max_args+6 */ \
-        x265_checkasm_call_float((float(*)())func, &m_ok, 0, 0, 0, 0, __VA_ARGS__))
+        PFX(checkasm_call_float)((float(*)())func, &m_ok, 0, 0, 0, 0, __VA_ARGS__))
 #define reportfail() if (!m_ok) { fflush(stdout); fprintf(stderr, "stack clobber check failed at %s:%d", __FILE__, __LINE__); abort(); }
 #elif ARCH_X86
-#define checked(func, ...) x265_checkasm_call((intptr_t(*)())func, &m_ok, __VA_ARGS__);
-#define checked_float(func, ...) x265_checkasm_call_float((float(*)())func, &m_ok, __VA_ARGS__);
+#define checked(func, ...) PFX(checkasm_call)((intptr_t(*)())func, &m_ok, __VA_ARGS__);
+#define checked_float(func, ...) PFX(checkasm_call_float)((float(*)())func, &m_ok, __VA_ARGS__);
 
 #else // if X86_64
 #define checked(func, ...) func(__VA_ARGS__)

x265_1.8.tar.gz/source/x265-extras.cpp Added

@@ -0,0 +1,341 @@
+/*****************************************************************************
+ * Copyright (C) 2015 x265 project
+ *
+ * Authors: Steve Borho <steve@borho.org>
+ *          Selvakumar Nithiyaruban <selvakumar@multicorewareinc.com>
+ *          Divya Manivannan <divya@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "x265.h"
+#include "x265-extras.h"
+
+#include "common.h"
+
+using namespace X265_NS;
+
+static const char* summaryCSVHeader =
+    "Command, Date/Time, Elapsed Time, FPS, Bitrate, "
+    "Y PSNR, U PSNR, V PSNR, Global PSNR, SSIM, SSIM (dB), "
+    "I count, I ave-QP, I kbps, I-PSNR Y, I-PSNR U, I-PSNR V, I-SSIM (dB), "
+    "P count, P ave-QP, P kbps, P-PSNR Y, P-PSNR U, P-PSNR V, P-SSIM (dB), "
+    "B count, B ave-QP, B kbps, B-PSNR Y, B-PSNR U, B-PSNR V, B-SSIM (dB), "
+    "Version\n";
+
+FILE* x265_csvlog_open(const x265_api& api, const x265_param& param, const char* fname, int level)
+{
+    if (sizeof(x265_stats) != api.sizeof_stats || sizeof(x265_picture) != api.sizeof_picture)
+    {
+        fprintf(stderr, "extras [error]: structure size skew, unable to create CSV logfile\n");
+        return NULL;
+    }
+
+    FILE *csvfp = fopen(fname, "r");
+    if (csvfp)
+    {
+        /* file already exists, re-open for append */
+        fclose(csvfp);
+        return fopen(fname, "ab");
+    }
+    else
+    {
+        /* new CSV file, write header */
+        csvfp = fopen(fname, "wb");
+        if (csvfp)
+        {
+            if (level)
+            {
+                fprintf(csvfp, "Encode Order, Type, POC, QP, Bits, ");
+                if (param.rc.rateControlMode == X265_RC_CRF)
+                    fprintf(csvfp, "RateFactor, ");
+                fprintf(csvfp, "Y PSNR, U PSNR, V PSNR, YUV PSNR, SSIM, SSIM (dB),  List 0, List 1");
+                /* detailed performance statistics */
+                fprintf(csvfp, ", DecideWait (ms), Row0Wait (ms), Wall time (ms), Ref Wait Wall (ms), Total CTU time (ms), Stall Time (ms), Avg WPP, Row Blocks");
+                if (level >= 2)
+                {
+                    uint32_t size = param.maxCUSize;
+                    for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
+                    {
+                        fprintf(csvfp, ", Intra %dx%d DC, Intra %dx%d Planar, Intra %dx%d Ang", size, size, size, size, size, size);
+                        size /= 2;
+                    }
+                    fprintf(csvfp, ", 4x4");
+                    size = param.maxCUSize;
+                    if (param.bEnableRectInter)
+                    {
+                        for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
+                        {
+                            fprintf(csvfp, ", Inter %dx%d, Inter %dx%d (Rect)", size, size, size, size);
+                            if (param.bEnableAMP)
+                                fprintf(csvfp, ", Inter %dx%d (Amp)", size, size);
+                            size /= 2;
+                        }
+                    }
+                    else
+                    {
+                        for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
+                        {
+                            fprintf(csvfp, ", Inter %dx%d", size, size);
+                            size /= 2;
+                        }
+                    }
+                    size = param.maxCUSize;
+                    for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
+                    {
+                        fprintf(csvfp, ", Skip %dx%d", size, size);
+                        size /= 2;
+                    }
+                    size = param.maxCUSize;
+                    for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
+                    {
+                        fprintf(csvfp, ", Merge %dx%d", size, size);
+                        size /= 2;
+                    }
+                    fprintf(csvfp, ", Avg Luma Distortion, Avg Chroma Distortion, Avg psyEnergy, Avg Luma Level, Max Luma Level");
+                }
+                fprintf(csvfp, "\n");
+            }
+            else
+                fputs(summaryCSVHeader, csvfp);
+        }
+        return csvfp;
+    }
+}
+
+// per frame CSV logging
+void x265_csvlog_frame(FILE* csvfp, const x265_param& param, const x265_picture& pic, int level)
+{
+    if (!csvfp)
+        return;
+
+    const x265_frame_stats* frameStats = &pic.frameData;
+    fprintf(csvfp, "%d, %c-SLICE, %4d, %2.2lf, %10d,", frameStats->encoderOrder, frameStats->sliceType, frameStats->poc, frameStats->qp, (int)frameStats->bits);
+    if (param.rc.rateControlMode == X265_RC_CRF)
+        fprintf(csvfp, "%.3lf,", frameStats->rateFactor);
+    if (param.bEnablePsnr)
+        fprintf(csvfp, "%.3lf, %.3lf, %.3lf, %.3lf,", frameStats->psnrY, frameStats->psnrU, frameStats->psnrV, frameStats->psnr);
+    else
+        fputs(" -, -, -, -,", csvfp);
+    if (param.bEnableSsim)
+        fprintf(csvfp, " %.6f, %6.3f,", frameStats->ssim, x265_ssim2dB(frameStats->ssim));
+    else
+        fputs(" -, -,", csvfp);
+    if (frameStats->sliceType == 'I')
+        fputs(" -, -,", csvfp);
+    else
+    {
+        int i = 0;
+        while (frameStats->list0POC[i] != -1)
+            fprintf(csvfp, "%d ", frameStats->list0POC[i++]);
+        fprintf(csvfp, ",");
+        if (frameStats->sliceType != 'P')
+        {
+            i = 0;
+            while (frameStats->list1POC[i] != -1)
+                fprintf(csvfp, "%d ", frameStats->list1POC[i++]);
+            fprintf(csvfp, ",");
+        }
+        else
+            fputs(" -,", csvfp);
+    }
+    fprintf(csvfp, " %.1lf, %.1lf, %.1lf, %.1lf, %.1lf, %.1lf,", frameStats->decideWaitTime, frameStats->row0WaitTime, frameStats->wallTime, frameStats->refWaitWallTime, frameStats->totalCTUTime, frameStats->stallTime);
+    fprintf(csvfp, " %.3lf, %d", frameStats->avgWPP, frameStats->countRowBlocks);
+    if (level >= 2)
+    {
+        for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
+            fprintf(csvfp, ", %5.2lf%%, %5.2lf%%, %5.2lf%%", frameStats->cuStats.percentIntraDistribution[depth][0], frameStats->cuStats.percentIntraDistribution[depth][1], frameStats->cuStats.percentIntraDistribution[depth][2]);
+        fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentIntraNxN);
+        if (param.bEnableRectInter)
+        {
+            for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
+            {
+                fprintf(csvfp, ", %5.2lf%%, %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][0], frameStats->cuStats.percentInterDistribution[depth][1]);
+                if (param.bEnableAMP)
+                    fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][2]);
+            }
+        }
+        else
+        {
+            for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
+                fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][0]);
+        }
+        for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
+            fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentSkipCu[depth]);
+        for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
+            fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentMergeCu[depth]);
+        fprintf(csvfp, ", %.2lf, %.2lf, %.2lf, %.2lf, %d", frameStats->avgLumaDistortion, frameStats->avgChromaDistortion, frameStats->avgPsyEnergy, frameStats->avgLumaLevel, frameStats->maxLumaLevel);
+    }
+    fprintf(csvfp, "\n");
+    fflush(stderr);
+}
+
+void x265_csvlog_encode(FILE* csvfp, const x265_api& api, const x265_param& param, const x265_stats& stats, int level, int argc, char** argv)
+{
+    if (!csvfp)
+        return;
+
+    if (level)
+    {
+        // adding summary to a per-frame csv log file, so it needs a summary header
+        fprintf(csvfp, "\nSummary\n");
+        fputs(summaryCSVHeader, csvfp);
+    }
+

x265_1.8.tar.gz/source/x265-extras.h Added

@@ -0,0 +1,66 @@
+/*****************************************************************************
+ * Copyright (C) 2015 x265 project
+ *
+ * Authors: Steve Borho <steve@borho.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_EXTRAS_H
+#define X265_EXTRAS_H 1
+
+#include "x265.h"
+
+#include <stdio.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if _WIN32
+#define LIBAPI __declspec(dllexport)
+#else
+#define LIBAPI
+#endif
+
+/* Open a CSV log file. On success it returns a file handle which must be passed
+ * to x265_csvlog_frame() and/or x265_csvlog_encode(). The file handle must be
+ * closed by the caller using fclose(). If level is 0, then no frame logging
+ * header is written to the file. This function will return NULL if it is unable
+ * to open the file for write or if it detects a structure size skew */
+LIBAPI FILE* x265_csvlog_open(const x265_api& api, const x265_param& param, const char* fname, int level);
+
+/* Log frame statistics to the CSV file handle. level should have been non-zero
+ * in the call to x265_csvlog_open() if this function is called. */
+LIBAPI void x265_csvlog_frame(FILE* csvfp, const x265_param& param, const x265_picture& pic, int level);
+
+/* Log final encode statistics to the CSV file handle. 'argc' and 'argv' are
+ * intended to be command line arguments passed to the encoder. Encode
+ * statistics should be queried from the encoder just prior to closing it. */
+LIBAPI void x265_csvlog_encode(FILE* csvfp, const x265_api& api, const x265_param& param, const x265_stats& stats, int level, int argc, char** argv);
+
+/* In-place downshift from a bit-depth greater than 8 to a bit-depth of 8, using
+ * the residual bits to dither each row. */
+LIBAPI void x265_dither_image(const x265_api& api, x265_picture&, int picWidth, int picHeight, int16_t *errorBuf, int bitDepth);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif

x265_1.7.tar.gz/source/x265.cpp -> x265_1.8.tar.gz/source/x265.cpp Changed

@@ -25,15 +25,17 @@
 #pragma warning(disable: 4127) // conditional expression is constant, yes I know
 #endif
 
+#include "x265.h"
+#include "x265-extras.h"
+#include "x265cli.h"
+
+#include "common.h"
 #include "input/input.h"
 #include "output/output.h"
 #include "output/reconplay.h"
-#include "filters/filters.h"
-#include "common.h"
+
 #include "param.h"
 #include "cpu.h"
-#include "x265.h"
-#include "x265cli.h"
 
 #if HAVE_VLD
 /* Visual Leak Detector */
@@ -59,7 +61,7 @@
 #define SetThreadExecutionState(es)
 #endif
 
-using namespace x265;
+using namespace X265_NS;
 
 /* Ctrl-C handler */
 static volatile sig_atomic_t b_ctrl_c /* = 0 */;
@@ -74,12 +76,15 @@
     ReconFile* recon;
     OutputFile* output;
     FILE*       qpfile;
+    FILE*       csvfpt;
+    const char* csvfn;
     const char* reconPlayCmd;
     const x265_api* api;
     x265_param* param;
     bool bProgress;
     bool bForceY4m;
     bool bDither;
+    int csvLogLevel;
     uint32_t seek;              // number of frames to skip from the beginning
     uint32_t framesToBeEncoded; // number of frames to encode
     uint64_t totalbytes;
@@ -95,6 +100,8 @@
         recon = NULL;
         output = NULL;
         qpfile = NULL;
+        csvfpt = NULL;
+        csvfn = NULL;
         reconPlayCmd = NULL;
         api = NULL;
         param = NULL;
@@ -105,6 +112,7 @@
         startTime = x265_mdate();
         prevUpdateTime = 0;
         bDither = false;
+        csvLogLevel = 0;
     }
 
     void destroy();
@@ -124,6 +132,9 @@
     if (qpfile)
         fclose(qpfile);
     qpfile = NULL;
+    if (csvfpt)
+        fclose(csvfpt);
+    csvfpt = NULL;
     if (output)
         output->release();
     output = NULL;
@@ -158,8 +169,8 @@
 
 bool CLIOptions::parse(int argc, char **argv)
 {
-    bool bError = 0;
-    int help = 0;
+    bool bError = false;
+    int bShowHelp = false;
     int inputBitDepth = 8;
     int outputBitDepth = 0;
     int reconFileBitDepth = 0;
@@ -188,8 +199,21 @@
             tune = optarg;
         else if (c == 'D')
             outputBitDepth = atoi(optarg);
+        else if (c == 'P')
+            profile = optarg;
         else if (c == '?')
-            showHelp(param);
+            bShowHelp = true;
+    }
+
+    if (!outputBitDepth && profile)
+    {
+        /* try to derive the output bit depth from the requested profile */
+        if (strstr(profile, "10"))
+            outputBitDepth = 10;
+        else if (strstr(profile, "12"))
+            outputBitDepth = 12;
+        else
+            outputBitDepth = 8;
     }
 
     api = x265_api_get(outputBitDepth);
@@ -212,6 +236,12 @@
         return true;
     }
 
+    if (bShowHelp)
+    {
+        printVersion(param, api);
+        showHelp(param);
+    }
+
     for (optind = 0;; )
     {
         int long_options_index = -1;
@@ -222,12 +252,13 @@
         switch (c)
         {
         case 'h':
+            printVersion(param, api);
             showHelp(param);
             break;
 
         case 'V':
-            printVersion(param);
-            x265_setup_primitives(param, -1);
+            printVersion(param, api);
+            x265_report_simd(param);
             exit(0);
 
         default:
@@ -264,6 +295,8 @@
             if (0) ;
             OPT2("frame-skip", "seek") this->seek = (uint32_t)x265_atoi(optarg, bError);
             OPT("frames") this->framesToBeEncoded = (uint32_t)x265_atoi(optarg, bError);
+            OPT("csv") this->csvfn = optarg;
+            OPT("csv-log-level") this->csvLogLevel = x265_atoi(optarg, bError);
             OPT("no-progress") this->bProgress = false;
             OPT("output") outputfn = optarg;
             OPT("input") inputfn = optarg;
@@ -272,9 +305,9 @@
             OPT("dither") this->bDither = true;
             OPT("recon-depth") reconFileBitDepth = (uint32_t)x265_atoi(optarg, bError);
             OPT("y4m") this->bForceY4m = true;
-            OPT("profile") profile = optarg; /* handled last */
-            OPT("preset") /* handled above */;
-            OPT("tune")   /* handled above */;
+            OPT("profile") /* handled above */;
+            OPT("preset")  /* handled above */;
+            OPT("tune")    /* handled above */;
             OPT("output-depth")   /* handled above */;
             OPT("recon-y4m-exec") reconPlayCmd = optarg;
             OPT("qpfile")
@@ -309,18 +342,22 @@
         return true;
     }
 
-    if (argc <= 1 || help)
+    if (argc <= 1)
+    {
+        api->param_default(param);
+        printVersion(param, api);
         showHelp(param);
+    }
 
-    if (inputfn == NULL || outputfn == NULL)
+    if (!inputfn || !outputfn)
     {
-        x265_log(param, X265_LOG_ERROR, "input or output file not specified, try -V for help\n");
+        x265_log(param, X265_LOG_ERROR, "input or output file not specified, try --help for help\n");
         return true;
     }
 
-    if (param->internalBitDepth != api->max_bit_depth)
+    if (param->internalBitDepth != api->bit_depth)
     {
-        x265_log(param, X265_LOG_ERROR, "Only bit depths of %d are supported in this build\n", api->max_bit_depth);
+        x265_log(param, X265_LOG_ERROR, "Only bit depths of %d are supported in this build\n", api->bit_depth);
         return true;
     }
 
@@ -465,7 +502,8 @@
  * 1 - unable to parse command line
  * 2 - unable to open encoder
  * 3 - unable to generate stream headers
- * 4 - encoder abort */
+ * 4 - encoder abort
+ * 5 - unable to open csv file */
 
 int main(int argc, char **argv)
 {
@@ -516,6 +554,19 @@
     /* get the encoder parameters post-initialization */
     api->encoder_parameters(encoder, param);

x265_1.7.tar.gz/source/x265.def.in -> x265_1.8.tar.gz/source/x265.def.in Changed

x265_1.7.tar.gz/source/x265.h -> x265_1.8.tar.gz/source/x265.h Changed

@@ -100,6 +100,50 @@
     uint32_t         numPartitions;
 } x265_analysis_data;
 
+/* cu statistics */
+typedef struct x265_cu_stats
+{
+    double      percentSkipCu[4];                // Percentage of skip cu in all depths
+    double      percentMergeCu[4];               // Percentage of merge cu in all depths
+    double      percentIntraDistribution[4][3];  // Percentage of DC, Planar, Angular intra modes in all depths
+    double      percentInterDistribution[4][3];  // Percentage of 2Nx2N inter, rect and amp in all depths
+    double      percentIntraNxN;                 // Percentage of 4x4 cu
+
+    /* All the above values will add up to 100%. */
+} x265_cu_stats;
+
+/* Frame level statistics */
+typedef struct x265_frame_stats
+{
+    double           qp;
+    double           rateFactor;
+    double           psnrY;
+    double           psnrU;
+    double           psnrV;
+    double           psnr;
+    double           ssim;
+    double           decideWaitTime;
+    double           row0WaitTime;
+    double           wallTime;
+    double           refWaitWallTime;
+    double           totalCTUTime;
+    double           stallTime;
+    double           avgWPP;
+    double           avgLumaDistortion;
+    double           avgChromaDistortion;
+    double           avgPsyEnergy;
+    double           avgLumaLevel;
+    uint64_t         bits;
+    int              encoderOrder;
+    int              poc;
+    int              countRowBlocks;
+    int              list0POC[16];
+    int              list1POC[16];
+    uint16_t         maxLumaLevel;
+    char             sliceType;
+    x265_cu_stats    cuStats;
+} x265_frame_stats;
+
 /* Used to pass pictures into the encoder, and to get picture data back out of
  * the encoder.  The input and output semantics are different */
 typedef struct x265_picture
@@ -161,6 +205,9 @@
      * this data structure */
     x265_analysis_data analysisData;
 
+    /* Frame level statistics */
+    x265_frame_stats frameData;
+
 } x265_picture;
 
 typedef enum
@@ -221,9 +268,8 @@
 #define X265_LOG_ERROR          0
 #define X265_LOG_WARNING        1
 #define X265_LOG_INFO           2
-#define X265_LOG_FRAME          3
-#define X265_LOG_DEBUG          4
-#define X265_LOG_FULL           5
+#define X265_LOG_DEBUG          3
+#define X265_LOG_FULL           4
 
 #define X265_B_ADAPT_NONE       0
 #define X265_B_ADAPT_FAST       1
@@ -249,6 +295,7 @@
 #define X265_AQ_NONE                 0
 #define X265_AQ_VARIANCE             1
 #define X265_AQ_AUTO_VARIANCE        2
+#define X265_AQ_AUTO_VARIANCE_BIASED 3
 
 /* NOTE! For this release only X265_CSP_I420 and X265_CSP_I444 are supported */
 
@@ -302,20 +349,35 @@
     X265_RC_CRF
 } X265_RC_METHODS;
 
+/* slice type statistics */
+typedef struct x265_sliceType_stats
+{
+    double        avgQp;
+    double        bitrate;
+    double        psnrY;
+    double        psnrU;
+    double        psnrV;
+    double        ssim;
+    uint32_t      numPics;
+} x265_sliceType_stats;
+
 /* Output statistics from encoder */
 typedef struct x265_stats
 {
-    double    globalPsnrY;
-    double    globalPsnrU;
-    double    globalPsnrV;
-    double    globalPsnr;
-    double    globalSsim;
-    double    elapsedEncodeTime;    /* wall time since encoder was opened */
-    double    elapsedVideoTime;     /* encoded picture count / frame rate */
-    double    bitrate;              /* accBits / elapsed video time */
-    uint64_t  accBits;              /* total bits output thus far */
-    uint32_t  encodedPictureCount;  /* number of output pictures thus far */
-    uint32_t  totalWPFrames;        /* number of uni-directional weighted frames used */
+    double                globalPsnrY;
+    double                globalPsnrU;
+    double                globalPsnrV;
+    double                globalPsnr;
+    double                globalSsim;
+    double                elapsedEncodeTime;    /* wall time since encoder was opened */
+    double                elapsedVideoTime;     /* encoded picture count / frame rate */
+    double                bitrate;              /* accBits / elapsed video time */
+    uint64_t              accBits;              /* total bits output thus far */
+    uint32_t              encodedPictureCount;  /* number of output pictures thus far */
+    uint32_t              totalWPFrames;        /* number of uni-directional weighted frames used */
+    x265_sliceType_stats  statsI;               /* statistics of I slice */
+    x265_sliceType_stats  statsP;               /* statistics of P slice */
+    x265_sliceType_stats  statsB;               /* statistics of B slice */
 } x265_stats;
 
 /* String values accepted by x265_param_parse() (and CLI) for various parameters */
@@ -326,7 +388,7 @@
 static const char * const x265_colorprim_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "film", "bt2020", 0 };
 static const char * const x265_transfer_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "linear", "log100",
                                                     "log316", "iec61966-2-4", "bt1361e", "iec61966-2-1", "bt2020-10", "bt2020-12",
-                                                    "smpte-st-2084", "smpte-st-428", 0 };
+                                                    "smpte-st-2084", "smpte-st-428", "arib-std-b67", 0 };
 static const char * const x265_colmatrix_names[] = { "GBR", "bt709", "undef", "", "fcc", "bt470bg", "smpte170m", "smpte240m",
                                                      "YCgCo", "bt2020nc", "bt2020c", 0 };
 static const char * const x265_sar_names[] = { "undef", "1:1", "12:11", "10:11", "16:11", "40:33", "24:11", "20:11",
@@ -439,8 +501,7 @@
 
     /*== Logging Features ==*/
 
-    /* Enable analysis and logging distribution of CUs encoded across various
-     * modes during mode decision. Default disabled */
+    /* Enable analysis and logging distribution of CUs. Now deprecated */
     int       bLogCuStats;
 
     /* Enable the measurement and reporting of PSNR. Default is enabled */
@@ -453,11 +514,7 @@
      * X265_LOG_FULL, default is X265_LOG_INFO */
     int       logLevel;
 
-    /* filename of CSV log. If logLevel greater than or equal to X265_LOG_FRAME,
-     * the encoder will emit per-slice statistics to this log file in encode
-     * order. Otherwise the encoder will emit per-stream statistics into the log
-     * file when x265_encoder_log is called (presumably at the end of the
-     * encode) */
+    /* Filename of CSV log. Now deprecated */
     const char* csvfn;
 
     /*== Internal Picture Specification ==*/
@@ -1143,11 +1200,31 @@
 #define X265_PARAM_BAD_VALUE (-2)
 int x265_param_parse(x265_param *p, const char *name, const char *value);
 
-/* x265_param_apply_profile:
- *      Applies the restrictions of the given profile. (one of below) */
-static const char * const x265_profile_names[] = { "main", "main10", "mainstillpicture", 0 };
+static const char * const x265_profile_names[] = {
+    /* HEVC v1 */
+    "main", "main10", "mainstillpicture", /* alias */ "msp",
+
+    /* HEVC v2 (Range Extensions) */
+    "main-intra", "main10-intra",
+    "main444-8",  "main444-intra", "main444-stillpicture",
 
-/*      (can be NULL, in which case the function will do nothing)
+    "main422-10", "main422-10-intra",
+    "main444-10", "main444-10-intra",
+
+    "main12",     "main12-intra",                  /* Highly Experimental */
+    "main422-12", "main422-12-intra",
+    "main444-12", "main444-12-intra",
+
+    "main444-16-intra", "main444-16-stillpicture", /* Not Supported! */
+    0
+};
+
+/* x265_param_apply_profile:
+ *      Applies the restrictions of the given profile. (one of x265_profile_names)
+ *      (can be NULL, in which case the function will do nothing)
+ *      Note: the detected profile can be lower than the one specified to this
+ *      function. This function will force the encoder parameters to fit within
+ *      the specified profile, or fail if that is impossible.
  *      returns 0 on success, negative on failure (e.g. invalid profile name). */
 int x265_param_apply_profile(x265_param *, const char *profile);
 
@@ -1263,9 +1340,7 @@
 void x265_encoder_get_stats(x265_encoder *encoder, x265_stats *, uint32_t statsSizeBytes);
 
 /* x265_encoder_log:

x265_1.7.tar.gz/source/x265cli.h -> x265_1.8.tar.gz/source/x265cli.h Changed

@@ -24,10 +24,13 @@
 #ifndef X265CLI_H
 #define X265CLI_H 1
 
+#include "common.h"
+#include "param.h"
+
 #include <getopt.h>
 
 #ifdef __cplusplus
-namespace x265 {
+namespace X265_NS {
 #endif
 
 static const char short_options[] = "o:D:P:p:f:F:r:I:i:b:s:t:q:m:hwV?";
@@ -54,6 +57,7 @@
     { "allow-non-conformance",no_argument, NULL, 0 },
     { "no-allow-non-conformance",no_argument, NULL, 0 },
     { "csv",            required_argument, NULL, 0 },
+    { "csv-log-level",  required_argument, NULL, 0 },
     { "no-cu-stats",          no_argument, NULL, 0 },
     { "cu-stats",             no_argument, NULL, 0 },
     { "y4m",                  no_argument, NULL, 0 },
@@ -121,6 +125,7 @@
     { "no-b-pyramid",         no_argument, NULL, 0 },
     { "b-pyramid",            no_argument, NULL, 0 },
     { "ref",            required_argument, NULL, 0 },
+    { "limit-refs",     required_argument, NULL, 0 },
     { "no-weightp",           no_argument, NULL, 0 },
     { "weightp",              no_argument, NULL, 'w' },
     { "no-weightb",           no_argument, NULL, 0 },
@@ -183,7 +188,8 @@
     { "transfer",       required_argument, NULL, 0 },
     { "colormatrix",    required_argument, NULL, 0 },
     { "chromaloc",      required_argument, NULL, 0 },
-    { "crop-rect",      required_argument, NULL, 0 },
+    { "display-window", required_argument, NULL, 0 },
+    { "crop-rect",      required_argument, NULL, 0 }, /* DEPRECATED */
     { "master-display", required_argument, NULL, 0 },
     { "max-cll",        required_argument, NULL, 0 },
     { "no-dither",            no_argument, NULL, 0 },
@@ -219,17 +225,15 @@
     { 0, 0, 0, 0 }
 };
 
-static void printVersion(x265_param *param)
+static void printVersion(x265_param *param, const x265_api* api)
 {
-    x265_log(param, X265_LOG_INFO, "HEVC encoder version %s\n", x265_version_str);
-    x265_log(param, X265_LOG_INFO, "build info %s\n", x265_build_info_str);
+    x265_log(param, X265_LOG_INFO, "HEVC encoder version %s\n", api->version_str);
+    x265_log(param, X265_LOG_INFO, "build info %s\n", api->build_info_str);
 }
 
 static void showHelp(x265_param *param)
 {
     int level = param->logLevel;
-    x265_param_default(param);
-    printVersion(param);
 
 #define OPT(value) (value ? "enabled" : "disabled")
 #define H0 printf
@@ -243,11 +247,11 @@
     H0("-V/--version                     Show version info and exit\n");
     H0("\nOutput Options:\n");
     H0("-o/--output <filename>           Bitstream output file name\n");
-    H0("-D/--output-depth 8|10           Output bit depth (also internal bit depth). Default %d\n", param->internalBitDepth);
-    H0("   --log-level <string>          Logging level: none error warning info debug full. Default %s\n", x265::logLevelNames[param->logLevel + 1]);
+    H0("-D/--output-depth 8|10|12        Output bit depth (also internal bit depth). Default %d\n", param->internalBitDepth);
+    H0("   --log-level <string>          Logging level: none error warning info debug full. Default %s\n", X265_NS::logLevelNames[param->logLevel + 1]);
     H0("   --no-progress                 Disable CLI progress reports\n");
-    H0("   --[no-]cu-stats               Enable logging stats about distribution of cu across all modes. Default %s\n",OPT(param->bLogCuStats));
-    H1("   --csv <filename>              Comma separated log file, log level >= 3 frame log, else one line per run\n");
+    H0("   --csv <filename>              Comma separated log file, if csv-log-level > 0 frame level statistics, else one line per run\n");
+    H0("   --csv-log-level               Level of csv logging, if csv-log-level > 0 frame level statistics, else one line per run: 0-2\n");
     H0("\nInput Options:\n");
     H0("   --input <filename>            Raw YUV or Y4M input file name. `-` for stdin\n");
     H1("   --y4m                         Force parsing of input stream as YUV4MPEG2 regardless of file extension\n");
@@ -302,10 +306,12 @@
     H0("   --[no-]signhide               Hide sign bit of one coeff per TU (rdo). Default %s\n", OPT(param->bEnableSignHiding));
     H1("   --[no-]tskip                  Enable intra 4x4 transform skipping. Default %s\n", OPT(param->bEnableTransformSkip));
     H0("\nTemporal / motion search options:\n");
+    H0("   --max-merge <1..5>            Maximum number of merge candidates. Default %d\n", param->maxNumMergeCand);
+    H0("   --ref <integer>               max number of L0 references to be allowed (1 .. 16) Default %d\n", param->maxNumReferences);
+    H0("   --limit-refs <0|1|2|3>        limit references per depth (1) or CU (2) or both (3). Default %d\n", param->limitReferences);
     H0("   --me <string>                 Motion search method dia hex umh star full. Default %d\n", param->searchMethod);
     H0("-m/--subme <integer>             Amount of subpel refinement to perform (0:least .. 7:most). Default %d \n", param->subpelRefine);
     H0("   --merange <integer>           Motion search range. Default %d\n", param->searchRange);
-    H0("   --max-merge <1..5>            Maximum number of merge candidates. Default %d\n", param->maxNumMergeCand);
     H0("   --[no-]rect                   Enable rectangular motion partitions Nx2N and 2NxN. Default %s\n", OPT(param->bEnableRectInter));
     H0("   --[no-]amp                    Enable asymmetric motion partitions, requires --rect. Default %s\n", OPT(param->bEnableAMP));
     H1("   --[no-]temporal-mvp           Enable temporal MV predictors. Default %s\n", OPT(param->bEnableTemporalMvp));
@@ -327,13 +333,6 @@
     H1("   --bframe-bias <integer>       Bias towards B frame decisions. Default %d\n", param->bFrameBias);
     H0("   --b-adapt <0..2>              0 - none, 1 - fast, 2 - full (trellis) adaptive B frame scheduling. Default %d\n", param->bFrameAdaptive);
     H0("   --[no-]b-pyramid              Use B-frames as references. Default %s\n", OPT(param->bBPyramid));
-    H0("   --ref <integer>               max number of L0 references to be allowed (1 .. 16) Default %d\n", param->maxNumReferences);
-    H1("   --zones <zone0>/<zone1>/...   Tweak the bitrate of regions of the video\n");
-    H1("                                 Each zone is of the form\n");
-    H1("                                   <start frame>,<end frame>,<option>\n");
-    H1("                                   where <option> is either\n");
-    H1("                                       q=<integer> (force QP)\n");
-    H1("                                   or  b=<float> (bitrate multiplier)\n");
     H1("   --qpfile <string>             Force frametypes and QPs for some or all frames\n");
     H1("                                 Format of each line: framenumber frametype QP\n");
     H1("                                 QP is optional (none lets x265 choose). Frametypes: I,i,P,B,b.\n");
@@ -359,7 +358,7 @@
     H0("   --[no-]strict-cbr             Enable stricter conditions and tolerance for bitrate deviations in CBR mode. Default %s\n", OPT(param->rc.bStrictCbr));
     H0("   --analysis-mode <string|int>  save - Dump analysis info into file, load - Load analysis buffers from the file. Default %d\n", param->analysisMode);
     H0("   --analysis-file <filename>    Specify file name used for either dumping or reading analysis data.\n");
-    H0("   --aq-mode <integer>           Mode for Adaptive Quantization - 0:none 1:uniform AQ 2:auto variance. Default %d\n", param->rc.aqMode);
+    H0("   --aq-mode <integer>           Mode for Adaptive Quantization - 0:none 1:uniform AQ 2:auto variance 3:auto variance with bias to dark scenes. Default %d\n", param->rc.aqMode);
     H0("   --aq-strength <float>         Reduces blocking and blurring in flat and textured areas (0 to 3.0). Default %.2f\n", param->rc.aqStrength);
     H0("   --qg-size <int>               Specifies the size of the quantization group (64, 32, 16). Default %d\n", param->rc.qgSize);
     H0("   --[no-]cutree                 Enable cutree for Adaptive Quantization. Default %s\n", OPT(param->rc.cuTree));
@@ -370,6 +369,12 @@
     H1("   --cbqpoffs <integer>          Chroma Cb QP Offset [-12..12]. Default %d\n", param->cbQpOffset);
     H1("   --crqpoffs <integer>          Chroma Cr QP Offset [-12..12]. Default %d\n", param->crQpOffset);
     H1("   --scaling-list <string>       Specify a file containing HM style quant scaling lists or 'default' or 'off'. Default: off\n");
+    H1("   --zones <zone0>/<zone1>/...   Tweak the bitrate of regions of the video\n");
+    H1("                                 Each zone is of the form\n");
+    H1("                                   <start frame>,<end frame>,<option>\n");
+    H1("                                   where <option> is either\n");
+    H1("                                       q=<integer> (force QP)\n");
+    H1("                                   or  b=<float> (bitrate multiplier)\n");
     H1("   --lambda-file <string>        Specify a file containing replacement values for the lambda tables\n");
     H1("                                 MAX_MAX_QP+1 floats for lambda table, then again for lambda2 table\n");
     H1("                                 Blank lines and lines starting with hash(#) are ignored\n");
@@ -383,7 +388,7 @@
     H0("                                 Choose from 0=undef, 1=1:1(\"square\"), 2=12:11, 3=10:11, 4=16:11,\n");
     H0("                                 5=40:33, 6=24:11, 7=20:11, 8=32:11, 9=80:33, 10=18:11, 11=15:11,\n");
     H0("                                 12=64:33, 13=160:99, 14=4:3, 15=3:2, 16=2:1 or custom ratio of <int:int>. Default %d\n", param->vui.aspectRatioIdc);
-    H1("   --crop-rect <string>          Add 'left,top,right,bottom' to the bitstream-level cropping rectangle\n");
+    H1("   --display-window <string>     Describe overscan cropping region as 'left,top,right,bottom' in pixels\n");
     H1("   --overscan <string>           Specify whether it is appropriate for decoder to show cropped region: undef, show or crop. Default undef\n");
     H0("   --videoformat <string>        Specify video format from undef, component, pal, ntsc, secam, mac. Default undef\n");
     H0("   --range <string>              Specify black level and range of luma and chroma signals as full or limited Default limited\n");
@@ -391,7 +396,7 @@
     H0("                                 smpte240m, film, bt2020. Default undef\n");
     H0("   --transfer <string>           Specify transfer characteristics from undef, bt709, bt470m, bt470bg, smpte170m,\n");
     H0("                                 smpte240m, linear, log100, log316, iec61966-2-4, bt1361e, iec61966-2-1,\n");
-    H0("                                 bt2020-10, bt2020-12. Default undef\n");
+    H0("                                 bt2020-10, bt2020-12, smpte-st-2084, smpte-st-428, arib-std-b67. Default undef\n");
     H1("   --colormatrix <string>        Specify color matrix setting from undef, bt709, fcc, bt470bg, smpte170m,\n");
     H1("                                 smpte240m, GBR, YCgCo, bt2020nc, bt2020c. Default undef\n");
     H1("   --chromaloc <integer>         Specify chroma sample location (0 to 5). Default of %d\n", param->vui.chromaSampleLocTypeTopField);