Packman Build Service PMBS

We truncated the diff of some files because they were too big. If you want to see the full diff for every file, click here.

Changes of Revision 13

x265.changes Changed

@@ -1,4 +1,40 @@
 -------------------------------------------------------------------
+Sun Aug 28 11:51:23 UTC 2016 - joerg.lorenzen@ki.tng.de
+
+- Update to version 2.0
+  API and Key Behavior Changes
+  * x265_rc_stats added to x265_picture, containing all RC decision
+    points for that frame.
+  * PTL: high tier is now allowed by default, chosen only if
+    necessary.
+  * multi-pass: First pass now uses slow-firstpass by default,
+    enabling better RC decisions in future passes.
+  * pools: fix behaviour on multi-socketed Windows systems, provide
+    more flexibility in determining thread and pool counts.
+  * ABR: improve bits allocation in the first few frames, abr reset,
+    vbv and cutree improved.
+  New Features
+  * uhd-bd: Enforce Ultra-HD Blu-ray Disc parameters
+    (overrides any other settings).
+  * rskip: Enables skipping recursion to analyze lower CU sizes
+    using heuristics at different rd-levels. Provides good visual
+    quality gains at the highest quality presets.
+  * rc-grain: Enables a new rate control mode specifically for
+    grainy content. Strictly prevents QP oscillations within and
+    between frames to avoid grain fluctuations.
+  * tune grain: A fully refactored and improved option to encode
+    film grain content including QP control as well as analysis
+    options.
+  * asm: ARM assembly is now enabled by default, native or cross
+    compiled builds supported on armv6 and later systems.
+  Misc
+  * An SSIM calculation bug was corrected
+- soname bump to 87.
+- Fixed arm.patch.
+- Added libnuma-devel as buildrequires for arch x86_64 (except
+  for openSUSE 13.1 because libnuma-devel >= 2.0.9 is required).
+
+-------------------------------------------------------------------
 Wed Feb  3 13:22:42 UTC 2016 - idonmez@suse.com
 
 - Update to version 1.9

x265.spec Changed

arm.patch Changed

@@ -1,19 +1,25 @@
-Index: x265_11047/source/CMakeLists.txt
+Index: x265_2.0/source/CMakeLists.txt
 ===================================================================
---- x265_11047.orig/source/CMakeLists.txt
-+++ x265_11047/source/CMakeLists.txt
-@@ -56,10 +56,22 @@ elseif(POWERMATCH GREATER "-1")
+--- x265_2.0.orig/source/CMakeLists.txt
++++ x265_2.0/source/CMakeLists.txt
+@@ -60,15 +60,22 @@
      message(STATUS "Detected POWER target processor")
      set(POWER 1)
      add_definitions(-DX265_ARCH_POWER=1)
+-elseif(ARMMATCH GREATER "-1")
+-    if(CROSS_COMPILE_ARM)
+-        message(STATUS "Cross compiling for ARM arch")
+-    else()
+-        set(CROSS_COMPILE_ARM 0)
+-    endif()
+-    message(STATUS "Detected ARM target processor")
+-    set(ARM 1)
+-    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
 +elseif(${SYSPROC} MATCHES "armv5.*")
 +    message(STATUS "Detected ARMV5 system processor")
 +    set(ARMV5 1)
 +    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=0 -DHAVE_NEON=0)
- elseif(${SYSPROC} STREQUAL "armv6l")
--    message(STATUS "Detected ARM target processor")
--    set(ARM 1)
--    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
++elseif(${SYSPROC} STREQUAL "armv6l")
 +    message(STATUS "Detected ARMV6 system processor")
 +    set(ARMV6 1)
 +    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1 -DHAVE_NEON=0)
@@ -28,21 +34,32 @@
  else()
      message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
      message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
-@@ -169,8 +181,8 @@ if(GCC)
-     elseif(X86 AND NOT X64)
-         add_definitions(-march=i686)
+@@ -186,18 +193,9 @@
+             add_definitions(-march=i686)
+         endif()
      endif()
--    if(ARM)
--        add_definitions(-march=armv6 -mfloat-abi=hard -mfpu=vfp)
+-    if(ARM AND CROSS_COMPILE_ARM)
+-        set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
+-    elseif(ARM)
+-        find_package(Neon)
+-        if(CPU_HAS_NEON)
+-            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
+-            add_definitions(-DHAVE_NEON)
+-        else()
+-            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
+-        endif()
+-    endif()
+-    add_definitions(${ARM_ARGS})
 +    if(ARMV7)
 +        add_definitions(-fPIC)
-     endif()
++    endif()
      if(FPROFILE_GENERATE)
          if(INTEL_CXX)
-Index: x265_11047/source/common/cpu.cpp
+             add_definitions(-prof-gen -prof-dir="${CMAKE_CURRENT_BINARY_DIR}")
+Index: x265_2.0/source/common/cpu.cpp
 ===================================================================
---- x265_11047.orig/source/common/cpu.cpp
-+++ x265_11047/source/common/cpu.cpp
+--- x265_2.0.orig/source/common/cpu.cpp
++++ x265_2.0/source/common/cpu.cpp
 @@ -37,7 +37,7 @@
  #include <machine/cpu.h>
  #endif
@@ -52,3 +69,20 @@
  #include <signal.h>
  #include <setjmp.h>
  static sigjmp_buf jmpbuf;
+@@ -340,7 +340,6 @@
+     }
+ 
+     canjump = 1;
+-    PFX(cpu_neon_test)();
+     canjump = 0;
+     signal(SIGILL, oldsig);
+ #endif // if !HAVE_NEON
+@@ -356,7 +355,7 @@
+     // which may result in incorrect detection and the counters stuck enabled.
+     // right now Apple does not seem to support performance counters for this test
+ #ifndef __MACH__
+-    flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
++    //flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
+ #endif
+     // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
+ #endif // if HAVE_ARMV6

x265_1.9.tar.gz/.hg_archival.txt -> x265_2.0.tar.gz/.hg_archival.txt Changed

x265_1.9.tar.gz/.hgtags -> x265_2.0.tar.gz/.hgtags Changed

x265_2.0.tar.gz/build/arm-linux/crosscompile.cmake Added

x265_2.0.tar.gz/build/arm-linux/make-Makefiles.bash Added

x265_1.9.tar.gz/doc/reST/api.rst -> x265_2.0.tar.gz/doc/reST/api.rst Changed

x265_1.9.tar.gz/doc/reST/cli.rst -> x265_2.0.tar.gz/doc/reST/cli.rst Changed

@@ -376,10 +376,10 @@
 
 .. option:: --dither
 
-	Enable high quality downscaling. Dithering is based on the diffusion
-	of errors from one row of pixels to the next row of pixels in a
-	picture. Only applicable when the input bit depth is larger than
-	8bits and internal bit depth is 8bits. Default disabled
+	Enable high quality downscaling to the encoder's internal bitdepth. 
+	Dithering is based on the diffusion	of errors from one row of pixels 
+	to the next row of pixels in a picture. Only applicable when the 
+	input bit depth is larger than 8bits. Default disabled
 
 	**CLI ONLY**
 
@@ -522,16 +522,14 @@
 
 .. option:: --high-tier, --no-high-tier
 
-	If :option:`--level-idc` has been specified, the option adds the
-	intention to support the High tier of that level. If your specified
-	level does not support a High tier, a warning is issued and this
-	modifier flag is ignored. If :option:`--level-idc` has been specified,
-	but not --high-tier, then the encoder will attempt to encode at the 
-	specified level, main tier first, turning on high tier only if 
-	necessary and available at that level.
+	If :option:`--level-idc` has been specified, --high-tier allows the
+	support of high tier at that level. The encoder will first attempt to encode 
+	at the specified level, main tier first, turning on high tier only if 
+	necessary and available at that level.If your requested level does not 
+	support a High tier, high tier will not be supported. If --no-high-tier 
+	has been specified, then the encoder will attempt to encode only at the main tier.
 
-	If :option:`--level-idc` has not been specified, this argument is
-	ignored.
+	Default: enabled
 
 .. option:: --ref <1..16>
 
@@ -564,6 +562,15 @@
 
 	Default: disabled
 
+.. option:: --uhd-bd
+
+    Enable Ultra HD Blu-ray format support. If specified with incompatible
+    encoding options, the encoder will attempt to modify/set the right 
+    encode specifications. If the encoder is unable to do so, this option
+    will be turned OFF. Highly experimental.
+	
+    Default: disabled
+	
 .. note::
 
 	:option:`--profile`, :option:`--level-idc`, and
@@ -600,7 +607,7 @@
 Mode decision / Analysis
 ========================
 
-.. option:: --rd <0..6>
+.. option:: --rd <1..6>
 
 	Level of RDO in mode decision. The higher the value, the more
 	exhaustive the analysis and the more rate distortion optimization is
@@ -629,7 +636,7 @@
 	| 6     | Currently same as 5                                           |
 	+-------+---------------------------------------------------------------+
 
-	**Range of values:** 0: least .. 6: full RDO analysis
+	**Range of values:** 1: least .. 6: full RDO analysis
 
 Options which affect the coding unit quad-tree, sometimes referred to as
 the prediction quad-tree.
@@ -722,8 +729,18 @@
 
 .. option:: --early-skip, --no-early-skip
 
-	Measure full CU size (2Nx2N) merge candidates first; if no residual
-	is found the analysis is short circuited. Default disabled
+	Measure 2Nx2N merge candidates first; if no residual is found, 
+	additional modes at that depth are not analysed. Default disabled
+
+.. option:: --rskip, --no-rskip
+
+	This option determines early exit from CU depth recursion. When a skip CU is
+	found, additional heuristics (depending on rd-level) are used to decide whether
+	to terminate recursion. In rdlevels 5 and 6, comparison with inter2Nx2N is used, 
+	while at rdlevels 4 and neighbour costs are used to skip recursion.
+	Provides minimal quality degradation at good performance gains when enabled. 
+
+	Default: enabled, disabled for :option:`--tune grain`
 
 .. option:: --fast-intra, --no-fast-intra
 
@@ -756,6 +773,14 @@
 	evaluate if luma used tskip. Inter block tskip analysis is
 	unmodified. Default disabled
 
+.. option:: --rd-refine, --no-rd-refine
+
+	For each analysed CU, calculate R-D cost on the best partition mode
+	for a range of QP values, to find the optimal rounding effect.
+	Default disabled.
+
+	Only effective at RD levels 5 and 6
+
 Analysis re-use options, to improve performance when encoding the same
 sequence multiple times (presumably at varying bitrates). The encoder
 will not reuse analysis if the resolution and slice type parameters do
@@ -1039,7 +1064,7 @@
 cause ringing artifacts. psy-rdoq is less accurate than psy-rd, it is
 biasing towards energy in general while psy-rd biases towards the energy
 of the source image. But very large psy-rdoq values can sometimes be
-beneficial, preserving film grain for instance.
+beneficial.
 
 As a general rule, when both psycho-visual features are disabled, the
 encoder will tend to blur blocks in areas of difficult motion. Turning
@@ -1076,8 +1101,8 @@
 	energy in the reconstructed image. This generally improves perceived
 	visual quality at the cost of lower quality metric scores.  It only
 	has effect when :option:`--rdoq-level` is 1 or 2. High values can
-	be beneficial in preserving high-frequency detail like film grain.
-	Default: 1.0
+	be beneficial in preserving high-frequency detail.
+	Default: 0.0 (1.0 for presets slow, slower, veryslow)
 
 	**Range of values:** 0 .. 50.0
 
@@ -1336,13 +1361,13 @@
 
 .. option:: --slow-firstpass, --no-slow-firstpass
 
-	Enable a slow and more detailed first pass encode in multi-pass rate
-	control mode.  Speed of the first pass encode is slightly lesser and
-	quality midly improved when compared to the default settings in a
-	multi-pass encode. Default disabled (turbo mode enabled)
+	Enable first pass encode with the exact settings specified. 
+	The quality in subsequent multi-pass encodes is better
+	(compared to first pass) when the settings match across each pass. 
+	Default enabled.
 
-	When **turbo** first pass is not disabled, these options are
-	set on the first pass to improve performance:
+	When slow first pass is disabled, a **turbo** encode with the following
+	go-fast options is used to improve performance:
 	
 	* :option:`--fast-intra`
 	* :option:`--no-rect`
@@ -1408,7 +1433,16 @@
 
 	The maximum single adjustment in QP allowed to rate control. Default
 	4
-
+	
+.. option:: --rc-grain, --no-rc-grain
+
+   Enables a specialised ratecontrol algorithm for film grain content. This 
+   parameter strictly minimises QP fluctuations within and across frames 
+   and removes pulsing of grain. Default disabled. 
+   Enabled when :option:'--tune' grain is applied. It is highly recommended 
+   that this option is used through the tune grain feature where a combination 
+   of param options are used to improve visual quality.
+   
 .. option:: --qblur <float>
 
 	Temporally blur quants. Default 0.5
@@ -1660,10 +1694,13 @@
 	a string which is parsed when the stream header SEI are emitted. The
 	string format is "G(%hu,%hu)B(%hu,%hu)R(%hu,%hu)WP(%hu,%hu)L(%u,%u)"
 	where %hu are unsigned 16bit integers and %u are unsigned 32bit
-	integers. The SEI includes X,Y display primaries for RGB channels,
-	white point X,Y and max,min luminance values. (HDR)
+	integers. The SEI includes X,Y display primaries for RGB channels
+	and white point (WP) in units of 0.00002 and max,min luminance (L)
+	values in units of 0.0001 candela per meter square. (HDR)
 
-	Example for D65P3 1000-nits:
+	Example for a P3D65 1000-nits monitor, where G(x=0.265, y=0.690),
+	B(x=0.150, y=0.060), R(x=0.680, y=0.320), WP(x=0.3127, y=0.3290),
+	L(max=1000, min=0.0001):
 
 		G(13250,34500)B(7500,3000)R(34000,16000)WP(15635,16450)L(10000000,1)
 
@@ -1672,8 +1709,9 @@
 
 .. option:: --max-cll <string>
 
-	Maximum content light level and maximum frame average light level as
-	required by the Consumer Electronics Association 861.3 specification.
+	Maximum content light level (MaxCLL) and maximum frame average light
+	level (MaxFALL) as required by the Consumer Electronics Association
+	861.3 specification.
 
 	Specified as a string which is parsed when the stream header SEI are
 	emitted. The string format is "%hu,%hu" where %hu are unsigned 16bit
@@ -1681,6 +1719,11 @@
 	maximum is indicated), the second value is the maximum picture
 	average light level (or 0). (HDR)

x265_1.9.tar.gz/doc/reST/presets.rst -> x265_2.0.tar.gz/doc/reST/presets.rst Changed

@@ -21,68 +21,80 @@
 The presets adjust encoder parameters as shown in the following table.
 Any parameters below that are specified in your command-line will be 
 changed from the value specified by the preset.
-
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
-|                 |ultrafast |superfast |veryfast |faster |fast |medium |slow |slower |veryslow |placebo |
-+=================+==========+==========+=========+=======+=====+=======+=====+=======+=========+========+
-| ctu             |    32    |    32    |   64    |  64   | 64  |  64   | 64  |  64   |   64    |  64    |
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
-| min-cu-size     |    16    |     8    |    8    |   8   |  8  |   8   |  8  |   8   |    8    |   8    |
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
-| bframes         |     3    |     3    |    4    |   4   |  4  |   4   |  4  |   8   |    8    |   8    |
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
-| b-adapt         |     0    |     0    |    0    |   0   |  0  |   2   |  2  |   2   |    2    |   2    |
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
-| rc-lookahead    |     5    |    10    |   15    |  15   | 15  |  20   | 25  |  30   |   40    |  60    |
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
-| lookahead-slices|     8    |     8    |    8    |   8   |  8  |   8   |  4  |   4   |    1    |   1    |
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
-| scenecut        |     0    |    40    |   40    |  40   | 40  |  40   | 40  |  40   |   40    |  40    |
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
-| ref             |     1    |     1    |    2    |   2   |  3  |   3   |  4  |   4   |    5    |   5    |
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
-| limit-refs      |     0    |     0    |    3    |   3   |  3  |   3   |  3  |   2   |    1    |   0    |
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
-| me              |    dia   |   hex    |   hex   |  hex  |hex  |  hex  |star | star  |   star  |  star  |
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
-| merange         |    57    |    57    |   57    |  57   | 57  |  57   | 57  |  57   |   57    |  92    |
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
-| subme           |     0    |     1    |    1    |   2   |  2  |   2   |  3  |   3   |    4    |   5    |
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
-| rect            |     0    |     0    |    0    |   0   |  0  |   0   |  1  |   1   |    1    |   1    |
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
-| amp             |     0    |     0    |    0    |   0   |  0  |   0   |  0  |   1   |    1    |   1    |
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
-| limit-modes     |     0    |     0    |    0    |   0   |  0  |   0   |  1  |   1   |    1    |   0    |
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
-| max-merge       |     2    |     2    |    2    |   2   |  2  |   2   |  3  |   3   |    4    |   5    |
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
-| early-skip      |     1    |     1    |    1    |   1   |  0  |   0   |  0  |   0   |    0    |   0    |
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
-| fast-intra      |     1    |     1    |    1    |   1   |  1  |   0   |  0  |   0   |    0    |   0    |
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
-| b-intra         |     0    |     0    |    0    |   0   |  0  |   0   |  0  |   1   |    1    |   1    |
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
-| sao             |     0    |     0    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
-| signhide        |     0    |     1    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
-| weightp         |     0    |     0    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
-| weightb         |     0    |     0    |    0    |   0   |  0  |   0   |  0  |   1   |    1    |   1    |
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
-| aq-mode         |     0    |     0    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
-| cuTree          |     1    |     1    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
-| rdLevel         |     2    |     2    |    2    |   2   |  2  |   3   |  4  |   6   |    6    |   6    |
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
-| rdoq-level      |     0    |     0    |    0    |   0   |  0  |   0   |  2  |   2   |    2    |   2    |
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
-| tu-intra        |     1    |     1    |    1    |   1   |  1  |   1   |  1  |   2   |    3    |   4    |
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
-| tu-inter        |     1    |     1    |    1    |   1   |  1  |   1   |  1  |   2   |    3    |   4    |
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+	0. ultrafast
+	1. superfast
+	2. veryfast
+	3. faster
+	4. fast
+	5. medium **(default)**
+	6. slow
+	7. slower
+	8. veryslow
+	9. placebo
+
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
+| preset          |  0  |  1  |  2  |   3 |   4 |   5 |   6  |   7  |   8  |  9   |
++=================+=====+=====+=====+=====+=====+=====+======+======+======+======+
+| ctu             | 32  | 32  | 64  |  64 |  64 |  64 |  64  |  64  |  64  | 64   |
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
+| min-cu-size     | 16  |  8  |  8  |   8 |   8 |   8 |   8  |   8  |   8  |  8   |
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
+| bframes         |  3  |  3  |  4  |   4 |   4 |   4 |   4  |   8  |   8  |  8   |
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
+| b-adapt         |  0  |  0  |  0  |   0 |   0 |   2 |   2  |   2  |   2  |  2   |
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
+| rc-lookahead    |  5  | 10  | 15  |  15 |  15 |  20 |  25  |  30  |  40  | 60   |
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
+| lookahead-slices|  8  |  8  |  8  |   8 |   8 |   8 |   4  |   4  |   1  |  1   |
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
+| scenecut        |  0  | 40  | 40  |  40 |  40 |  40 |  40  |  40  |  40  | 40   |
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
+| ref             |  1  |  1  |  2  |   2 |   3 |   3 |   4  |   4  |   5  |  5   |
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
+| limit-refs      |  0  |  0  |  3  |   3 |   3 |   3 |   3  |   2  |   1  |  0   |
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
+| me              | dia | hex | hex | hex | hex | hex | star | star | star | star |
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
+| merange         | 57  | 57  | 57  |  57 |  57 |  57 |  57  |  57  |  57  | 92   |
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
+| subme           |  0  |  1  |  1  |   2 |   2 |   2 |   3  |   3  |   4  |  5   |
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
+| rect            |  0  |  0  |  0  |   0 |   0 |   0 |   1  |   1  |   1  |  1   |
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
+| amp             |  0  |  0  |  0  |   0 |   0 |   0 |   0  |   1  |   1  |  1   |
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
+| limit-modes     |  0  |  0  |  0  |   0 |   0 |   0 |   1  |   1  |   1  |  0   |
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
+| max-merge       |  2  |  2  |  2  |   2 |   2 |   2 |   3  |   3  |   4  |  5   |
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
+| early-skip      |  1  |  1  |  1  |   1 |   0 |   0 |   0  |   0  |   0  |  0   |
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
+| recursion-skip  |  1  |  1  |  1  |   1 |   1 |   1 |   1  |   1  |   0  |  0   |
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
+| fast-intra      |  1  |  1  |  1  |   1 |   1 |   0 |   0  |   0  |   0  |  0   |
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
+| b-intra         |  0  |  0  |  0  |   0 |   0 |   0 |   0  |   1  |   1  |  1   |
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
+| sao             |  0  |  0  |  1  |   1 |   1 |   1 |   1  |   1  |   1  |  1   |
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
+| signhide        |  0  |  1  |  1  |   1 |   1 |   1 |   1  |   1  |   1  |  1   |
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
+| weightp         |  0  |  0  |  1  |   1 |   1 |   1 |   1  |   1  |   1  |  1   |
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
+| weightb         |  0  |  0  |  0  |   0 |   0 |   0 |   0  |   1  |   1  |  1   |
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
+| aq-mode         |  0  |  0  |  1  |   1 |   1 |   1 |   1  |   1  |   1  |  1   |
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
+| cuTree          |  1  |  1  |  1  |   1 |   1 |   1 |   1  |   1  |   1  |  1   |
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
+| rdLevel         |  2  |  2  |  2  |   2 |   2 |   3 |   4  |   6  |   6  |  6   |
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
+| rdoq-level      |  0  |  0  |  0  |   0 |   0 |   0 |   2  |   2  |   2  |  2   |
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
+| tu-intra        |  1  |  1  |  1  |   1 |   1 |   1 |   1  |   2  |   3  |  4   |
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
+| tu-inter        |  1  |  1  |  1  |   1 |   1 |   1 |   1  |   2  |   3  |  4   |
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
 
 .. _tunings:
 
@@ -117,33 +129,32 @@
 
 
 
-Film Grain Retention
-~~~~~~~~~~~~~~~~~~~~
-
-:option:`--tune` *grain* tries to improve the retention of film grain in
-the reconstructed output. It disables rate distortion optimizations in
-quantization, and increases the default psy-rd.
-
-    * :option:`--psy-rd` 0.5
-    * :option:`--rdoq-level` 0
-    * :option:`--psy-rdoq` 0
-
-It lowers the strength of adaptive quantization, so residual energy can
-be more evenly distributed across the (noisy) picture:
+Film Grain
+~~~~~~~~~~
 
-    * :option:`--aq-strength` 0.3
-
-And it similarly tunes rate control to prevent the slice QP from
-swinging too wildly from frame to frame:
+:option:`--tune` *grain* aims to encode grainy content with the best 
+visual quality. The purpose of this option is neither to retain nor 
+eliminate grain, but prevent noticeable artifacts caused by uneven 
+distribution of grain. :option:`--tune` *grain* strongly restricts 
+algorithms that vary the quantization parameter within and across frames.
+Tune grain also biases towards decisions that retain more high frequency
+components.
 
+    * :option:`--aq-mode` 0
+    * :option:`--cutree` 0
     * :option:`--ipratio` 1.1
-    * :option:`--pbratio` 1.1
-    * :option:`--qcomp` 0.8
-
-And lastly it reduces the strength of deblocking to prevent grain being
-blurred on block boundaries:
-
-    * :option:`--deblock` -2
+    * :option:`--pbratio` 1.0
+    * :option:`--qpstep` 1
+    * :option:`--sao` 0
+    * :option:`--psy-rd` 4.0
+    * :option:`--psy-rdoq` 10.0
+    * :option:`--recursion-skip` 0
+    
+It also enables a specialised ratecontrol algorithm :option:`--rc-grain` 
+that strictly minimises QP fluctuations across frames, while still allowing 
+the encoder to hit bitrate targets and VBV buffer limits (with a slightly 
+higher margin of error than normal). It is highly recommended that this 
+algorithm is used only through the :option:`--tune` *grain* feature.
 
 Fast Decode
 ~~~~~~~~~~~

x265_1.9.tar.gz/source/CMakeLists.txt -> x265_2.0.tar.gz/source/CMakeLists.txt Changed

@@ -30,7 +30,7 @@
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
 
 # X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 79)
+set(X265_BUILD 87)
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
                "${PROJECT_BINARY_DIR}/x265.def")
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
@@ -41,7 +41,9 @@
 # System architecture detection
 string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSPROC)
 set(X86_ALIASES x86 i386 i686 x86_64 amd64)
+set(ARM_ALIASES armv6l armv7l)
 list(FIND X86_ALIASES "${SYSPROC}" X86MATCH)
+list(FIND ARM_ALIASES "${SYSPROC}" ARMMATCH)
 set(POWER_ALIASES ppc64 ppc64le)
 list(FIND POWER_ALIASES "${SYSPROC}" POWERMATCH)
 if("${SYSPROC}" STREQUAL "" OR X86MATCH GREATER "-1")
@@ -58,7 +60,12 @@
     message(STATUS "Detected POWER target processor")
     set(POWER 1)
     add_definitions(-DX265_ARCH_POWER=1)
-elseif(${SYSPROC} STREQUAL "armv6l")
+elseif(ARMMATCH GREATER "-1")
+    if(CROSS_COMPILE_ARM)
+        message(STATUS "Cross compiling for ARM arch")
+    else()
+        set(CROSS_COMPILE_ARM 0)
+    endif()
     message(STATUS "Detected ARM target processor")
     set(ARM 1)
     add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
@@ -174,11 +181,23 @@
             add_definitions(-march=native)
         endif()
     elseif(X86 AND NOT X64)
-        add_definitions(-march=i686)
+        string(FIND "${CMAKE_CXX_FLAGS}" "-march" marchPos)
+        if(marchPos LESS "0")
+            add_definitions(-march=i686)
+        endif()
     endif()
-    if(ARM)
-        add_definitions(-march=armv6 -mfloat-abi=hard -mfpu=vfp)
+    if(ARM AND CROSS_COMPILE_ARM)
+        set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
+    elseif(ARM)
+        find_package(Neon)
+        if(CPU_HAS_NEON)
+            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
+            add_definitions(-DHAVE_NEON)
+        else()
+            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
+        endif()
     endif()
+    add_definitions(${ARM_ARGS})
     if(FPROFILE_GENERATE)
         if(INTEL_CXX)
             add_definitions(-prof-gen -prof-dir="${CMAKE_CURRENT_BINARY_DIR}")
@@ -269,7 +288,9 @@
 endif(GCC)
 
 find_package(Yasm)
-if(YASM_FOUND AND X86)
+if(ARM OR CROSS_COMPILE_ARM)
+    option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives" ON)
+elseif(YASM_FOUND AND X86)
     if (YASM_VERSION_STRING VERSION_LESS "1.2.0")
         message(STATUS "Yasm version ${YASM_VERSION_STRING} is too old. 1.2.0 or later required")
         option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives" OFF)
@@ -409,7 +430,7 @@
 add_subdirectory(encoder)
 add_subdirectory(common)
 
-if((MSVC_IDE OR XCODE) AND ENABLE_ASSEMBLY)
+if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
     # this is required because of this cmake bug
     # http://www.cmake.org/Bug/print_bug_page.php?bug_id=8170
     if(WIN32)
@@ -417,19 +438,36 @@
     else()
         set(SUFFIX o)
     endif()
-    foreach(ASM ${MSVC_ASMS})
-        set(YASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/x86/${ASM})
-        list(APPEND YASM_SRCS ${YASM_SRC})
-        list(APPEND YASM_OBJS ${ASM}.${SUFFIX})
-        add_custom_command(
-            OUTPUT ${ASM}.${SUFFIX}
-            COMMAND ${YASM_EXECUTABLE} ARGS ${YASM_FLAGS} ${YASM_SRC} -o ${ASM}.${SUFFIX}
-            DEPENDS ${YASM_SRC})
-    endforeach()
+
+    if(ARM OR CROSS_COMPILE_ARM)
+    # compile ARM arch asm files here
+        enable_language(ASM)
+        foreach(ASM ${ARM_ASMS})
+            set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm/${ASM})
+            list(APPEND ASM_SRCS ${ASM_SRC})
+            list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
+            add_custom_command(
+                OUTPUT ${ASM}.${SUFFIX}
+                COMMAND ${CMAKE_CXX_COMPILER}
+                ARGS ${ARM_ARGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
+                DEPENDS ${ASM_SRC})
+        endforeach()
+    elseif(X86)
+    # compile X86 arch asm files here
+        foreach(ASM ${MSVC_ASMS})
+            set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/x86/${ASM})
+            list(APPEND ASM_SRCS ${ASM_SRC})
+            list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
+            add_custom_command(
+                OUTPUT ${ASM}.${SUFFIX}
+                COMMAND ${YASM_EXECUTABLE} ARGS ${YASM_FLAGS} ${ASM_SRC} -o ${ASM}.${SUFFIX}
+                DEPENDS ${ASM_SRC})
+        endforeach()
+    endif()
 endif()
 
-source_group(ASM FILES ${YASM_SRCS})
-add_library(x265-static STATIC $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common> ${YASM_OBJS} ${YASM_SRCS})
+source_group(ASM FILES ${ASM_SRCS})
+add_library(x265-static STATIC $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common> ${ASM_OBJS} ${ASM_SRCS})
 if(NOT MSVC)
     set_target_properties(x265-static PROPERTIES OUTPUT_NAME x265)
 endif()
@@ -463,7 +501,7 @@
 
 option(ENABLE_SHARED "Build shared library" ON)
 if(ENABLE_SHARED)
-    add_library(x265-shared SHARED "${PROJECT_BINARY_DIR}/x265.def" ${YASM_OBJS}
+    add_library(x265-shared SHARED "${PROJECT_BINARY_DIR}/x265.def" ${ASM_OBJS}
                 ${X265_RC_FILE} $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common>)
     if(EXTRA_LIB)
         target_link_libraries(x265-shared ${EXTRA_LIB})
@@ -559,7 +597,7 @@
         # Xcode seems unable to link the CLI with libs, so link as one targget
         add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${GETOPT}
                        x265.cpp x265.h x265cli.h x265-extras.h x265-extras.cpp
-                       $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common> ${YASM_OBJS} ${YASM_SRCS})
+                       $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common> ${ASM_OBJS} ${ASM_SRCS})
     else()
         add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${GETOPT} ${X265_RC_FILE}
                        ${ExportDefs} x265.cpp x265.h x265cli.h x265-extras.h x265-extras.cpp)
@@ -587,3 +625,11 @@
         add_subdirectory(test)
     endif()
 endif()
+
+get_directory_property(hasParent PARENT_DIRECTORY)
+if(hasParent)
+    if(PLATFORM_LIBS)
+        LIST(REMOVE_DUPLICATES PLATFORM_LIBS)
+        set(PLATFORM_LIBS ${PLATFORM_LIBS} PARENT_SCOPE)
+    endif(PLATFORM_LIBS)
+endif(hasParent)

x265_2.0.tar.gz/source/cmake/FindNeon.cmake Added

x265_1.9.tar.gz/source/cmake/version.cmake -> x265_2.0.tar.gz/source/cmake/version.cmake Changed

@@ -52,39 +52,55 @@
         )
     execute_process(
         COMMAND
-        ${HG_EXECUTABLE} log -r. --template "{node|short}"
+        ${HG_EXECUTABLE} log -r. --template "{node}"
         WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-        OUTPUT_VARIABLE HG_REVISION_ID
+        OUTPUT_VARIABLE X265_REVISION_ID
         ERROR_QUIET
         OUTPUT_STRIP_TRAILING_WHITESPACE
         )
+    string(SUBSTRING "${X265_REVISION_ID}" 0 12 X265_REVISION_ID)
 
     if(X265_LATEST_TAG MATCHES "^r")
         string(SUBSTRING ${X265_LATEST_TAG} 1 -1 X265_LATEST_TAG)
     endif()
-    if(X265_TAG_DISTANCE STREQUAL "0")
-        set(X265_VERSION "${X265_LATEST_TAG}")
-    else()
-        set(X265_VERSION "${X265_LATEST_TAG}+${X265_TAG_DISTANCE}-${HG_REVISION_ID}")
-    endif()
 elseif(GIT_EXECUTABLE AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/../.git)
     execute_process(
         COMMAND
-        ${GIT_EXECUTABLE} describe --tags --abbrev=0
+        ${GIT_EXECUTABLE} rev-list --tags --max-count=1
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        OUTPUT_VARIABLE X265_LATEST_TAG_COMMIT
+        ERROR_QUIET
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        )
+    execute_process(
+        COMMAND
+        ${GIT_EXECUTABLE} describe --tags ${X265_LATEST_TAG_COMMIT}
         WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
         OUTPUT_VARIABLE X265_LATEST_TAG
         ERROR_QUIET
         OUTPUT_STRIP_TRAILING_WHITESPACE
         )
-
     execute_process(
         COMMAND
-        ${GIT_EXECUTABLE} describe --tags
+        ${GIT_EXECUTABLE} rev-list ${X265_LATEST_TAG}.. --count --first-parent
         WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-        OUTPUT_VARIABLE X265_VERSION
+        OUTPUT_VARIABLE X265_TAG_DISTANCE
         ERROR_QUIET
         OUTPUT_STRIP_TRAILING_WHITESPACE
         )
+    execute_process(
+        COMMAND
+        ${GIT_EXECUTABLE} log -1 --format=g%h
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        OUTPUT_VARIABLE X265_REVISION_ID
+        ERROR_QUIET
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        )
+endif()
+if(X265_TAG_DISTANCE STREQUAL "0")
+    set(X265_VERSION "${X265_LATEST_TAG}")
+else()
+    set(X265_VERSION "${X265_LATEST_TAG}+${X265_TAG_DISTANCE}-${X265_REVISION_ID}")
 endif()
 
 message(STATUS "x265 version ${X265_VERSION}")

x265_1.9.tar.gz/source/common/CMakeLists.txt -> x265_2.0.tar.gz/source/common/CMakeLists.txt Changed

@@ -16,12 +16,14 @@
 if(ENABLE_ASSEMBLY)
     set_source_files_properties(threading.cpp primitives.cpp PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1)
     list(APPEND VFLAGS "-DENABLE_ASSEMBLY=1")
+endif(ENABLE_ASSEMBLY)
 
+if(ENABLE_ASSEMBLY AND X86)
     set(SSE3  vec/dct-sse3.cpp)
     set(SSSE3 vec/dct-ssse3.cpp)
     set(SSE41 vec/dct-sse41.cpp)
 
-    if(MSVC AND X86)
+    if(MSVC)
         set(PRIMITIVES ${SSE3} ${SSSE3} ${SSE41})
         set(WARNDISABLE "/wd4100") # unreferenced formal parameter
         if(INTEL_CXX)
@@ -38,7 +40,7 @@
             set_source_files_properties(${SSE3} ${SSSE3} ${SSE41} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} /arch:SSE2")
         endif()
     endif()
-    if(GCC AND X86)
+    if(GCC)
         if(CLANG)
             # llvm intrinsic headers cause shadow warnings
             set(WARNDISABLE "-Wno-shadow -Wno-unused-parameter")
@@ -81,7 +83,21 @@
         set(ASM_PRIMITIVES ${ASM_PRIMITIVES} x86/${SRC})
     endforeach()
     source_group(Assembly FILES ${ASM_PRIMITIVES})
-endif(ENABLE_ASSEMBLY)
+endif(ENABLE_ASSEMBLY AND X86)
+
+if(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
+    set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
+
+    # add ARM assembly/intrinsic files here
+    set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S ipfilter8.S dct-a.S)
+    set(VEC_PRIMITIVES)
+
+    set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
+    foreach(SRC ${C_SRCS})
+        set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC})
+    endforeach()
+    source_group(Assembly FILES ${ASM_PRIMITIVES})
+endif(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
 
 # set_target_properties can't do list expansion
 string(REPLACE ";" " " VERSION_FLAGS "${VFLAGS}")

x265_2.0.tar.gz/source/common/arm/asm-primitives.cpp Added

@@ -0,0 +1,1022 @@
+/*****************************************************************************
+ * Copyright (C) 2016 x265 project
+ *
+ * Authors: Steve Borho <steve@borho.org>
+ *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
+ *          Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
+ *          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "common.h"
+#include "primitives.h"
+#include "x265.h"
+#include "cpu.h"
+
+extern "C" {
+#include "blockcopy8.h"
+#include "pixel.h"
+#include "pixel-util.h"
+#include "ipfilter8.h"
+#include "dct8.h"
+}
+
+namespace X265_NS {
+// private x265 namespace
+
+void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask)
+{
+    if (cpuMask & X265_CPU_NEON)
+    {
+        // ssim_4x4x2_core
+        p.ssim_4x4x2_core = PFX(ssim_4x4x2_core_neon);
+
+        // addAvg
+         p.pu[LUMA_4x4].addAvg   = PFX(addAvg_4x4_neon);
+         p.pu[LUMA_4x8].addAvg   = PFX(addAvg_4x8_neon);
+         p.pu[LUMA_4x16].addAvg  = PFX(addAvg_4x16_neon);
+         p.pu[LUMA_8x4].addAvg   = PFX(addAvg_8x4_neon);
+         p.pu[LUMA_8x8].addAvg   = PFX(addAvg_8x8_neon);
+         p.pu[LUMA_8x16].addAvg  = PFX(addAvg_8x16_neon);
+         p.pu[LUMA_8x32].addAvg  = PFX(addAvg_8x32_neon);
+         p.pu[LUMA_12x16].addAvg = PFX(addAvg_12x16_neon);
+         p.pu[LUMA_16x4].addAvg  = PFX(addAvg_16x4_neon);
+         p.pu[LUMA_16x8].addAvg  = PFX(addAvg_16x8_neon);
+         p.pu[LUMA_16x12].addAvg = PFX(addAvg_16x12_neon);
+         p.pu[LUMA_16x16].addAvg = PFX(addAvg_16x16_neon);
+         p.pu[LUMA_16x32].addAvg = PFX(addAvg_16x32_neon);
+         p.pu[LUMA_16x64].addAvg = PFX(addAvg_16x64_neon);
+         p.pu[LUMA_24x32].addAvg = PFX(addAvg_24x32_neon);
+         p.pu[LUMA_32x8].addAvg  = PFX(addAvg_32x8_neon);
+         p.pu[LUMA_32x16].addAvg = PFX(addAvg_32x16_neon);
+         p.pu[LUMA_32x24].addAvg = PFX(addAvg_32x24_neon);
+         p.pu[LUMA_32x32].addAvg = PFX(addAvg_32x32_neon);
+         p.pu[LUMA_32x64].addAvg = PFX(addAvg_32x64_neon);
+         p.pu[LUMA_48x64].addAvg = PFX(addAvg_48x64_neon);
+         p.pu[LUMA_64x16].addAvg = PFX(addAvg_64x16_neon);
+         p.pu[LUMA_64x32].addAvg = PFX(addAvg_64x32_neon);
+         p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_neon);
+         p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_neon);
+
+        // chroma addAvg
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].addAvg   = PFX(addAvg_4x2_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].addAvg   = PFX(addAvg_4x4_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].addAvg   = PFX(addAvg_4x8_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].addAvg  = PFX(addAvg_4x16_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].addAvg   = PFX(addAvg_6x8_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg   = PFX(addAvg_8x2_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg   = PFX(addAvg_8x4_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg   = PFX(addAvg_8x6_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg   = PFX(addAvg_8x8_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg  = PFX(addAvg_8x16_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg  = PFX(addAvg_8x32_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg = PFX(addAvg_12x16_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg  = PFX(addAvg_16x4_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg  = PFX(addAvg_16x8_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg = PFX(addAvg_16x12_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg = PFX(addAvg_16x16_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg = PFX(addAvg_16x32_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].addAvg = PFX(addAvg_24x32_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg  = PFX(addAvg_32x8_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg = PFX(addAvg_32x16_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg = PFX(addAvg_32x24_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg = PFX(addAvg_32x32_neon);
+
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].addAvg   = PFX(addAvg_4x8_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].addAvg  = PFX(addAvg_4x16_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].addAvg  = PFX(addAvg_4x32_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].addAvg  = PFX(addAvg_6x16_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].addAvg   = PFX(addAvg_8x4_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].addAvg   = PFX(addAvg_8x8_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].addAvg  = PFX(addAvg_8x12_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg  = PFX(addAvg_8x16_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].addAvg  = PFX(addAvg_8x32_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].addAvg  = PFX(addAvg_8x64_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].addAvg = PFX(addAvg_12x32_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg  = PFX(addAvg_16x8_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg = PFX(addAvg_16x16_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg = PFX(addAvg_16x24_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg = PFX(addAvg_16x32_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg = PFX(addAvg_16x64_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].addAvg = PFX(addAvg_24x64_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg = PFX(addAvg_32x16_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg = PFX(addAvg_32x32_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg = PFX(addAvg_32x48_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg = PFX(addAvg_32x64_neon);
+
+        // quant
+         p.quant = PFX(quant_neon);
+         p.nquant = PFX(nquant_neon);
+
+        // dequant_scaling
+         p.dequant_scaling = PFX(dequant_scaling_neon);
+         p.dequant_normal  = PFX(dequant_normal_neon);
+
+        // luma satd
+         p.pu[LUMA_4x4].satd   = PFX(pixel_satd_4x4_neon);
+         p.pu[LUMA_4x8].satd   = PFX(pixel_satd_4x8_neon);
+         p.pu[LUMA_4x16].satd  = PFX(pixel_satd_4x16_neon);
+         p.pu[LUMA_8x4].satd   = PFX(pixel_satd_8x4_neon);
+         p.pu[LUMA_8x8].satd   = PFX(pixel_satd_8x8_neon);
+         p.pu[LUMA_8x16].satd  = PFX(pixel_satd_8x16_neon);
+         p.pu[LUMA_8x32].satd  = PFX(pixel_satd_8x32_neon);
+         p.pu[LUMA_12x16].satd = PFX(pixel_satd_12x16_neon);
+         p.pu[LUMA_16x4].satd  = PFX(pixel_satd_16x4_neon);
+         p.pu[LUMA_16x8].satd  = PFX(pixel_satd_16x8_neon);
+         p.pu[LUMA_16x16].satd = PFX(pixel_satd_16x16_neon);
+         p.pu[LUMA_16x32].satd = PFX(pixel_satd_16x32_neon);
+         p.pu[LUMA_16x64].satd = PFX(pixel_satd_16x64_neon);
+         p.pu[LUMA_24x32].satd = PFX(pixel_satd_24x32_neon);
+         p.pu[LUMA_32x8].satd  = PFX(pixel_satd_32x8_neon);
+         p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_neon);
+         p.pu[LUMA_32x24].satd = PFX(pixel_satd_32x24_neon);
+         p.pu[LUMA_32x32].satd = PFX(pixel_satd_32x32_neon);
+         p.pu[LUMA_32x64].satd = PFX(pixel_satd_32x64_neon);
+         p.pu[LUMA_48x64].satd = PFX(pixel_satd_48x64_neon);
+         p.pu[LUMA_64x16].satd = PFX(pixel_satd_64x16_neon);
+         p.pu[LUMA_64x32].satd = PFX(pixel_satd_64x32_neon);
+         p.pu[LUMA_64x48].satd = PFX(pixel_satd_64x48_neon);
+         p.pu[LUMA_64x64].satd = PFX(pixel_satd_64x64_neon);
+
+        // chroma satd
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd    = PFX(pixel_satd_4x4_neon);
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd    = PFX(pixel_satd_4x8_neon);
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd   = PFX(pixel_satd_4x16_neon);
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd    = PFX(pixel_satd_8x4_neon);
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd    = PFX(pixel_satd_8x8_neon);
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd   = PFX(pixel_satd_8x16_neon);
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd   = PFX(pixel_satd_8x32_neon);
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd  = PFX(pixel_satd_12x16_neon);
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd   = PFX(pixel_satd_16x4_neon);
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd   = PFX(pixel_satd_16x8_neon);
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd  = PFX(pixel_satd_16x12_neon);
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd  = PFX(pixel_satd_16x16_neon);
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd  = PFX(pixel_satd_16x32_neon);
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd  = PFX(pixel_satd_24x32_neon);
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd   = PFX(pixel_satd_32x8_neon);
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd  = PFX(pixel_satd_32x16_neon);
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd  = PFX(pixel_satd_32x24_neon);
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd  = PFX(pixel_satd_32x32_neon);
+
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd    = PFX(pixel_satd_4x4_neon);
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd    = PFX(pixel_satd_4x8_neon);
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd   = PFX(pixel_satd_4x16_neon);
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd   = PFX(pixel_satd_4x32_neon);
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd    = PFX(pixel_satd_8x4_neon);
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd    = PFX(pixel_satd_8x8_neon);
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd   = PFX(pixel_satd_8x12_neon);
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd   = PFX(pixel_satd_8x16_neon);
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd   = PFX(pixel_satd_8x32_neon);
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].satd   = PFX(pixel_satd_8x64_neon);
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd  = PFX(pixel_satd_12x32_neon);
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd   = PFX(pixel_satd_16x8_neon);
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd  = PFX(pixel_satd_16x16_neon);
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].satd  = PFX(pixel_satd_16x24_neon);
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd  = PFX(pixel_satd_16x32_neon);
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd  = PFX(pixel_satd_16x64_neon);
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].satd  = PFX(pixel_satd_24x64_neon);
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd  = PFX(pixel_satd_32x16_neon);
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd  = PFX(pixel_satd_32x32_neon);
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd  = PFX(pixel_satd_32x48_neon);
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd  = PFX(pixel_satd_32x64_neon);
+
+        // chroma_hpp

x265_2.0.tar.gz/source/common/arm/asm.S Added

@@ -0,0 +1,194 @@
+/*****************************************************************************
+ * asm.S: arm utility macros
+ *****************************************************************************
+ * Copyright (C) 2016 x265 project
+ *
+ * Authors: Mans Rullgard <mans@mansr.com>
+ *          David Conrad <lessen42@gmail.com>
+ *          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+.syntax unified
+
+#if   HAVE_NEON
+        .arch           armv7-a
+#elif HAVE_ARMV6T2
+        .arch           armv6t2
+#elif HAVE_ARMV6
+        .arch           armv6
+#endif
+
+.fpu neon
+
+#ifdef PREFIX
+#   define EXTERN_ASM _
+#else
+#   define EXTERN_ASM
+#endif
+
+#ifdef __ELF__
+#   define ELF
+#else
+#   define ELF @
+#endif
+
+#if HAVE_AS_FUNC
+#   define FUNC
+#else
+#   define FUNC @
+#endif
+
+.macro require8, val=1
+ELF     .eabi_attribute 24, \val
+.endm
+
+.macro preserve8, val=1
+ELF     .eabi_attribute 25, \val
+.endm
+
+.macro function name, export=1
+    .macro endfunc
+ELF     .size   \name, . - \name
+FUNC    .endfunc
+        .purgem endfunc
+    .endm
+        .align  2
+.if \export == 1
+        .global EXTERN_ASM\name
+ELF     .hidden EXTERN_ASM\name
+ELF     .type   EXTERN_ASM\name, %function
+FUNC    .func   EXTERN_ASM\name
+EXTERN_ASM\name:
+.else
+ELF     .hidden \name
+ELF     .type   \name, %function
+FUNC    .func   \name
+\name:
+.endif
+.endm
+
+.macro movrel rd, val
+#if HAVE_ARMV6T2 && !defined(PIC)
+        movw            \rd, #:lower16:\val
+        movt            \rd, #:upper16:\val
+#else
+        ldr             \rd, =\val
+#endif
+.endm
+
+.macro movconst rd, val
+#if HAVE_ARMV6T2
+    movw        \rd, #:lower16:\val
+.if \val >> 16
+    movt        \rd, #:upper16:\val
+.endif
+#else
+    ldr         \rd, =\val
+#endif
+.endm
+
+#define GLUE(a, b) a ## b
+#define JOIN(a, b) GLUE(a, b)
+#define X(s) JOIN(EXTERN_ASM, s)
+
+#define FENC_STRIDE 64
+#define FDEC_STRIDE 32
+
+.macro HORIZ_ADD dest, a, b
+.ifnb \b
+    vadd.u16    \a, \a, \b
+.endif
+    vpaddl.u16  \a, \a
+    vpaddl.u32  \dest, \a
+.endm
+
+.macro SUMSUB_AB sum, diff, a, b
+    vadd.s16    \sum,  \a, \b
+    vsub.s16    \diff, \a, \b
+.endm
+
+.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
+    SUMSUB_AB   \s1, \d1, \a, \b
+    SUMSUB_AB   \s2, \d2, \c, \d
+.endm
+
+.macro ABS2 a b
+    vabs.s16 \a, \a
+    vabs.s16 \b, \b
+.endm
+
+// dist = distance in elements (0 for vertical pass, 1/2 for horizontal passes)
+// op = sumsub/amax (sum and diff / maximum of absolutes)
+// d1/2 = destination registers
+// s1/2 = source registers
+.macro HADAMARD dist, op, d1, d2, s1, s2
+.if \dist == 1
+    vtrn.16     \s1, \s2
+.else
+    vtrn.32     \s1, \s2
+.endif
+.ifc \op, sumsub
+    SUMSUB_AB   \d1, \d2, \s1, \s2
+.else
+    vabs.s16    \s1, \s1
+    vabs.s16    \s2, \s2
+    vmax.s16    \d1, \s1, \s2
+.endif
+.endm
+
+.macro TRANSPOSE8x8 r0 r1 r2 r3 r4 r5 r6 r7
+    vtrn.32         \r0, \r4
+    vtrn.32         \r1, \r5
+    vtrn.32         \r2, \r6
+    vtrn.32         \r3, \r7
+    vtrn.16         \r0, \r2
+    vtrn.16         \r1, \r3
+    vtrn.16         \r4, \r6
+    vtrn.16         \r5, \r7
+    vtrn.8          \r0, \r1
+    vtrn.8          \r2, \r3
+    vtrn.8          \r4, \r5
+    vtrn.8          \r6, \r7
+.endm
+
+.macro TRANSPOSE4x4 r0 r1 r2 r3
+    vtrn.16         \r0, \r2
+    vtrn.16         \r1, \r3
+    vtrn.8          \r0, \r1
+    vtrn.8          \r2, \r3
+.endm
+
+.macro TRANSPOSE4x4_16  r0, r1, r2, r3
+    vtrn.32     \r0, \r2            // r0 = [21 20 01 00], r2 = [23 22 03 02]
+    vtrn.32     \r1, \r3            // r1 = [31 30 11 10], r3 = [33 32 13 12]
+    vtrn.16     \r0, \r1            // r0 = [30 20 10 00], r1 = [31 21 11 01]
+    vtrn.16     \r2, \r3            // r2 = [32 22 12 02], r3 = [33 23 13 03]
+.endm
+
+.macro TRANSPOSE4x4x2_16  rA0, rA1, rA2, rA3, rB0, rB1, rB2, rB3
+    vtrn.32     \rA0, \rA2          // r0 = [21 20 01 00], r2 = [23 22 03 02]
+    vtrn.32     \rA1, \rA3          // r1 = [31 30 11 10], r3 = [33 32 13 12]
+    vtrn.32     \rB0, \rB2
+    vtrn.32     \rB1, \rB3
+    vtrn.16     \rA0, \rA1          // r0 = [30 20 10 00], r1 = [31 21 11 01]
+    vtrn.16     \rA2, \rA3          // r2 = [32 22 12 02], r3 = [33 23 13 03]
+    vtrn.16     \rB0, \rB1
+    vtrn.16     \rB2, \rB3
+.endm

x265_2.0.tar.gz/source/common/arm/blockcopy8.S Added

@@ -0,0 +1,838 @@
+/*****************************************************************************
+ * Copyright (C) 2016 x265 project
+ *
+ * Authors: Radhakrishnan VR <radhakrishnan@multicorewareinc.com>
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.section .rodata
+
+.align 4
+
+.text
+
+/* void blockcopy_sp(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
+ *
+ * r0   - a
+ * r1   - stridea
+ * r2   - b
+ * r3   - strideb */
+function x265_blockcopy_sp_4x4_neon
+    lsl             r3, #1
+.rept 2
+    vld1.u16        {q0}, [r2], r3
+    vld1.u16        {q1}, [r2], r3
+    vmovn.u16       d0, q0
+    vmovn.u16       d1, q1
+    vst1.u32        {d0[0]}, [r0], r1
+    vst1.u32        {d1[0]}, [r0], r1
+.endr
+    bx              lr
+endfunc
+
+function x265_blockcopy_sp_8x8_neon
+    lsl             r3, #1
+.rept 4
+    vld1.u16        {q0}, [r2], r3
+    vld1.u16        {q1}, [r2], r3
+    vmovn.u16       d0, q0
+    vmovn.u16       d1, q1
+    vst1.u8         {d0}, [r0], r1
+    vst1.u8         {d1}, [r0], r1
+.endr
+    bx              lr
+endfunc
+
+function x265_blockcopy_sp_16x16_neon
+    lsl             r3, #1
+.rept 8
+    vld1.u16        {q0, q1}, [r2], r3
+    vld1.u16        {q2, q3}, [r2], r3
+    vmovn.u16       d0, q0
+    vmovn.u16       d1, q1
+    vmovn.u16       d2, q2
+    vmovn.u16       d3, q3
+    vst1.u8         {q0}, [r0], r1
+    vst1.u8         {q1}, [r0], r1
+.endr
+    bx              lr
+endfunc
+
+function x265_blockcopy_sp_32x32_neon
+    mov             r12, #4
+    lsl             r3, #1
+    sub             r3, #32
+loop_csp32:
+    subs            r12, #1
+.rept 4
+    vld1.u16        {q0, q1}, [r2]!
+    vld1.u16        {q2, q3}, [r2], r3
+    vld1.u16        {q8, q9}, [r2]!
+    vld1.u16        {q10, q11}, [r2], r3
+
+    vmovn.u16       d0, q0
+    vmovn.u16       d1, q1
+    vmovn.u16       d2, q2
+    vmovn.u16       d3, q3
+
+    vmovn.u16       d4, q8
+    vmovn.u16       d5, q9
+    vmovn.u16       d6, q10
+    vmovn.u16       d7, q11
+
+    vst1.u8         {q0, q1}, [r0], r1
+    vst1.u8         {q2, q3}, [r0], r1
+.endr
+    bne             loop_csp32
+    bx              lr
+endfunc
+
+function x265_blockcopy_sp_64x64_neon
+    mov             r12, #16
+    lsl             r3, #1
+    sub             r3, #96
+    sub             r1, #32
+loop_csp64:
+    subs            r12, #1
+.rept 4
+    vld1.u16        {q0, q1}, [r2]!
+    vld1.u16        {q2, q3}, [r2]!
+    vld1.u16        {q8, q9}, [r2]!
+    vld1.u16        {q10, q11}, [r2], r3
+
+    vmovn.u16       d0, q0
+    vmovn.u16       d1, q1
+    vmovn.u16       d2, q2
+    vmovn.u16       d3, q3
+
+    vmovn.u16       d4, q8
+    vmovn.u16       d5, q9
+    vmovn.u16       d6, q10
+    vmovn.u16       d7, q11
+
+    vst1.u8         {q0, q1}, [r0]!
+    vst1.u8         {q2, q3}, [r0], r1
+.endr
+    bne             loop_csp64
+    bx              lr
+endfunc
+
+// void blockcopy_ps(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb)
+function x265_blockcopy_ps_4x4_neon
+    lsl             r1, #1
+.rept 2
+    vld1.u8         {d0}, [r2], r3
+    vld1.u8         {d1}, [r2], r3
+    vmovl.u8        q1, d0
+    vmovl.u8        q2, d1
+    vst1.u16        {d2}, [r0], r1
+    vst1.u16        {d4}, [r0], r1
+.endr
+    bx              lr
+endfunc
+
+function x265_blockcopy_ps_8x8_neon
+    lsl             r1, #1
+.rept 4
+    vld1.u8         {d0}, [r2], r3
+    vld1.u8         {d1}, [r2], r3
+    vmovl.u8        q1, d0
+    vmovl.u8        q2, d1
+    vst1.u16        {q1}, [r0], r1
+    vst1.u16        {q2}, [r0], r1
+.endr
+    bx              lr
+endfunc
+
+function x265_blockcopy_ps_16x16_neon
+    lsl             r1, #1
+.rept 8
+    vld1.u8         {q0}, [r2], r3
+    vld1.u8         {q1}, [r2], r3
+    vmovl.u8        q8, d0
+    vmovl.u8        q9, d1
+    vmovl.u8        q10, d2
+    vmovl.u8        q11, d3
+    vst1.u16        {q8, q9}, [r0], r1
+    vst1.u16        {q10, q11}, [r0], r1
+.endr
+    bx              lr
+endfunc
+
+function x265_blockcopy_ps_32x32_neon
+    lsl             r1, #1
+    sub             r1, #32
+    mov             r12, #4
+loop_cps32:
+    subs            r12, #1
+.rept 4
+    vld1.u8         {q0, q1}, [r2], r3
+    vld1.u8         {q2, q3}, [r2], r3
+    vmovl.u8        q8, d0
+    vmovl.u8        q9, d1
+    vmovl.u8        q10, d2
+    vmovl.u8        q11, d3
+
+    vmovl.u8        q12, d4
+    vmovl.u8        q13, d5
+    vmovl.u8        q14, d6
+    vmovl.u8        q15, d7
+
+    vst1.u16        {q8, q9}, [r0]!

x265_2.0.tar.gz/source/common/arm/blockcopy8.h Added

@@ -0,0 +1,123 @@
+/*****************************************************************************
+ * Copyright (C) 2016 x265 project
+ *
+ * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
+ *          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_BLOCKCOPY8_ARM_H
+#define X265_BLOCKCOPY8_ARM_H
+
+void x265_blockcopy_pp_16x16_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_8x4_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_8x8_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_8x16_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_8x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_12x16_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_4x4_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_4x8_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_4x16_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_16x4_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_16x8_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_16x12_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_16x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_16x64_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_24x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_32x8_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_32x16_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_32x24_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_32x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_32x64_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_48x64_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_64x16_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_64x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_64x48_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_64x64_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_2x4_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_2x8_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_2x16_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_6x8_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_6x16_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_8x2_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_8x6_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_8x12_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_8x64_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_12x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_4x2_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_4x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_16x24_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_24x64_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_32x48_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+
+void x265_cpy2Dto1D_shr_4x4_neon(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shr_8x8_neon(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shr_16x16_neon(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shr_32x32_neon(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+
+void x265_blockcopy_sp_4x4_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+void x265_blockcopy_sp_8x8_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+void x265_blockcopy_sp_16x16_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+void x265_blockcopy_sp_32x32_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+void x265_blockcopy_sp_64x64_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+
+void x265_blockcopy_ps_4x4_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_ps_8x8_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_ps_16x16_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_ps_32x32_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_ps_64x64_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+
+void x265_blockcopy_ss_4x4_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+void x265_blockcopy_ss_8x8_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+void x265_blockcopy_ss_16x16_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+void x265_blockcopy_ss_32x32_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+void x265_blockcopy_ss_64x64_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+
+// chroma blockcopy
+void x265_blockcopy_ss_4x8_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+void x265_blockcopy_ss_8x16_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+void x265_blockcopy_ss_16x32_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+void x265_blockcopy_ss_32x64_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+
+void x265_blockcopy_sp_4x8_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+void x265_blockcopy_sp_8x16_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+void x265_blockcopy_sp_16x32_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+void x265_blockcopy_sp_32x64_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+
+void x265_blockcopy_ps_4x8_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_ps_8x16_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_ps_16x32_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_ps_32x64_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+
+void x265_blockfill_s_4x4_neon(int16_t* dst, intptr_t dstride, int16_t val);
+void x265_blockfill_s_8x8_neon(int16_t* dst, intptr_t dstride, int16_t val);
+void x265_blockfill_s_16x16_neon(int16_t* dst, intptr_t dstride, int16_t val);
+void x265_blockfill_s_32x32_neon(int16_t* dst, intptr_t dstride, int16_t val);
+
+uint32_t x265_copy_cnt_4_neon(int16_t* coeff, const int16_t* residual, intptr_t resiStride);
+uint32_t x265_copy_cnt_8_neon(int16_t* coeff, const int16_t* residual, intptr_t resiStride);
+uint32_t x265_copy_cnt_16_neon(int16_t* coeff, const int16_t* residual, intptr_t resiStride);
+uint32_t x265_copy_cnt_32_neon(int16_t* coeff, const int16_t* residual, intptr_t resiStride);
+
+int x265_count_nonzero_4_neon(const int16_t* quantCoeff);
+int x265_count_nonzero_8_neon(const int16_t* quantCoeff);
+int x265_count_nonzero_16_neon(const int16_t* quantCoeff);
+int x265_count_nonzero_32_neon(const int16_t* quantCoeff);
+#endif // ifndef X265_I386_PIXEL_ARM_H

x265_2.0.tar.gz/source/common/arm/cpu-a.S Added

@@ -0,0 +1,109 @@
+/*****************************************************************************
+ * cpu-a.S: arm cpu detection
+ *****************************************************************************
+ * Copyright (C) 2016 x265 project
+ *
+ * Authors: David Conrad <lessen42@gmail.com>
+ *          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.align 2
+
+// done in gas because .fpu neon overrides the refusal to assemble
+// instructions the selected -march/-mcpu doesn't support
+function x265_cpu_neon_test
+    vadd.i16    q0, q0, q0
+    bx          lr
+endfunc
+
+// return: 0 on success
+//         1 if counters were already enabled
+//         9 if lo-res counters were already enabled
+function x265_cpu_enable_armv7_counter, export=0
+    mrc         p15, 0, r2, c9, c12, 0      // read PMNC
+    ands        r0, r2, #1
+    andne       r0, r2, #9
+
+    orr         r2, r2, #1                  // enable counters
+    bic         r2, r2, #8                  // full resolution
+    mcreq       p15, 0, r2, c9, c12, 0      // write PMNC
+    mov         r2, #1 << 31                // enable cycle counter
+    mcr         p15, 0, r2, c9, c12, 1      // write CNTENS
+    bx          lr
+endfunc
+
+function x265_cpu_disable_armv7_counter, export=0
+    mrc         p15, 0, r0, c9, c12, 0      // read PMNC
+    bic         r0, r0, #1                  // disable counters
+    mcr         p15, 0, r0, c9, c12, 0      // write PMNC
+    bx          lr
+endfunc
+
+
+.macro READ_TIME r
+    mrc         p15, 0, \r, c9, c13, 0
+.endm
+
+// return: 0 if transfers neon -> arm transfers take more than 10 cycles
+//         nonzero otherwise
+function x265_cpu_fast_neon_mrc_test
+    // check for user access to performance counters
+    mrc         p15, 0, r0, c9, c14, 0
+    cmp         r0, #0
+    bxeq        lr
+
+    push        {r4-r6,lr}
+    bl          x265_cpu_enable_armv7_counter
+    ands        r1, r0, #8
+    mov         r3, #0
+    mov         ip, #4
+    mov         r6, #4
+    moveq       r5, #1
+    movne       r5, #64
+
+average_loop:
+    mov         r4, r5
+    READ_TIME   r1
+1:  subs        r4, r4, #1
+.rept 8
+    vmov.u32    lr, d0[0]
+    add         lr, lr, lr
+.endr
+    bgt         1b
+    READ_TIME   r2
+
+    subs        r6, r6, #1
+    sub         r2, r2, r1
+    cmpgt       r2, #30 << 3    // assume context switch if it took over 30 cycles
+    addle       r3, r3, r2
+    subsle      ip, ip, #1
+    bgt         average_loop
+
+    // disable counters if we enabled them
+    ands        r0, r0, #1
+    bleq        x265_cpu_disable_armv7_counter
+
+    lsr         r0, r3, #5
+    cmp         r0, #10
+    movgt       r0, #0
+    pop         {r4-r6,pc}
+endfunc

x265_2.0.tar.gz/source/common/arm/dct-a.S Added

@@ -0,0 +1,900 @@
+/*****************************************************************************
+ * Copyright (C) 2016 x265 project
+ *
+ * Authors: Min Chen <chenm003@163.com>
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.section .rodata
+
+.align 4
+
+.text
+
+.align 4
+
+//        dst[0 * line] = ((64 * E[0] + 64 * E[1] + add) >> shift);
+//        dst[2 * line] = ((64 * E[0] - 64 * E[1] + add) >> shift);
+//        dst[1 * line] = ((83 * O[0] + 36 * O[1] + add) >> shift);
+//        dst[3 * line] = ((36 * O[0] - 83 * O[1] + add) >> shift);
+
+/* void dct4_c(const int16_t* src, int16_t* dst, intptr_t srcStride) */
+function x265_dct_4x4_neon
+    mov             r2, r2, lsl #1
+    vld1.16         {d0}, [r0, :64], r2                     // d0  = [03 02 01 00]
+    vld1.16         {d1}, [r0, :64], r2                     // d1  = [13 12 11 10]
+    vld1.16         {d2}, [r0, :64], r2                     // d2  = [23 22 21 20]
+    vld1.16         {d3}, [r0, :64]                         // d3  = [33 32 31 30]
+
+    vtrn.32         q0, q1                                  // q0  = [31 30 11 10 21 20 01 00], q1 = [33 32 13 12 23 22 03 02]
+    vrev32.16       q1, q1                                  // q1  = [32 33 12 13 22 23 02 03]
+
+    movconst        r0, 0x00240053
+    movconst        r2, 0xFFAD0024
+
+    // DCT-1D
+    vadd.s16        q2, q0, q1                              // q2  = [E31 E30 E11 E10 E21 E20 E01 E00]
+    vsub.s16        q3, q0, q1                              // q3  = [O31 O30 O11 O10 O21 O20 O01 O00]
+    vdup.32         d16, r0                                 // d16 = [ 36  83]
+    vdup.32         d17, r2                                 // d17 = [-83  36]
+    vtrn.16         d4, d5                                  // d4  = [E30 E20 E10 E00], d5 = [E31 E21 E11 E01]
+    vtrn.32         d6, d7                                  // q3  = [O31 O30 O21 O20 O11 O10 O01 O00]
+
+    vmull.s16       q9, d6, d16
+    vmull.s16       q10, d7, d16                            // [q9, q10] = [ 36*O1 83*O0] -> [1]
+    vmull.s16       q11, d6, d17
+    vmull.s16       q12, d7, d17                            // [q11,q12] = [-83*O1 36*O0] -> [3]
+
+    vadd.s16        d0, d4, d5                              // d0 = [E0 + E1]
+    vsub.s16        d1, d4, d5                              // d1 = [E0 - E1]
+
+    vpadd.s32       d18, d18, d19                           // q9  = [1]
+    vpadd.s32       d19, d20, d21
+    vpadd.s32       d20, d22, d23                           // q10 = [3]
+    vpadd.s32       d21, d24, d25
+
+    vshll.s16       q1, d0, #6                              // q1  = 64 * [0]
+    vshll.s16       q2, d1, #6                              // q2  = 64 * [2]
+
+    // TODO: Dynamic Range is 11+6-1 bits
+    vqrshrn.s32     d25, q9, 1                              // d25 = R[13 12 11 10]
+    vqrshrn.s32     d24, q1, 1                              // d24 = R[03 02 01 00]
+    vqrshrn.s32     d26, q2, 1                              // q26 = R[23 22 21 20]
+    vqrshrn.s32     d27, q10, 1                             // d27 = R[33 32 31 30]
+
+
+    // DCT-2D
+    vmovl.s16       q0, d16                                // q14 = [ 36  83]
+
+    vtrn.32         q12, q13                                // q12 = [31 30 11 10 21 20 01 00], q13 = [33 32 13 12 23 22 03 02]
+    vrev32.16       q13, q13                                // q13 = [32 33 12 13 22 23 02 03]
+
+    vaddl.s16       q1, d24, d26                            // q0  = [E21 E20 E01 E00]
+    vaddl.s16       q2, d25, d27                            // q1  = [E31 E30 E11 E10]
+    vsubl.s16       q3, d24, d26                            // q2  = [O21 O20 O01 O00]
+    vsubl.s16       q8, d25, d27                            // q3  = [O31 O30 O11 O10]
+
+    vtrn.32         q1, q2                                  // q1  = [E30 E20 E10 E00], q2  = [E31 E21 E11 E01]
+    vtrn.32         q3, q8                                  // q3  = [O30 O20 O10 O00], q8  = [O31 O21 O11 O01]
+
+    vmul.s32        q9, q3, d0[0]                           // q9  = [83*O30 83*O20 83*O10 83*O00]
+    vmul.s32        q10, q8, d0[1]                          // q10 = [36*O31 36*O21 36*O11 36*O01]
+    vmul.s32        q11, q3, d0[1]                          // q11 = [36*O30 36*O20 36*O10 36*O00]
+    vmul.s32        q12, q8, d0[0]                          // q12 = [83*O31 83*O21 83*O11 83*O01]
+
+    vadd.s32        q0, q1, q2                              // d0 = [E0 + E1]
+    vsub.s32        q1, q1, q2                              // d1 = [E0 - E1]
+
+    vadd.s32        q9, q9, q10
+    vsub.s32        q10, q11, q12
+
+    vshl.s32        q0, q0, #6                              // q1  = 64 * [0]
+    vshl.s32        q1, q1, #6                              // q2  = 64 * [2]
+
+    vqrshrn.s32     d25, q9, 8                              // d25 = R[13 12 11 10]
+    vqrshrn.s32     d27, q10, 8                             // d27 = R[33 32 31 30]
+
+    vqrshrn.s32     d24, q0, 8                              // d24 = R[03 02 01 00]
+    vqrshrn.s32     d26, q1, 8                              // q26 = R[23 22 21 20]
+
+    vst1.16         {d24-d27}, [r1]
+
+    bx              lr
+endfunc
+
+/* uses registers q4 - q7 for temp values */
+.macro tr4 r0, r1, r2, r3
+    vsub.s32    q8, \r0, \r3    // EO0
+    vadd.s32    q9, \r0, \r3    // EE0
+    vadd.s32    q10, \r1, \r2   // EE1
+    vsub.s32    q11, \r1, \r2   // EO1
+
+    vmul.s32    \r1, q8, d0[0]  // 83 * EO0
+    vmul.s32    \r3, q8, d0[1]  // 36 * EO0
+    vshl.s32    q9, q9, #6      // 64 * EE0
+    vshl.s32    q10, q10, #6    // 64 * EE1
+    vmla.s32    \r1, q11, d0[1] // 83 * EO0 + 36 * EO1
+    vmls.s32    \r3, q11, d0[0] // 36 * EO0 - 83 * EO1
+    vadd.s32    \r0, q9, q10    // 64 * (EE0 + EE1)
+    vsub.s32    \r2, q9, q10    // 64 * (EE0 - EE1)
+.endm
+
+
+.macro tr8 r0, r1, r2, r3
+    vmul.s32  q12, \r0, d1[1]   //  89 * src1
+    vmul.s32  q13, \r0, d1[0]   //  75 * src1
+    vmul.s32  q14, \r0, d2[1]   //  50 * src1
+    vmul.s32  q15, \r0, d2[0]   //  18 * src1
+
+    vmla.s32  q12, \r1, d1[0]   //  75 * src3
+    vmls.s32  q13, \r1, d2[0]   // -18 * src3
+    vmls.s32  q14, \r1, d1[1]   // -89 * src3
+    vmls.s32  q15, \r1, d2[1]   // -50 * src3
+
+    vmla.s32  q12, \r2, d2[1]   //  50 * src5
+    vmls.s32  q13, \r2, d1[1]   // -89 * src5
+    vmla.s32  q14, \r2, d2[0]   //  18 * src5
+    vmla.s32  q15, \r2, d1[0]   //  75 * src5
+
+    vmla.s32  q12, \r3, d2[0]   //  18 * src7
+    vmls.s32  q13, \r3, d2[1]   // -50 * src7
+    vmla.s32  q14, \r3, d1[0]   //  75 * src7
+    vmls.s32  q15, \r3, d1[1]   // -89 * src7
+.endm
+
+
+// TODO: in the DCT-2D stage, I spending 4x8=32 LD/ST operators because I haven't temporary buffer
+/* void dct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride) */
+function x265_dct_8x8_neon
+    vpush {q4-q7}
+
+    mov r2, r2, lsl #1
+
+    adr r3, ctr4
+    vld1.16 {d0-d2}, [r3]
+    mov r3, r1
+
+    // DCT-1D
+    // top half
+    vld1.16 {q12}, [r0], r2
+    vld1.16 {q13}, [r0], r2
+    vld1.16 {q14}, [r0], r2
+    vld1.16 {q15}, [r0], r2
+
+    TRANSPOSE4x4x2_16 d24, d26, d28, d30,  d25, d27, d29, d31
+
+    // |--|
+    // |24|
+    // |26|
+    // |28|
+    // |30|
+    // |25|
+    // |27|
+    // |29|
+    // |31|
+    // |--|
+
+    vaddl.s16 q4, d28, d27
+    vaddl.s16 q5, d30, d25
+    vaddl.s16 q2, d24, d31
+    vaddl.s16 q3, d26, d29
+

x265_2.0.tar.gz/source/common/arm/dct8.h Added

@@ -0,0 +1,32 @@
+/*****************************************************************************
+ * Copyright (C) 2016 x265 project
+ *
+ * Authors: Min Chen <chenm003@163.com>
+ *          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_DCT8_ARM_H
+#define X265_DCT8_ARM_H
+
+void PFX(dct_4x4_neon)(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void PFX(dct_8x8_neon)(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void PFX(dct_16x16_neon)(const int16_t* src, int16_t* dst, intptr_t srcStride);
+
+#endif // ifndef X265_DCT8_ARM_H

x265_2.0.tar.gz/source/common/arm/intrapred.h Added

@@ -0,0 +1,31 @@
+/*****************************************************************************
+ * intrapred.h: Intra Prediction metrics
+ *****************************************************************************
+ * Copyright (C) 2003-2013 x264 project
+ *
+ * Authors: Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
+ *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
+ *          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_INTRAPRED_ARM_H
+#define X265_INTRAPRED_ARM_H
+
+#endif // ifndef X265_INTRAPRED_ARM_H

x265_2.0.tar.gz/source/common/arm/ipfilter8.S Added

@@ -0,0 +1,3341 @@
+/*****************************************************************************
+ * Copyright (C) 2016 x265 project
+ *
+ * Authors: Dnyaneshwar G <dnyaneshwar@multicorewareinc.com>
+ *          Radhakrishnan VR <radhakrishnan@multicorewareinc.com>
+ *          Min Chen <min.chen@multicorewareinc.com>
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.section .rodata
+.align 4
+
+g_lumaFilter:
+.word 0,0,0,0,0,0,64,64,0,0,0,0,0,0,0,0
+.word -1,-1,4,4,-10,-10,58,58,17,17,-5,-5,1,1,0,0
+.word -1,-1,4,4,-11,-11,40,40,40,40,-11,-11,4,4,-1,-1
+.word 0,0,1,1,-5,-5,17,17,58,58,-10,-10,4,4,-1,-1 
+g_chromaFilter:
+.word 0, 0, 64, 64, 0, 0, 0, 0
+.word -2, -2, 58, 58, 10, 10, -2, -2
+.word -4, -4, 54, 54, 16, 16, -2, -2
+.word -6, -6, 46, 46, 28, 28, -4, -4
+.word -4, -4, 36, 36, 36, 36, -4 ,-4
+.word -4, -4, 28, 28, 46, 46, -6, -6
+.word -2, -2, 16, 16, 54, 54, -4 ,-4
+.word -2, -2, 10, 10, 58, 58, -2, -2
+
+
+.text
+
+// filterPixelToShort(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride)
+function x265_filterPixelToShort_4x4_neon
+    vld1.u32    {d0[]}, [r0], r1
+    vld1.u32    {d0[1]}, [r0], r1
+    vld1.u32    {d1[]}, [r0], r1
+    vld1.u32    {d1[1]}, [r0], r1
+
+    // avoid load pipeline stall
+    vmov.i16    q1, #0xE000
+
+    vshll.u8    q2, d0, #6
+    vshll.u8    q3, d1, #6
+    vadd.i16    q2, q1
+    vadd.i16    q3, q1
+
+    add         r3, r3
+    vst1.16     {d4}, [r2], r3
+    vst1.16     {d5}, [r2], r3
+    vst1.16     {d6}, [r2], r3
+    vst1.16     {d7}, [r2], r3
+
+    bx          lr
+endfunc
+
+function x265_filterPixelToShort_4x8_neon
+    add         r3, r3
+    vmov.u16    q8, #64
+    vmov.u16    q9, #8192
+    vneg.s16    q9, q9
+.rept 4
+    vld1.u8     {d0}, [r0], r1
+    vld1.u8     {d2}, [r0], r1
+    vmovl.u8    q0, d0
+    vmovl.u8    q1, d2
+    vmov        q2, q9
+    vmov        q3, q9
+    vmla.s16    q2, q0, q8
+    vmla.s16    q3, q1, q8
+    vst1.16     {d4}, [r2], r3
+    vst1.16     {d6}, [r2], r3
+.endr
+    bx          lr
+endfunc
+
+function x265_filterPixelToShort_4x16_neon
+    add         r3, r3
+    vmov.u16    q8, #64
+    vmov.u16    q9, #8192
+    vneg.s16    q9, q9
+.rept 8
+    vld1.u8     {d0}, [r0], r1
+    vld1.u8     {d2}, [r0], r1
+    vmovl.u8    q0, d0
+    vmovl.u8    q1, d2
+    vmov        q2, q9
+    vmov        q3, q9
+    vmla.s16    q2, q0, q8
+    vmla.s16    q3, q1, q8
+    vst1.16     {d4}, [r2], r3
+    vst1.16     {d6}, [r2], r3
+.endr
+    bx          lr
+endfunc
+
+function x265_filterPixelToShort_8x4_neon
+    add         r3, r3
+    vmov.u16    q8, #64
+    vmov.u16    q9, #8192
+    vneg.s16    q9, q9
+.rept 2
+    vld1.u8     {d0}, [r0], r1
+    vld1.u8     {d2}, [r0], r1
+    vmovl.u8    q0, d0
+    vmovl.u8    q1, d2
+    vmov        q2, q9
+    vmov        q3, q9
+    vmla.s16    q2, q0, q8
+    vmla.s16    q3, q1, q8
+    vst1.16     {q2}, [r2], r3
+    vst1.16     {q3}, [r2], r3
+.endr
+    bx          lr
+endfunc
+
+function x265_filterPixelToShort_8x8_neon
+    add         r3, r3
+    vmov.u16    q8, #64
+    vmov.u16    q9, #8192
+    vneg.s16    q9, q9
+.rept 4
+    vld1.u8     {d0}, [r0], r1
+    vld1.u8     {d2}, [r0], r1
+    vmovl.u8    q0, d0
+    vmovl.u8    q1, d2
+    vmov        q2, q9
+    vmov        q3, q9
+    vmla.s16    q2, q0, q8
+    vmla.s16    q3, q1, q8
+    vst1.16     {q2}, [r2], r3
+    vst1.16     {q3}, [r2], r3
+.endr
+    bx          lr
+endfunc
+
+function x265_filterPixelToShort_8x16_neon
+    add         r3, r3
+    vmov.u16    q8, #64
+    vmov.u16    q9, #8192
+    vneg.s16    q9, q9
+.rept 8
+    vld1.u8     {d0}, [r0], r1
+    vld1.u8     {d2}, [r0], r1
+    vmovl.u8    q0, d0
+    vmovl.u8    q1, d2
+    vmov        q2, q9
+    vmov        q3, q9
+    vmla.s16    q2, q0, q8
+    vmla.s16    q3, q1, q8
+    vst1.16     {q2}, [r2], r3
+    vst1.16     {q3}, [r2], r3
+.endr
+    bx          lr
+endfunc
+
+function x265_filterPixelToShort_8x32_neon
+    add         r3, r3
+    vmov.u16    q8, #64
+    vmov.u16    q9, #8192
+    vneg.s16    q9, q9
+.rept 16
+    vld1.u8     {d0}, [r0], r1
+    vld1.u8     {d2}, [r0], r1
+    vmovl.u8    q0, d0
+    vmovl.u8    q1, d2
+    vmov        q2, q9
+    vmov        q3, q9
+    vmla.s16    q2, q0, q8
+    vmla.s16    q3, q1, q8
+    vst1.16     {q2}, [r2], r3
+    vst1.16     {q3}, [r2], r3
+.endr
+    bx          lr
+endfunc
+
+function x265_filterPixelToShort_12x16_neon
+    add         r3, r3
+    vmov.u16    q8, #64
+    vmov.u16    q9, #8192
+    vneg.s16    q9, q9
+.rept 16
+    vld1.u8     {d2-d3}, [r0], r1

x265_2.0.tar.gz/source/common/arm/ipfilter8.h Added

@@ -0,0 +1,342 @@
+/*****************************************************************************
+ * Copyright (C) 2016 x265 project
+ *
+ * Authors: Steve Borho <steve@borho.org>
+ *          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_IPFILTER8_ARM_H
+#define X265_IPFILTER8_ARM_H
+
+void x265_filterPixelToShort_4x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
+void x265_filterPixelToShort_4x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
+void x265_filterPixelToShort_4x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
+void x265_filterPixelToShort_8x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
+void x265_filterPixelToShort_8x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
+void x265_filterPixelToShort_8x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
+void x265_filterPixelToShort_8x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
+void x265_filterPixelToShort_12x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
+void x265_filterPixelToShort_16x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
+void x265_filterPixelToShort_16x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
+void x265_filterPixelToShort_16x12_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
+void x265_filterPixelToShort_16x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
+void x265_filterPixelToShort_16x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
+void x265_filterPixelToShort_16x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
+void x265_filterPixelToShort_24x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
+void x265_filterPixelToShort_32x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
+void x265_filterPixelToShort_32x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
+void x265_filterPixelToShort_32x24_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
+void x265_filterPixelToShort_32x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
+void x265_filterPixelToShort_32x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
+void x265_filterPixelToShort_48x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
+void x265_filterPixelToShort_64x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
+void x265_filterPixelToShort_64x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
+void x265_filterPixelToShort_64x48_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
+void x265_filterPixelToShort_64x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
+
+void x265_interp_8tap_vert_pp_4x4_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_4x8_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_4x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_8x4_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_8x8_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_8x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_8x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_16x4_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_16x8_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_16x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_16x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_16x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_16x12_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_32x8_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_32x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_32x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_32x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_32x24_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_64x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_64x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_64x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_64x48_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_24x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_48x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_12x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+
+void x265_interp_8tap_vert_sp_4x4_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_sp_4x8_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_sp_4x16_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_sp_8x4_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_sp_8x8_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_sp_8x16_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_sp_8x32_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_sp_16x4_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_sp_16x8_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_sp_16x16_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_sp_16x32_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_sp_16x64_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_sp_16x12_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_sp_32x8_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_sp_32x16_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_sp_32x32_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_sp_32x64_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_sp_32x24_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_sp_64x16_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_sp_64x32_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_sp_64x64_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_sp_64x48_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_sp_24x32_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_sp_48x64_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_sp_12x16_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+
+void x265_interp_8tap_vert_ps_4x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_4x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_4x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_8x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_8x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_8x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_8x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_16x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_16x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_16x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_16x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_16x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_16x12_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_32x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_32x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_32x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_32x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_32x24_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_64x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_64x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_64x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_64x48_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_24x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_48x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_12x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+
+void x265_interp_4tap_vert_pp_8x2_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_pp_8x4_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_pp_8x6_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_pp_8x8_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_pp_8x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_pp_8x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_pp_8x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_pp_8x12_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_pp_16x4_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_pp_16x8_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_pp_16x12_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_pp_16x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_pp_16x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_pp_16x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_pp_16x24_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_pp_32x8_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_pp_32x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_pp_32x24_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_pp_32x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_pp_32x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_pp_32x48_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_pp_24x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_pp_24x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_pp_48x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_pp_64x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_pp_64x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_pp_64x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_pp_64x48_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+
+void x265_interp_4tap_vert_ps_8x2_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_8x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_8x6_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_8x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_8x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_8x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_8x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_8x12_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_16x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_16x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_16x12_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_16x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_16x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_16x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_16x24_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_32x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_32x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_32x24_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_32x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_32x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_32x48_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_24x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_24x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_48x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_64x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_64x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_64x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_64x48_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+
+void x265_interp_4tap_vert_sp_8x2_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_sp_8x4_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_sp_8x6_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_sp_8x8_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_sp_8x16_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_sp_8x32_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_sp_8x64_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_sp_8x12_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_sp_16x4_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_sp_16x8_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);

x265_2.0.tar.gz/source/common/arm/loopfilter.h Added

@@ -0,0 +1,29 @@
+/*****************************************************************************
+ * Copyright (C) 2016 x265 project
+ *
+ * Authors: Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
+ *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
+;*          Min Chen <chenm003@163.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_LOOPFILTER_ARM_H
+#define X265_LOOPFILTER_ARM_H
+
+#endif // ifndef X265_LOOPFILTER_ARM_H

x265_2.0.tar.gz/source/common/arm/mc-a.S Added

@@ -0,0 +1,1172 @@
+/*****************************************************************************
+ * Copyright (C) 2016 x265 project
+ *
+ * Authors: Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
+ *          Radhakrishnan <radhakrishnan@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.section .rodata
+
+.align 4
+
+.text
+
+/* blockcopy_pp_16x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
+ *
+ * r0   - dst
+ * r1   - dstStride
+ * r2   - src
+ * r3   - srcStride */
+function x265_blockcopy_pp_16x16_neon
+.rept 16
+    vld1.8          {q0}, [r2]
+    vst1.8          {q0}, [r0]
+    add             r2, r2, r3
+    add             r0, r0, r1
+.endr
+    bx              lr
+endfunc
+
+.macro blockcopy_pp_4xN_neon h
+function x265_blockcopy_pp_4x\h\()_neon
+.rept \h
+    ldr             r12, [r2], r3
+    str             r12, [r0], r1
+.endr
+    bx              lr
+endfunc
+.endm
+
+blockcopy_pp_4xN_neon 4
+blockcopy_pp_4xN_neon 8
+blockcopy_pp_4xN_neon 16
+blockcopy_pp_4xN_neon 2
+blockcopy_pp_4xN_neon 32
+
+.macro blockcopy_pp_16xN_neon h
+function x265_blockcopy_pp_16x\h\()_neon
+.rept \h    
+    vld1.8          {q0}, [r2], r3
+    vst1.8          {q0}, [r0], r1
+.endr
+    bx              lr
+endfunc
+.endm
+
+blockcopy_pp_16xN_neon 4
+blockcopy_pp_16xN_neon 8
+blockcopy_pp_16xN_neon 12
+blockcopy_pp_16xN_neon 24
+
+.macro blockcopy_pp_16xN1_neon h i
+function x265_blockcopy_pp_16x\h\()_neon
+    mov             r12, #\i
+loop_16x\h\():
+.rept 8
+    vld1.8          {q0}, [r2], r3
+    vst1.8          {q0}, [r0], r1
+.endr
+    subs            r12, r12, #1
+    bne             loop_16x\h
+    bx              lr
+endfunc
+.endm
+
+blockcopy_pp_16xN1_neon 32 4
+blockcopy_pp_16xN1_neon 64 8
+
+.macro blockcopy_pp_8xN_neon h
+function x265_blockcopy_pp_8x\h\()_neon
+.rept \h    
+    vld1.8          {d0}, [r2], r3
+    vst1.8          {d0}, [r0], r1
+.endr
+    bx              lr
+endfunc
+.endm
+
+blockcopy_pp_8xN_neon 4
+blockcopy_pp_8xN_neon 8
+blockcopy_pp_8xN_neon 16
+blockcopy_pp_8xN_neon 32
+blockcopy_pp_8xN_neon 2
+blockcopy_pp_8xN_neon 6
+blockcopy_pp_8xN_neon 12
+
+function x265_blockcopy_pp_12x16_neon
+    sub             r3, #8
+    sub             r1, #8
+.rept 16
+    vld1.8          {d0}, [r2]!
+    ldr             r12, [r2], r3
+    vst1.8          {d0}, [r0]!
+    str             r12, [r0], r1
+.endr
+    bx              lr
+endfunc
+
+function x265_blockcopy_pp_24x32_neon
+    mov             r12, #4
+loop_24x32:
+.rept 8
+    vld1.8          {d0, d1, d2}, [r2], r3
+    vst1.8          {d0, d1, d2}, [r0], r1
+.endr
+    subs            r12, r12, #1
+    bne             loop_24x32
+    bx              lr
+endfunc
+
+function x265_blockcopy_pp_32x8_neon
+.rept 8
+    vld1.8          {q0, q1}, [r2], r3
+    vst1.8          {q0, q1}, [r0], r1
+.endr 
+    bx              lr
+endfunc
+
+.macro blockcopy_pp_32xN_neon h i
+function x265_blockcopy_pp_32x\h\()_neon
+    mov             r12, #\i
+loop_32x\h\():
+.rept 8
+    vld1.8          {q0, q1}, [r2], r3
+    vst1.8          {q0, q1}, [r0], r1
+.endr
+    subs            r12, r12, #1
+    bne             loop_32x\h
+    bx              lr
+endfunc
+.endm
+
+blockcopy_pp_32xN_neon 16 2
+blockcopy_pp_32xN_neon 24 3
+blockcopy_pp_32xN_neon 32 4
+blockcopy_pp_32xN_neon 64 8
+blockcopy_pp_32xN_neon 48 6
+
+function x265_blockcopy_pp_48x64_neon
+    mov             r12, #8
+    sub             r3, #32
+    sub             r1, #32
+loop_48x64:
+.rept 8
+    vld1.8          {q0, q1}, [r2]!
+    vld1.8          {q2}, [r2], r3
+    vst1.8          {q0, q1}, [r0]!
+    vst1.8          {q2}, [r0], r1
+.endr
+    subs            r12, r12, #1
+    bne             loop_48x64
+    bx              lr
+endfunc
+
+.macro blockcopy_pp_64xN_neon h i
+function x265_blockcopy_pp_64x\h\()_neon
+    mov             r12, #\i
+    sub             r3, #32
+    sub             r1, #32
+loop_64x\h\():
+.rept 4
+    vld1.8          {q0, q1}, [r2]!
+    vld1.8          {q2, q3}, [r2], r3
+    vst1.8          {q0, q1}, [r0]!
+    vst1.8          {q2, q3}, [r0], r1
+.endr
+    subs            r12, r12, #1
+    bne             loop_64x\h
+    bx              lr
+endfunc
+.endm

x265_2.0.tar.gz/source/common/arm/mc.h Added

@@ -0,0 +1,27 @@
+/*****************************************************************************
+ * Copyright (C) 2016 x265 project
+ *
+ * Authors: Steve Borho <steve@borho.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_MC_ARM_H
+#define X265_MC_ARM_H
+
+#endif // ifndef X265_MC_ARM_H

x265_2.0.tar.gz/source/common/arm/pixel-util.S Added

@@ -0,0 +1,2451 @@
+/*****************************************************************************
+ * Copyright (C) 2016 x265 project
+ *
+ * Authors: Dnyaneshwar G <dnyaneshwar@multicorewareinc.com>
+ *          Radhakrishnan VR <radhakrishnan@multicorewareinc.com>
+ *          Min Chen <min.chen@multicorewareinc.com>
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.section .rodata
+
+.align 4
+
+
+.text
+
+.macro VAR_SQR_SUM qsqr_sum, qsqr_last, qsqr_temp, dsrc, num=0, vpadal=vpadal.u16
+    vmull.u8        \qsqr_temp, \dsrc, \dsrc
+    vaddw.u8        q\num, q\num, \dsrc
+    \vpadal         \qsqr_sum, \qsqr_last
+.endm
+
+function x265_pixel_var_8x8_neon
+    vld1.u8         {d16}, [r0], r1
+    vmull.u8        q1, d16, d16
+    vmovl.u8        q0, d16
+    vld1.u8         {d18}, [r0], r1
+    vmull.u8        q2, d18, d18
+    vaddw.u8        q0, q0, d18
+
+    vld1.u8         {d20}, [r0], r1
+    VAR_SQR_SUM     q1, q1, q3, d20, 0, vpaddl.u16
+    vld1.u8         {d22}, [r0], r1
+    VAR_SQR_SUM     q2, q2, q8, d22, 0, vpaddl.u16
+
+    vld1.u8         {d24}, [r0], r1
+    VAR_SQR_SUM     q1, q3, q9, d24
+    vld1.u8         {d26}, [r0], r1
+    VAR_SQR_SUM     q2, q8, q10, d26
+    vld1.u8         {d24}, [r0], r1
+    VAR_SQR_SUM     q1, q9, q14, d24
+    vld1.u8         {d26}, [r0], r1
+    VAR_SQR_SUM     q2, q10, q15, d26
+
+    vpaddl.u16      q8, q14
+    vpaddl.u16      q9, q15
+    vadd.u32        q1, q1, q8
+    vadd.u16        d0, d0, d1
+    vadd.u32        q1, q1, q9
+    vadd.u32        q1, q1, q2
+    vpaddl.u16      d0, d0
+    vadd.u32        d2, d2, d3
+    vpadd.u32       d0, d0, d2
+
+    vmov            r0, r1, d0
+    bx              lr
+endfunc
+
+function x265_pixel_var_16x16_neon
+    veor.u8         q0, q0
+    veor.u8         q1, q1
+    veor.u8         q2, q2
+    veor.u8         q14, q14
+    veor.u8         q15, q15
+    mov             ip, #4
+
+.var16_loop:
+    subs            ip, ip, #1
+    vld1.u8         {q8}, [r0], r1
+    VAR_SQR_SUM     q1, q14, q12, d16
+    VAR_SQR_SUM     q2, q15, q13, d17
+
+    vld1.u8         {q9}, [r0], r1
+    VAR_SQR_SUM     q1, q12, q14, d18
+    VAR_SQR_SUM     q2, q13, q15, d19
+
+    vld1.u8         {q8}, [r0], r1
+    VAR_SQR_SUM     q1, q14, q12, d16
+    VAR_SQR_SUM     q2, q15, q13, d17
+
+    vld1.u8         {q9}, [r0], r1
+    VAR_SQR_SUM     q1, q12, q14, d18
+    VAR_SQR_SUM     q2, q13, q15, d19
+    bgt             .var16_loop
+
+    vpaddl.u16      q8, q14
+    vpaddl.u16      q9, q15
+    vadd.u32        q1, q1, q8
+    vadd.u16        d0, d0, d1
+    vadd.u32        q1, q1, q9
+    vadd.u32        q1, q1, q2
+    vpaddl.u16      d0, d0
+    vadd.u32        d2, d2, d3
+    vpadd.u32       d0, d0, d2
+
+    vmov            r0, r1, d0
+    bx              lr
+endfunc
+
+function x265_pixel_var_32x32_neon
+    veor.u8         q0, q0
+    veor.u8         q1, q1
+    veor.u8         q2, q2
+    veor.u8         q14, q14
+    veor.u8         q15, q15
+    mov             ip, #8
+
+.var32_loop:
+    subs            ip, ip, #1
+    vld1.u8         {q8-q9}, [r0], r1
+    VAR_SQR_SUM     q1, q14, q12, d16
+    VAR_SQR_SUM     q2, q15, q13, d17
+    VAR_SQR_SUM     q1, q12, q14, d18
+    VAR_SQR_SUM     q2, q13, q15, d19
+
+    vld1.u8         {q8-q9}, [r0], r1
+    VAR_SQR_SUM     q1, q14, q12, d16
+    VAR_SQR_SUM     q2, q15, q13, d17
+    VAR_SQR_SUM     q1, q12, q14, d18
+    VAR_SQR_SUM     q2, q13, q15, d19
+
+    vld1.u8         {q8-q9}, [r0], r1
+    VAR_SQR_SUM     q1, q14, q12, d16
+    VAR_SQR_SUM     q2, q15, q13, d17
+    VAR_SQR_SUM     q1, q12, q14, d18
+    VAR_SQR_SUM     q2, q13, q15, d19
+
+    vld1.u8         {q8-q9}, [r0], r1
+    VAR_SQR_SUM     q1, q14, q12, d16
+    VAR_SQR_SUM     q2, q15, q13, d17
+    VAR_SQR_SUM     q1, q12, q14, d18
+    VAR_SQR_SUM     q2, q13, q15, d19
+    bgt             .var32_loop
+
+    vpaddl.u16      q8, q14
+    vpaddl.u16      q9, q15
+    vadd.u32        q1, q1, q8
+    vadd.u16        d0, d0, d1
+    vadd.u32        q1, q1, q9
+    vadd.u32        q1, q1, q2
+    vpaddl.u16      d0, d0
+    vadd.u32        d2, d2, d3
+    vpadd.u32       d0, d0, d2
+
+    vmov            r0, r1, d0
+    bx              lr
+endfunc
+
+function x265_pixel_var_64x64_neon
+    sub             r1, #32
+    veor.u8         q0, q0
+    veor.u8         q1, q1
+    veor.u8         q2, q2
+    veor.u8         q3, q3
+    veor.u8         q14, q14
+    veor.u8         q15, q15
+    mov             ip, #16
+
+.var64_loop:
+    subs            ip, ip, #1
+    vld1.u8         {q8-q9}, [r0]!
+    VAR_SQR_SUM     q1, q14, q12, d16
+    VAR_SQR_SUM     q2, q15, q13, d17
+    VAR_SQR_SUM     q1, q12, q14, d18
+    VAR_SQR_SUM     q2, q13, q15, d19
+
+    vld1.u8         {q8-q9}, [r0], r1
+    VAR_SQR_SUM     q1, q14, q12, d16, 3
+    VAR_SQR_SUM     q2, q15, q13, d17, 3
+    VAR_SQR_SUM     q1, q12, q14, d18, 3
+    VAR_SQR_SUM     q2, q13, q15, d19, 3
+
+    vld1.u8         {q8-q9}, [r0]!
+    VAR_SQR_SUM     q1, q14, q12, d16
+    VAR_SQR_SUM     q2, q15, q13, d17
+    VAR_SQR_SUM     q1, q12, q14, d18
+    VAR_SQR_SUM     q2, q13, q15, d19
+
+    vld1.u8         {q8-q9}, [r0], r1
+    VAR_SQR_SUM     q1, q14, q12, d16, 3
+    VAR_SQR_SUM     q2, q15, q13, d17, 3

x265_2.0.tar.gz/source/common/arm/pixel-util.h Added

@@ -0,0 +1,92 @@
+/*****************************************************************************
+ * Copyright (C) 2016 x265 project
+ *
+ * Authors: Steve Borho <steve@borho.org>
+;*          Min Chen <chenm003@163.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_PIXEL_UTIL_ARM_H
+#define X265_PIXEL_UTIL_ARM_H
+
+uint64_t x265_pixel_var_8x8_neon(const pixel* pix, intptr_t stride);
+uint64_t x265_pixel_var_16x16_neon(const pixel* pix, intptr_t stride);
+uint64_t x265_pixel_var_32x32_neon(const pixel* pix, intptr_t stride);
+uint64_t x265_pixel_var_64x64_neon(const pixel* pix, intptr_t stride);
+
+void x265_getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+void x265_getResidual8_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+void x265_getResidual16_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+void x265_getResidual32_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+
+void x265_scale1D_128to64_neon(pixel *dst, const pixel *src);
+void x265_scale2D_64to32_neon(pixel* dst, const pixel* src, intptr_t stride);
+
+int x265_pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_8x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_8x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_8x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_8x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_16x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_16x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_16x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_16x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_16x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_16x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_16x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_24x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_24x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_32x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_32x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_32x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_32x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_32x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_32x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_48x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_64x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_64x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_64x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_64x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+
+int x265_pixel_sa8d_8x8_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
+int x265_pixel_sa8d_8x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
+int x265_pixel_sa8d_16x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
+int x265_pixel_sa8d_16x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
+int x265_pixel_sa8d_32x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
+int x265_pixel_sa8d_32x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
+int x265_pixel_sa8d_64x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
+
+uint32_t x265_quant_neon(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
+uint32_t x265_nquant_neon(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
+
+void x265_dequant_scaling_neon(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift);
+void x265_dequant_normal_neon(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
+
+void x265_ssim_4x4x2_core_neon(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4]);
+
+int PFX(psyCost_4x4_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
+
+#endif // ifndef X265_PIXEL_UTIL_ARM_H

x265_2.0.tar.gz/source/common/arm/pixel.h Added

@@ -0,0 +1,215 @@
+/*****************************************************************************
+ * pixel.h: x86 pixel metrics
+ *****************************************************************************
+ * Copyright (C) 2003-2013 x264 project
+ * Copyright (C) 2013-2016 x265 project
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *          Loren Merritt <lorenm@u.washington.edu>
+ *          Fiona Glaser <fiona@x264.com>
+ *          Min Chen <chenm003@163.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_I386_PIXEL_ARM_H
+#define X265_I386_PIXEL_ARM_H
+
+int x265_pixel_sad_4x4_armv6(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+int x265_pixel_sad_4x8_armv6(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+int x265_pixel_sad_4x16_armv6(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+int x265_pixel_sad_8x4_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+int x265_pixel_sad_8x8_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+int x265_pixel_sad_8x16_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+int x265_pixel_sad_8x32_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+int x265_pixel_sad_16x4_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+int x265_pixel_sad_16x8_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+int x265_pixel_sad_16x16_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+int x265_pixel_sad_16x12_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+int x265_pixel_sad_16x32_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+int x265_pixel_sad_16x64_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+int x265_pixel_sad_32x8_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+int x265_pixel_sad_32x16_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+int x265_pixel_sad_32x32_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+int x265_pixel_sad_32x64_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+int x265_pixel_sad_32x24_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+int x265_pixel_sad_64x16_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+int x265_pixel_sad_64x32_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+int x265_pixel_sad_64x64_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+int x265_pixel_sad_64x48_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+int x265_pixel_sad_12x16_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+int x265_pixel_sad_24x32_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+int x265_pixel_sad_48x64_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+
+void x265_pixel_avg_pp_4x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_4x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_4x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_8x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_8x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_8x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_8x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_12x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x12_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_24x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_32x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_32x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_32x24_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_32x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_32x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_48x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_64x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_64x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_64x48_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_64x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+
+void x265_sad_x3_4x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_4x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_4x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_8x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_8x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_8x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_8x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_12x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x12_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_24x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_32x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_32x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_32x24_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_32x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_32x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_48x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_64x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_64x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+
+void x265_sad_x4_4x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_4x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_4x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_8x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_8x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_8x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_8x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_12x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_16x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_16x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_16x12_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_16x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_16x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_16x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_24x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_32x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_32x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_32x24_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_32x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_32x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_48x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_64x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_64x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+
+sse_t x265_pixel_sse_pp_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+sse_t x265_pixel_sse_pp_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+sse_t x265_pixel_sse_pp_16x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+sse_t x265_pixel_sse_pp_32x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+sse_t x265_pixel_sse_pp_64x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+
+sse_t x265_pixel_sse_ss_4x4_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2);
+sse_t x265_pixel_sse_ss_8x8_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2);
+sse_t x265_pixel_sse_ss_16x16_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2);
+sse_t x265_pixel_sse_ss_32x32_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2);
+sse_t x265_pixel_sse_ss_64x64_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2);
+
+sse_t x265_pixel_ssd_s_4x4_neon(const int16_t* a, intptr_t dstride);
+sse_t x265_pixel_ssd_s_8x8_neon(const int16_t* a, intptr_t dstride);
+sse_t x265_pixel_ssd_s_16x16_neon(const int16_t* a, intptr_t dstride);
+sse_t x265_pixel_ssd_s_32x32_neon(const int16_t* a, intptr_t dstride);
+sse_t x265_pixel_ssd_s_64x64_neon(const int16_t* a, intptr_t dstride);
+
+void x265_pixel_sub_ps_4x4_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
+void x265_pixel_sub_ps_8x8_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
+void x265_pixel_sub_ps_16x16_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
+void x265_pixel_sub_ps_32x32_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
+void x265_pixel_sub_ps_64x64_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
+void x265_pixel_sub_ps_4x8_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
+void x265_pixel_sub_ps_8x16_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
+void x265_pixel_sub_ps_16x32_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
+void x265_pixel_sub_ps_32x64_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
+
+void x265_pixel_add_ps_4x4_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
+void x265_pixel_add_ps_8x8_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
+void x265_pixel_add_ps_16x16_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
+void x265_pixel_add_ps_32x32_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
+void x265_pixel_add_ps_64x64_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
+void x265_pixel_add_ps_4x8_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
+void x265_pixel_add_ps_8x16_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
+void x265_pixel_add_ps_16x32_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
+void x265_pixel_add_ps_32x64_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
+
+void x265_pixel_planecopy_cp_neon(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
+
+void x265_addAvg_4x4_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
+void x265_addAvg_4x8_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
+void x265_addAvg_4x16_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
+void x265_addAvg_8x4_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
+void x265_addAvg_8x8_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
+void x265_addAvg_8x16_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
+void x265_addAvg_8x32_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
+void x265_addAvg_12x16_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
+void x265_addAvg_16x4_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
+void x265_addAvg_16x8_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
+void x265_addAvg_16x12_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
+void x265_addAvg_16x16_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
+void x265_addAvg_16x32_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
+void x265_addAvg_16x64_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
+void x265_addAvg_24x32_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
+void x265_addAvg_32x8_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
+void x265_addAvg_32x16_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
+void x265_addAvg_32x24_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
+void x265_addAvg_32x32_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
+void x265_addAvg_32x64_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
+void x265_addAvg_48x64_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
+void x265_addAvg_64x16_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
+void x265_addAvg_64x32_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);

x265_2.0.tar.gz/source/common/arm/sad-a.S Added

@@ -0,0 +1,1356 @@
+/*****************************************************************************
+ * Copyright (C) 2016 x265 project
+ *
+ * Authors: David Conrad <lessen42@gmail.com>
+ *          Janne Grunau <janne-x264@jannau.net>
+ *          Dnyaneshwar G <dnyaneshwar@multicorewareinc.com>
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.section .rodata
+
+.align 4
+sad12_mask:
+.byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0
+
+.text
+
+/* sad4x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
+ *
+ * r0   - dst
+ * r1   - dstStride
+ * r2   - src
+ * r3   - srcStride */
+
+.macro SAD4_ARMV6 h
+function x265_pixel_sad_4x\h\()_armv6
+    push        {r4-r6,lr}
+    ldr         r4, [r2], r3
+    ldr         r5, [r0], r1
+    ldr         r6, [r2], r3
+    ldr         lr, [r0], r1
+    usad8       ip, r4, r5
+.rept (\h - 2)/2
+    ldr         r4, [r2], r3
+    ldr         r5, [r0], r1
+    usada8      ip, r6, lr, ip
+    ldr         r6, [r2], r3
+    ldr         lr, [r0], r1
+    usada8      ip, r4, r5, ip
+.endr
+    usada8      r0, r6, lr, ip
+    pop         {r4-r6,pc}
+endfunc
+.endm
+
+SAD4_ARMV6 4
+SAD4_ARMV6 8
+SAD4_ARMV6 16
+
+.macro SAD8_NEON h
+function x265_pixel_sad_8x\h\()_neon
+    vld1.8          d0, [r0], r1        // row 0
+    vld1.8          d1, [r2], r3        // row 1
+    vabdl.u8        q1, d0, d1
+
+.rept \h-1
+    vld1.8          d0, [r0], r1        // row 2,4,6
+    vld1.8          d1, [r2], r3        // row 3,5,7
+    vabal.u8        q1, d0, d1
+.endr
+
+    vadd.u16        d2, d2, d3
+    vpadd.u16       d0, d2, d2
+    vpaddl.u16      d0, d0
+    vmov.u32        r0, d0[0]
+    bx              lr
+endfunc
+.endm
+
+SAD8_NEON 4
+SAD8_NEON 8
+SAD8_NEON 16
+SAD8_NEON 32
+
+.macro SAD16_NEON h
+function x265_pixel_sad_16x\h\()_neon
+    vld1.8          {q0}, [r0], r1      // row 0
+    vld1.8          {q1}, [r2], r3
+    vld1.8          {q2}, [r0], r1      // row 1
+    vld1.8          {q3}, [r2], r3
+
+    vabdl.u8        q8, d0, d2
+    vabdl.u8        q9, d1, d3
+    vabal.u8        q8, d4, d6
+    vabal.u8        q9, d5, d7
+    mov             r12, #(\h-2)/2
+
+.loop_16x\h:
+
+    subs            r12, #1
+    vld1.8          {q0}, [r0], r1
+    vld1.8          {q1}, [r2], r3
+    vld1.8          {q2}, [r0], r1
+    vld1.8          {q3}, [r2], r3
+
+    vabal.u8        q8, d0, d2
+    vabal.u8        q9, d1, d3
+    vabal.u8        q8, d4, d6
+    vabal.u8        q9, d5, d7
+    bne             .loop_16x\h
+
+    vadd.u16        q8, q8, q9
+.if \h == 64
+    vaddl.u16       q0, d16, d17
+    vpadd.u32       d0, d0, d1
+    vpadd.u32       d0, d0
+.else
+    vadd.u16        d16, d16, d17
+    vpadd.u16       d0, d16, d16
+    vpaddl.u16      d0, d0
+.endif
+    vmov.u32        r0, d0[0]
+    bx              lr
+endfunc
+.endm
+
+SAD16_NEON 4
+SAD16_NEON 8
+SAD16_NEON 16
+SAD16_NEON 12
+SAD16_NEON 32
+SAD16_NEON 64
+
+.macro SAD32_NEON h
+function x265_pixel_sad_32x\h\()_neon
+    veor.u8         q8, q8
+    veor.u8         q9, q9
+    veor.u8         q10, q10
+    veor.u8         q11, q11
+    mov             r12, #\h/8
+
+.loop_32x\h:
+
+    subs            r12, #1
+.rept 4
+    vld1.8          {q0, q1}, [r0], r1           // row 0
+    vld1.8          {q2, q3}, [r2], r3           // row 0
+    vld1.8          {q12, q13}, [r0], r1         // row 1
+    vld1.8          {q14, q15}, [r2], r3         // row 1
+
+    vabal.u8        q8, d0, d4
+    vabal.u8        q9, d1, d5
+    vabal.u8        q10, d2, d6
+    vabal.u8        q11, d3, d7
+
+    vabal.u8        q8, d24, d28
+    vabal.u8        q9, d25, d29
+    vabal.u8        q10, d26, d30
+    vabal.u8        q11, d27, d31
+.endr
+    bne             .loop_32x\h
+
+    vadd.u16        q8, q8, q9
+    vadd.u16        q10, q10, q11
+.if \h == 64
+    vaddl.u16       q0, d16, d17
+    vpadd.u32       d0, d0, d1
+    vpaddl.u32      d0, d0
+
+    vaddl.u16       q1, d20, d21
+    vpadd.u32       d2, d2, d3
+    vpaddl.u32      d2, d2
+
+    vadd.u32        d0,d0,d2
+.else
+    vadd.u16        d16, d16, d17
+    vpadd.u16       d0, d16, d16
+    vpaddl.u16      d0, d0
+
+    vadd.u16        d20, d20, d21
+    vpadd.u16       d1, d20, d20
+    vpaddl.u16      d1, d1
+
+    vadd.u32        d0,d0,d1
+.endif
+    vmov.u32        r0,  d0[0]
+    bx              lr
+endfunc
+.endm
+
+SAD32_NEON 8

x265_2.0.tar.gz/source/common/arm/ssd-a.S Added

@@ -0,0 +1,469 @@
+/*****************************************************************************
+ * Copyright (C) 2016 x265 project
+ *
+ * Authors: Dnyaneshwar G <dnyaneshwar@multicorewareinc.com>
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.section .rodata
+
+.align 4
+
+
+.text
+
+
+function x265_pixel_sse_pp_4x4_neon
+    vld1.32     {d16[]}, [r0], r1
+    vld1.32     {d17[]}, [r2], r3
+    vsubl.u8    q2, d16, d17
+    vld1.32     {d16[]}, [r0], r1
+    vmull.s16   q0, d4, d4
+    vld1.32     {d17[]}, [r2], r3
+
+    vsubl.u8    q2, d16, d17
+    vld1.32     {d16[]}, [r0], r1
+    vmlal.s16   q0, d4, d4
+    vld1.32     {d17[]}, [r2], r3
+
+    vsubl.u8    q2, d16, d17
+    vld1.32     {d16[]}, [r0], r1
+    vmlal.s16   q0, d4, d4
+    vld1.32     {d17[]}, [r2], r3
+
+    vsubl.u8    q2, d16, d17
+    vmlal.s16   q0, d4, d4
+    vadd.s32    d0, d0, d1
+    vpadd.s32   d0, d0, d0
+    vmov.32     r0, d0[0]
+    bx          lr
+endfunc
+
+function x265_pixel_sse_pp_8x8_neon
+    vld1.64     {d16}, [r0], r1
+    vld1.64     {d17}, [r2], r3
+    vsubl.u8    q2, d16, d17
+    vld1.64     {d16}, [r0], r1
+    vmull.s16   q0, d4, d4
+    vmlal.s16   q0, d5, d5
+    vld1.64     {d17}, [r2], r3
+
+.rept 6
+    vsubl.u8    q2, d16, d17
+    vld1.64     {d16}, [r0], r1
+    vmlal.s16   q0, d4, d4
+    vmlal.s16   q0, d5, d5
+    vld1.64     {d17}, [r2], r3
+.endr
+    vsubl.u8    q2, d16, d17
+    vmlal.s16   q0, d4, d4
+    vmlal.s16   q0, d5, d5
+    vadd.s32    d0, d0, d1
+    vpadd.s32   d0, d0, d0
+    vmov.32     r0, d0[0]
+    bx          lr
+endfunc
+
+function x265_pixel_sse_pp_16x16_neon
+    vld1.64     {d16-d17}, [r0], r1
+    vld1.64     {d18-d19}, [r2], r3
+    vsubl.u8    q2, d16, d18
+    vsubl.u8    q3, d17, d19
+    vld1.64     {d16-d17}, [r0], r1
+    vmull.s16   q0, d4, d4
+    vmlal.s16   q0, d5, d5
+    vld1.64     {d18-d19}, [r2], r3
+    vmlal.s16   q0, d6, d6
+    vmlal.s16   q0, d7, d7
+
+.rept 14
+    vsubl.u8    q2, d16, d18
+    vsubl.u8    q3, d17, d19
+    vld1.64     {d16-d17}, [r0], r1
+    vmlal.s16   q0, d4, d4
+    vmlal.s16   q0, d5, d5
+    vld1.64     {d18-d19}, [r2], r3
+    vmlal.s16   q0, d6, d6
+    vmlal.s16   q0, d7, d7
+.endr
+    vsubl.u8    q2, d16, d18
+    vsubl.u8    q3, d17, d19
+    vmlal.s16   q0, d4, d4
+    vmlal.s16   q0, d5, d5
+    vmlal.s16   q0, d6, d6
+    vmlal.s16   q0, d7, d7
+    vadd.s32    d0, d0, d1
+    vpadd.s32   d0, d0, d0
+    vmov.32     r0, d0[0]
+    bx          lr
+endfunc
+
+function x265_pixel_sse_pp_32x32_neon
+    mov         r12, #8
+    veor.u8     q0, q0
+    veor.u8     q1, q1
+
+.loop_sse_pp_32:
+    subs        r12, #1
+.rept 4
+    vld1.64     {q8-q9}, [r0], r1
+    vld1.64     {q10-q11}, [r2], r3
+    vsubl.u8    q2, d16, d20
+    vsubl.u8    q3, d17, d21
+    vsubl.u8    q12, d18, d22
+    vsubl.u8    q13, d19, d23
+    vmlal.s16   q0, d4, d4
+    vmlal.s16   q1, d5, d5
+    vmlal.s16   q0, d6, d6
+    vmlal.s16   q1, d7, d7
+    vmlal.s16   q0, d24, d24
+    vmlal.s16   q1, d25, d25
+    vmlal.s16   q0, d26, d26
+    vmlal.s16   q1, d27, d27
+.endr
+    bne         .loop_sse_pp_32
+    vadd.s32    q0, q1
+    vadd.s32    d0, d0, d1
+    vpadd.s32   d0, d0, d0
+    vmov.32     r0, d0[0]
+    bx          lr
+endfunc
+
+function x265_pixel_sse_pp_64x64_neon
+    sub         r1, #32
+    sub         r3, #32
+    mov         r12, #16
+    veor.u8     q0, q0
+    veor.u8     q1, q1
+
+.loop_sse_pp_64:
+    subs        r12, #1
+.rept 4
+    vld1.64     {q8-q9}, [r0]!
+    vld1.64     {q10-q11}, [r2]!
+    vsubl.u8    q2, d16, d20
+    vsubl.u8    q3, d17, d21
+    vsubl.u8    q12, d18, d22
+    vsubl.u8    q13, d19, d23
+    vmlal.s16   q0, d4, d4
+    vmlal.s16   q1, d5, d5
+    vmlal.s16   q0, d6, d6
+    vmlal.s16   q1, d7, d7
+    vmlal.s16   q0, d24, d24
+    vmlal.s16   q1, d25, d25
+    vmlal.s16   q0, d26, d26
+    vmlal.s16   q1, d27, d27
+
+    vld1.64     {q8-q9}, [r0], r1
+    vld1.64     {q10-q11}, [r2], r3
+    vsubl.u8    q2, d16, d20
+    vsubl.u8    q3, d17, d21
+    vsubl.u8    q12, d18, d22
+    vsubl.u8    q13, d19, d23
+    vmlal.s16   q0, d4, d4
+    vmlal.s16   q1, d5, d5
+    vmlal.s16   q0, d6, d6
+    vmlal.s16   q1, d7, d7
+    vmlal.s16   q0, d24, d24
+    vmlal.s16   q1, d25, d25
+    vmlal.s16   q0, d26, d26
+    vmlal.s16   q1, d27, d27
+.endr
+    bne         .loop_sse_pp_64
+    vadd.s32    q0, q1
+    vadd.s32    d0, d0, d1
+    vpadd.s32   d0, d0, d0
+    vmov.32     r0, d0[0]
+    bx          lr
+endfunc
+
+function x265_pixel_sse_ss_4x4_neon
+    add         r1, r1

x265_1.9.tar.gz/source/common/common.cpp -> x265_2.0.tar.gz/source/common/common.cpp Changed

@@ -29,6 +29,8 @@
 #if _WIN32
 #include <sys/types.h>
 #include <sys/timeb.h>
+#include <io.h>
+#include <fcntl.h>
 #else
 #include <sys/time.h>
 #endif
@@ -139,6 +141,94 @@
     fputs(buffer, stderr);
 }
 
+#if _WIN32
+/* For Unicode filenames in Windows we convert UTF-8 strings to UTF-16 and we use _w functions.
+ * For other OS we do not make any changes. */
+void general_log_file(const x265_param* param, const char* caller, int level, const char* fmt, ...)
+{
+    if (param && level > param->logLevel)
+        return;
+    const int bufferSize = 4096;
+    char buffer[bufferSize];
+    int p = 0;
+    const char* log_level;
+    switch (level)
+    {
+    case X265_LOG_ERROR:
+        log_level = "error";
+        break;
+    case X265_LOG_WARNING:
+        log_level = "warning";
+        break;
+    case X265_LOG_INFO:
+        log_level = "info";
+        break;
+    case X265_LOG_DEBUG:
+        log_level = "debug";
+        break;
+    case X265_LOG_FULL:
+        log_level = "full";
+        break;
+    default:
+        log_level = "unknown";
+        break;
+    }
+
+    if (caller)
+        p += sprintf(buffer, "%-4s [%s]: ", caller, log_level);
+    va_list arg;
+    va_start(arg, fmt);
+    vsnprintf(buffer + p, bufferSize - p, fmt, arg);
+    va_end(arg);
+
+    HANDLE console = GetStdHandle(STD_ERROR_HANDLE);
+    DWORD mode;
+    if (GetConsoleMode(console, &mode))
+    {
+        wchar_t buf_utf16[bufferSize];
+        int length_utf16 = MultiByteToWideChar(CP_UTF8, 0, buffer, -1, buf_utf16, sizeof(buf_utf16)/sizeof(wchar_t)) - 1;
+        if (length_utf16 > 0)
+            WriteConsoleW(console, buf_utf16, length_utf16, &mode, NULL);
+    }
+    else
+        fputs(buffer, stderr);
+}
+
+FILE* x265_fopen(const char* fileName, const char* mode)
+{
+    wchar_t buf_utf16[MAX_PATH * 2], mode_utf16[16];
+
+    if (MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, fileName, -1, buf_utf16, sizeof(buf_utf16)/sizeof(wchar_t)) &&
+        MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, mode, -1, mode_utf16, sizeof(mode_utf16)/sizeof(wchar_t)))
+    {
+        return _wfopen(buf_utf16, mode_utf16);
+    }
+    return NULL;
+}
+
+int x265_unlink(const char* fileName)
+{
+    wchar_t buf_utf16[MAX_PATH * 2];
+
+    if (MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, fileName, -1, buf_utf16, sizeof(buf_utf16)/sizeof(wchar_t)))
+        return _wunlink(buf_utf16);
+
+    return -1;
+}
+
+int x265_rename(const char* oldName, const char* newName)
+{
+    wchar_t old_utf16[MAX_PATH * 2], new_utf16[MAX_PATH * 2];
+
+    if (MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, oldName, -1, old_utf16, sizeof(old_utf16)/sizeof(wchar_t)) &&
+        MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, newName, -1, new_utf16, sizeof(new_utf16)/sizeof(wchar_t)))
+    {
+        return _wrename(old_utf16, new_utf16);
+    }
+    return -1;
+}
+#endif
+
 double x265_ssim2dB(double ssim)
 {
     double inv_ssim = 1 - ssim;
@@ -177,10 +267,10 @@
     size_t fSize;
     char *buf = NULL;
 
-    FILE *fh = fopen(filename, "rb");
+    FILE *fh = x265_fopen(filename, "rb");
     if (!fh)
     {
-        x265_log(NULL, X265_LOG_ERROR, "unable to open file %s\n", filename);
+        x265_log_file(NULL, X265_LOG_ERROR, "unable to open file %s\n", filename);
         return NULL;
     }

x265_1.9.tar.gz/source/common/common.h -> x265_2.0.tar.gz/source/common/common.h Changed

@@ -322,6 +322,8 @@
 #define MAX_NUM_TR_COEFFS           MAX_TR_SIZE * MAX_TR_SIZE // Maximum number of transform coefficients, for a 32x32 transform
 #define MAX_NUM_TR_CATEGORIES       16                        // 32, 16, 8, 4 transform categories each for luma and chroma
 
+#define PIXEL_MAX ((1 << X265_DEPTH) - 1)
+
 namespace X265_NS {
 
 enum { SAO_NUM_OFFSET = 4 };
@@ -402,7 +404,19 @@
 /* located in common.cpp */
 int64_t  x265_mdate(void);
 #define  x265_log(param, ...) general_log(param, "x265", __VA_ARGS__)
+#define  x265_log_file(param, ...) general_log_file(param, "x265", __VA_ARGS__)
 void     general_log(const x265_param* param, const char* caller, int level, const char* fmt, ...);
+#if _WIN32
+void     general_log_file(const x265_param* param, const char* caller, int level, const char* fmt, ...);
+FILE*    x265_fopen(const char* fileName, const char* mode);
+int      x265_unlink(const char* fileName);
+int      x265_rename(const char* oldName, const char* newName);
+#else
+#define  general_log_file(param, caller, level, fmt, ...) general_log(param, caller, level, fmt, __VA_ARGS__)
+#define  x265_fopen(fileName, mode) fopen(fileName, mode)
+#define  x265_unlink(fileName) unlink(fileName)
+#define  x265_rename(oldName, newName) rename(oldName, newName)
+#endif
 int      x265_exp2fix8(double x);
 
 double   x265_ssim2dB(double ssim);

x265_1.9.tar.gz/source/common/constants.cpp -> x265_2.0.tar.gz/source/common/constants.cpp Changed

@@ -555,18 +555,6 @@
     0x38, 
 };
 
-/* Contains how much to increment shared depth buffer for different ctu sizes to get next best depth
- * here, depth 0 = 64x64, depth 1 = 32x32, depth 2 = 16x16 and depth 3 = 8x8
- * if ctu = 64, depth buffer size is 256 combination of depth values 0, 1, 2, 3
- * if ctu = 32, depth buffer size is 64 combination of depth values 1, 2, 3
- * if ctu = 16, depth buffer size is 16 combination of depth values 2, 3 */
-const uint32_t g_depthInc[3][4] =
-{
-    { 16,  4,  0, 0},
-    { 64, 16,  4, 1},
-    {256, 64, 16, 4}
-};
-
 /* g_depthScanIdx [y][x] */
 const uint32_t g_depthScanIdx[8][8] =
 {
@@ -580,4 +568,236 @@
     {  42,  43,  46,  47,  58,  59,  62,  63,  }
 };
 
+/* Rec.2020 YUV to RGB Non-constant luminance */
+const double g_YUVtoRGB_BT2020[3][3] = 
+{
+    {   1.00,   0.00,      1.47460,   },
+    {   1.00,  -0.16455,  -0.57135,   },
+    {   1.00,   1.88140,   0.00,      }
+};
+
+const double g_ST2084_PQTable[MAX_HDR_LEGAL_RANGE - MIN_HDR_LEGAL_RANGE + 1] = 
+{
+    0,
+    5.25912035416561E-05, 0.000170826479250824, 0.000342874260206259, 0.000565730978088069,
+    0.000838361593599196, 0.0011605708550711, 0.00153261170332205, 0.00195500928122658,
+    0.00242846920816411, 0.00295382484798614, 0.00353200479131171, 0.00416401171798929,
+    0.00485090808272845, 0.00559380610060962, 0.00639386055422149, 0.00725226351560689,
+    0.0081702404049783, 0.00914904700558975, 0.010189967177051, 0.0112943110883226,
+    0.0124634138437419, 0.0136986344106386, 0.0150013547814312, 0.0163729793201926,
+    0.0178149342559234, 0.0193286672936668, 0.0209156473211494, 0.022577364193536,
+    0.0243153285825585, 0.0261310718791221, 0.0280261461406398, 0.0300021240760516,
+    0.0320605990628007, 0.0342031851910785, 0.036431517331512, 0.0387472512230819,
+    0.0411520635786705, 0.0436476522060052, 0.046235736142162, 0.0489180558000865,
+    0.0516963731258075, 0.0545724717652363, 0.0575481572396137, 0.0606252571287911,
+    0.0638056212616694, 0.0670911219131892, 0.0704836540073949, 0.0739851353261047,
+    0.0775975067228409, 0.0813227323416811, 0.0851627998407477, 0.0891197206201265,
+    0.0931955300539647, 0.0973922877266004, 0.101712077672541, 0.106157008620188,
+    0.110729214239187, 0.115430853391267, 0.120264110384523, 0.125231195231086,
+    0.130334343908053, 0.135575818621706, 0.140957908074883, 0.146482927737596,
+    0.152153220120717, 0.157971155052834, 0.163939129960184, 0.170059570149691,
+    0.176334929095073, 0.182767688726043, 0.189360359720598, 0.196115481800328,
+    0.203035624028883, 0.210123385113499, 0.21738139370961, 0.224812308728624,
+    0.232418819648774, 0.240203646829142, 0.248169541826838, 0.256319287717358,
+    0.264655699418179, 0.273181624015456, 0.281899941094164, 0.29081356307129,
+    0.299925435532481, 0.309238537571936, 0.318755882135647, 0.32848051636804,
+    0.338415521962, 0.34856401551231, 0.358929148872555, 0.369514109515577,
+    0.380322120897342, 0.391356442824469, 0.402620371825233, 0.414117241524302,
+    0.425850423021013, 0.437823325271459, 0.450039395474131, 0.4625021194595,
+    0.475215022083238, 0.488181667623337, 0.501405660181076, 0.514890644085913,
+    0.528640304304275, 0.542658366852319, 0.556948599212766, 0.571514810755682,
+    0.58636085316357, 0.601490620860234, 0.616908051444177, 0.632617126126042,
+    0.648621870170268, 0.664926353341107, 0.681534690353104, 0.6984510413256,
+    0.715679612242097, 0.733224655413817, 0.751090469947712, 0.769281402219399,
+    0.78780184635024, 0.806656244689427, 0.82584908830055, 0.84538491745295,
+    0.865268322117971, 0.885503942469945, 0.906096469391926, 0.927050644986733,
+    0.948371263092526, 0.970063169803824, 0.99213126399724, 1.01458049786256,
+    1.03741587743901, 1.06064246315667, 1.08426537038311, 1.10828976997558,
+    1.13272088883845, 1.1575640104859, 1.18282447561067, 1.20850768265765,
+    1.23461908840365, 1.26116420854251, 1.28814861827608, 1.31557795291099,
+    1.34345790846097, 1.37179424225547, 1.40059277355414, 1.42985938416685,
+    1.45960001908056, 1.48982068709166, 1.52052746144494, 1.55172648047831,
+    1.58342394827458, 1.61562613531883, 1.6483393791628, 1.68157008509547,
+    1.71532472682031, 1.74960984713914, 1.78443205864284, 1.81979804440872,
+    1.85571455870433, 1.8921884276992, 1.92922655018235, 1.9668358982877,
+    2.0050235182263, 2.04379653102551, 2.0831621332761, 2.12312759788576,
+    2.16370027484092, 2.20488759197549, 2.2466970557472, 2.28913625202187,
+    2.33221284686502, 2.37593458734142, 2.42030930232274, 2.46534490330251,
+    2.51104938521982, 2.55743082729067, 2.60449739384781, 2.65225733518805,
+    2.70071898842928, 2.74989077837451, 2.79978121838576, 2.85039891126499,
+    2.90175255014517, 2.95385091938954, 3.00670289549934, 3.06031744803115,
+    3.11470364052283, 3.16987063142876, 3.22582767506471, 3.2825841225609,
+    3.3401494228253, 3.39853312351689, 3.45774487202715, 3.51779441647257,
+    3.57869160669604, 3.64044639527875, 3.7030688385618, 3.76656909767725,
+    3.83095743959148, 3.89624423815599, 3.96243997517042, 4.02955524145598,
+    4.09760073793895, 4.16658727674518, 4.2365257823051, 4.30742729247016,
+    4.37930295964014, 4.45216405190141, 4.52602195417663, 4.60088816938553,
+    4.67677431961831, 4.75369214731843, 4.83165351647993, 4.91067041385396,
+    4.99075495016979, 5.07191936136577, 5.15417600983301, 5.23753738567282,
+    5.32201610796449, 5.40762492604782, 5.49437672081637, 5.58228450602463,
+    5.67136142960816, 5.76162077501684, 5.85307596256082, 5.94574055077076,
+    6.03962823777015, 6.13475286266291, 6.2311284069342, 6.32876899586396,
+    6.42768889995753, 6.5279025363866, 6.62942447044656, 6.73226941703026,
+    6.83645224211186, 6.94198796425035, 7.04889175610325, 7.15717894596024,
+    7.2668650192892, 7.37796562029657, 7.49049655350635, 7.60447378535363,
+    7.71991344579293, 7.83683182992318, 7.95524539963073, 8.07517078524564,
+    8.19662478721649, 8.31962437780235, 8.44418670277909, 8.57032908316786,
+    8.69806901697162, 8.82742418094208, 8.95841243235119, 9.09105181078918,
+    9.22536053997842, 9.36135702960081, 9.4990598771529, 9.63848786980913,
+    9.77965998631185, 9.92259539887546, 10.0673134751131, 10.2138337799773,
+    10.3621760777285, 10.5123603339148, 10.6644067173761, 10.8183356022682,
+    10.9741675701064, 11.1319234118292, 11.2916241298841, 11.4532909403319,
+    11.6169452749761, 11.782608783511, 11.9503033356888, 12.120051023515,
+    12.2918741634627, 12.4657952987048, 12.6418372013776, 12.8200228748588,
+    13.0003755560757, 13.1829187178276, 13.367676071144, 13.5546715676512,
+    13.7439294019804, 13.9354740141834, 14.1293300921851, 14.3255225742508,
+    14.5240766514895, 14.7250177703705, 14.9283716352778, 15.1341642110757,
+    15.3424217257167, 15.5531706728631, 15.7664378145379, 15.9822501838117,
+    16.2006350874992, 16.4216201089027, 16.6452331105667, 16.8715022370722,
+    17.1004559178516, 17.3321228700381, 17.5665321013393, 17.8037129129401,
+    18.0436949024415, 18.2865079668192, 18.5321823054235, 18.7807484229967,
+    19.0322371327346, 19.2866795593684, 19.5441071422852, 19.8045516386728,
+    20.068045126707, 20.3346200087623, 20.6043090146575, 20.8771452049349,
+    21.1531619741772, 21.4323930543496, 21.7148725181833, 22.0006347825899,
+    22.2897146121093, 22.5821471224015, 22.8779677837589, 23.1772124246723,
+    23.4799172354157, 23.7861187716811, 24.0958539582449, 24.4091600926726,
+    24.7260748490581, 25.0466362818137, 25.3708828294739, 25.6988533185695,
+    26.0305869675189, 26.3661233905639, 26.7055026017538, 27.0487650189598,
+    27.3959514679386, 27.7471031864343, 28.1022618283194, 28.4614694677879,
+    28.8247686035749, 29.1922021632471, 29.5638135074984, 29.9396464345297,
+    30.3197451844465, 30.7041544437129, 31.0929193496474, 31.4860854949729,
+    31.8836989324014, 32.2858061792735, 32.6924542222466, 33.1036905220286,
+    33.5195630181606, 33.9401201338504, 34.3654107808513, 34.7954843644001,
+    35.2303907882032, 35.6701804594619, 36.1149042939698, 36.5646137212482,
+    37.0193606897411, 37.4791976720634, 37.944177670299, 38.4143542213633,
+    38.8897814024065, 39.3705138362898, 39.8566066971106, 40.3481157157767,
+    40.8450971856484, 41.3476079682522, 41.8557054990105, 42.369447793091,
+    42.8888934512647, 43.4141016658423, 43.9451322266965, 44.4820455273072,
+    45.0249025708978, 45.57376497661, 46.128694985791, 46.6897554682848,
+    47.257009928828, 47.8305225135037, 48.4103580162663, 48.9965818855272,
+    49.589260230802, 50.1884598294566, 50.794248133489, 51.4066932764077,
+    52.0258640801652, 52.6518300621766, 53.2846614424041, 53.9244291505136,
+    54.5712048331156, 55.2250608610794, 55.8860703369173, 56.5543071022513,
+    57.2298457453516, 57.9127616087739, 58.6031307970611, 59.3010301845114,
+    60.0065374230609, 60.7197309502355, 61.4406899971675, 62.1694945967356,
+    62.9062255917496, 63.6509646432403, 64.4037942388625, 65.1647977013236,
+    65.9340591969731, 66.7116637444152, 67.4976972232724, 68.2922463830112,
+    69.0953988518382, 69.9072431457598, 70.7278686776501, 71.5573657664994,
+    72.3958256466906, 73.2433404774142, 74.1000033521872, 74.9659083084248,
+    75.8411503371909, 76.7258253929696, 77.6200304036002, 78.5238632802992,
+    79.4374229277768, 80.3608092544678, 81.2941231828966, 82.2374666600933,
+    83.1909426682048, 84.154655235138, 85.1287094453491, 86.1132114507694,
+    87.108268481825, 88.1139888585565, 89.1304820019001, 90.1578584450571,
+    91.1962298449948, 92.2457089940652, 93.3064098317639, 94.3784474565997,
+    95.4619381380949, 96.5569993289116, 97.6637496771184, 98.7823090385655,
+    99.9127984894415, 101.055340338899, 102.210058141845, 103.377076711919,
+    104.556522134513, 105.748521780005, 106.953204317117, 108.170699726403,
+    109.401139313892, 110.644655724874, 111.901382957862, 113.171456378648,
+    114.455012734562, 115.752190168864, 117.063128235285, 118.387967912751,
+    119.726851620228, 121.079923231788, 122.447328091724, 123.829213029981,
+    125.225726377642, 126.637017982633, 128.063239225529, 129.504543035659,
+    130.961083907258, 132.43301791588, 133.920502734926, 135.423697652396,
+    136.942763587828, 138.477863109372, 140.029160451099, 141.596821530472,
+    143.181013966024, 144.781907095212, 146.399671992475, 148.034481487503,
+    149.686510183665, 151.355934476676, 153.042932573466, 154.747684511235,
+    156.470372176717, 158.211179325695, 159.970291602654, 161.747896560765,
+    163.544183681914, 165.359344397174, 167.193572107279, 169.047062203492,
+    170.920012088617, 172.812621198221, 174.725091022243, 176.657625126586,
+    178.610429175187, 180.583710952171, 182.577680384379, 184.59254956399,
+    186.628532771569, 188.685846499193, 190.764709473972, 192.865342681753,
+    194.987969391112, 197.13281517763, 199.300107948348, 201.490077966701,
+    203.702957877374, 205.938982731875, 208.198390014006, 210.481419665809,
+    212.788314113849, 215.119318295558, 217.474679686168, 219.854648325694,
+    222.259476846381, 224.689420500319, 227.144737187562, 229.625687484264,
+    232.132534671514, 234.665544764103, 237.224986539876, 239.811131569336,
+    242.424254245529, 245.064631814346, 247.73254440507, 250.428275061399,
+    253.152109772633, 255.904337505438, 258.685250235678, 261.49514298094,
+    264.334313833161, 267.203063991664, 270.101697796781, 273.03052276345,
+    275.989849615675, 278.979992320954, 282.001268125309, 285.053997588697,
+    288.138504620796, 291.255116517118, 294.404163995707, 297.585981234071,
+    300.800905906628, 304.049279222569, 307.331445964095, 310.647754525259,
+    313.998556950887, 317.384208976364, 320.805070067649, 324.26150346164,
+    327.753876207298, 331.28255920701, 334.84792725845, 338.450359096983,
+    342.090237438443, 345.767949022632, 349.483884657022, 353.238439261111,
+    357.032011911288, 360.865005886229, 364.73782871259, 368.650892211681,
+    372.604612546163, 376.59941026756, 380.635710364328, 384.713942310386,
+    388.83454011424, 392.997942368521, 397.20459230049, 401.454937822634,
+    405.749431584178, 410.088531023082, 414.47269841859, 418.902400944533,
+    423.378110722949, 427.900304878816, 432.469465594816, 437.086080167171,
+    441.750641062068, 446.463645972511, 451.225597876033, 456.037005092914,
+    460.89838134554, 465.81024581748, 470.773123214509, 475.787543825096,
+    480.854043582649, 485.973164127686, 491.14545287122, 496.371463058725,
+    501.651753834779, 506.986890308486, 512.377443619739, 517.823991006384,
+    523.32711587159, 528.887407852831, 534.505462890955, 540.181883300517,
+    545.917277840779, 551.712261787277, 557.567457004939, 563.48349202123,
+    569.461002100643, 575.500629320033, 581.603022644652, 587.76883800521,
+    593.998738375827, 600.29339385279, 606.653481734616, 613.07968660232,
+    619.572700401503, 626.133222524762, 632.761959895347, 639.459627051767,
+    646.226946233466, 653.064647467273, 659.973468655012, 666.954155662449,
+    674.007462408703, 681.134150957274, 688.334991607664, 695.610762988527,
+    702.962252151562, 710.390254666907, 717.895574719168, 725.479025205175,
+    733.141427832198, 740.883613218127, 748.706420992262, 756.610699897378,
+    764.597307893424, 772.667112261926, 780.820989711908, 789.059826487117,
+    797.384518474445, 805.79597131351, 814.295100508111, 822.882831538009,
+    831.560099973222, 840.327851588798, 849.187042481472, 858.138639187298,
+    867.183618801265, 876.322969097945, 885.557688653527, 894.88878696958,
+    904.317284598324, 913.844213269149, 923.470616016881, 933.197547311661,
+    943.02607318998, 952.957271387842, 962.99223147528, 973.13205499233,
+    983.377855587028, 993.730759155025, 1004.19190398011, 1014.7624408779,
+    1025.44353334027, 1036.23635768138, 1047.14210318612, 1058.16197226031,

x265_1.9.tar.gz/source/common/constants.h -> x265_2.0.tar.gz/source/common/constants.h Changed

x265_1.9.tar.gz/source/common/contexts.h -> x265_2.0.tar.gz/source/common/contexts.h Changed

@@ -117,196 +117,8 @@
 #define sbacGetEntropyBits(S, V) (g_entropyBits[(S) ^ (V)])
 #define sbacGetEntropyBitsTrm(V) (g_entropyBits[126 ^ (V)])
 
-#define MAX_NUM_CHANNEL_TYPE     2
-
 static const uint32_t ctxCbf[3][5] = { { 1, 0, 0, 0, 0 }, { 2, 3, 4, 5, 6 }, { 2, 3, 4, 5, 6 } };
-static const uint32_t significanceMapContextSetStart[MAX_NUM_CHANNEL_TYPE][3] = { { 0,  9, 21 }, { 0,  9, 12 } };
-static const uint32_t significanceMapContextSetSize[MAX_NUM_CHANNEL_TYPE][3]  = { { 9, 12,  6 }, { 9,  3,  3 } };
-static const uint32_t nonDiagonalScan8x8ContextOffset[MAX_NUM_CHANNEL_TYPE]   = {  6, 0  };
-static const uint32_t notFirstGroupNeighbourhoodContextOffset[MAX_NUM_CHANNEL_TYPE] = { 3, 0 };
-
-// initial probability for cu_transquant_bypass flag
-static const uint8_t INIT_CU_TRANSQUANT_BYPASS_FLAG[3][NUM_TQUANT_BYPASS_FLAG_CTX] =
-{
-    { 154 },
-    { 154 },
-    { 154 },
-};
-
-// initial probability for split flag
-static const uint8_t INIT_SPLIT_FLAG[3][NUM_SPLIT_FLAG_CTX] =
-{
-    { 107,  139,  126, },
-    { 107,  139,  126, },
-    { 139,  141,  157, },
-};
-
-static const uint8_t INIT_SKIP_FLAG[3][NUM_SKIP_FLAG_CTX] =
-{
-    { 197,  185,  201, },
-    { 197,  185,  201, },
-    { CNU,  CNU,  CNU, },
-};
-
-static const uint8_t INIT_MERGE_FLAG_EXT[3][NUM_MERGE_FLAG_EXT_CTX] =
-{
-    { 154, },
-    { 110, },
-    { CNU, },
-};
-
-static const uint8_t INIT_MERGE_IDX_EXT[3][NUM_MERGE_IDX_EXT_CTX] =
-{
-    { 137, },
-    { 122, },
-    { CNU, },
-};
-
-static const uint8_t INIT_PART_SIZE[3][NUM_PART_SIZE_CTX] =
-{
-    { 154,  139,  154, 154 },
-    { 154,  139,  154, 154 },
-    { 184,  CNU,  CNU, CNU },
-};
-
-static const uint8_t INIT_PRED_MODE[3][NUM_PRED_MODE_CTX] =
-{
-    { 134, },
-    { 149, },
-    { CNU, },
-};
-
-static const uint8_t INIT_INTRA_PRED_MODE[3][NUM_ADI_CTX] =
-{
-    { 183, },
-    { 154, },
-    { 184, },
-};
-
-static const uint8_t INIT_CHROMA_PRED_MODE[3][NUM_CHROMA_PRED_CTX] =
-{
-    { 152,  139, },
-    { 152,  139, },
-    {  63,  139, },
-};
-
-static const uint8_t INIT_INTER_DIR[3][NUM_INTER_DIR_CTX] =
-{
-    {  95,   79,   63,   31,  31, },
-    {  95,   79,   63,   31,  31, },
-    { CNU,  CNU,  CNU,  CNU, CNU, },
-};
-
-static const uint8_t INIT_MVD[3][NUM_MV_RES_CTX] =
-{
-    { 169,  198, },
-    { 140,  198, },
-    { CNU,  CNU, },
-};
-
-static const uint8_t INIT_REF_PIC[3][NUM_REF_NO_CTX] =
-{
-    { 153,  153 },
-    { 153,  153 },
-    { CNU,  CNU },
-};
-
-static const uint8_t INIT_DQP[3][NUM_DELTA_QP_CTX] =
-{
-    { 154,  154,  154, },
-    { 154,  154,  154, },
-    { 154,  154,  154, },
-};
-
-static const uint8_t INIT_QT_CBF[3][NUM_QT_CBF_CTX] =
-{
-    { 153,  111,  149,   92,  167,  154,  154 },
-    { 153,  111,  149,  107,  167,  154,  154 },
-    { 111,  141,   94,  138,  182,  154,  154 },
-};
-
-static const uint8_t INIT_QT_ROOT_CBF[3][NUM_QT_ROOT_CBF_CTX] =
-{
-    {  79, },
-    {  79, },
-    { CNU, },
-};
-
-static const uint8_t INIT_LAST[3][NUM_CTX_LAST_FLAG_XY] =
-{
-    { 125,  110,  124,  110,   95,   94,  125,  111,  111,   79,  125,  126,  111,  111,   79,
-      108,  123,   93 },
-    { 125,  110,   94,  110,   95,   79,  125,  111,  110,   78,  110,  111,  111,   95,   94,
-      108,  123,  108 },
-    { 110,  110,  124,  125,  140,  153,  125,  127,  140,  109,  111,  143,  127,  111,   79,
-      108,  123,   63 },
-};
-
-static const uint8_t INIT_SIG_CG_FLAG[3][2 * NUM_SIG_CG_FLAG_CTX] =
-{
-    { 121,  140,
-      61,  154, },
-    { 121,  140,
-      61,  154, },
-    {  91,  171,
-       134,  141, },
-};
-
-static const uint8_t INIT_SIG_FLAG[3][NUM_SIG_FLAG_CTX] =
-{
-    { 170,  154,  139,  153,  139,  123,  123,   63,  124,  166,  183,  140,  136,  153,  154,  166,  183,  140,  136,  153,  154,  166,  183,  140,  136,  153,  154,  170,  153,  138,  138,  122,  121,  122,  121,  167,  151,  183,  140,  151,  183,  140,  },
-    { 155,  154,  139,  153,  139,  123,  123,   63,  153,  166,  183,  140,  136,  153,  154,  166,  183,  140,  136,  153,  154,  166,  183,  140,  136,  153,  154,  170,  153,  123,  123,  107,  121,  107,  121,  167,  151,  183,  140,  151,  183,  140,  },
-    { 111,  111,  125,  110,  110,   94,  124,  108,  124,  107,  125,  141,  179,  153,  125,  107,  125,  141,  179,  153,  125,  107,  125,  141,  179,  153,  125,  140,  139,  182,  182,  152,  136,  152,  136,  153,  136,  139,  111,  136,  139,  111,  },
-};
-
-static const uint8_t INIT_ONE_FLAG[3][NUM_ONE_FLAG_CTX] =
-{
-    { 154,  196,  167,  167,  154,  152,  167,  182,  182,  134,  149,  136,  153,  121,  136,  122,  169,  208,  166,  167,  154,  152,  167,  182, },
-    { 154,  196,  196,  167,  154,  152,  167,  182,  182,  134,  149,  136,  153,  121,  136,  137,  169,  194,  166,  167,  154,  167,  137,  182, },
-    { 140,   92,  137,  138,  140,  152,  138,  139,  153,   74,  149,   92,  139,  107,  122,  152,  140,  179,  166,  182,  140,  227,  122,  197, },
-};
-
-static const uint8_t INIT_ABS_FLAG[3][NUM_ABS_FLAG_CTX] =
-{
-    { 107,  167,   91,  107,  107,  167, },
-    { 107,  167,   91,  122,  107,  167, },
-    { 138,  153,  136,  167,  152,  152, },
-};
-
-static const uint8_t INIT_MVP_IDX[3][NUM_MVP_IDX_CTX] =
-{
-    { 168 },
-    { 168 },
-    { CNU },
-};
-
-static const uint8_t INIT_SAO_MERGE_FLAG[3][NUM_SAO_MERGE_FLAG_CTX] =
-{
-    { 153,  },
-    { 153,  },
-    { 153,  },
-};
-
-static const uint8_t INIT_SAO_TYPE_IDX[3][NUM_SAO_TYPE_IDX_CTX] =
-{
-    { 160, },
-    { 185, },
-    { 200, },
-};
-
-static const uint8_t INIT_TRANS_SUBDIV_FLAG[3][NUM_TRANS_SUBDIV_FLAG_CTX] =
-{
-    { 224,  167,  122, },
-    { 124,  138,   94, },
-    { 153,  138,  138, },
-};
 
-static const uint8_t INIT_TRANSFORMSKIP_FLAG[3][2 * NUM_TRANSFORMSKIP_FLAG_CTX] =
-{
-    { 139,  139 },
-    { 139,  139 },
-    { 139,  139 },
-};
 }
 
 #endif // ifndef X265_CONTEXTS_H

x265_1.9.tar.gz/source/common/cpu.cpp -> x265_2.0.tar.gz/source/common/cpu.cpp Changed

x265_1.9.tar.gz/source/common/cudata.cpp -> x265_2.0.tar.gz/source/common/cudata.cpp Changed

@@ -480,7 +480,7 @@
 }
 
 /* The reverse of copyToPic, called only by encodeResidue */
-void CUData::copyFromPic(const CUData& ctu, const CUGeom& cuGeom, int csp)
+void CUData::copyFromPic(const CUData& ctu, const CUGeom& cuGeom, int csp, bool copyQp)
 {
     m_encData       = ctu.m_encData;
     m_slice         = ctu.m_slice;
@@ -491,7 +491,8 @@
     m_numPartitions = cuGeom.numPartitions;
 
     /* copy out all prediction info for this part */
-    m_partCopy((uint8_t*)m_qp, (uint8_t*)ctu.m_qp + m_absIdxInCTU);
+    if (copyQp) m_partCopy((uint8_t*)m_qp, (uint8_t*)ctu.m_qp + m_absIdxInCTU);
+
     m_partCopy(m_log2CUSize,   ctu.m_log2CUSize + m_absIdxInCTU);
     m_partCopy(m_lumaIntraDir, ctu.m_lumaIntraDir + m_absIdxInCTU);
     m_partCopy(m_tqBypass,     ctu.m_tqBypass + m_absIdxInCTU);
@@ -526,7 +527,7 @@
 }
 
 /* Only called by encodeResidue, these fields can be modified during inter/intra coding */
-void CUData::updatePic(uint32_t depth) const
+void CUData::updatePic(uint32_t depth, int picCsp) const
 {
     CUData& ctu = *m_encData->getPicCTU(m_cuAddr);
 
@@ -540,7 +541,7 @@
     uint32_t tmpY2 = m_absIdxInCTU << (LOG2_UNIT_SIZE * 2);
     memcpy(ctu.m_trCoeff[0] + tmpY2, m_trCoeff[0], sizeof(coeff_t)* tmpY);
 
-    if (ctu.m_chromaFormat != X265_CSP_I400)
+    if (ctu.m_chromaFormat != X265_CSP_I400 && picCsp != X265_CSP_I400)
     {
         m_partCopy(ctu.m_transformSkip[1] + m_absIdxInCTU, m_transformSkip[1]);
         m_partCopy(ctu.m_transformSkip[2] + m_absIdxInCTU, m_transformSkip[2]);
@@ -2088,6 +2089,7 @@
                 cu->absPartIdx = g_depthScanIdx[yOffset][xOffset] * 4;
                 cu->numPartitions = (NUM_4x4_PARTITIONS >> ((g_maxLog2CUSize - cu->log2CUSize) * 2));
                 cu->depth = g_log2Size[maxCUSize] - log2CUSize;
+                cu->geomRecurId = cuIdx;
 
                 cu->flags = 0;
                 CU_SET_FLAG(cu->flags, CUGeom::PRESENT, presentFlag);

x265_1.9.tar.gz/source/common/cudata.h -> x265_2.0.tar.gz/source/common/cudata.h Changed

@@ -87,6 +87,7 @@
     uint32_t numPartitions; // Number of 4x4 blocks in the CU
     uint32_t flags;         // CU flags.
     uint32_t depth;         // depth of this CU relative from CTU
+    uint32_t geomRecurId;   // Unique geom id from 0 to MAX_GEOMS - 1 for every depth
 };
 
 struct MVField
@@ -222,8 +223,8 @@
     void     copyToPic(uint32_t depth) const;
 
     /* RD-0 methods called only from encodeResidue */
-    void     copyFromPic(const CUData& ctu, const CUGeom& cuGeom, int csp);
-    void     updatePic(uint32_t depth) const;
+    void     copyFromPic(const CUData& ctu, const CUGeom& cuGeom, int csp, bool copyQp = true);
+    void     updatePic(uint32_t depth, int picCsp) const;
 
     void     setPartSizeSubParts(PartSize size)    { m_partSet(m_partSize, (uint8_t)size); }
     void     setPredModeSubParts(PredMode mode)    { m_partSet(m_predMode, (uint8_t)mode); }
@@ -246,7 +247,7 @@
     void     setPURefIdx(int list, int8_t refIdx, int absPartIdx, int puIdx);
 
     uint8_t  getCbf(uint32_t absPartIdx, TextType ttype, uint32_t tuDepth) const { return (m_cbf[ttype][absPartIdx] >> tuDepth) & 0x1; }
-    uint8_t  getQtRootCbf(uint32_t absPartIdx) const                             { if (m_chromaFormat == X265_CSP_I400) return m_cbf[0][absPartIdx] || false; else { return m_cbf[0][absPartIdx] || m_cbf[1][absPartIdx] || m_cbf[2][absPartIdx];} }
+    bool     getQtRootCbf(uint32_t absPartIdx) const                             { return (m_cbf[0][absPartIdx] || ((m_chromaFormat != X265_CSP_I400) && (m_cbf[1][absPartIdx] || m_cbf[2][absPartIdx]))); }
     int8_t   getRefQP(uint32_t currAbsIdxInCTU) const;
     uint32_t getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MVField (*candMvField)[2], uint8_t* candDir) const;
     void     clipMv(MV& outMV) const;

x265_1.9.tar.gz/source/common/deblock.cpp -> x265_2.0.tar.gz/source/common/deblock.cpp Changed

@@ -319,27 +319,6 @@
     }
 }
 
-/* Deblocking of one line/column for the chrominance component
- * \param src     pointer to picture data
- * \param offset  offset value for picture data
- * \param tc      tc value
- * \param maskP   indicator to disable filtering on partP
- * \param maskQ   indicator to disable filtering on partQ */
-static inline void pelFilterChroma(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ)
-{
-    for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
-    {
-        int16_t m4  = (int16_t)src[0];
-        int16_t m3  = (int16_t)src[-offset];
-        int16_t m5  = (int16_t)src[offset];
-        int16_t m2  = (int16_t)src[-offset * 2];
-
-        int32_t delta = x265_clip3(-tc, tc, ((((m4 - m3) * 4) + m2 - m5 + 4) >> 3));
-        src[-offset] = x265_clip(m3 + (delta & maskP));
-        src[0] = x265_clip(m4 - (delta & maskQ));
-    }
-}
-
 void Deblock::edgeFilterLuma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[])
 {
     PicYuv* reconPic = cuQ->m_encData->m_reconPic;
@@ -517,7 +496,7 @@
             int32_t tc = s_tcTable[indexTC] << bitdepthShift;
             pixel* srcC = srcChroma[chromaIdx];
 
-            pelFilterChroma(srcC + unitOffset, srcStep, offset, tc, maskP, maskQ);
+            primitives.pelFilterChroma[dir](srcC + unitOffset, srcStep, offset, tc, maskP, maskQ);
         }
     }
 }

x265_1.9.tar.gz/source/common/frame.cpp -> x265_2.0.tar.gz/source/common/frame.cpp Changed

@@ -42,12 +42,14 @@
     m_prev = NULL;
     m_param = NULL;
     memset(&m_lowres, 0, sizeof(m_lowres));
+    m_rcData = NULL;
 }
 
 bool Frame::create(x265_param *param, float* quantOffsets)
 {
     m_fencPic = new PicYuv;
     m_param = param;
+    CHECKED_MALLOC_ZERO(m_rcData, RcStats, 1);
 
     if (m_fencPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp) &&
         m_lowres.create(m_fencPic, param->bframes, !!param->rc.aqMode))
@@ -64,14 +66,17 @@
         return true;
     }
     return false;
+fail:
+    return false;
 }
 
 bool Frame::allocEncodeData(x265_param *param, const SPS& sps)
 {
     m_encData = new FrameData;
     m_reconPic = new PicYuv;
+    m_param = param;
     m_encData->m_reconPic = m_reconPic;
-    bool ok = m_encData->create(*param, sps) && m_reconPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp);
+    bool ok = m_encData->create(*param, sps, m_fencPic->m_picCsp) && m_reconPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp);
     if (ok)
     {
         /* initialize right border of m_reconpicYuv as SAO may read beyond the
@@ -139,4 +144,5 @@
     }
 
     m_lowres.destroy();
+    X265_FREE(m_rcData);
 }

x265_1.9.tar.gz/source/common/frame.h -> x265_2.0.tar.gz/source/common/frame.h Changed

@@ -37,6 +37,27 @@
 
 #define IS_REFERENCED(frame) (frame->m_lowres.sliceType != X265_TYPE_B)
 
+/* Ratecontrol statistics */
+struct RcStats
+{
+    double   qpaRc;
+    double   qpAq;
+    double   qRceq;
+    double   qpNoVbv;
+    double   newQScale;
+    double   iCuCount;
+    double   pCuCount;
+    double   skipCuCount;
+    double   qScale;
+    int      mvBits;
+    int      miscBits;
+    int      coeffBits;
+    int      poc;
+    int      encodeOrder;
+    int      sliceType;
+    int      keptAsRef;
+};
+
 class Frame
 {
 public:
@@ -49,6 +70,7 @@
     /* Data associated with x265_picture */
     PicYuv*                m_fencPic;
     int                    m_poc;
+    int                    m_encodeOrder;
     int64_t                m_pts;                // user provided presentation time stamp
     int64_t                m_reorderedPts;
     int64_t                m_dts;
@@ -71,6 +93,7 @@
     Frame*                 m_prev;
     x265_param*            m_param;              // Points to the latest param set for the frame.
     x265_analysis_data     m_analysisData;
+    RcStats*               m_rcData;
     Frame();
 
     bool create(x265_param *param, float* quantOffsets);

x265_1.9.tar.gz/source/common/framedata.cpp -> x265_2.0.tar.gz/source/common/framedata.cpp Changed

x265_1.9.tar.gz/source/common/framedata.h -> x265_2.0.tar.gz/source/common/framedata.h Changed

x265_1.9.tar.gz/source/common/ipfilter.cpp -> x265_2.0.tar.gz/source/common/ipfilter.cpp Changed

x265_1.9.tar.gz/source/common/loopfilter.cpp -> x265_2.0.tar.gz/source/common/loopfilter.cpp Changed

@@ -27,7 +27,6 @@
 #include "primitives.h"
 
 #define PIXEL_MIN 0
-#define PIXEL_MAX ((1 << X265_DEPTH) - 1)
 
 namespace {
 
@@ -158,6 +157,27 @@
         src[offset * 2]  = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + 3 * m6 + 2 * m7 + 4) >> 3) - m6) + m6);
     }
 }
+
+/* Deblocking of one line/column for the chrominance component
+* \param src     pointer to picture data
+* \param offset  offset value for picture data
+* \param tc      tc value
+* \param maskP   indicator to disable filtering on partP
+* \param maskQ   indicator to disable filtering on partQ */
+static void pelFilterChroma_c(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ)
+{
+    for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
+    {
+        int16_t m4 = (int16_t)src[0];
+        int16_t m3 = (int16_t)src[-offset];
+        int16_t m5 = (int16_t)src[offset];
+        int16_t m2 = (int16_t)src[-offset * 2];
+
+        int32_t delta = x265_clip3(-tc, tc, ((((m4 - m3) * 4) + m2 - m5 + 4) >> 3));
+        src[-offset]  = x265_clip(m3 + (delta & maskP));
+        src[0]        = x265_clip(m4 - (delta & maskQ));
+    }
+}
 }
 
 namespace X265_NS {
@@ -176,5 +196,7 @@
     // C code is same for EDGE_VER and EDGE_HOR only asm code is different
     p.pelFilterLumaStrong[0] = pelFilterLumaStrong_c;
     p.pelFilterLumaStrong[1] = pelFilterLumaStrong_c;
+    p.pelFilterChroma[0]     = pelFilterChroma_c;
+    p.pelFilterChroma[1]     = pelFilterChroma_c;
 }
 }

x265_1.9.tar.gz/source/common/param.cpp -> x265_2.0.tar.gz/source/common/param.cpp Changed

@@ -121,9 +121,9 @@
     /* Source specifications */
     param->internalBitDepth = X265_DEPTH;
     param->internalCsp = X265_CSP_I420;
-
-    param->levelIdc = 0;
-    param->bHighTier = 0;
+    param->levelIdc = 0; //Auto-detect level
+    param->uhdBluray = 0;
+    param->bHighTier = 1; //Allow high tier by default
     param->interlaceMode = 0;
     param->bAnnexB = 1;
     param->bRepeatHeaders = 0;
@@ -164,6 +164,7 @@
     param->bEnableWeightedPred = 1;
     param->bEnableWeightedBiPred = 0;
     param->bEnableEarlySkip = 0;
+    param->bEnableRecursionSkip = 1;
     param->bEnableAMP = 0;
     param->bEnableRectInter = 0;
     param->rdLevel = 3;
@@ -193,6 +194,7 @@
     param->bLossless = 0;
     param->bCULossless = 0;
     param->bEnableTemporalSubLayers = 0;
+    param->bEnableRdRefine = 0;
 
     /* Rate control options */
     param->rc.vbvMaxBitrate = 0;
@@ -219,8 +221,9 @@
     param->rc.qblur = 0.5;
     param->rc.zoneCount = 0;
     param->rc.zones = NULL;
-    param->rc.bEnableSlowFirstPass = 0;
+    param->rc.bEnableSlowFirstPass = 1;
     param->rc.bStrictCbr = 0;
+    param->rc.bEnableGrain = 0;
 
     /* Video Usability Information (VUI) */
     param->vui.aspectRatioIdc = 0;
@@ -245,7 +248,7 @@
     param->maxCLL = 0;
     param->maxFALL = 0;
     param->minLuma = 0;
-    param->maxLuma = (1 << X265_DEPTH) - 1;
+    param->maxLuma = PIXEL_MAX;
 }
 
 int x265_param_default_preset(x265_param* param, const char* preset, const char* tune)
@@ -408,9 +411,9 @@
             param->maxNumMergeCand = 5;
             param->searchMethod = X265_STAR_SEARCH;
             param->bEnableTransformSkip = 1;
+            param->bEnableRecursionSkip = 0;
             param->maxNumReferences = 5;
             param->limitReferences = 0;
-            param->rc.bEnableSlowFirstPass = 1;
             param->bIntraInBFrames = 1;
             param->lookaheadSlices = 0; // disabled for best quality
             // TODO: optimized esa
@@ -453,16 +456,16 @@
         }
         else if (!strcmp(tune, "grain"))
         {
-            param->deblockingFilterBetaOffset = -2;
-            param->deblockingFilterTCOffset = -2;
-            param->bIntraInBFrames = 0;
-            param->rdoqLevel = 2;
-            param->psyRdoq = 10.0;
-            param->psyRd = 0.5;
             param->rc.ipFactor = 1.1;
-            param->rc.pbFactor = 1.1;
-            param->rc.aqStrength = 0.3;
-            param->rc.qCompress = 0.8;
+            param->rc.pbFactor = 1.0;
+            param->rc.cuTree = 0;
+            param->rc.aqMode = 0;
+            param->rc.qpStep = 1;
+            param->rc.bEnableGrain = 1;
+            param->bEnableRecursionSkip = 0;
+            param->psyRd = 4.0;
+            param->psyRdoq = 10.0;
+            param->bEnableSAO = 0;
         }
         else
             return -1;
@@ -616,6 +619,7 @@
     OPT("max-merge") p->maxNumMergeCand = (uint32_t)atoi(value);
     OPT("temporal-mvp") p->bEnableTemporalMvp = atobool(value);
     OPT("early-skip") p->bEnableEarlySkip = atobool(value);
+    OPT("rskip") p->bEnableRecursionSkip = atobool(value);
     OPT("rdpenalty") p->rdPenalty = atoi(value);
     OPT("tskip") p->bEnableTransformSkip = atobool(value);
     OPT("no-tskip-fast") p->bEnableTSkipFast = atobool(value);
@@ -702,6 +706,7 @@
         else
             p->psyRdoq = 0.0;
     }
+    OPT("rd-refine") p->bEnableRdRefine = atobool(value);
     OPT("signhide") p->bEnableSignHiding = atobool(value);
     OPT("b-intra") p->bIntraInBFrames = atobool(value);
     OPT("lft") p->bEnableLoopFilter = atobool(value); /* DEPRECATED */
@@ -757,6 +762,7 @@
         p->rc.qp = atoi(value);
         p->rc.rateControlMode = X265_RC_CQP;
     }
+    OPT("rc-grain") p->rc.bEnableGrain = atobool(value);
     OPT("zones")
     {
         p->rc.zoneCount = 1;
@@ -877,6 +883,7 @@
     OPT("max-cll") bError |= sscanf(value, "%hu,%hu", &p->maxCLL, &p->maxFALL) != 2;
     OPT("min-luma") p->minLuma = (uint16_t)atoi(value);
     OPT("max-luma") p->maxLuma = (uint16_t)atoi(value);
+    OPT("uhd-bd") p->uhdBluray = atobool(value);
     else
         return X265_PARAM_BAD_NAME;
 #undef OPT
@@ -1023,7 +1030,8 @@
 {
 #define CHECK(expr, msg) check_failed |= _confirm(param, expr, msg)
     int check_failed = 0; /* abort if there is a fatal configuration problem */
-
+    CHECK(param->uhdBluray == 1 && (X265_DEPTH != 10 || param->internalCsp != 1 || param->interlaceMode != 0),
+        "uhd-bd: bit depth, chroma subsample, source picture type must be 10, 4:2:0, progressive");
     CHECK(param->maxCUSize != 64 && param->maxCUSize != 32 && param->maxCUSize != 16,
           "max cu size must be 16, 32, or 64");
     if (check_failed == 1)
@@ -1096,7 +1104,7 @@
 
     CHECK(param->rc.rateControlMode > X265_RC_CRF || param->rc.rateControlMode < X265_RC_ABR,
           "Rate control mode is out of range");
-    CHECK(param->rdLevel < 0 || param->rdLevel > 6,
+    CHECK(param->rdLevel < 1 || param->rdLevel > 6,
           "RD Level is out of range");
     CHECK(param->rdoqLevel < 0 || param->rdoqLevel > 2,
         "RDOQ Level is out of range");
@@ -1194,12 +1202,12 @@
         CHECK(0 > param->noiseReductionIntra || param->noiseReductionIntra > 2000, "Valid noise reduction range 0 - 2000");
     if (param->noiseReductionInter)
         CHECK(0 > param->noiseReductionInter || param->noiseReductionInter > 2000, "Valid noise reduction range 0 - 2000");
-    CHECK(param->rc.rateControlMode == X265_RC_CRF && param->rc.bStatRead && param->rc.vbvMaxBitrate == 0,
-          "Constant rate-factor is incompatible with 2pass");
     CHECK(param->rc.rateControlMode == X265_RC_CQP && param->rc.bStatRead,
           "Constant QP is incompatible with 2pass");
     CHECK(param->rc.bStrictCbr && (param->rc.bitrate <= 0 || param->rc.vbvBufferSize <=0),
           "Strict-cbr cannot be applied without specifying target bitrate or vbv bufsize");
+    CHECK(param->analysisMode && (param->analysisMode < X265_ANALYSIS_OFF || param->analysisMode > X265_ANALYSIS_LOAD),
+        "Invalid analysis mode. Analysis mode 0: OFF 1: SAVE : 2 LOAD");
     return check_failed;
 }
 
@@ -1225,18 +1233,21 @@
     uint32_t maxLog2CUSize = (uint32_t)g_log2Size[param->maxCUSize];
     uint32_t minLog2CUSize = (uint32_t)g_log2Size[param->minCUSize];
 
-    if (ATOMIC_INC(&g_ctuSizeConfigured) > 1)
+    Lock gLock;
+    ScopedLock sLock(gLock);
+
+    if (++g_ctuSizeConfigured > 1)
     {
         if (g_maxCUSize != param->maxCUSize)
         {
-            x265_log(param, X265_LOG_ERROR, "maxCUSize must be the same for all encoders in a single process");
-            return -1;
+            x265_log(param, X265_LOG_WARNING, "maxCUSize must be the same for all encoders in a single process");
         }
         if (g_maxCUDepth != maxLog2CUSize - minLog2CUSize)
         {
-            x265_log(param, X265_LOG_ERROR, "maxCUDepth must be the same for all encoders in a single process");
-            return -1;
+            x265_log(param, X265_LOG_WARNING, "maxCUDepth must be the same for all encoders in a single process");
         }
+        param->maxCUSize = g_maxCUSize;
+        return x265_check_params(param); /* Check again, since param may have changed */
     }
     else
     {
@@ -1302,8 +1313,9 @@
     x265_log(param, X265_LOG_INFO, "Lookahead / bframes / badapt        : %d / %d / %d\n", param->lookaheadDepth, param->bframes, param->bFrameAdaptive);
     x265_log(param, X265_LOG_INFO, "b-pyramid / weightp / weightb       : %d / %d / %d\n",
              param->bBPyramid, param->bEnableWeightedPred, param->bEnableWeightedBiPred);
-    x265_log(param, X265_LOG_INFO, "References / ref-limit  cu / depth  : %d / %d / %d\n",
-             param->maxNumReferences, !!(param->limitReferences & X265_REF_LIMIT_CU), !!(param->limitReferences & X265_REF_LIMIT_DEPTH));
+    x265_log(param, X265_LOG_INFO, "References / ref-limit  cu / depth  : %d / %s / %s\n",
+             param->maxNumReferences, (param->limitReferences & X265_REF_LIMIT_CU) ? "on" : "off",
+             (param->limitReferences & X265_REF_LIMIT_DEPTH) ? "on" : "off");
 
     if (param->rc.aqMode)
         x265_log(param, X265_LOG_INFO, "AQ: mode / str / qg-size / cu-tree  : %d / %0.1f / %d / %d\n", param->rc.aqMode,
@@ -1336,7 +1348,9 @@
     TOOLVAL(param->psyRd, "psy-rd=%.2lf");
     TOOLVAL(param->rdoqLevel, "rdoq=%d");
     TOOLVAL(param->psyRdoq, "psy-rdoq=%.2lf");
+    TOOLOPT(param->bEnableRdRefine, "rd-refine");
     TOOLOPT(param->bEnableEarlySkip, "early-skip");
+    TOOLOPT(param->bEnableRecursionSkip, "rskip");
     TOOLVAL(param->noiseReductionIntra, "nr-intra=%d");
     TOOLVAL(param->noiseReductionInter, "nr-inter=%d");

x265_1.9.tar.gz/source/common/param.h -> x265_2.0.tar.gz/source/common/param.h Changed

x265_1.9.tar.gz/source/common/picyuv.cpp -> x265_2.0.tar.gz/source/common/picyuv.cpp Changed

@@ -46,6 +46,10 @@
 
     m_maxLumaLevel = 0;
     m_avgLumaLevel = 0;
+    m_stride = 0;
+    m_strideC = 0;
+    m_hChromaShift = 0;
+    m_vChromaShift = 0;
 }
 
 bool PicYuv::create(uint32_t picWidth, uint32_t picHeight, uint32_t picCsp)
@@ -176,6 +180,7 @@
      * warnings from valgrind about using uninitialized pixels */
     padx++;
     pady++;
+    m_picCsp = pic.colorSpace;
 
     X265_CHECK(pic.bitDepth >= 8, "pic.bitDepth check failure");
 
@@ -190,7 +195,7 @@
 
             primitives.planecopy_cp(yChar, pic.stride[0] / sizeof(*yChar), yPixel, m_stride, width, height, shift);
 
-            if (pic.colorSpace != X265_CSP_I400)
+            if (param.internalCsp != X265_CSP_I400)
             {
                 pixel *uPixel = m_picOrg[1];
                 pixel *vPixel = m_picOrg[2];
@@ -216,7 +221,7 @@
                 yChar += pic.stride[0] / sizeof(*yChar);
             }
 
-            if (pic.colorSpace != X265_CSP_I400)
+            if (param.internalCsp != X265_CSP_I400)
             {
                 pixel *uPixel = m_picOrg[1];
                 pixel *vPixel = m_picOrg[2];
@@ -258,7 +263,7 @@
             primitives.planecopy_sp_shl(yShort, pic.stride[0] / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
         }
 
-        if (pic.colorSpace != X265_CSP_I400)
+        if (param.internalCsp != X265_CSP_I400)
         {
             pixel *uPixel = m_picOrg[1];
             pixel *vPixel = m_picOrg[2];
@@ -279,12 +284,25 @@
         }
     }
 
-    /* extend the right edge if width was not multiple of the minimum CU size */
-    uint64_t sumLuma;
     pixel *Y = m_picOrg[0];
-    m_maxLumaLevel = primitives.planeClipAndMax(Y, m_stride, width, height, &sumLuma, (pixel)param.minLuma, (pixel)param.maxLuma);
-    m_avgLumaLevel = (double)(sumLuma) / (m_picHeight * m_picWidth);
+    pixel *U = m_picOrg[1];
+    pixel *V = m_picOrg[2];
 
+#if HIGH_BIT_DEPTH
+    bool calcHDRParams = !!param.minLuma || (param.maxLuma != PIXEL_MAX);
+    /* Apply min/max luma bounds for HDR pixel manipulations */
+    if (calcHDRParams)
+    {
+        X265_CHECK(pic.bitDepth == 10, "HDR stats can be applied/calculated only for 10bpp content");
+        uint64_t sumLuma;
+        m_maxLumaLevel = primitives.planeClipAndMax(Y, m_stride, width, height, &sumLuma, (pixel)param.minLuma, (pixel)param.maxLuma);
+        m_avgLumaLevel = (double) sumLuma / (m_picHeight * m_picWidth);
+    }
+#else
+    (void) param;
+#endif
+
+    /* extend the right edge if width was not multiple of the minimum CU size */
     for (int r = 0; r < height; r++)
     {
         for (int x = 0; x < padx; x++)
@@ -297,11 +315,8 @@
     for (int i = 1; i <= pady; i++)
         memcpy(Y + i * m_stride, Y, (width + padx) * sizeof(pixel));
 
-    if (pic.colorSpace != X265_CSP_I400)
+    if (param.internalCsp != X265_CSP_I400)
     {
-        pixel *U = m_picOrg[1];
-        pixel *V = m_picOrg[2];
-
         for (int r = 0; r < height >> m_vChromaShift; r++)
         {
             for (int x = 0; x < padx >> m_hChromaShift; x++)

x265_1.9.tar.gz/source/common/picyuv.h -> x265_2.0.tar.gz/source/common/picyuv.h Changed

x265_1.9.tar.gz/source/common/pixel.cpp -> x265_2.0.tar.gz/source/common/pixel.cpp Changed

@@ -607,7 +607,6 @@
  * s1*s1, s2*s2, and s1*s2 also obtain this value for edge cases: ((2^10-1)*16*4)^2 = 4286582784.
  * Maximum value for 9-bit is: ss*64 = (2^9-1)^2*16*4*64 = 1069551616, which will not overflow. */
 
-#define PIXEL_MAX ((1 << X265_DEPTH) - 1)
 #if HIGH_BIT_DEPTH
     X265_CHECK((X265_DEPTH == 10) || (X265_DEPTH == 12), "ssim invalid depth\n");
 #define type float
@@ -873,7 +872,25 @@
     }
 }
 
-static pixel planeClipAndMax_c(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix)
+/* Conversion between double and Q8.8 fixed point (big-endian) for storage */
+static void cuTreeFix8Pack(uint16_t *dst, double *src, int count)
+{
+    for (int i = 0; i < count; i++)
+        dst[i] = (uint16_t)(src[i] * 256.0);
+}
+
+static void cuTreeFix8Unpack(double *dst, uint16_t *src, int count)
+{
+    for (int i = 0; i < count; i++)
+    {
+        int16_t qpFix8 = src[i];
+        dst[i] = (double)(qpFix8) / 256.0;
+    }
+}
+
+#if HIGH_BIT_DEPTH
+static pixel planeClipAndMax_c(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, 
+                               const pixel minPix, const pixel maxPix)
 {
     pixel maxLumaLevel = 0;
     uint64_t sumLuma = 0;
@@ -882,21 +899,18 @@
     {
         for (int c = 0; c < width; c++)
         {
-            /* Clip luma of source picture to max and min values before extending edges of picYuv */
+            /* Clip luma of source picture to max and min*/
             src[c] = x265_clip3((pixel)minPix, (pixel)maxPix, src[c]);
-
-            /* Determine maximum and average luma level in a picture */
             maxLumaLevel = X265_MAX(src[c], maxLumaLevel);
             sumLuma += src[c];
         }
-
         src += stride;
     }
-
     *outsum = sumLuma;
     return maxLumaLevel;
 }
 
+#endif
 }  // end anonymous namespace
 
 namespace X265_NS {
@@ -1181,7 +1195,11 @@
     p.planecopy_cp = planecopy_cp_c;
     p.planecopy_sp = planecopy_sp_c;
     p.planecopy_sp_shl = planecopy_sp_shl_c;
+#if HIGH_BIT_DEPTH
     p.planeClipAndMax = planeClipAndMax_c;
+#endif
     p.propagateCost = estimateCUPropagateCost;
+    p.fix8Unpack = cuTreeFix8Unpack;
+    p.fix8Pack = cuTreeFix8Pack;
 }
 }

x265_1.9.tar.gz/source/common/predict.cpp -> x265_2.0.tar.gz/source/common/predict.cpp Changed

@@ -57,12 +57,10 @@
 
 Predict::Predict()
 {
-    m_immedVals = NULL;
 }
 
 Predict::~Predict()
 {
-    X265_FREE(m_immedVals);
     m_predShortYuv[0].destroy();
     m_predShortYuv[1].destroy();
 }
@@ -72,12 +70,8 @@
     m_csp = csp;
     m_hChromaShift = CHROMA_H_SHIFT(csp);
     m_vChromaShift = CHROMA_V_SHIFT(csp);
-    CHECKED_MALLOC(m_immedVals, int16_t, 64 * (64 + NTAPS_LUMA - 1));
 
     return m_predShortYuv[0].create(MAX_CU_SIZE, csp) && m_predShortYuv[1].create(MAX_CU_SIZE, csp);
-
-fail:
-    return false;
 }
 
 void Predict::motionCompensation(const CUData& cu, const PredictionUnit& pu, Yuv& predYuv, bool bLuma, bool bChroma)
@@ -258,8 +252,8 @@
     int partEnum = partitionFromSizes(pu.width, pu.height);
     const pixel* src = refPic.getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + srcOffset;
 
-    int xFrac = mv.x & 0x3;
-    int yFrac = mv.y & 0x3;
+    int xFrac = mv.x & 3;
+    int yFrac = mv.y & 3;
 
     if (!(yFrac | xFrac))
         primitives.pu[partEnum].copy_pp(dst, dstStride, src, srcStride);
@@ -280,14 +274,14 @@
     intptr_t srcOffset = (mv.x >> 2) + (mv.y >> 2) * srcStride;
     const pixel* src = refPic.getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + srcOffset;
 
-    int xFrac = mv.x & 0x3;
-    int yFrac = mv.y & 0x3;
-
     int partEnum = partitionFromSizes(pu.width, pu.height);
 
     X265_CHECK((pu.width % 4) + (pu.height % 4) == 0, "width or height not divisible by 4\n");
     X265_CHECK(dstStride == MAX_CU_SIZE, "stride expected to be max cu size\n");
 
+    int xFrac = mv.x & 3;
+    int yFrac = mv.y & 3;
+
     if (!(yFrac | xFrac))
         primitives.pu[partEnum].convert_p2s(src, srcStride, dst, dstStride);
     else if (!yFrac)
@@ -296,11 +290,12 @@
         primitives.pu[partEnum].luma_vps(src, srcStride, dst, dstStride, yFrac);
     else
     {
-        int tmpStride = pu.width;
-        int filterSize = NTAPS_LUMA;
-        int halfFilterSize = (filterSize >> 1);
-        primitives.pu[partEnum].luma_hps(src, srcStride, m_immedVals, tmpStride, xFrac, 1);
-        primitives.pu[partEnum].luma_vss(m_immedVals + (halfFilterSize - 1) * tmpStride, tmpStride, dst, dstStride, yFrac);
+        ALIGN_VAR_32(int16_t, immed[MAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_LUMA - 1)]);
+        int immedStride = pu.width;
+        int halfFilterSize = NTAPS_LUMA >> 1;
+
+        primitives.pu[partEnum].luma_hps(src, srcStride, immed, immedStride, xFrac, 1);
+        primitives.pu[partEnum].luma_vss(immed + (halfFilterSize - 1) * immedStride, immedStride, dst, dstStride, yFrac);
     }
 }
 
@@ -309,10 +304,10 @@
     intptr_t dstStride = dstYuv.m_csize;
     intptr_t refStride = refPic.m_strideC;
 
-    int shiftHor = (2 + m_hChromaShift);
-    int shiftVer = (2 + m_vChromaShift);
+    int mvx = mv.x << (1 - m_hChromaShift);
+    int mvy = mv.y << (1 - m_vChromaShift);
 
-    intptr_t refOffset = (mv.x >> shiftHor) + (mv.y >> shiftVer) * refStride;
+    intptr_t refOffset = (mvx >> 3) + (mvy >> 3) * refStride;
 
     const pixel* refCb = refPic.getCbAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + refOffset;
     const pixel* refCr = refPic.getCrAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + refOffset;
@@ -320,11 +315,11 @@
     pixel* dstCb = dstYuv.getCbAddr(pu.puAbsPartIdx);
     pixel* dstCr = dstYuv.getCrAddr(pu.puAbsPartIdx);
 
-    int xFrac = mv.x & ((1 << shiftHor) - 1);
-    int yFrac = mv.y & ((1 << shiftVer) - 1);
-
     int partEnum = partitionFromSizes(pu.width, pu.height);
-    
+
+    int xFrac = mvx & 7;
+    int yFrac = mvy & 7;
+
     if (!(yFrac | xFrac))
     {
         primitives.chroma[m_csp].pu[partEnum].copy_pp(dstCb, dstStride, refCb, refStride);
@@ -332,37 +327,36 @@
     }
     else if (!yFrac)
     {
-        primitives.chroma[m_csp].pu[partEnum].filter_hpp(refCb, refStride, dstCb, dstStride, xFrac << (1 - m_hChromaShift));
-        primitives.chroma[m_csp].pu[partEnum].filter_hpp(refCr, refStride, dstCr, dstStride, xFrac << (1 - m_hChromaShift));
+        primitives.chroma[m_csp].pu[partEnum].filter_hpp(refCb, refStride, dstCb, dstStride, xFrac);
+        primitives.chroma[m_csp].pu[partEnum].filter_hpp(refCr, refStride, dstCr, dstStride, xFrac);
     }
     else if (!xFrac)
     {
-        primitives.chroma[m_csp].pu[partEnum].filter_vpp(refCb, refStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
-        primitives.chroma[m_csp].pu[partEnum].filter_vpp(refCr, refStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift));
+        primitives.chroma[m_csp].pu[partEnum].filter_vpp(refCb, refStride, dstCb, dstStride, yFrac);
+        primitives.chroma[m_csp].pu[partEnum].filter_vpp(refCr, refStride, dstCr, dstStride, yFrac);
     }
     else
     {
-        int extStride = pu.width >> m_hChromaShift;
-        int filterSize = NTAPS_CHROMA;
-        int halfFilterSize = (filterSize >> 1);
-
-        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCb, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);
-        primitives.chroma[m_csp].pu[partEnum].filter_vsp(m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
-
-        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCr, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);
-        primitives.chroma[m_csp].pu[partEnum].filter_vsp(m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift));
+        ALIGN_VAR_32(int16_t, immed[MAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_CHROMA - 1)]);
+        int immedStride = pu.width >> m_hChromaShift;
+        int halfFilterSize = NTAPS_CHROMA >> 1;
+
+        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCb, refStride, immed, immedStride, xFrac, 1);
+        primitives.chroma[m_csp].pu[partEnum].filter_vsp(immed + (halfFilterSize - 1) * immedStride, immedStride, dstCb, dstStride, yFrac);
+        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCr, refStride, immed, immedStride, xFrac, 1);
+        primitives.chroma[m_csp].pu[partEnum].filter_vsp(immed + (halfFilterSize - 1) * immedStride, immedStride, dstCr, dstStride, yFrac);
     }
 }
 
 void Predict::predInterChromaShort(const PredictionUnit& pu, ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const
 {
-    intptr_t refStride = refPic.m_strideC;
     intptr_t dstStride = dstSYuv.m_csize;
+    intptr_t refStride = refPic.m_strideC;
 
-    int shiftHor = (2 + m_hChromaShift);
-    int shiftVer = (2 + m_vChromaShift);
+    int mvx = mv.x << (1 - m_hChromaShift);
+    int mvy = mv.y << (1 - m_vChromaShift);
 
-    intptr_t refOffset = (mv.x >> shiftHor) + (mv.y >> shiftVer) * refStride;
+    intptr_t refOffset = (mvx >> 3) + (mvy >> 3) * refStride;
 
     const pixel* refCb = refPic.getCbAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + refOffset;
     const pixel* refCr = refPic.getCrAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + refOffset;
@@ -370,15 +364,15 @@
     int16_t* dstCb = dstSYuv.getCbAddr(pu.puAbsPartIdx);
     int16_t* dstCr = dstSYuv.getCrAddr(pu.puAbsPartIdx);
 
-    int xFrac = mv.x & ((1 << shiftHor) - 1);
-    int yFrac = mv.y & ((1 << shiftVer) - 1);
-
     int partEnum = partitionFromSizes(pu.width, pu.height);
     
     uint32_t cxWidth  = pu.width >> m_hChromaShift;
 
     X265_CHECK(((cxWidth | (pu.height >> m_vChromaShift)) % 2) == 0, "chroma block size expected to be multiple of 2\n");
 
+    int xFrac = mvx & 7;
+    int yFrac = mvy & 7;
+
     if (!(yFrac | xFrac))
     {
         primitives.chroma[m_csp].pu[partEnum].p2s(refCb, refStride, dstCb, dstStride);
@@ -386,23 +380,24 @@
     }
     else if (!yFrac)
     {
-        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCb, refStride, dstCb, dstStride, xFrac << (1 - m_hChromaShift), 0);
-        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCr, refStride, dstCr, dstStride, xFrac << (1 - m_hChromaShift), 0);
+        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCb, refStride, dstCb, dstStride, xFrac, 0);
+        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCr, refStride, dstCr, dstStride, xFrac, 0);
     }
     else if (!xFrac)
     {
-        primitives.chroma[m_csp].pu[partEnum].filter_vps(refCb, refStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
-        primitives.chroma[m_csp].pu[partEnum].filter_vps(refCr, refStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift));
+        primitives.chroma[m_csp].pu[partEnum].filter_vps(refCb, refStride, dstCb, dstStride, yFrac);
+        primitives.chroma[m_csp].pu[partEnum].filter_vps(refCr, refStride, dstCr, dstStride, yFrac);
     }
     else
     {
-        int extStride = cxWidth;
-        int filterSize = NTAPS_CHROMA;
-        int halfFilterSize = (filterSize >> 1);
-        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCb, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);
-        primitives.chroma[m_csp].pu[partEnum].filter_vss(m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
-        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCr, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);

x265_1.9.tar.gz/source/common/predict.h -> x265_2.0.tar.gz/source/common/predict.h Changed

x265_1.9.tar.gz/source/common/primitives.cpp -> x265_2.0.tar.gz/source/common/primitives.cpp Changed

x265_1.9.tar.gz/source/common/primitives.h -> x265_2.0.tar.gz/source/common/primitives.h Changed

@@ -189,6 +189,9 @@
 
 typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
 
+typedef void (*cutree_fix8_unpack)(double *dst, uint16_t *src, int count);
+typedef void (*cutree_fix8_pack)(uint16_t *dst, double *src, int count);
+
 typedef int (*scanPosLast_t)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
 typedef uint32_t (*findPosFirstLast_t)(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]);
 
@@ -197,6 +200,7 @@
 typedef uint32_t (*costC1C2Flag_t)(uint16_t *absCoeff, intptr_t numC1Flag, uint8_t *baseCtxMod, intptr_t ctxOffset);
 
 typedef void (*pelFilterLumaStrong_t)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
+typedef void (*pelFilterChroma_t)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ);
 
 /* Function pointers to optimized encoder primitives. Each pointer can reference
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */
@@ -313,6 +317,8 @@
 
     downscale_t           frameInitLowres;
     cutree_propagate_cost propagateCost;
+    cutree_fix8_unpack    fix8Unpack;
+    cutree_fix8_pack      fix8Pack;
 
     extendCURowBorder_t   extendRowBorder;
     planecopy_cp_t        planecopy_cp;
@@ -332,6 +338,7 @@
     costC1C2Flag_t        costC1C2Flag;
 
     pelFilterLumaStrong_t pelFilterLumaStrong[2]; // EDGE_VER = 0, EDGE_HOR = 1
+    pelFilterChroma_t     pelFilterChroma[2];     // EDGE_VER = 0, EDGE_HOR = 1
 
     /* There is one set of chroma primitives per color space. An encoder will
      * have just a single color space and thus it will only ever use one entry

x265_1.9.tar.gz/source/common/quant.cpp -> x265_2.0.tar.gz/source/common/quant.cpp Changed

x265_1.9.tar.gz/source/common/quant.h -> x265_2.0.tar.gz/source/common/quant.h Changed

x265_1.9.tar.gz/source/common/scalinglist.cpp -> x265_2.0.tar.gz/source/common/scalinglist.cpp Changed

@@ -57,7 +57,11 @@
     },
     {
         "INTRA32X32_LUMA",
+        "",
+        "",
         "INTER32X32_LUMA",
+        "",
+        "",
     },
 };
 const char MatrixType_DC[4][12][22] =
@@ -76,7 +80,11 @@
     },
     {
         "INTRA32X32_LUMA_DC",
+        "",
+        "",
         "INTER32X32_LUMA_DC",
+        "",
+        "",
     },
 };
 
@@ -246,15 +254,15 @@
 
     char line[1024];
     int32_t *src = NULL;
+    fseek(fp, 0, 0);
 
     for (int sizeIdc = 0; sizeIdc < NUM_SIZES; sizeIdc++)
     {
         int size = X265_MIN(MAX_MATRIX_COEF_NUM, s_numCoefPerSize[sizeIdc]);
-        for (int listIdc = 0; listIdc < NUM_LISTS; listIdc++)
+        for (int listIdc = 0; listIdc < NUM_LISTS;  listIdc += (sizeIdc == 3) ? 3 : 1)
         {
             src = m_scalingListCoef[sizeIdc][listIdc];
 
-            fseek(fp, 0, 0);
             do
             {
                 char *ret = fgets(line, 1024, fp);
@@ -282,7 +290,6 @@
 
             if (sizeIdc > BLOCK_8x8)
             {
-                fseek(fp, 0, 0);
                 do
                 {
                     char *ret = fgets(line, 1024, fp);
@@ -310,7 +317,7 @@
     fclose(fp);
 
     m_bEnabled = true;
-    m_bDataPresent = !checkDefaultScalingList();
+    m_bDataPresent = true;
 
     return false;
 }

x265_1.9.tar.gz/source/common/shortyuv.cpp -> x265_2.0.tar.gz/source/common/shortyuv.cpp Changed

x265_1.9.tar.gz/source/common/shortyuv.h -> x265_2.0.tar.gz/source/common/shortyuv.h Changed

x265_1.9.tar.gz/source/common/threadpool.cpp -> x265_2.0.tar.gz/source/common/threadpool.cpp Changed

@@ -28,6 +28,10 @@
 
 #include <new>
 
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
+#include <winnt.h>
+#endif
+
 #if X86_64
 
 #ifdef __GNUC__
@@ -64,6 +68,21 @@
 # define strcasecmp _stricmp
 #endif
 
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
+const uint64_t m1 = 0x5555555555555555; //binary: 0101...
+const uint64_t m2 = 0x3333333333333333; //binary: 00110011..
+const uint64_t m3 = 0x0f0f0f0f0f0f0f0f; //binary:  4 zeros,  4 ones ...
+const uint64_t h01 = 0x0101010101010101; //the sum of 256 to the power of 0,1,2,3...
+
+static int popCount(uint64_t x)
+{
+    x -= (x >> 1) & m1;
+    x = (x & m2) + ((x >> 2) & m2);
+    x = (x + (x >> 4)) & m3;
+    return (x * h01) >> 56;
+}
+#endif
+
 namespace X265_NS {
 // x265 private namespace
 
@@ -238,7 +257,6 @@
     memset(nodeMaskPerPool, 0, sizeof(nodeMaskPerPool));
 
     int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM);
-    int cpuCount = getCpuCount();
     bool bNumaSupport = false;
 
 #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
@@ -248,26 +266,54 @@
 #endif
 
 
-    for (int i = 0; i < cpuCount; i++)
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
+    PGROUP_AFFINITY groupAffinityPointer = new GROUP_AFFINITY;
+    for (int i = 0; i < numNumaNodes; i++)
     {
-#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
-        UCHAR node;
-        if (GetNumaProcessorNode((UCHAR)i, &node))
-            cpusPerNode[X265_MIN(node, (UCHAR)MAX_NODE_NUM)]++;
-        else
+        GetNumaNodeProcessorMaskEx((UCHAR)i, groupAffinityPointer);
+        cpusPerNode[i] = popCount(groupAffinityPointer->Mask);
+    }
+    delete groupAffinityPointer;
 #elif HAVE_LIBNUMA
-        if (bNumaSupport >= 0)
-            cpusPerNode[X265_MIN(numa_node_of_cpu(i), MAX_NODE_NUM)]++;
-        else
-#endif
-            cpusPerNode[0]++;
+    if (bNumaSupport)
+    {
+        struct bitmask* bitMask = numa_allocate_cpumask();
+        for (int i = 0; i < numNumaNodes; i++)
+        {
+            int ret = numa_node_to_cpus(i, bitMask);
+            if (!ret)
+                cpusPerNode[i] = numa_bitmask_weight(bitMask);
+            else
+                x265_log(p, X265_LOG_ERROR, "Failed to genrate CPU mask\n");
+        }
+        numa_free_cpumask(bitMask);
     }
+#else // NUMA not supported
+    cpusPerNode[0] = getCpuCount();
+#endif
 
     if (bNumaSupport && p->logLevel >= X265_LOG_DEBUG)
-        for (int i = 0; i < numNumaNodes; i++)
-            x265_log(p, X265_LOG_DEBUG, "detected NUMA node %d with %d logical cores\n", i, cpusPerNode[i]);
-
-    /* limit threads based on param->numaPools */
+    for (int i = 0; i < numNumaNodes; i++)
+        x265_log(p, X265_LOG_DEBUG, "detected NUMA node %d with %d logical cores\n", i, cpusPerNode[i]);
+    /* limit threads based on param->numaPools
+     * For windows because threads can't be allocated to live across sockets
+     * changing the default behavior to be per-socket pools -- FIXME */
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
+     if (!p->numaPools)
+     {
+         char poolString[50] = "";
+         for (int i = 0; i < numNumaNodes; i++)
+         {
+             char nextCount[10] = "";
+             if (i)
+                 sprintf(nextCount, ",%d", cpusPerNode[i]);
+             else
+                   sprintf(nextCount, "%d", cpusPerNode[i]);
+             strcat(poolString, nextCount);
+         }
+         x265_param_parse(p, "pools", poolString);
+     }
+#endif
     if (p->numaPools && *p->numaPools)
     {
         const char *nodeStr = p->numaPools;
@@ -280,7 +326,7 @@
             }
             else if (*nodeStr == '-')
                 threadsPerPool[i] = 0;
-			else if (*nodeStr == '*' || !strcasecmp(nodeStr, "NULL"))
+            else if (*nodeStr == '*' || !strcasecmp(nodeStr, "NULL"))
             {
                 for (int j = i; j < numNumaNodes; j++)
                 {
@@ -297,8 +343,16 @@
             else
             {
                 int count = atoi(nodeStr);
-                threadsPerPool[i] = X265_MIN(count, cpusPerNode[i]);
-                nodeMaskPerPool[i] = ((uint64_t)1 << i);
+                if (i > 0 || strchr(nodeStr, ','))   // it is comma -> old logic
+                {
+                    threadsPerPool[i] = X265_MIN(count, cpusPerNode[i]);
+                    nodeMaskPerPool[i] = ((uint64_t)1 << i);
+                }
+                else                                 // new logic: exactly 'count' threads on all NUMAs
+                {
+                    threadsPerPool[numNumaNodes] = X265_MIN(count, numNumaNodes * MAX_POOL_THREADS);
+                    nodeMaskPerPool[numNumaNodes] = ((uint64_t)-1 >> (64 - numNumaNodes));
+                }
             }
 
             /* consume current node string, comma, and white-space */
@@ -389,16 +443,15 @@
     X265_CHECK(numThreads <= MAX_POOL_THREADS, "a single thread pool cannot have more than MAX_POOL_THREADS threads\n");
 
 #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
-    m_winCpuMask = 0x0;
-    GROUP_AFFINITY groupAffinity;
+    memset(&m_groupAffinity, 0, sizeof(GROUP_AFFINITY));
     for (int i = 0; i < getNumaNodeCount(); i++)
     {
         int numaNode = ((nodeMask >> i) & 0x1U) ? i : -1;
         if (numaNode != -1)
-            if (GetNumaNodeProcessorMaskEx((USHORT)numaNode, &groupAffinity))
-                m_winCpuMask |= groupAffinity.Mask;
+        if (GetNumaNodeProcessorMaskEx((USHORT)numaNode, &m_groupAffinity))
+            break;
     }
-    m_numaMask = &m_winCpuMask;
+    m_numaMask = &m_groupAffinity.Mask;
 #elif HAVE_LIBNUMA
     if (numa_available() >= 0)
     {
@@ -480,11 +533,16 @@
     setThreadNodeAffinity(m_numaMask);
 }
 
-/* static */
 void ThreadPool::setThreadNodeAffinity(void *numaMask)
 {
 #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
-    if (SetThreadAffinityMask(GetCurrentThread(), *((DWORD_PTR*)numaMask)))
+    UNREFERENCED_PARAMETER(numaMask);
+    GROUP_AFFINITY groupAffinity;
+    memset(&groupAffinity, 0, sizeof(GROUP_AFFINITY));
+    groupAffinity.Group = m_groupAffinity.Group;
+    groupAffinity.Mask = m_groupAffinity.Mask;
+    const PGROUP_AFFINITY affinityPointer = &groupAffinity;
+    if (SetThreadGroupAffinity(GetCurrentThread(), affinityPointer, NULL))
         return;
     else
         x265_log(NULL, X265_LOG_ERROR, "unable to set thread affinity for NUMA node mask\n");
@@ -524,10 +582,25 @@
 /* static */
 int ThreadPool::getCpuCount()
 {
-#if _WIN32
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
+    enum { MAX_NODE_NUM = 127 };
+    int cpus = 0;
+    int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM);
+    GROUP_AFFINITY groupAffinity;
+    for (int i = 0; i < numNumaNodes; i++)
+    {
+        GetNumaNodeProcessorMaskEx((UCHAR)i, &groupAffinity);
+        cpus += popCount(groupAffinity.Mask);
+    }
+    return cpus;
+#elif _WIN32
     SYSTEM_INFO sysinfo;
     GetSystemInfo(&sysinfo);
     return sysinfo.dwNumberOfProcessors;
+#elif __unix__ && X265_ARCH_ARM

x265_1.9.tar.gz/source/common/threadpool.h -> x265_2.0.tar.gz/source/common/threadpool.h Changed

x265_1.9.tar.gz/source/common/x86/asm-primitives.cpp -> x265_2.0.tar.gz/source/common/x86/asm-primitives.cpp Changed

@@ -861,12 +861,12 @@
 template<int size>
 void interp_8tap_hv_pp_cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY)
 {
-    ALIGN_VAR_32(int16_t, immed[MAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_LUMA)]);
-    const int filterSize = NTAPS_LUMA;
-    const int halfFilterSize = filterSize >> 1;
+    ALIGN_VAR_32(int16_t, immed[MAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_LUMA - 1)]);
+    const int halfFilterSize = NTAPS_LUMA >> 1;
+    const int immedStride = MAX_CU_SIZE;
 
-    primitives.pu[size].luma_hps(src, srcStride, immed, MAX_CU_SIZE, idxX, 1);
-    primitives.pu[size].luma_vsp(immed + (halfFilterSize - 1) * MAX_CU_SIZE, MAX_CU_SIZE, dst, dstStride, idxY);
+    primitives.pu[size].luma_hps(src, srcStride, immed, immedStride, idxX, 1);
+    primitives.pu[size].luma_vsp(immed + (halfFilterSize - 1) * immedStride, immedStride, dst, dstStride, idxY);
 }
 
 #if HIGH_BIT_DEPTH
@@ -1098,9 +1098,16 @@
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].p2s = PFX(filterPixelToShort_8x2_ssse3);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].p2s = PFX(filterPixelToShort_8x6_ssse3);
         p.findPosFirstLast = PFX(findPosFirstLast_ssse3);
+        p.fix8Unpack = PFX(cutree_fix8_unpack_ssse3);
+        p.fix8Pack = PFX(cutree_fix8_pack_ssse3);
     }
     if (cpuMask & X265_CPU_SSE4)
     {
+        p.pelFilterLumaStrong[0] = PFX(pelFilterLumaStrong_V_sse4);
+        p.pelFilterLumaStrong[1] = PFX(pelFilterLumaStrong_H_sse4);
+        p.pelFilterChroma[0] = PFX(pelFilterChroma_V_sse4);
+        p.pelFilterChroma[1] = PFX(pelFilterChroma_H_sse4);
+
         p.saoCuOrgE0 = PFX(saoCuOrgE0_sse4);
         p.saoCuOrgE1 = PFX(saoCuOrgE1_sse4);
         p.saoCuOrgE1_2Rows = PFX(saoCuOrgE1_2Rows_sse4);
@@ -1166,6 +1173,12 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].p2s = PFX(filterPixelToShort_2x16_sse4);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s = PFX(filterPixelToShort_6x16_sse4);
         p.costCoeffRemain = PFX(costCoeffRemain_sse4);
+#if X86_64
+        p.saoCuStatsE0 = PFX(saoCuStatsE0_sse4);
+        p.saoCuStatsE1 = PFX(saoCuStatsE1_sse4);
+        p.saoCuStatsE2 = PFX(saoCuStatsE2_sse4);
+        p.saoCuStatsE3 = PFX(saoCuStatsE3_sse4);
+#endif
     }
     if (cpuMask & X265_CPU_AVX)
     {
@@ -2141,11 +2154,23 @@
 
         p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
         p.propagateCost = PFX(mbtree_propagate_cost_avx2);
+        p.fix8Unpack = PFX(cutree_fix8_unpack_avx2);
+        p.fix8Pack = PFX(cutree_fix8_pack_avx2);
+
+        /* TODO: This kernel needs to be modified to work with HIGH_BIT_DEPTH only 
+        p.planeClipAndMax = PFX(planeClipAndMax_avx2); */
 
         // TODO: depends on hps and vsp
         ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu);                        // calling luma_hvpp for all sizes
         p.pu[LUMA_4x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x4>;             // ALL_LUMA_PU_T has declared all sizes except 4x4, hence calling luma_hvpp[4x4] 
 
+#if X265_DEPTH == 10
+        p.pu[LUMA_8x8].satd = PFX(pixel_satd_8x8_avx2);
+        p.cu[LUMA_8x8].sa8d = PFX(pixel_sa8d_8x8_avx2);
+        p.cu[LUMA_16x16].sa8d = PFX(pixel_sa8d_16x16_avx2);
+        p.cu[LUMA_32x32].sa8d = PFX(pixel_sa8d_32x32_avx2);
+#endif
+
         if (cpuMask & X265_CPU_BMI2)
         {
             p.scanPosLast = PFX(scanPosLast_avx2_bmi2);
@@ -2434,6 +2459,8 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_ssse3);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_ssse3);
         p.findPosFirstLast = PFX(findPosFirstLast_ssse3);
+        p.fix8Unpack = PFX(cutree_fix8_unpack_ssse3);
+        p.fix8Pack = PFX(cutree_fix8_pack_ssse3);
     }
     if (cpuMask & X265_CPU_SSE4)
     {
@@ -2529,8 +2556,10 @@
 #if X86_64
         p.pelFilterLumaStrong[0] = PFX(pelFilterLumaStrong_V_sse4);
         p.pelFilterLumaStrong[1] = PFX(pelFilterLumaStrong_H_sse4);
+        p.pelFilterChroma[0] = PFX(pelFilterChroma_V_sse4);
+        p.pelFilterChroma[1] = PFX(pelFilterChroma_H_sse4);
 
-        p.saoCuStatsBO = PFX(saoCuStatsBO_sse4);
+//        p.saoCuStatsBO = PFX(saoCuStatsBO_sse4);
         p.saoCuStatsE0 = PFX(saoCuStatsE0_sse4);
         p.saoCuStatsE1 = PFX(saoCuStatsE1_sse4);
         p.saoCuStatsE2 = PFX(saoCuStatsE2_sse4);
@@ -2932,6 +2961,7 @@
         p.cu[BLOCK_8x8].intra_pred[14] = PFX(intra_pred_ang8_14_avx2);
         p.cu[BLOCK_8x8].intra_pred[15] = PFX(intra_pred_ang8_15_avx2);
         p.cu[BLOCK_8x8].intra_pred[16] = PFX(intra_pred_ang8_16_avx2);
+        p.cu[BLOCK_8x8].intra_pred[17] = PFX(intra_pred_ang8_17_avx2);
         p.cu[BLOCK_8x8].intra_pred[20] = PFX(intra_pred_ang8_20_avx2);
         p.cu[BLOCK_8x8].intra_pred[21] = PFX(intra_pred_ang8_21_avx2);
         p.cu[BLOCK_8x8].intra_pred[22] = PFX(intra_pred_ang8_22_avx2);
@@ -3651,7 +3681,6 @@
         p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx2);
         p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ps = PFX(blockcopy_ps_32x64_avx2);
         p.cu[BLOCK_64x64].copy_ps = PFX(blockcopy_ps_64x64_avx2);
-        p.planeClipAndMax = PFX(planeClipAndMax_avx2);
 
         p.pu[LUMA_32x8].sad_x3 = PFX(pixel_sad_x3_32x8_avx2);
         p.pu[LUMA_32x16].sad_x3 = PFX(pixel_sad_x3_32x16_avx2);
@@ -3663,6 +3692,8 @@
         p.pu[LUMA_64x48].sad_x3 = PFX(pixel_sad_x3_64x48_avx2);
         p.pu[LUMA_64x64].sad_x3 = PFX(pixel_sad_x3_64x64_avx2);
         p.pu[LUMA_48x64].sad_x3 = PFX(pixel_sad_x3_48x64_avx2);
+        p.fix8Unpack = PFX(cutree_fix8_unpack_avx2);
+        p.fix8Pack = PFX(cutree_fix8_pack_avx2);
 
     }
 #endif

x265_1.9.tar.gz/source/common/x86/blockcopy8.asm -> x265_2.0.tar.gz/source/common/x86/blockcopy8.asm Changed

x265_1.9.tar.gz/source/common/x86/const-a.asm -> x265_2.0.tar.gz/source/common/x86/const-a.asm Changed

@@ -40,12 +40,16 @@
 const pb_8,                 times 32 db 8
 const pb_15,                times 32 db 15
 const pb_16,                times 32 db 16
+const pb_31,                times 32 db 31
 const pb_32,                times 32 db 32
 const pb_64,                times 32 db 64
+const pb_124,               times 32 db 124
 const pb_128,               times 32 db 128
 const pb_a1,                times 16 db 0xa1
 
 const pb_01,                times  8 db   0,   1
+const pb_0123,              times  4 db   0,   1
+                            times  4 db   2,   3
 const hsub_mul,             times 16 db   1,  -1
 const pw_swap,              times  2 db   6,   7,   4,   5,   2,   3,   0,   1
 const pb_unpackbd1,         times  2 db   0,   0,   0,   0,   1,   1,   1,   1,   2,   2,   2,   2,   3,   3,   3,   3
@@ -64,6 +68,8 @@
                             times 12 db 0x00
 const pb_000000000000000F,           db 0xff
                             times 15 db 0x00
+const pb_shuf_off4,         times  2 db   0,   4,   1,   5,   2,   6,   3,   7
+const pw_shuf_off4,         times  1 db   0,   1,   8,   9,   2,   3,  10,  11,   4,   5,  12,  13,   6,   7,  14,  15
 
 ;; 16-bit constants
 
@@ -115,6 +121,8 @@
 const hmul_16p,             times 16 db   1
                             times  8 db   1,  -1
 const pw_exp2_0_15,                  dw 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768
+const pw_1_ffff,            times  4 dw 1
+                            times  4 dw 0xFFFF
 
 
 ;; 32-bit constants
@@ -146,10 +154,6 @@
 const pd_planar16_mul2,     times  1 dd  15,  14,  13,  12,  11,  10,   9,   8,    7,   6,   5,   4,   3,   2,   1,   0
 const trans8_shuf,          times  1 dd   0,   4,   1,   5,   2,   6,   3,   7
 
-const popcnt_table
-%assign x 0
-%rep 256
-; population count
-db ((x>>0)&1)+((x>>1)&1)+((x>>2)&1)+((x>>3)&1)+((x>>4)&1)+((x>>5)&1)+((x>>6)&1)+((x>>7)&1)
-%assign x x+1
-%endrep
+;; 64-bit constants
+
+const pq_1,                 times 1 dq 1

x265_1.9.tar.gz/source/common/x86/intrapred8.asm -> x265_2.0.tar.gz/source/common/x86/intrapred8.asm Changed

@@ -355,55 +355,55 @@
                             times 8 db (32-22), 22
                             times 8 db (32-11), 11
 
-const ang16_shuf_mode9,    times 8 db 0, 1
-                           times 8 db 1, 2
+const ang16_shuf_mode9,     times 8 db 0, 1
+                            times 8 db 1, 2
 
-const angHor_tab_9,  db (32-2), 2, (32-4), 4, (32-6), 6, (32-8), 8, (32-10), 10, (32-12), 12, (32-14), 14, (32-16), 16
-                     db (32-18), 18, (32-20), 20, (32-22), 22, (32-24),  24, (32-26),  26, (32-28), 28, (32-30), 30, (32-32), 32
+const angHor_tab_9,         db (32-2), 2, (32-4), 4, (32-6), 6, (32-8), 8, (32-10), 10, (32-12), 12, (32-14), 14, (32-16), 16
+                            db (32-18), 18, (32-20), 20, (32-22), 22, (32-24),  24, (32-26),  26, (32-28), 28, (32-30), 30, (32-32), 32
 
-const angHor_tab_11, db (32-30), 30, (32-28), 28, (32-26), 26, (32-24), 24, (32-22), 22, (32-20), 20, (32-18), 18, (32-16), 16
-                     db (32-14), 14, (32-12), 12, (32-10), 10, (32- 8),  8, (32- 6),  6, (32- 4),  4, (32- 2),  2, (32- 0),  0
+const angHor_tab_11,        db (32-30), 30, (32-28), 28, (32-26), 26, (32-24), 24, (32-22), 22, (32-20), 20, (32-18), 18, (32-16), 16
+                            db (32-14), 14, (32-12), 12, (32-10), 10, (32- 8),  8, (32- 6),  6, (32- 4),  4, (32- 2),  2, (32- 0),  0
 
-const ang16_shuf_mode12,   db 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3
-                           db 1, 2, 1, 2, 1, 2, 1, 2, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2
+const ang16_shuf_mode12,    db 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3
+                            db 1, 2, 1, 2, 1, 2, 1, 2, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2
 
-const angHor_tab_12, db (32-27), 27, (32-22), 22, (32-17), 17, (32-12), 12, (32-7), 7, (32-2), 2, (32-29), 29, (32-24), 24
-                     db (32-19), 19, (32-14), 14, (32-9), 9, (32-4), 4, (32-31), 31, (32-26),  26, (32-21), 21, (32-16), 16
+const angHor_tab_12,        db (32-27), 27, (32-22), 22, (32-17), 17, (32-12), 12, (32-7), 7, (32-2), 2, (32-29), 29, (32-24), 24
+                            db (32-19), 19, (32-14), 14, (32-9), 9, (32-4), 4, (32-31), 31, (32-26),  26, (32-21), 21, (32-16), 16
 
-const ang16_shuf_mode13,   db 4, 5, 4, 5, 4, 5, 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 5, 6, 5, 6, 5, 6, 4, 5, 4, 5, 4, 5, 4, 5, 3, 4
-                           db 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2, 0, 1, 0, 1, 3, 4, 3, 4, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2
-                           db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0, 0 ,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0
+const ang16_shuf_mode13,    db 4, 5, 4, 5, 4, 5, 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 5, 6, 5, 6, 5, 6, 4, 5, 4, 5, 4, 5, 4, 5, 3, 4
+                            db 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2, 0, 1, 0, 1, 3, 4, 3, 4, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2
+                            db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0, 0 ,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0
 
-const angHor_tab_13, db (32-23), 23, (32-14), 14, (32-5), 5, (32-28), 28, (32-19), 19, (32-10), 10, (32-1), 1, (32-24), 24
-                     db (32-15), 15, (32-6), 6, (32-29), 29, (32-20), 20, (32-11), 11, (32-2), 2, (32-25), 25, (32-16), 16
+const angHor_tab_13,        db (32-23), 23, (32-14), 14, (32-5), 5, (32-28), 28, (32-19), 19, (32-10), 10, (32-1), 1, (32-24), 24
+                            db (32-15), 15, (32-6), 6, (32-29), 29, (32-20), 20, (32-11), 11, (32-2), 2, (32-25), 25, (32-16), 16
 
-const ang16_shuf_mode14,   db 6, 7, 6, 7, 5, 6, 5, 6, 4, 5, 4, 5, 4, 5, 3, 4, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6, 5, 6, 4, 5
-                           db 3, 4, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 0, 1, 4, 5, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2
-                           db 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0
+const ang16_shuf_mode14,    db 6, 7, 6, 7, 5, 6, 5, 6, 4, 5, 4, 5, 4, 5, 3, 4, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6, 5, 6, 4, 5
+                            db 3, 4, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 0, 1, 4, 5, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2
+                            db 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0
 
-const angHor_tab_14, db (32-19), 19, (32-6), 6, (32-25), 25, (32-12), 12, (32-31), 31, (32-18), 18, (32-5), 5, (32-24), 24
-                     db (32-11), 11, (32-30), 30, (32-17), 17, (32-4), 4, (32-23), 23, (32-10), 10, (32-29), 29, (32-16), 16
+const angHor_tab_14,        db (32-19), 19, (32-6), 6, (32-25), 25, (32-12), 12, (32-31), 31, (32-18), 18, (32-5), 5, (32-24), 24
+                            db (32-11), 11, (32-30), 30, (32-17), 17, (32-4), 4, (32-23), 23, (32-10), 10, (32-29), 29, (32-16), 16
 
-const ang16_shuf_mode15,   db 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6, 4, 5, 9, 10, 8, 9, 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6
-                           db 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 5, 6, 4, 5, 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2
-                           db 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0
+const ang16_shuf_mode15,    db 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6, 4, 5, 9, 10, 8, 9, 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6
+                            db 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 5, 6, 4, 5, 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2
+                            db 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0
 
-const angHor_tab_15, db (32-15), 15, (32-30), 30, (32-13), 13, (32-28), 28, (32-11), 11, (32-26), 26, (32-9), 9, (32-24), 24
-                     db (32-7), 7, (32-22), 22, (32-5), 5, (32-20), 20, (32-3), 3, (32-18), 18, (32-1), 1, (32- 16), 16
+const angHor_tab_15,        db (32-15), 15, (32-30), 30, (32-13), 13, (32-28), 28, (32-11), 11, (32-26), 26, (32-9), 9, (32-24), 24
+                            db (32-7), 7, (32-22), 22, (32-5), 5, (32-20), 20, (32-3), 3, (32-18), 18, (32-1), 1, (32- 16), 16
 
-const ang16_shuf_mode16,   db 10, 11, 9, 10, 9, 10, 8, 9, 7, 8, 7, 8, 6, 7, 5, 6, 11, 12, 10, 11, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7
-                           db 5, 6, 4, 5, 3, 4, 3, 4, 2, 3, 1, 2, 1, 2, 0, 1, 6, 7, 5, 6, 4, 5, 4, 5, 3, 4, 2, 3, 2, 3, 1, 2
-                           db 0 ,0, 0, 0, 0, 15, 14, 12 , 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0
+const ang16_shuf_mode16,    db 10, 11, 9, 10, 9, 10, 8, 9, 7, 8, 7, 8, 6, 7, 5, 6, 11, 12, 10, 11, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7
+                            db 5, 6, 4, 5, 3, 4, 3, 4, 2, 3, 1, 2, 1, 2, 0, 1, 6, 7, 5, 6, 4, 5, 4, 5, 3, 4, 2, 3, 2, 3, 1, 2
+                            db 0 ,0, 0, 0, 0, 15, 14, 12 , 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0
 
-const angHor_tab_16, db (32-11), 11, (32-22), 22, (32-1), 1, (32-12), 12, (32-23), 23, (32-2), 2, (32-13), 13, (32-24), 24
-                     db (32-3), 3, (32-14), 14, (32-25), 25, (32-4), 4, (32-15), 15, (32-26), 26, (32-5), 5, (32-16), 16
+const angHor_tab_16,        db (32-11), 11, (32-22), 22, (32-1), 1, (32-12), 12, (32-23), 23, (32-2), 2, (32-13), 13, (32-24), 24
+                            db (32-3), 3, (32-14), 14, (32-25), 25, (32-4), 4, (32-15), 15, (32-26), 26, (32-5), 5, (32-16), 16
 
-const ang16_shuf_mode17,   db 12, 13, 11, 12, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7, 13, 14, 12, 13, 11, 12, 10, 11, 9, 10, 9, 10, 8, 9, 7, 8
-                           db 5, 6, 4, 5, 4, 5, 3, 4, 2, 3, 1, 2, 0, 1, 0, 1, 6, 7, 5, 6, 5, 6, 4, 5, 3, 4, 2, 3, 1, 2, 1, 2
-                           db 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0
+const ang16_shuf_mode17,    db 12, 13, 11, 12, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7, 13, 14, 12, 13, 11, 12, 10, 11, 9, 10, 9, 10, 8, 9, 7, 8
+                            db 5, 6, 4, 5, 4, 5, 3, 4, 2, 3, 1, 2, 0, 1, 0, 1, 6, 7, 5, 6, 5, 6, 4, 5, 3, 4, 2, 3, 1, 2, 1, 2
+                            db 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0
 
-const angHor_tab_17, db (32- 6),  6, (32-12), 12, (32-18), 18, (32-24), 24, (32-30), 30, (32- 4),  4, (32-10), 10, (32-16), 16
-                     db (32-22), 22, (32-28), 28, (32- 2),  2, (32- 8),  8, (32-14), 14, (32-20), 20, (32-26), 26, (32- 0),  0
+const angHor_tab_17,        db (32- 6),  6, (32-12), 12, (32-18), 18, (32-24), 24, (32-30), 30, (32- 4),  4, (32-10), 10, (32-16), 16
+                            db (32-22), 22, (32-28), 28, (32- 2),  2, (32- 8),  8, (32-14), 14, (32-20), 20, (32-26), 26, (32- 0),  0
 
 ; Intrapred_angle32x32, modes 1 to 33 constants
 const ang32_shuf_mode9,         times 8 db 0, 1
@@ -467,6 +467,39 @@
                                 dd  0,  0,  2,  3,  0,  0,  7,  1
                                 dd  0,  0,  5,  6,  0,  0,  0,  0
 
+; Intrapred_angle8x8, modes 1 to 33 constants
+const ang8_shuf_mode3,          db  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  4,  5,  5,  6,  6,  7,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  5,  6,  6,  7,  7,  8
+const ang8_shuf_mode4,          db  0,  1,  1,  2,  1,  2,  2,  3,  3,  4,  3,  4,  4,  5,  5,  6,  1,  2,  2,  3,  2,  3,  3,  4,  4,  5,  4,  5,  5,  6,  6,  7
+const ang8_shuf_mode5,          db  0,  1,  1,  2,  1,  2,  2,  3,  2,  3,  3,  4,  3,  4,  4,  5,  1,  2,  2,  3,  2,  3,  3,  4,  3,  4,  4,  5,  4,  5,  5,  6
+const ang8_shuf_mode6,          db  0,  1,  0,  1,  1,  2,  1,  2,  2,  3,  2,  3,  2,  3,  3,  4,  1,  2,  1,  2,  2,  3,  2,  3,  3,  4,  3,  4,  3,  4,  4,  5
+const ang8_shuf_mode7,          db  0,  1,  0,  1,  0,  1,  1,  2,  1,  2,  1,  2,  1,  2,  2,  3,  1,  2,  1,  2,  1,  2,  2,  3,  2,  3,  2,  3,  2,  3,  3,  4
+const ang8_shuf_mode8,          db  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  1,  2,  1,  2,  1,  2,  1,  2,  1,  2,  1,  2,  1,  2,  1,  2,  2,  3,  2,  3
+const ang8_shuf_mode9,          db  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  1,  2,  1,  2,  1,  2,  1,  2,  1,  2,  1,  2,  1,  2,  1,  2
+const ang8_shuf_mode12,         db  7,  8,  7,  8,  7,  8,  7,  8,  7,  8,  7,  8,  6,  7,  6,  7,  8,  9,  8,  9,  8,  9,  8,  9,  8,  9,  8,  9,  7,  8,  7,  8
+const ang8_shuf_mode13,         db  8,  9,  8,  9,  8,  9,  7,  8,  7,  8,  7,  8,  7,  8,  6,  7,  9, 10,  9, 10,  9, 10,  8,  9,  8,  9,  8,  9,  8,  9,  7,  8
+const ang8_shuf_mode14,         db  9, 10,  9, 10,  8,  9,  8,  9,  7,  8,  7,  8,  7,  8,  6,  7, 10, 11, 10, 11,  9, 10,  9, 10,  8,  9,  8,  9,  8,  9,  7,  8
+const ang8_shuf_mode15,         db 10, 11,  9, 10,  9, 10,  8,  9,  8,  9,  7,  8,  7,  8,  6,  7, 11, 12, 10, 11, 10, 11,  9, 10,  9, 10,  8,  9,  8,  9,  7,  8
+                                db  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  8,  6,  4,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  8,  6,  4,  2,  0
+const ang8_shuf_mode16,         db 11, 12, 10, 11, 10, 11,  9, 10,  8,  9,  8,  9,  7,  8,  6,  7, 12, 13, 11, 12, 11, 12, 10, 11,  9, 10,  9, 10,  8,  9,  7,  8
+                                db  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  8,  6,  5,  3,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  8,  6,  5,  3,  2,  0
+const ang8_shuf_mode17,         db 12, 13, 11, 12, 10, 11,  9, 10,  8,  9,  8,  9,  7,  8,  6,  7, 13, 14, 12, 13, 11, 12, 10, 11,  9, 10,  9, 10,  8,  9,  7,  8
+                                db  0,  0,  0,  0,  0,  0,  0,  0,  0,  7,  6,  5,  4,  2,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  7,  6,  5,  4,  2,  1,  0
+
+const ang8_fact_mode3,          db (32-26), 26, (32-20), 20, (32-14), 14, (32- 8),  8, (32- 2),  2, (32-28), 28, (32-22), 22, (32-16), 16
+const ang8_fact_mode4,          db (32-21), 21, (32-10), 10, (32-31), 31, (32-20), 20, (32- 9),  9, (32-30), 30, (32-19), 19, (32- 8),  8
+const ang8_fact_mode5,          db (32-17), 17, (32- 2),  2, (32-19), 19, (32- 4),  4, (32-21), 21, (32- 6),  6, (32-23), 23, (32- 8),  8
+const ang8_fact_mode6,          db (32-13), 13, (32-26), 26, (32- 7),  7, (32-20), 20, (32- 1),  1, (32-14), 14, (32-27), 27, (32- 8),  8
+const ang8_fact_mode7,          db (32- 9),  9, (32-18), 18, (32-27), 27, (32- 4),  4, (32-13), 13, (32-22), 22, (32-31), 31, (32- 8),  8
+const ang8_fact_mode8,          db (32- 5),  5, (32-10), 10, (32-15), 15, (32-20), 20, (32-25), 25, (32-30), 30, (32- 3),  3, (32- 8),  8
+const ang8_fact_mode9,          db (32- 2),  2, (32- 4),  4, (32- 6),  6, (32- 8),  8, (32-10), 10, (32-12), 12, (32-14), 14, (32-16), 16
+const ang8_fact_mode11,         db (32-30), 30, (32-28), 28, (32-26), 26, (32-24), 24, (32-22), 22, (32-20), 20, (32-18), 18, (32-16), 16
+const ang8_fact_mode12,         db (32-27), 27, (32-22), 22, (32-17), 17, (32-12), 12, (32- 7),  7, (32- 2),  2, (32-29), 29, (32-24), 24
+const ang8_fact_mode13,         db (32-23), 23, (32-14), 14, (32- 5),  5, (32-28), 28, (32-19), 19, (32-10), 10, (32- 1),  1, (32-24), 24
+const ang8_fact_mode14,         db (32-19), 19, (32- 6),  6, (32-25), 25, (32-12), 12, (32-31), 31, (32-18), 18, (32- 5),  5, (32-24), 24
+const ang8_fact_mode15,         db (32-15), 15, (32-30), 30, (32-13), 13, (32-28), 28, (32-11), 11, (32-26), 26, (32- 9),  9, (32-24), 24
+const ang8_fact_mode16,         db (32-11), 11, (32-22), 22, (32- 1),  1, (32-12), 12, (32-23), 23, (32- 2),  2, (32-13), 13, (32-24), 24
+const ang8_fact_mode17,         db (32- 6),  6, (32-12), 12, (32-18), 18, (32-24), 24, (32-30), 30, (32- 4),  4, (32-10), 10, (32-16), 16
+
 const ang_table
 %assign x 0
 %rep 32
@@ -490,6 +523,7 @@
 
 SECTION .text
 cextern pb_1
+cextern pb_2
 cextern pw_2
 cextern pw_3
 cextern pw_4
@@ -18582,48 +18616,48 @@
 ; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
 ;-----------------------------------------------------------------------------------------
 INIT_YMM avx2
-cglobal intra_pred_ang8_3, 3,4,5
-    mova              m3, [pw_1024]
+%macro ang8_store8x8 0
+    lea               r3, [3 * r1]
+    vextracti128      xm2, m1, 1
+    vextracti128      xm5, m4, 1
+    movq              [r0], xm1
+    movq              [r0 + r1], xm2
+    movhps            [r0 + 2 * r1], xm1
+    movhps            [r0 + r3], xm2
+    lea               r0, [r0 + 4 * r1]
+    movq              [r0], xm4
+    movq              [r0 + r1], xm5
+    movhps            [r0 + 2 * r1], xm4
+    movhps            [r0 + r3], xm5
+%endmacro
+
+cglobal intra_pred_ang8_3, 3,4,6
     vbroadcasti128    m0, [r2 + 17]
+    mova              m5, [ang8_shuf_mode3]
+    mova              m3, [pb_2]
 
-    pshufb            m1, m0, [c_ang8_src1_9_2_10]
-    pshufb            m2, m0, [c_ang8_src3_11_4_12]
-    pshufb            m4, m0, [c_ang8_src5_13_5_13]
-    pshufb            m0,     [c_ang8_src6_14_7_15]
+    pshufb            m1, m0, m5
+    paddb             m5, m3
+    pshufb            m2, m0, m5
+    paddb             m5, m3
+    pshufb            m4, m0, m5
+    paddb             m5, m3
+    pshufb            m0, m5
 
-    pmaddubsw         m1, [c_ang8_26_20]
+    vbroadcasti128    m5, [ang8_fact_mode3]
+    mova              m3, [pw_1024]
+    pmaddubsw         m1, m5
+    pmaddubsw         m2, m5
+    pmaddubsw         m4, m5
+    pmaddubsw         m0, m5
     pmulhrsw          m1, m3
-    pmaddubsw         m2, [c_ang8_14_8]
     pmulhrsw          m2, m3
-    pmaddubsw         m4, [c_ang8_2_28]
     pmulhrsw          m4, m3
-    pmaddubsw         m0, [c_ang8_22_16]
     pmulhrsw          m0, m3
     packuswb          m1, m2
     packuswb          m4, m0
 
-    vperm2i128        m2, m1, m4, 00100000b
-    vperm2i128        m1, m1, m4, 00110001b
-    punpcklbw         m4, m2, m1
-    punpckhbw         m2, m1
-    punpcklwd         m1, m4, m2
-    punpckhwd         m4, m2

x265_1.9.tar.gz/source/common/x86/ipfilter16.asm -> x265_2.0.tar.gz/source/common/x86/ipfilter16.asm Changed

@@ -116,6 +116,7 @@
                   dw  -1, 4, -11, 40,  40, -11, 4, -1
                   dw   0, 1, -5,  17,  58, -10, 4, -1
 
+ALIGN 32
 tab_LumaCoeffV:   times 4 dw 0, 0
                   times 4 dw 0, 64
                   times 4 dw 0, 0
@@ -161,9 +162,8 @@
 const interp8_hpp_shuf,     db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
                             db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
 
-const pb_shuf,  db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
-                db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
-
+const interp8_hpp_shuf_new, db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
+                            db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
 
 SECTION .text
 cextern pd_8
@@ -10407,7 +10407,7 @@
     vpbroadcastq        m0, [tab_LumaCoeff + r4]
     vpbroadcastq        m1, [tab_LumaCoeff + r4 + 8]
 %endif
-    mova                m3, [pb_shuf]
+    mova                m3, [interp8_hpp_shuf]
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
 
     ; register map
@@ -10475,7 +10475,7 @@
     vpbroadcastq        m0, [tab_LumaCoeff + r4]
     vpbroadcastq        m1, [tab_LumaCoeff + r4 + 8]
 %endif
-    mova                m3, [pb_shuf]
+    mova                m3, [interp8_hpp_shuf]
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
 
     ; register map
@@ -10536,16 +10536,16 @@
     add                 r3d, r3d
     mov                 r4d, r4m
     mov                 r5d, r5m
-    shl                 r4d, 4
+    shl                 r4d, 6
 %ifdef PIC
-    lea                 r6, [tab_LumaCoeff]
-    vpbroadcastq        m0, [r6 + r4]
-    vpbroadcastq        m1, [r6 + r4 + 8]
+    lea                 r6, [tab_LumaCoeffV]
+    movu                m0, [r6 + r4]
+    movu                m1, [r6 + r4 + mmsize]
 %else
-    vpbroadcastq        m0, [tab_LumaCoeff + r4]
-    vpbroadcastq        m1, [tab_LumaCoeff + r4 + 8]
+    movu                m0, [tab_LumaCoeffV + r4]
+    movu                m1, [tab_LumaCoeffV + r4 + mmsize]
 %endif
-    mova                m3, [pb_shuf]
+    mova                m3, [interp8_hpp_shuf_new]
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
 
     ; register map
@@ -10554,7 +10554,7 @@
     sub                 r0, 6
     test                r5d, r5d
     mov                 r4d, %2
-    jz                  .loop0
+    jz                 .loop0
     lea                 r6, [r1*3]
     sub                 r0, r6
     add                 r4d, 7
@@ -10563,64 +10563,64 @@
 %assign x 0
 %rep %1/16
     vbroadcasti128      m4, [r0 + x]
-    vbroadcasti128      m5, [r0 + 8 + x]
+    vbroadcasti128      m5, [r0 + 4 * SIZEOF_PIXEL + x]
     pshufb              m4, m3
-    pshufb              m7, m5, m3
+    pshufb              m5, m3
 
     pmaddwd             m4, m0
-    pmaddwd             m7, m1
+    pmaddwd             m7, m5, m1
     paddd               m4, m7
+    vextracti128        xm7, m4, 1
+    paddd               xm4, xm7
+    paddd               xm4, xm2
+    psrad               xm4, INTERP_SHIFT_PS
 
     vbroadcasti128      m6, [r0 + 16 + x]
-    pshufb              m5, m3
-    pshufb              m7, m6, m3
+    pshufb              m6, m3
 
     pmaddwd             m5, m0
-    pmaddwd             m7, m1
+    pmaddwd             m7, m6, m1
     paddd               m5, m7
-
-    phaddd              m4, m5
-    vpermq              m4, m4, q3120
-    paddd               m4, m2
-    vextracti128        xm5,m4, 1
-    psrad               xm4, INTERP_SHIFT_PS
+    vextracti128        xm7, m5, 1
+    paddd               xm5, xm7
+    paddd               xm5, xm2
     psrad               xm5, INTERP_SHIFT_PS
-    packssdw            xm4, xm5
 
+    packssdw            xm4, xm5
     movu                [r2 + x], xm4
 
     vbroadcasti128      m5, [r0 + 24 + x]
-    pshufb              m6, m3
-    pshufb              m7, m5, m3
+    pshufb              m5, m3
 
     pmaddwd             m6, m0
-    pmaddwd             m7, m1
+    pmaddwd             m7, m5, m1
     paddd               m6, m7
+    vextracti128        xm7, m6, 1
+    paddd               xm6, xm7
+    paddd               xm6, xm2
+    psrad               xm6, INTERP_SHIFT_PS
 
     vbroadcasti128      m7, [r0 + 32 + x]
-    pshufb              m5, m3
     pshufb              m7, m3
 
     pmaddwd             m5, m0
     pmaddwd             m7, m1
     paddd               m5, m7
-
-    phaddd              m6, m5
-    vpermq              m6, m6, q3120
-    paddd               m6, m2
-    vextracti128        xm5,m6, 1
-    psrad               xm6, INTERP_SHIFT_PS
+    vextracti128        xm7, m5, 1
+    paddd               xm5, xm7
+    paddd               xm5, xm2
     psrad               xm5, INTERP_SHIFT_PS
-    packssdw            xm6, xm5
 
+    packssdw            xm6, xm5
     movu                [r2 + 16 + x], xm6
-    %assign x x+32
-    %endrep
+
+%assign x x+32
+%endrep
 
     add                 r2, r3
     add                 r0, r1
     dec                 r4d
-    jnz                 .loop0
+    jnz                .loop0
     RET
 %endif
 %endmacro
@@ -10656,7 +10656,7 @@
     vpbroadcastq        m0, [tab_LumaCoeff + r4]
     vpbroadcastq        m1, [tab_LumaCoeff + r4 + 8]
 %endif
-    mova                m3, [pb_shuf]
+    mova                m3, [interp8_hpp_shuf]
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
 
     ; register map
@@ -10749,7 +10749,7 @@
     vpbroadcastq        m0, [tab_LumaCoeff + r4]
     vpbroadcastq        m1, [tab_LumaCoeff + r4 + 8]
 %endif
-    mova                m3, [pb_shuf]
+    mova                m3, [interp8_hpp_shuf]
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
 
     ; register map
@@ -10824,7 +10824,7 @@
 %else
     vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
 %endif
-    mova                m3, [pb_shuf]
+    mova                m3, [interp8_hpp_shuf]
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
 
     ; register map
@@ -10883,7 +10883,7 @@
 %else
     vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
 %endif
-    mova                m3, [pb_shuf]
+    mova                m3, [interp8_hpp_shuf]
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
 
     ; register map
@@ -10956,7 +10956,7 @@

x265_1.9.tar.gz/source/common/x86/loopfilter.asm -> x265_2.0.tar.gz/source/common/x86/loopfilter.asm Changed

@@ -29,9 +29,6 @@
 %include "x86util.asm"
 
 SECTION_RODATA 32
-pb_31:      times 32 db 31
-pb_124:     times 32 db 124
-pb_15:      times 32 db 15
 
 SECTION .text
 cextern pb_1
@@ -39,6 +36,10 @@
 cextern pb_3
 cextern pb_4
 cextern pb_01
+cextern pb_0123
+cextern pb_15
+cextern pb_31
+cextern pb_124
 cextern pb_128
 cextern pw_1
 cextern pw_n1
@@ -48,7 +49,9 @@
 cextern pb_movemask
 cextern pb_movemask_32
 cextern hmul_16p
-
+cextern pw_1_ffff
+cextern pb_shuf_off4
+cextern pw_shuf_off4
 
 ;============================================================================================================
 ; void saoCuOrgE0(pixel * rec, int8_t * offsetEo, int lcuWidth, int8_t* signLeft, intptr_t stride)
@@ -154,7 +157,9 @@
     sub         r4d, 16
     jnz        .loopH
     RET
-%else ; HIGH_BIT_DEPTH
+
+%else ; HIGH_BIT_DEPTH == 1
+
 cglobal saoCuOrgE0, 5, 5, 8, rec, offsetEo, lcuWidth, signLeft, stride
 
     mov         r4d, r4m
@@ -240,7 +245,7 @@
     sub         r4d, 16
     jnz        .loopH
     RET
-%endif
+%endif ; HIGH_BIT_DEPTH == 0
 
 INIT_YMM avx2
 %if HIGH_BIT_DEPTH
@@ -2061,6 +2066,117 @@
 ; saoCuStatsE0(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
 ;-----------------------------------------------------------------------------------------------------------------------
 %if ARCH_X86_64
+
+%if HIGH_BIT_DEPTH == 1
+INIT_XMM sse4
+cglobal saoCuStatsE0, 3,10,8, 0-32
+    mov         r3d, r3m
+    mov         r4d, r4m
+    mov         r9, r5mp
+
+    ; clear internal temporary buffer
+    pxor        m0, m0
+    mova        [rsp], m0
+    mova        [rsp + mmsize], m0
+    mova        m4, [pw_1]
+    mova        m5, [pb_2]
+    xor         r7d, r7d
+
+    ; correct stride for diff[] and rec
+    mov         r6d, r3d
+    and         r6d, ~15
+    sub         r2, r6
+    lea         r8, [(r6 - 64) * 2]             ; 64 = MAX_CU_SIZE
+
+    FIX_STRIDES r2
+
+.loopH:
+    mov         r5d, r3d
+
+    ; calculate signLeft
+    mov         r7w, [r1]
+    sub         r7w, [r1 - SIZEOF_PIXEL]
+    seta        r7b
+    setb        r6b
+    sub         r7b, r6b
+    neg         r7b
+    pinsrb      m0, r7d, 15
+
+.loopL:
+
+    movu        m3, [r1]
+    movu        m2, [r1 + SIZEOF_PIXEL]
+    pcmpgtw     m6, m3, m2
+    pcmpgtw     m2, m3
+    pand        m6, m4
+    por         m2, m6
+
+    movu        m3, [r1 + mmsize]
+    movu        m6, [r1 + mmsize + SIZEOF_PIXEL]
+    pcmpgtw     m7, m3, m6
+    pcmpgtw     m6, m3
+    pand        m7, m4
+    por         m7, m6
+
+    packsswb    m2, m7                          ; signRight
+
+    palignr     m3, m2, m0, 15
+
+    pxor        m6, m6
+    psubb       m6, m3                          ; signLeft
+
+    mova        m0, m2
+    paddb       m2, m6
+    paddb       m2, m5                          ; edgeType
+
+    ; stats[edgeType]
+%assign x 0
+%rep 16
+    pextrb      r7d, m2, x
+
+    movsx       r6d, word [r0 + x * 2]
+    inc         word [rsp + r7 * 2]             ; tmp_count[edgeType]++
+    add         [rsp + 5 * 2 + r7 * 4], r6d     ; tmp_stats[edgeType] += (fenc[x] - rec[x])
+    dec         r5d
+    jz         .next
+%assign x x+1
+%endrep
+
+    add         r0, 16*2
+    add         r1, 16 * SIZEOF_PIXEL
+    jmp        .loopL
+
+.next:
+    sub         r0, r8
+    add         r1, r2
+
+    dec         r4d
+    jnz        .loopH
+
+    ; sum to global buffer
+    mov         r0, r6mp
+
+    ; s_eoTable = {1, 2, 0, 3, 4}
+    pmovzxwd    m0, [rsp + 0 * 2]
+    pshufd      m0, m0, q3102
+    movu        m1, [r0]
+    paddd       m0, m1
+    movu        [r0], m0
+    movzx       r5d, word [rsp + 4 * 2]
+    add         [r0 + 4 * 4], r5d
+
+    movu        m0, [rsp + 5 * 2 + 0 * 4]
+    pshufd      m0, m0, q3102
+    movu        m1, [r9]
+    paddd       m0, m1
+    movu        [r9], m0
+    mov         r6d, [rsp + 5 * 2 + 4 * 4]
+    add         [r9 + 4 * 4], r6d
+    RET
+%endif ; HIGH_BIT_DEPTH=1
+
+
+%if HIGH_BIT_DEPTH == 0
 INIT_XMM sse4
 cglobal saoCuStatsE0, 3,10,6, 0-32
     mov         r3d, r3m
@@ -2086,7 +2202,7 @@
 
     ; calculate signLeft
     mov         r7b, [r1]
-    sub         r7b, [r1 - 1]
+    sub         r7b, [r1 - SIZEOF_PIXEL]
     seta        r7b
     setb        r6b
     sub         r7b, r6b
@@ -2095,13 +2211,14 @@
 
 .loopL:
     movu        m3, [r1]
-    movu        m2, [r1 + 1]
+    movu        m2, [r1 + SIZEOF_PIXEL]
 
     pxor        m1, m3, m4
     pxor        m2, m4
     pcmpgtb     m3, m1, m2
     pcmpgtb     m2, m1
     pand        m3, [pb_1]
+
     por         m2, m3                          ; signRight
 
     palignr     m3, m2, m0, 15
@@ -2125,7 +2242,7 @@
 %endrep
 
     add         r0, 16*2
-    add         r1, 16

x265_1.9.tar.gz/source/common/x86/loopfilter.h -> x265_2.0.tar.gz/source/common/x86/loopfilter.h Changed

x265_1.9.tar.gz/source/common/x86/mc-a.asm -> x265_2.0.tar.gz/source/common/x86/mc-a.asm Changed

x265_1.9.tar.gz/source/common/x86/mc-a2.asm -> x265_2.0.tar.gz/source/common/x86/mc-a2.asm Changed

@@ -43,11 +43,11 @@
 deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
 deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
 %endif
-pw_1024: times 16 dw 1024
 
-pd_16: times 4 dd 16
-pd_0f: times 4 dd 0xffff
-pf_inv256: times 8 dd 0.00390625
+cutree_fix8_unpack_shuf: db -1,-1, 0, 1,-1,-1, 2, 3,-1,-1, 4, 5,-1,-1, 6, 7
+                         db -1,-1, 8, 9,-1,-1,10,11,-1,-1,12,13,-1,-1,14,15
+
+const pq_256,       times 4 dq 256.0
 const pd_inv256,    times 4 dq 0.00390625
 const pd_0_5,       times 4 dq 0.5
 
@@ -59,9 +59,11 @@
 cextern pw_32
 cextern pw_512
 cextern pw_00ff
+cextern pw_1024
 cextern pw_3fff
 cextern pw_pixel_max
 cextern pd_ffff
+cextern pd_16
 
 ;The hpel_filter routines use non-temporal writes for output.
 ;The following defines may be uncommented for testing.
@@ -1215,3 +1217,121 @@
 
 INIT_YMM avx2
 MBTREE_AVX
+
+
+%macro CUTREE_FIX8 0
+;-----------------------------------------------------------------------------
+; void cutree_fix8_pack( uint16_t *dst, double *src, int count )
+;-----------------------------------------------------------------------------
+cglobal cutree_fix8_pack, 3, 4, 5
+    movapd       m2, [pq_256]
+    sub          r2d, mmsize / 2
+    movsxdifnidn r2, r2d
+    lea          r1, [r1 + 8 * r2]
+    lea          r0, [r0 + 2 * r2]
+    neg          r2
+    jg .skip_loop
+.loop:
+    mulpd        m0, m2, [r1 + 8 * r2]
+    mulpd        m1, m2, [r1 + 8 * r2 + mmsize]
+    mulpd        m3, m2, [r1 + 8 * r2 + 2 * mmsize]
+    mulpd        m4, m2, [r1 + 8 * r2 + 3 * mmsize]
+    cvttpd2dq    xm0, m0
+    cvttpd2dq    xm1, m1
+    cvttpd2dq    xm3, m3
+    cvttpd2dq    xm4, m4
+%if mmsize == 32
+    vinserti128  m0, m0, xm3, 1
+    vinserti128  m1, m1, xm4, 1
+    packssdw     m0, m1
+%else
+    punpcklqdq   m0, m1
+    punpcklqdq   m3, m4
+    packssdw     m0, m3
+%endif
+    mova         [r0 + 2 * r2], m0
+    add          r2, mmsize / 2
+    jle .loop
+.skip_loop:
+    sub          r2, mmsize / 2
+    jz .end
+    ; Do the remaining values in scalar in order to avoid overreading src.
+.scalar:
+    movq         xm0, [r1 + 8 * r2 + 4 * mmsize] 
+    mulsd        xm0, xm2
+    cvttsd2si    r3d, xm0
+    mov          [r0 + 2 * r2 + mmsize], r3w
+    inc          r2
+    jl .scalar
+.end:
+    RET
+
+;-----------------------------------------------------------------------------
+; void cutree_fix8_unpack( double *dst, uint16_t *src, int count )
+;-----------------------------------------------------------------------------
+cglobal cutree_fix8_unpack, 3, 4, 7
+%if mmsize != 32
+    mova           m4, [cutree_fix8_unpack_shuf+16]
+%endif
+    movapd         m2, [pd_inv256]
+    mova           m3, [cutree_fix8_unpack_shuf]
+    sub            r2d, mmsize / 2
+    movsxdifnidn   r2, r2d
+    lea            r1, [r1 + 2 * r2]
+    lea            r0, [r0 + 8 * r2]
+    neg            r2
+    jg .skip_loop
+.loop:
+%if mmsize == 32
+    vbroadcasti128 m0, [r1 + 2 * r2]
+    vbroadcasti128 m1, [r1 + 2 * r2 + 16]
+    pshufb         m0, m3
+    pshufb         m1, m3
+%else
+    mova           m1, [r1 + 2 * r2]
+    pshufb         m0, m1, m3
+    pshufb         m1, m4
+%endif
+    psrad          m0, 16 ; sign-extend
+    psrad          m1, 16
+    cvtdq2pd       m5, xm0
+    cvtdq2pd       m6, xm1
+%if mmsize == 32
+    vpermq         m0, m0, q1032
+    vpermq         m1, m1, q1032
+%else
+    psrldq         m0, 8
+    psrldq         m1, 8
+%endif
+    cvtdq2pd       m0, xm0
+    cvtdq2pd       m1, xm1
+    mulpd          m0, m2
+    mulpd          m1, m2
+    mulpd          m5, m2
+    mulpd          m6, m2
+    movapd         [r0 + 8 * r2], m5
+    movapd         [r0 + 8 * r2 + mmsize], m0
+    movapd         [r0 + 8 * r2 + mmsize * 2], m6
+    movapd         [r0 + 8 * r2 + mmsize * 3], m1
+    add            r2, mmsize / 2
+    jle .loop
+.skip_loop:
+    sub            r2, mmsize / 2
+    jz .end
+.scalar:
+    movzx          r3d, word [r1 + 2 * r2 + mmsize]
+    movsx          r3d, r3w
+    cvtsi2sd       xm0, r3d
+    mulsd          xm0, xm2
+    movsd          [r0 + 8 * r2 + 4 * mmsize], xm0
+    inc            r2
+    jl .scalar
+.end:
+    RET
+%endmacro
+
+INIT_XMM ssse3
+CUTREE_FIX8
+
+INIT_YMM avx2
+CUTREE_FIX8

x265_1.9.tar.gz/source/common/x86/mc.h -> x265_2.0.tar.gz/source/common/x86/mc.h Changed

x265_1.9.tar.gz/source/common/x86/pixel-a.asm -> x265_2.0.tar.gz/source/common/x86/pixel-a.asm Changed

@@ -50,9 +50,6 @@
 transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14
 transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15
 
-sw_f0:     dq 0xfff0, 0
-pd_f0:     times 4 dd 0xffff0000
-
 SECTION .text
 
 cextern pb_0
@@ -67,7 +64,6 @@
 cextern pw_pmpmpmpm
 cextern pw_pmmpzzzz
 cextern pd_1
-cextern popcnt_table
 cextern pd_2
 cextern hmul_16p
 cextern pb_movemask
@@ -13803,3 +13799,589 @@
     movzx           eax, al
     RET
 %endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
+
+
+%if HIGH_BIT_DEPTH == 1 && BIT_DEPTH == 10
+%macro LOAD_DIFF_AVX2 4
+    movu       %1, %3
+    movu       %2, %4
+    psubw      %1, %2
+%endmacro
+
+%macro LOAD_DIFF_8x4P_AVX2 6-8 r0,r2 ; 4x dest, 2x temp, 2x pointer
+    LOAD_DIFF_AVX2 xm%1, xm%5, [%7],      [%8]
+    LOAD_DIFF_AVX2 xm%2, xm%6, [%7+r1],   [%8+r3]
+    LOAD_DIFF_AVX2 xm%3, xm%5, [%7+2*r1], [%8+2*r3]
+    LOAD_DIFF_AVX2 xm%4, xm%6, [%7+r4],   [%8+r5]
+
+    ;lea %7, [%7+4*r1]
+    ;lea %8, [%8+4*r3]
+%endmacro
+
+INIT_YMM avx2
+cglobal pixel_satd_8x8, 4,4,7
+
+    FIX_STRIDES r1, r3
+    pxor    xm6, xm6
+
+    ; load_diff 0 & 4
+    movu    xm0, [r0]
+    movu    xm1, [r2]
+    vinserti128 m0, m0, [r0 + r1 * 4], 1
+    vinserti128 m1, m1, [r2 + r3 * 4], 1
+    psubw   m0, m1
+    add     r0, r1
+    add     r2, r3
+
+    ; load_diff 1 & 5
+    movu    xm1, [r0]
+    movu    xm2, [r2]
+    vinserti128 m1, m1, [r0 + r1 * 4], 1
+    vinserti128 m2, m2, [r2 + r3 * 4], 1
+    psubw   m1, m2
+    add     r0, r1
+    add     r2, r3
+
+    ; load_diff 2 & 6
+    movu    xm2, [r0]
+    movu    xm3, [r2]
+    vinserti128 m2, m2, [r0 + r1 * 4], 1
+    vinserti128 m3, m3, [r2 + r3 * 4], 1
+    psubw   m2, m3
+    add     r0, r1
+    add     r2, r3
+
+    ; load_diff 3 & 7
+    movu    xm3, [r0]
+    movu    xm4, [r2]
+    vinserti128 m3, m3, [r0 + r1 * 4], 1
+    vinserti128 m4, m4, [r2 + r3 * 4], 1
+    psubw   m3, m4
+
+    SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
+
+    vextracti128 xm0, m6, 1
+    paddw xm6, xm0
+    HADDUW xm6, xm0
+    movd   eax, xm6
+    RET
+
+INIT_XMM avx2
+cglobal pixel_sa8d_8x8_internal
+    lea  r6, [r0+4*r1]
+    lea  r7, [r2+4*r3]
+    LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
+    LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
+
+    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
+    ;HADAMARD2_2D 0, 1, 2, 8, 6, wd
+    ;HADAMARD2_2D 4, 5, 3, 9, 6, wd
+    ;HADAMARD2_2D 0, 2, 1, 8, 6, dq
+    ;HADAMARD2_2D 4, 3, 5, 9, 6, dq
+    ;HADAMARD2_2D 0, 4, 2, 3, 6, qdq, amax
+    ;HADAMARD2_2D 1, 5, 8, 9, 6, qdq, amax
+
+    paddw m0, m1
+    paddw m0, m2
+    paddw m0, m8
+    SAVE_MM_PERMUTATION
+    ret
+
+
+INIT_XMM avx2
+cglobal pixel_sa8d_8x8, 4,8,12
+    FIX_STRIDES r1, r3
+    lea  r4, [3*r1]
+    lea  r5, [3*r3]
+    call pixel_sa8d_8x8_internal
+    HADDUW m0, m1
+    movd eax, m0
+    add eax, 1
+    shr eax, 1
+    RET
+
+
+INIT_YMM avx2
+cglobal pixel_sa8d_16x16, 4,8,12
+    FIX_STRIDES r1, r3
+    lea  r4, [3*r1]
+    lea  r5, [3*r3]
+    lea  r6, [r0+4*r1]
+    lea  r7, [r2+4*r3]
+    vbroadcasti128 m7, [pw_1]
+
+    ; Top 16x8
+    ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
+    movu m0, [r0]                                   ; 10 bits
+    movu m5, [r2]
+    psubw m0, m5                                    ; 11 bits
+    movu m1, [r0 + r1]
+    movu m6, [r2 + r3]
+    psubw m1, m6
+    movu m2, [r0 + r1 * 2]
+    movu m5, [r2 + r3 * 2]
+    psubw m2, m5
+    movu m8, [r0 + r4]
+    movu m6, [r2 + r5]
+    psubw m8, m6
+
+    ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
+    movu m4, [r6]
+    movu m11, [r7]
+    psubw m4, m11
+    movu m5, [r6 + r1]
+    movu m6, [r7 + r3]
+    psubw m5, m6
+    movu m3, [r6 + r1 * 2]
+    movu m11, [r7 + r3 * 2]
+    psubw m3, m11
+    movu m9, [r6 + r4]
+    movu m6, [r7 + r5]
+    psubw m9, m6
+
+    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax    ; 16 bits
+    pmaddwd m0, m7
+    pmaddwd m1, m7
+    pmaddwd m2, m7
+    pmaddwd m8, m7
+    paddd m0, m1
+    paddd m2, m8
+    paddd m10, m0, m2
+
+    lea  r0, [r0+8*r1]
+    lea  r2, [r2+8*r3]
+    lea  r6, [r6+8*r1]
+    lea  r7, [r7+8*r3]
+
+    ; Bottom 16x8
+    ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
+    movu m0, [r0]
+    movu m5, [r2]
+    psubw m0, m5
+    movu m1, [r0 + r1]
+    movu m6, [r2 + r3]
+    psubw m1, m6
+    movu m2, [r0 + r1 * 2]
+    movu m5, [r2 + r3 * 2]
+    psubw m2, m5
+    movu m8, [r0 + r4]
+    movu m6, [r2 + r5]
+    psubw m8, m6
+
+    ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
+    movu m4, [r6]
+    movu m11, [r7]
+    psubw m4, m11
+    movu m5, [r6 + r1]
+    movu m6, [r7 + r3]
+    psubw m5, m6
+    movu m3, [r6 + r1 * 2]
+    movu m11, [r7 + r3 * 2]

x265_1.9.tar.gz/source/common/yuv.cpp -> x265_2.0.tar.gz/source/common/yuv.cpp Changed

@@ -163,14 +163,19 @@
     }
 }
 
-void Yuv::addClip(const Yuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t log2SizeL)
+void Yuv::addClip(const Yuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t log2SizeL, int picCsp)
 {
     primitives.cu[log2SizeL - 2].add_ps(m_buf[0], m_size, srcYuv0.m_buf[0], srcYuv1.m_buf[0], srcYuv0.m_size, srcYuv1.m_size);
-    if (m_csp != X265_CSP_I400)
+    if (m_csp != X265_CSP_I400 && picCsp != X265_CSP_I400)
     {
         primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps(m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
         primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps(m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
     }
+    if (picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)
+    {
+        primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv0.m_csize);
+        primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv0.m_csize);
+    }
 }
 
 void Yuv::addAvg(const ShortYuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t absPartIdx, uint32_t width, uint32_t height, bool bLuma, bool bChroma)

x265_1.9.tar.gz/source/common/yuv.h -> x265_2.0.tar.gz/source/common/yuv.h Changed

x265_1.9.tar.gz/source/compat/msvc/stdint.h -> x265_2.0.tar.gz/source/compat/msvc/stdint.h Changed

x265_1.9.tar.gz/source/encoder/analysis.cpp -> x265_2.0.tar.gz/source/encoder/analysis.cpp Changed

@@ -74,14 +74,18 @@
 {
     m_reuseInterDataCTU = NULL;
     m_reuseRef = NULL;
-    m_reuseBestMergeCand = NULL;
-    m_reuseMv = NULL;
+    m_bHD = false;
 }
 bool Analysis::create(ThreadLocalData *tld)
 {
     m_tld = tld;
     m_bTryLossless = m_param->bCULossless && !m_param->bLossless && m_param->rdLevel >= 2;
-    m_bChromaSa8d = m_param->rdLevel >= 3;
+
+    int costArrSize = 1;
+    uint32_t maxDQPDepth = g_log2Size[m_param->maxCUSize] - g_log2Size[m_param->rc.qgSize];
+    for (uint32_t i = 1; i <= maxDQPDepth; i++)
+        costArrSize += (1 << (i * 2));
+    cacheCost = X265_MALLOC(uint64_t, costArrSize);
 
     int csp = m_param->internalCsp;
     uint32_t cuSize = g_maxCUSize;
@@ -102,6 +106,8 @@
             md.pred[j].fencYuv = &md.fencYuv;
         }
     }
+    if (m_param->sourceHeight >= 1080)
+        m_bHD = true;
 
     return ok;
 }
@@ -119,12 +125,14 @@
             m_modeDepth[i].pred[j].reconYuv.destroy();
         }
     }
+    X265_FREE(cacheCost);
 }
 
 Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext)
 {
     m_slice = ctu.m_slice;
     m_frame = &frame;
+    m_bChromaSa8d = m_param->rdLevel >= 3;
 
 #if _DEBUG || CHECKED_BUILD
     invalidateContexts(0);
@@ -142,8 +150,13 @@
         int numPredDir = m_slice->isInterP() ? 1 : 2;
         m_reuseInterDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData;
         m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
-        m_reuseBestMergeCand = &m_reuseInterDataCTU->bestMergeCand[ctu.m_cuAddr * CUGeom::MAX_GEOMS];
-        m_reuseMv = &m_reuseInterDataCTU->mv[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
+        m_reuseDepth = &m_reuseInterDataCTU->depth[ctu.m_cuAddr * ctu.m_numPartitions];
+        m_reuseModes = &m_reuseInterDataCTU->modes[ctu.m_cuAddr * ctu.m_numPartitions];
+        m_reusePartSize = &m_reuseInterDataCTU->partSize[ctu.m_cuAddr * ctu.m_numPartitions];
+        m_reuseMergeFlag = &m_reuseInterDataCTU->mergeFlag[ctu.m_cuAddr * ctu.m_numPartitions];
+        if (m_param->analysisMode == X265_ANALYSIS_SAVE)
+            for (int i = 0; i < X265_MAX_PRED_MODE_PER_CTU * numPredDir; i++)
+                m_reuseRef[i] = -1;
     }
     ProfileCUScope(ctu, totalCTUTime, totalCTUs);
 
@@ -158,14 +171,6 @@
             memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
         }
         compressIntraCU(ctu, cuGeom, qp);
-        if (m_param->analysisMode == X265_ANALYSIS_SAVE && intraDataCTU)
-        {
-            CUData* bestCU = &m_modeDepth[0].bestMode->cu;
-            memcpy(&intraDataCTU->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition);
-            memcpy(&intraDataCTU->modes[ctu.m_cuAddr * numPartition], bestCU->m_lumaIntraDir, sizeof(uint8_t) * numPartition);
-            memcpy(&intraDataCTU->partSizes[ctu.m_cuAddr * numPartition], bestCU->m_partSize, sizeof(uint8_t) * numPartition);
-            memcpy(&intraDataCTU->chromaModes[ctu.m_cuAddr * numPartition], bestCU->m_chromaIntraDir, sizeof(uint8_t) * numPartition);
-        }
     }
     else
     {
@@ -189,18 +194,12 @@
         else if (m_param->rdLevel <= 4)
             compressInterCU_rd0_4(ctu, cuGeom, qp);
         else
-        {
-            uint32_t zOrder = 0;
-            compressInterCU_rd5_6(ctu, cuGeom, zOrder, qp);
-            if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_analysisData.interData)
-            {
-                CUData* bestCU = &m_modeDepth[0].bestMode->cu;
-                memcpy(&m_reuseInterDataCTU->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition);
-                memcpy(&m_reuseInterDataCTU->modes[ctu.m_cuAddr * numPartition], bestCU->m_predMode, sizeof(uint8_t) * numPartition);
-            }
-        }
+            compressInterCU_rd5_6(ctu, cuGeom, qp);
     }
 
+    if (m_param->bEnableRdRefine)
+        qprdRefine(ctu, cuGeom, qp, qp);
+
     return *m_modeDepth[0].bestMode;
 }
 
@@ -229,6 +228,61 @@
     }
 }
 
+void Analysis::qprdRefine(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t lqp)
+{
+    uint32_t depth = cuGeom.depth;
+    ModeDepth& md = m_modeDepth[depth];
+    md.bestMode = NULL;
+
+    bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
+
+    int bestCUQP = qp;
+    int lambdaQP = lqp;
+
+    bool doQPRefine = (bDecidedDepth && depth <= m_slice->m_pps->maxCuDQPDepth) || (!bDecidedDepth && depth == m_slice->m_pps->maxCuDQPDepth);
+
+    if (doQPRefine)
+    {
+        uint64_t bestCUCost, origCUCost, cuCost, cuPrevCost;
+
+        int cuIdx = (cuGeom.childOffset - 1) / 3;
+        bestCUCost = origCUCost = cacheCost[cuIdx];
+
+        for (int dir = 2; dir >= -2; dir -= 4)
+        {
+            int threshold = 1;
+            int failure = 0;
+            cuPrevCost = origCUCost;
+
+            int modCUQP = qp + dir;
+            while (modCUQP >= QP_MIN && modCUQP <= QP_MAX_SPEC)
+            {
+                recodeCU(parentCTU, cuGeom, modCUQP, qp);
+                cuCost = md.bestMode->rdCost;
+
+                COPY2_IF_LT(bestCUCost, cuCost, bestCUQP, modCUQP);
+                if (cuCost < cuPrevCost)
+                    failure = 0;
+                else
+                    failure++;
+
+                if (failure > threshold)
+                    break;
+
+                cuPrevCost = cuCost;
+                modCUQP += dir;
+            }
+        }
+        lambdaQP = bestCUQP;
+    }
+
+    recodeCU(parentCTU, cuGeom, bestCUQP, lambdaQP);
+
+    /* Copy best data to encData CTU and recon */
+    md.bestMode->cu.copyToPic(depth);
+    md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
+}
+
 void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
 {
     uint32_t depth = cuGeom.depth;
@@ -334,6 +388,12 @@
         checkBestMode(*splitPred, depth);
     }
 
+    if (m_param->bEnableRdRefine && depth <= m_slice->m_pps->maxCuDQPDepth)
+    {
+        int cuIdx = (cuGeom.childOffset - 1) / 3;
+        cacheCost[cuIdx] = md.bestMode->rdCost;
+    }
+
     /* Copy best data to encData CTU and recon */
     md.bestMode->cu.copyToPic(depth);
     if (md.bestMode != &md.pred[PRED_SPLIT])
@@ -377,6 +437,7 @@
         slave.m_slice = m_slice;
         slave.m_frame = m_frame;
         slave.m_param = m_param;
+        slave.m_bChromaSa8d = m_param->rdLevel >= 3;
         slave.setLambdaFromQP(md.pred[PRED_2Nx2N].cu, m_rdCost.m_qp);
         slave.invalidateContexts(0);
         slave.m_rqt[pmode.cuGeom.depth].cur.load(m_rqt[pmode.cuGeom.depth].cur);
@@ -555,7 +616,7 @@
         if (m_param->rdLevel <= 4)
             checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
         else
-            checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom, false);
+            checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
     }
 
     bool bNoSplit = false;
@@ -827,8 +888,11 @@
     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
     uint32_t minDepth = topSkipMinDepth(parentCTU, cuGeom);
-    bool earlyskip = false;
+    bool skipModes = false; /* Skip any remaining mode analyses at current depth */
+    bool skipRecursion = false; /* Skip recursion */
     bool splitIntra = true;

x265_1.9.tar.gz/source/encoder/analysis.h -> x265_2.0.tar.gz/source/encoder/analysis.h Changed

@@ -108,6 +108,7 @@
     ModeDepth m_modeDepth[NUM_CU_DEPTH];
     bool      m_bTryLossless;
     bool      m_bChromaSa8d;
+    bool      m_bHD;
 
     Analysis();
 
@@ -117,12 +118,19 @@
     Mode& compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext);
 
 protected:
-    /* Analysis data for load/save modes, keeps getting incremented as CTU analysis proceeds and data is consumed or read */
+    /* Analysis data for save/load mode, writes/reads data based on absPartIdx */
     analysis_inter_data* m_reuseInterDataCTU;
-    MV*                  m_reuseMv;
     int32_t*             m_reuseRef;
-    uint32_t*            m_reuseBestMergeCand;
+    uint8_t*             m_reuseDepth;
+    uint8_t*             m_reuseModes;
+    uint8_t*             m_reusePartSize;
+    uint8_t*             m_reuseMergeFlag;
+
     uint32_t m_splitRefIdx[4];
+    uint64_t* cacheCost;
+
+    /* refine RD based on QP for rd-levels 5 and 6 */
+    void qprdRefine(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t lqp);
 
     /* full analysis for an I-slice CU */
     void compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
@@ -130,11 +138,13 @@
     /* full analysis for a P or B slice CU */
     uint32_t compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
     SplitData compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
-    SplitData compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder, int32_t qp);
+    SplitData compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
+
+    void recodeCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t origqp = -1);
 
     /* measure merge and skip */
     void checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGeom);
-    void checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom, bool isShareMergeCand);
+    void checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom);
 
     /* measure inter options */
     void checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refmask[2]);
@@ -151,6 +161,7 @@
     /* work-avoidance heuristics for RD levels < 5 */
     uint32_t topSkipMinDepth(const CUData& parentCTU, const CUGeom& cuGeom);
     bool recursionDepthCheck(const CUData& parentCTU, const CUGeom& cuGeom, const Mode& bestMode);
+    bool complexityCheckCU(const Mode& bestMode);
 
     /* generate residual and recon pixels for an entire CTU recursively (RD0) */
     void encodeResidue(const CUData& parentCTU, const CUGeom& cuGeom);

x265_1.9.tar.gz/source/encoder/api.cpp -> x265_2.0.tar.gz/source/encoder/api.cpp Changed

x265_1.9.tar.gz/source/encoder/dpb.cpp -> x265_2.0.tar.gz/source/encoder/dpb.cpp Changed

x265_1.9.tar.gz/source/encoder/dpb.h -> x265_2.0.tar.gz/source/encoder/dpb.h Changed

x265_1.9.tar.gz/source/encoder/encoder.cpp -> x265_2.0.tar.gz/source/encoder/encoder.cpp Changed

@@ -55,7 +55,7 @@
 Encoder::Encoder()
 {
     m_aborted = false;
-    m_reconfigured = false;
+    m_reconfigure = false;
     m_encodedFrameNum = 0;
     m_pocLast = -1;
     m_curEncoder = 0;
@@ -361,7 +361,10 @@
     }
 
     if (m_threadPool)
-        m_threadPool->stopWorkers();
+    {
+        for (int i = 0; i < m_numPools; i++)
+            m_threadPool[i].stopWorkers();
+    }
 }
 
 void Encoder::destroy()
@@ -508,12 +511,6 @@
 
     if (pic_in)
     {
-        if (pic_in->colorSpace != m_param->internalCsp)
-        {
-            x265_log(m_param, X265_LOG_ERROR, "Unsupported chroma subsampling (%d) on input\n",
-                     pic_in->colorSpace);
-            return -1;
-        }
         if (pic_in->bitDepth < 8 || pic_in->bitDepth > 16)
         {
             x265_log(m_param, X265_LOG_ERROR, "Input bit depth (%d) must be between 8 and 16\n",
@@ -525,7 +522,7 @@
         if (m_dpb->m_freeList.empty())
         {
             inFrame = new Frame;
-            x265_param* p = m_reconfigured? m_latestParam : m_param;
+            x265_param* p = m_reconfigure ? m_latestParam : m_param;
             if (inFrame->create(p, pic_in->quantOffsets))
             {
                 /* the first PicYuv created is asked to generate the CU and block unit offset
@@ -535,7 +532,7 @@
                 {
                     inFrame->m_fencPic->m_cuOffsetY = m_sps.cuOffsetY;
                     inFrame->m_fencPic->m_buOffsetY = m_sps.buOffsetY;
-                    if (pic_in->colorSpace != X265_CSP_I400)
+                    if (m_param->internalCsp != X265_CSP_I400)
                     {
                         inFrame->m_fencPic->m_cuOffsetC = m_sps.cuOffsetC;
                         inFrame->m_fencPic->m_buOffsetC = m_sps.buOffsetC;
@@ -555,7 +552,7 @@
                     {
                         m_sps.cuOffsetY = inFrame->m_fencPic->m_cuOffsetY;
                         m_sps.buOffsetY = inFrame->m_fencPic->m_buOffsetY;
-                        if (pic_in->colorSpace != X265_CSP_I400)
+                        if (m_param->internalCsp != X265_CSP_I400)
                         {
                             m_sps.cuOffsetC = inFrame->m_fencPic->m_cuOffsetC;
                             m_sps.cuOffsetY = inFrame->m_fencPic->m_cuOffsetY;
@@ -591,7 +588,7 @@
         inFrame->m_userData  = pic_in->userData;
         inFrame->m_pts       = pic_in->pts;
         inFrame->m_forceqp   = pic_in->forceqp;
-        inFrame->m_param     = m_reconfigured ? m_latestParam : m_param;
+        inFrame->m_param     = m_reconfigure ? m_latestParam : m_param;
         
         if (pic_in->quantOffsets != NULL)
         {
@@ -719,7 +716,7 @@
                     pic_out->analysisData.numPartitions = outFrame->m_analysisData.numPartitions;
                     pic_out->analysisData.interData = outFrame->m_analysisData.interData;
                     pic_out->analysisData.intraData = outFrame->m_analysisData.intraData;
-                    writeAnalysisFile(&pic_out->analysisData);
+                    writeAnalysisFile(&pic_out->analysisData, *outFrame->m_encData);
                     freeAnalysis(&pic_out->analysisData);
                 }
             }
@@ -780,6 +777,27 @@
                 if (m_rateControl->writeRateControlFrameStats(outFrame, &curEncoder->m_rce))
                     m_aborted = true;
 
+            if (pic_out && m_param->rc.bStatWrite)
+            {
+                /* m_rcData is allocated for every frame */
+                pic_out->rcData = outFrame->m_rcData;
+                outFrame->m_rcData->qpaRc = outFrame->m_encData->m_avgQpRc;
+                outFrame->m_rcData->qRceq = curEncoder->m_rce.qRceq;
+                outFrame->m_rcData->qpNoVbv = curEncoder->m_rce.qpNoVbv;
+                outFrame->m_rcData->coeffBits = outFrame->m_encData->m_frameStats.coeffBits;
+                outFrame->m_rcData->miscBits = outFrame->m_encData->m_frameStats.miscBits;
+                outFrame->m_rcData->mvBits = outFrame->m_encData->m_frameStats.mvBits;
+                outFrame->m_rcData->qScale = outFrame->m_rcData->newQScale = x265_qp2qScale(outFrame->m_encData->m_avgQpRc);
+                outFrame->m_rcData->poc = curEncoder->m_rce.poc;
+                outFrame->m_rcData->encodeOrder = curEncoder->m_rce.encodeOrder;
+                outFrame->m_rcData->sliceType = curEncoder->m_rce.sliceType;
+                outFrame->m_rcData->keptAsRef = curEncoder->m_rce.sliceType == B_SLICE && !IS_REFERENCED(outFrame) ? 0 : 1;
+                outFrame->m_rcData->qpAq = outFrame->m_encData->m_avgQpAq;
+                outFrame->m_rcData->iCuCount = outFrame->m_encData->m_frameStats.percent8x8Intra * m_rateControl->m_ncu;
+                outFrame->m_rcData->pCuCount = outFrame->m_encData->m_frameStats.percent8x8Inter * m_rateControl->m_ncu;
+                outFrame->m_rcData->skipCuCount = outFrame->m_encData->m_frameStats.percent8x8Skip  * m_rateControl->m_ncu;
+            }
+
             /* Allow this frame to be recycled if no frame encoders are using it for reference */
             if (!pic_out)
             {
@@ -800,16 +818,32 @@
             frameEnc = m_lookahead->getDecidedPicture();
         if (frameEnc && !pass)
         {
+            if (curEncoder->m_reconfigure)
+            {
+                /* One round robin cycle of FE reconfigure is complete */
+                /* Safe to copy m_latestParam to Encoder::m_param, encoder reconfigure complete */
+                for (int frameEncId = 0; frameEncId < m_param->frameNumThreads; frameEncId++)
+                    m_frameEncoder[frameEncId]->m_reconfigure = false;
+                memcpy (m_param, m_latestParam, sizeof(x265_param));
+                m_reconfigure = false;
+            }
+
+            /* Initiate reconfigure for this FE if necessary */
+            curEncoder->m_param = m_reconfigure ? m_latestParam : m_param;
+            curEncoder->m_reconfigure = m_reconfigure;
+
             /* give this frame a FrameData instance before encoding */
             if (m_dpb->m_frameDataFreeList)
             {
                 frameEnc->m_encData = m_dpb->m_frameDataFreeList;
                 m_dpb->m_frameDataFreeList = m_dpb->m_frameDataFreeList->m_freeListNext;
                 frameEnc->reinit(m_sps);
+                frameEnc->m_param = m_reconfigure ? m_latestParam : m_param;
+                frameEnc->m_encData->m_param = m_reconfigure ? m_latestParam : m_param;
             }
             else
             {
-                frameEnc->allocEncodeData(m_param, m_sps);
+                frameEnc->allocEncodeData(m_reconfigure ? m_latestParam : m_param, m_sps);
                 Slice* slice = frameEnc->m_encData->m_slice;
                 slice->m_sps = &m_sps;
                 slice->m_pps = &m_pps;
@@ -817,7 +851,7 @@
                 slice->m_endCUAddr = slice->realEndAddress(m_sps.numCUsInFrame * NUM_4x4_PARTITIONS);
             }
 
-            curEncoder->m_rce.encodeOrder = m_encodedFrameNum++;
+            curEncoder->m_rce.encodeOrder = frameEnc->m_encodeOrder = m_encodedFrameNum++;
             if (m_bframeDelay)
             {
                 int64_t *prevReorderedPts = m_prevReorderedPts;
@@ -867,28 +901,23 @@
 int Encoder::reconfigureParam(x265_param* encParam, x265_param* param)
 {
     encParam->maxNumReferences = param->maxNumReferences; // never uses more refs than specified in stream headers
-    encParam->bEnableLoopFilter = param->bEnableLoopFilter;
-    encParam->deblockingFilterTCOffset = param->deblockingFilterTCOffset;
-    encParam->deblockingFilterBetaOffset = param->deblockingFilterBetaOffset;
     encParam->bEnableFastIntra = param->bEnableFastIntra;
     encParam->bEnableEarlySkip = param->bEnableEarlySkip;
-    encParam->bEnableTemporalMvp = param->bEnableTemporalMvp;
-    /* Scratch buffer prevents me_range from being increased for esa/tesa
-    if (param->searchMethod < X265_FULL_SEARCH || param->searchMethod < encParam->searchRange)
-        encParam->searchRange = param->searchRange; */
-    encParam->noiseReductionInter = param->noiseReductionInter;
-    encParam->noiseReductionIntra = param->noiseReductionIntra;
+    encParam->bEnableRecursionSkip = param->bEnableRecursionSkip;
+    encParam->searchMethod = param->searchMethod;
+    /* Scratch buffer prevents me_range from being increased for esa/tesa */
+    if (param->searchRange < encParam->searchRange)
+        encParam->searchRange = param->searchRange;
     /* We can't switch out of subme=0 during encoding. */
     if (encParam->subpelRefine)
         encParam->subpelRefine = param->subpelRefine;
     encParam->rdoqLevel = param->rdoqLevel;
     encParam->rdLevel = param->rdLevel;
-    encParam->bEnableTSkipFast = param->bEnableTSkipFast;
-    encParam->psyRd = param->psyRd;
-    encParam->psyRdoq = param->psyRdoq;
-    encParam->bEnableSignHiding = param->bEnableSignHiding;
-    encParam->bEnableFastIntra = param->bEnableFastIntra;
-    encParam->maxTUSize = param->maxTUSize;
+    encParam->bEnableRectInter = param->bEnableRectInter;
+    encParam->maxNumMergeCand = param->maxNumMergeCand;
+    encParam->bIntraInBFrames = param->bIntraInBFrames;
+    /* To add: Loop Filter/deblocking controls, transform skip, signhide require PPS to be resent */
+    /* To add: SAO, temporal MVP, AMP, TU depths require SPS to be resent, at every CVS boundary */
     return x265_check_params(encParam);
 }
 
@@ -1214,12 +1243,6 @@
 
         stats->maxCLL         = m_analyzeAll.m_maxCLL;
         stats->maxFALL        = (uint16_t)(m_analyzeAll.m_maxFALL / m_analyzeAll.m_numPics);
-
-        if (m_emitCLLSEI)
-        {
-            m_param->maxCLL = stats->maxCLL;
-            m_param->maxFALL = stats->maxFALL;
-        }
     }

x265_1.9.tar.gz/source/encoder/encoder.h -> x265_2.0.tar.gz/source/encoder/encoder.h Changed

@@ -74,6 +74,7 @@
 class Lookahead;
 class RateControl;
 class ThreadPool;
+class FrameData;
 
 class Encoder : public x265_encoder
 {
@@ -110,7 +111,7 @@
     Frame*             m_exportedPic;
     FILE*              m_analysisFile;
     x265_param*        m_param;
-    x265_param*        m_latestParam;
+    x265_param*        m_latestParam;     // Holds latest param during a reconfigure
     RateControl*       m_rateControl;
     Lookahead*         m_lookahead;
 
@@ -129,7 +130,7 @@
     bool               m_emitCLLSEI;
     bool               m_bZeroLatency;     // x265_encoder_encode() returns NALs for the input picture, zero lag
     bool               m_aborted;          // fatal error detected
-    bool               m_reconfigured;      // reconfigure of encoder detected
+    bool               m_reconfigure;      // Encoder reconfigure in progress
 
     /* Begin intra refresh when one not in progress or else begin one as soon as the current 
      * one is done. Requires bIntraRefresh to be set.*/
@@ -152,6 +153,8 @@
 
     void printSummary();
 
+    void printReconfigureParams();
+
     char* statsString(EncStats&, char*);
 
     void configure(x265_param *param);
@@ -164,7 +167,7 @@
 
     void readAnalysisFile(x265_analysis_data* analysis, int poc);
 
-    void writeAnalysisFile(x265_analysis_data* pic);
+    void writeAnalysisFile(x265_analysis_data* pic, FrameData &curEncData);
 
     void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, x265_frame_stats* frameStats, int inPoc);

x265_1.9.tar.gz/source/encoder/entropy.cpp -> x265_2.0.tar.gz/source/encoder/entropy.cpp Changed

@@ -38,6 +38,189 @@
 
 namespace X265_NS {
 
+// initial probability for cu_transquant_bypass flag
+static const uint8_t INIT_CU_TRANSQUANT_BYPASS_FLAG[3][NUM_TQUANT_BYPASS_FLAG_CTX] =
+{
+    { 154 },
+    { 154 },
+    { 154 },
+};
+
+// initial probability for split flag
+static const uint8_t INIT_SPLIT_FLAG[3][NUM_SPLIT_FLAG_CTX] =
+{
+    { 107,  139,  126, },
+    { 107,  139,  126, },
+    { 139,  141,  157, },
+};
+
+static const uint8_t INIT_SKIP_FLAG[3][NUM_SKIP_FLAG_CTX] =
+{
+    { 197,  185,  201, },
+    { 197,  185,  201, },
+    { CNU,  CNU,  CNU, },
+};
+
+static const uint8_t INIT_MERGE_FLAG_EXT[3][NUM_MERGE_FLAG_EXT_CTX] =
+{
+    { 154, },
+    { 110, },
+    { CNU, },
+};
+
+static const uint8_t INIT_MERGE_IDX_EXT[3][NUM_MERGE_IDX_EXT_CTX] =
+{
+    { 137, },
+    { 122, },
+    { CNU, },
+};
+
+static const uint8_t INIT_PART_SIZE[3][NUM_PART_SIZE_CTX] =
+{
+    { 154,  139,  154, 154 },
+    { 154,  139,  154, 154 },
+    { 184,  CNU,  CNU, CNU },
+};
+
+static const uint8_t INIT_PRED_MODE[3][NUM_PRED_MODE_CTX] =
+{
+    { 134, },
+    { 149, },
+    { CNU, },
+};
+
+static const uint8_t INIT_INTRA_PRED_MODE[3][NUM_ADI_CTX] =
+{
+    { 183, },
+    { 154, },
+    { 184, },
+};
+
+static const uint8_t INIT_CHROMA_PRED_MODE[3][NUM_CHROMA_PRED_CTX] =
+{
+    { 152,  139, },
+    { 152,  139, },
+    {  63,  139, },
+};
+
+static const uint8_t INIT_INTER_DIR[3][NUM_INTER_DIR_CTX] =
+{
+    {  95,   79,   63,   31,  31, },
+    {  95,   79,   63,   31,  31, },
+    { CNU,  CNU,  CNU,  CNU, CNU, },
+};
+
+static const uint8_t INIT_MVD[3][NUM_MV_RES_CTX] =
+{
+    { 169,  198, },
+    { 140,  198, },
+    { CNU,  CNU, },
+};
+
+static const uint8_t INIT_REF_PIC[3][NUM_REF_NO_CTX] =
+{
+    { 153,  153 },
+    { 153,  153 },
+    { CNU,  CNU },
+};
+
+static const uint8_t INIT_DQP[3][NUM_DELTA_QP_CTX] =
+{
+    { 154,  154,  154, },
+    { 154,  154,  154, },
+    { 154,  154,  154, },
+};
+
+static const uint8_t INIT_QT_CBF[3][NUM_QT_CBF_CTX] =
+{
+    { 153,  111,  149,   92,  167,  154,  154 },
+    { 153,  111,  149,  107,  167,  154,  154 },
+    { 111,  141,   94,  138,  182,  154,  154 },
+};
+
+static const uint8_t INIT_QT_ROOT_CBF[3][NUM_QT_ROOT_CBF_CTX] =
+{
+    {  79, },
+    {  79, },
+    { CNU, },
+};
+
+static const uint8_t INIT_LAST[3][NUM_CTX_LAST_FLAG_XY] =
+{
+    { 125,  110,  124,  110,   95,   94,  125,  111,  111,   79,  125,  126,  111,  111,   79,
+      108,  123,   93 },
+    { 125,  110,   94,  110,   95,   79,  125,  111,  110,   78,  110,  111,  111,   95,   94,
+      108,  123,  108 },
+    { 110,  110,  124,  125,  140,  153,  125,  127,  140,  109,  111,  143,  127,  111,   79,
+      108,  123,   63 },
+};
+
+static const uint8_t INIT_SIG_CG_FLAG[3][2 * NUM_SIG_CG_FLAG_CTX] =
+{
+    { 121,  140,
+      61,  154, },
+    { 121,  140,
+      61,  154, },
+    {  91,  171,
+       134,  141, },
+};
+
+static const uint8_t INIT_SIG_FLAG[3][NUM_SIG_FLAG_CTX] =
+{
+    { 170,  154,  139,  153,  139,  123,  123,   63,  124,  166,  183,  140,  136,  153,  154,  166,  183,  140,  136,  153,  154,  166,  183,  140,  136,  153,  154,  170,  153,  138,  138,  122,  121,  122,  121,  167,  151,  183,  140,  151,  183,  140,  },
+    { 155,  154,  139,  153,  139,  123,  123,   63,  153,  166,  183,  140,  136,  153,  154,  166,  183,  140,  136,  153,  154,  166,  183,  140,  136,  153,  154,  170,  153,  123,  123,  107,  121,  107,  121,  167,  151,  183,  140,  151,  183,  140,  },
+    { 111,  111,  125,  110,  110,   94,  124,  108,  124,  107,  125,  141,  179,  153,  125,  107,  125,  141,  179,  153,  125,  107,  125,  141,  179,  153,  125,  140,  139,  182,  182,  152,  136,  152,  136,  153,  136,  139,  111,  136,  139,  111,  },
+};
+
+static const uint8_t INIT_ONE_FLAG[3][NUM_ONE_FLAG_CTX] =
+{
+    { 154,  196,  167,  167,  154,  152,  167,  182,  182,  134,  149,  136,  153,  121,  136,  122,  169,  208,  166,  167,  154,  152,  167,  182, },
+    { 154,  196,  196,  167,  154,  152,  167,  182,  182,  134,  149,  136,  153,  121,  136,  137,  169,  194,  166,  167,  154,  167,  137,  182, },
+    { 140,   92,  137,  138,  140,  152,  138,  139,  153,   74,  149,   92,  139,  107,  122,  152,  140,  179,  166,  182,  140,  227,  122,  197, },
+};
+
+static const uint8_t INIT_ABS_FLAG[3][NUM_ABS_FLAG_CTX] =
+{
+    { 107,  167,   91,  107,  107,  167, },
+    { 107,  167,   91,  122,  107,  167, },
+    { 138,  153,  136,  167,  152,  152, },
+};
+
+static const uint8_t INIT_MVP_IDX[3][NUM_MVP_IDX_CTX] =
+{
+    { 168 },
+    { 168 },
+    { CNU },
+};
+
+static const uint8_t INIT_SAO_MERGE_FLAG[3][NUM_SAO_MERGE_FLAG_CTX] =
+{
+    { 153,  },
+    { 153,  },
+    { 153,  },
+};
+
+static const uint8_t INIT_SAO_TYPE_IDX[3][NUM_SAO_TYPE_IDX_CTX] =
+{
+    { 160, },
+    { 185, },
+    { 200, },
+};
+
+static const uint8_t INIT_TRANS_SUBDIV_FLAG[3][NUM_TRANS_SUBDIV_FLAG_CTX] =
+{
+    { 224,  167,  122, },
+    { 124,  138,   94, },
+    { 153,  138,  138, },
+};
+
+static const uint8_t INIT_TRANSFORMSKIP_FLAG[3][2 * NUM_TRANSFORMSKIP_FLAG_CTX] =
+{
+    { 139,  139 },
+    { 139,  139 },
+    { 139,  139 },
+};
+
 Entropy::Entropy()
 {
     markValid();
@@ -306,7 +489,7 @@
 {
     for (int sizeId = 0; sizeId < ScalingList::NUM_SIZES; sizeId++)
     {
-        for (int listId = 0; listId < ScalingList::NUM_LISTS; listId++)
+        for (int listId = 0; listId < ScalingList::NUM_LISTS; listId += (sizeId == 3) ? 3 : 1)
         {
             int predList = scalingList.checkPredMode(sizeId, listId);
             WRITE_FLAG(predList < 0, "scaling_list_pred_mode_flag");
@@ -334,12 +517,7 @@

x265_1.9.tar.gz/source/encoder/entropy.h -> x265_2.0.tar.gz/source/encoder/entropy.h Changed

@@ -162,13 +162,13 @@
 
     void codePartSize(const CUData& cu, uint32_t absPartIdx, uint32_t depth);
     void codePredInfo(const CUData& cu, uint32_t absPartIdx);
-    inline void codeQtCbfLuma(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth) { codeQtCbfLuma(cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth), tuDepth); }
 
     void codeQtCbfChroma(const CUData& cu, uint32_t absPartIdx, TextType ttype, uint32_t tuDepth, bool lowestLevel);
     void codeCoeff(const CUData& cu, uint32_t absPartIdx, bool& bCodeDQP, const uint32_t depthRange[2]);
     void codeCoeffNxN(const CUData& cu, const coeff_t* coef, uint32_t absPartIdx, uint32_t log2TrSize, TextType ttype);
 
     inline void codeSaoMerge(uint32_t code)                          { encodeBin(code, m_contextState[OFF_SAO_MERGE_FLAG_CTX]); }
+    inline void codeSaoType(uint32_t code)                           { encodeBin(code, m_contextState[OFF_SAO_TYPE_IDX_CTX]); }
     inline void codeMVPIdx(uint32_t symbol)                          { encodeBin(symbol, m_contextState[OFF_MVP_IDX_CTX]); }
     inline void codeMergeFlag(const CUData& cu, uint32_t absPartIdx) { encodeBin(cu.m_mergeFlag[absPartIdx], m_contextState[OFF_MERGE_FLAG_EXT_CTX]); }
     inline void codeSkipFlag(const CUData& cu, uint32_t absPartIdx)  { encodeBin(cu.isSkipped(absPartIdx), m_contextState[OFF_SKIP_FLAG_CTX + cu.getCtxSkipFlag(absPartIdx)]); }
@@ -182,6 +182,8 @@
     inline void codeTransformSkipFlags(uint32_t transformSkip, TextType ttype) { encodeBin(transformSkip, m_contextState[OFF_TRANSFORMSKIP_FLAG_CTX + (ttype ? NUM_TRANSFORMSKIP_FLAG_CTX : 0)]); }
     void codeDeltaQP(const CUData& cu, uint32_t absPartIdx);
     void codeSaoOffset(const SaoCtuParam& ctuParam, int plane);
+    void codeSaoOffsetEO(int *offset, int typeIdx, int plane);
+    void codeSaoOffsetBO(int *offset, int bandPos, int plane);
 
     /* RDO functions */
     void estBit(EstBitsSbac& estBitsSbac, uint32_t log2TrSize, bool bIsLuma) const;

x265_1.9.tar.gz/source/encoder/frameencoder.cpp -> x265_2.0.tar.gz/source/encoder/frameencoder.cpp Changed

@@ -41,6 +41,7 @@
 FrameEncoder::FrameEncoder()
 {
     m_prevOutputTime = x265_mdate();
+    m_reconfigure = false;
     m_isFrameEncoder = true;
     m_threadActive = true;
     m_slicetypeWaitTime = 0;
@@ -104,6 +105,7 @@
     m_param = top->m_param;
     m_numRows = numRows;
     m_numCols = numCols;
+    m_reconfigure = false;
     m_filterRowDelay = ((m_param->bEnableSAO && m_param->bSaoNonDeblocked)
                         || (!m_param->bEnableLoopFilter && m_param->bEnableSAO)) ?
                         2 : (m_param->bEnableSAO || m_param->bEnableLoopFilter ? 1 : 0);
@@ -213,7 +215,6 @@
 {
     m_slicetypeWaitTime = x265_mdate() - m_prevOutputTime;
     m_frame = curFrame;
-    m_param = curFrame->m_param;
     m_sliceType = curFrame->m_lowres.sliceType;
     curFrame->m_encData->m_frameEncoderID = m_jpId;
     curFrame->m_encData->m_jobProvider = this;
@@ -333,18 +334,40 @@
     // Weighted Prediction parameters estimation.
     bool bUseWeightP = slice->m_sliceType == P_SLICE && slice->m_pps->bUseWeightPred;
     bool bUseWeightB = slice->m_sliceType == B_SLICE && slice->m_pps->bUseWeightedBiPred;
+
+    WeightParam* reuseWP = NULL;
+    if (m_param->analysisMode && (bUseWeightP || bUseWeightB))
+        reuseWP = ((analysis_inter_data*)m_frame->m_analysisData.interData)->wt;
+
     if (bUseWeightP || bUseWeightB)
     {
 #if DETAILED_CU_STATS
         m_cuStats.countWeightAnalyze++;
         ScopedElapsedTime time(m_cuStats.weightAnalyzeTime);
 #endif
-        WeightAnalysis wa(*this);
-        if (m_pool && wa.tryBondPeers(*this, 1))
-            /* use an idle worker for weight analysis */
-            wa.waitForExit();
+        if (m_param->analysisMode == X265_ANALYSIS_LOAD)
+        {
+            for (int list = 0; list < slice->isInterB() + 1; list++) 
+            {
+                for (int plane = 0; plane < (m_param->internalCsp != X265_CSP_I400 ? 3 : 1); plane++)
+                {
+                    for (int ref = 1; ref < slice->m_numRefIdx[list]; ref++)
+                        SET_WEIGHT(slice->m_weightPredTable[list][ref][plane], false, 1 << reuseWP->log2WeightDenom, reuseWP->log2WeightDenom, 0);
+                    slice->m_weightPredTable[list][0][plane] = *(reuseWP++);
+                }
+            }
+        }
         else
-            weightAnalyse(*slice, *m_frame, *m_param);
+        {
+            WeightAnalysis wa(*this);
+            if (m_pool && wa.tryBondPeers(*this, 1))
+                /* use an idle worker for weight analysis */
+                wa.waitForExit();
+            else
+                weightAnalyse(*slice, *m_frame, *m_param);
+
+        }
+
     }
     else
         slice->disableWeights();
@@ -361,6 +384,12 @@
             slice->m_refReconPicList[l][ref] = slice->m_refFrameList[l][ref]->m_reconPic;
             m_mref[l][ref].init(slice->m_refReconPicList[l][ref], w, *m_param);
         }
+        if (m_param->analysisMode == X265_ANALYSIS_SAVE && (bUseWeightP || bUseWeightB))
+        {
+            for (int i = 0; i < (m_param->internalCsp != X265_CSP_I400 ? 3 : 1); i++)
+                *(reuseWP++) = slice->m_weightPredTable[l][0][i];
+        }
+
     }
 
     int numTLD;
@@ -371,6 +400,7 @@
 
     /* Get the QP for this frame from rate control. This call may block until
      * frames ahead of it in encode order have called rateControlEnd() */
+    m_rce.encodeOrder = m_frame->m_encodeOrder;
     int qp = m_top->m_rateControl->rateControlStart(m_frame, &m_rce, m_top);
     m_rce.newQp = qp;
 
@@ -409,7 +439,7 @@
 
     m_initSliceContext.resetEntropy(*slice);
 
-    m_frameFilter.start(m_frame, m_initSliceContext, qp);
+    m_frameFilter.start(m_frame, m_initSliceContext);
 
     /* ensure all rows are blocked prior to initializing row CTU counters */
     WaveFront::clearEnabledRowMask();
@@ -969,44 +999,48 @@
         /* Deblock with idle threading */
         if (m_param->bEnableLoopFilter | m_param->bEnableSAO)
         {
-            // TODO: Multiple Threading
-            // Delay ONE row to avoid Intra Prediction Conflict
-            if (m_pool && (row >= 1))
+            // NOTE: in VBV mode, we may reencode anytime, so we can't do Deblock stage-Horizon and SAO
+            if (!bIsVbv)
             {
-                // Waitting last threading finish
-                m_frameFilter.m_parallelFilter[row - 1].waitForExit();
+                // TODO: Multiple Threading
+                // Delay ONE row to avoid Intra Prediction Conflict
+                if (m_pool && (row >= 1))
+                {
+                    // Waitting last threading finish
+                    m_frameFilter.m_parallelFilter[row - 1].waitForExit();
 
-                // Processing new group
-                int allowCol = col;
+                    // Processing new group
+                    int allowCol = col;
 
-                // avoid race condition on last column
-                if (row >= 2)
-                {
-                    allowCol = X265_MIN(((col == numCols - 1) ? m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.get()
-                                                              : m_frameFilter.m_parallelFilter[row - 2].m_lastCol.get()), (int)col);
+                    // avoid race condition on last column
+                    if (row >= 2)
+                    {
+                        allowCol = X265_MIN(((col == numCols - 1) ? m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.get()
+                                                                  : m_frameFilter.m_parallelFilter[row - 2].m_lastCol.get()), (int)col);
+                    }
+                    m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(allowCol);
+                    m_frameFilter.m_parallelFilter[row - 1].tryBondPeers(*this, 1);
                 }
-                m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(allowCol);
-                m_frameFilter.m_parallelFilter[row - 1].tryBondPeers(*this, 1);
-            }
 
-            // Last Row may start early
-            if (m_pool && (row == m_numRows - 1))
-            {
-                // Waiting for the last thread to finish
-                m_frameFilter.m_parallelFilter[row].waitForExit();
+                // Last Row may start early
+                if (m_pool && (row == m_numRows - 1))
+                {
+                    // Waiting for the last thread to finish
+                    m_frameFilter.m_parallelFilter[row].waitForExit();
 
-                // Deblocking last row
-                int allowCol = col;
+                    // Deblocking last row
+                    int allowCol = col;
 
-                // avoid race condition on last column
-                if (row >= 2)
-                {
-                    allowCol = X265_MIN(((col == numCols - 1) ? m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.get()
-                                                              : m_frameFilter.m_parallelFilter[row - 1].m_lastCol.get()), (int)col);
+                    // avoid race condition on last column
+                    if (row >= 2)
+                    {
+                        allowCol = X265_MIN(((col == numCols - 1) ? m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.get()
+                                                                  : m_frameFilter.m_parallelFilter[row - 1].m_lastCol.get()), (int)col);
+                    }
+                    m_frameFilter.m_parallelFilter[row].m_allowedCol.set(allowCol);
+                    m_frameFilter.m_parallelFilter[row].tryBondPeers(*this, 1);
                 }
-                m_frameFilter.m_parallelFilter[row].m_allowedCol.set(allowCol);
-                m_frameFilter.m_parallelFilter[row].tryBondPeers(*this, 1);
-            }
+            } // end of !bIsVbv
         }
         // Both Loopfilter and SAO Disabled
         else
@@ -1179,7 +1213,9 @@
     uint32_t rowCount = 0;
     if (m_param->rc.rateControlMode == X265_RC_ABR || bIsVbv)
     {
-        if ((uint32_t)m_rce.encodeOrder <= 2 * (m_param->fpsNum / m_param->fpsDenom))
+        if (!m_rce.encodeOrder)
+            rowCount = m_numRows - 1;
+        else if ((uint32_t)m_rce.encodeOrder <= 2 * (m_param->fpsNum / m_param->fpsDenom))
             rowCount = X265_MIN((m_numRows + 1) / 2, m_numRows - 1);
         else
             rowCount = X265_MIN(m_refLagRows, m_numRows - 1);

x265_1.9.tar.gz/source/encoder/frameencoder.h -> x265_2.0.tar.gz/source/encoder/frameencoder.h Changed

x265_1.9.tar.gz/source/encoder/framefilter.cpp -> x265_2.0.tar.gz/source/encoder/framefilter.cpp Changed

@@ -54,7 +54,7 @@
 
 void FrameFilter::init(Encoder *top, FrameEncoder *frame, int numRows, uint32_t numCols)
 {
-    m_param = top->m_param;
+    m_param = frame->m_param;
     m_frameEncoder = frame;
     m_numRows = numRows;
     m_numCols = numCols;
@@ -103,7 +103,7 @@
 
 }
 
-void FrameFilter::start(Frame *frame, Entropy& initState, int qp)
+void FrameFilter::start(Frame *frame, Entropy& initState)
 {
     m_frame = frame;
 
@@ -113,7 +113,7 @@
         for(int row = 0; row < m_numRows; row++)
         {
             if (m_param->bEnableSAO)
-                m_parallelFilter[row].m_sao.startSlice(frame, initState, qp);
+                m_parallelFilter[row].m_sao.startSlice(frame, initState);
 
             m_parallelFilter[row].m_lastCol.set(0);
             m_parallelFilter[row].m_allowedCol.set(0);
@@ -198,14 +198,14 @@
     }
 }
 
-void FrameFilter::ParallelFilter::processSaoUnitCu(SAOParam *saoParam, int col)
+void FrameFilter::ParallelFilter::processSaoCTU(SAOParam *saoParam, int col)
 {
     // TODO: apply SAO on CU and copy back soon, is it necessary?
     if (saoParam->bSaoFlag[0])
-        m_sao.processSaoUnitCuLuma(saoParam->ctuParam[0], m_row, col);
+        m_sao.generateLumaOffsets(saoParam->ctuParam[0], m_row, col);
 
     if (saoParam->bSaoFlag[1])
-        m_sao.processSaoUnitCuChroma(saoParam->ctuParam, m_row, col);
+        m_sao.generateChromaOffsets(saoParam->ctuParam, m_row, col);
 
     if (m_encData->m_slice->m_pps->bTransquantBypassEnabled)
     {
@@ -320,11 +320,14 @@
     const uint32_t* ctuGeomMap = m_frameFilter->m_frameEncoder->m_ctuGeomMap;
     PicYuv* reconPic = m_encData->m_reconPic;
     const int colStart = m_lastCol.get();
-    // TODO: Waiting previous row finish or simple clip on it?
-    const int colEnd = m_allowedCol.get();
     const int numCols = m_frameFilter->m_numCols;
+    // TODO: Waiting previous row finish or simple clip on it?
+    int colEnd = m_allowedCol.get();
 
     // Avoid threading conflict
+    if (m_prevRow && colEnd > m_prevRow->m_lastDeblocked.get())
+        colEnd = m_prevRow->m_lastDeblocked.get();
+
     if (colStart >= colEnd)
         return;
 
@@ -368,7 +371,7 @@
                 if (m_row >= 1 && col >= 3)
                 {
                     // Must delay 1 row to avoid thread data race conflict
-                    m_prevRow->processSaoUnitCu(saoParam, col - 3);
+                    m_prevRow->processSaoCTU(saoParam, col - 3);
                     m_prevRow->processPostCu(col - 3);
                 }
             }
@@ -409,19 +412,19 @@
             // Process Previous Rows SAO CU
             if (m_row >= 1 && numCols >= 3)
             {
-                m_prevRow->processSaoUnitCu(saoParam, numCols - 3);
+                m_prevRow->processSaoCTU(saoParam, numCols - 3);
                 m_prevRow->processPostCu(numCols - 3);
             }
 
             if (m_row >= 1 && numCols >= 2)
             {
-                m_prevRow->processSaoUnitCu(saoParam, numCols - 2);
+                m_prevRow->processSaoCTU(saoParam, numCols - 2);
                 m_prevRow->processPostCu(numCols - 2);
             }
 
             if (m_row >= 1 && numCols >= 1)
             {
-                m_prevRow->processSaoUnitCu(saoParam, numCols - 1);
+                m_prevRow->processSaoCTU(saoParam, numCols - 1);
                 m_prevRow->processPostCu(numCols - 1);
             }
 
@@ -475,7 +478,7 @@
                 for(int col = 0; col < m_numCols; col++)
                 {
                     // NOTE: must use processSaoUnitCu(), it include TQBypass logic
-                    m_parallelFilter[row].processSaoUnitCu(saoParam, col);
+                    m_parallelFilter[row].processSaoCTU(saoParam, col);
                 }
             }
 
@@ -550,10 +553,10 @@
         pixel *fenc = m_frame->m_fencPic->m_picOrg[0];
         intptr_t stride1 = reconPic->m_stride;
         intptr_t stride2 = m_frame->m_fencPic->m_stride;
-        uint32_t bEnd = ((row + 1) == (this->m_numRows - 1));
+        uint32_t bEnd = ((row) == (this->m_numRows - 1));
         uint32_t bStart = (row == 0);
         uint32_t minPixY = row * g_maxCUSize - 4 * !bStart;
-        uint32_t maxPixY = (row + 1) * g_maxCUSize - 4 * !bEnd;
+        uint32_t maxPixY = X265_MIN((row + 1) * g_maxCUSize - 4 * !bEnd, (uint32_t)m_param->sourceHeight);
         uint32_t ssim_cnt;
         x265_emms();
 
@@ -723,7 +726,7 @@
         {
             std::swap(sum0, sum1);
             for (uint32_t x = 0; x < width; x += 2)
-                primitives.ssim_4x4x2_core(&pix1[(4 * x + (z * stride1))], stride1, &pix2[(4 * x + (z * stride2))], stride2, &sum0[x]);
+                primitives.ssim_4x4x2_core(&pix1[4 * (x + (z * stride1))], stride1, &pix2[4 * (x + (z * stride2))], stride2, &sum0[x]);
         }
 
         for (uint32_t x = 0; x < width - 1; x += 4)

x265_1.9.tar.gz/source/encoder/framefilter.h -> x265_2.0.tar.gz/source/encoder/framefilter.h Changed

x265_1.9.tar.gz/source/encoder/level.cpp -> x265_2.0.tar.gz/source/encoder/level.cpp Changed

@@ -131,6 +131,14 @@
         vps.ptl.levelIdc = Level::LEVEL8_5;
         vps.ptl.tierFlag = Level::MAIN;
     }
+    else if (param.uhdBluray)
+    {
+        i = 8;
+        vps.ptl.levelIdc = levels[i].levelEnum;
+        vps.ptl.tierFlag = Level::HIGH;
+        vps.ptl.minCrForLevel = levels[i].minCompressionRatio;
+        vps.ptl.maxLumaSrForLevel = levels[i].maxLumaSamplesPerSecond;
+    }
     else for (i = 0; i < NumLevels; i++)
     {
         if (lumaSamples > levels[i].maxLumaSamples)
@@ -145,8 +153,10 @@
             continue;
         else if (param.sourceHeight > sqrt(levels[i].maxLumaSamples * 8.0f))
             continue;
-
+        else if (param.levelIdc && param.levelIdc != levels[i].levelIdc)
+            continue;
         uint32_t maxDpbSize = MaxDpbPicBuf;
+
         if (lumaSamples <= (levels[i].maxLumaSamples >> 2))
             maxDpbSize = X265_MIN(4 * MaxDpbPicBuf, 16);
         else if (lumaSamples <= (levels[i].maxLumaSamples >> 1))
@@ -188,7 +198,7 @@
             CHECK_RANGE((uint32_t)param.rc.vbvBufferSize, levels[i].maxCpbSizeMain, levels[i].maxCpbSizeHigh))
         {
             /* The bitrate or buffer size are out of range for Main tier, but in
-             * range for High tier. If the user requested High tier then give
+             * range for High tier. If the user allowed High tier then give
              * them High tier at this level.  Otherwise allow the loop to
              * progress to the Main tier of the next level */
             if (param.bHighTier)
@@ -279,7 +289,7 @@
 bool enforceLevel(x265_param& param, VPS& vps)
 {
     vps.numReorderPics = (param.bBPyramid && param.bframes > 1) ? 2 : !!param.bframes;
-    vps.maxDecPicBuffering = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPics + 2, (uint32_t)param.maxNumReferences) + vps.numReorderPics);
+    vps.maxDecPicBuffering = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPics + 2, (uint32_t)param.maxNumReferences) + 1);
 
     /* no level specified by user, just auto-detect from the configuration */
     if (param.levelIdc <= 0)
@@ -290,17 +300,14 @@
         level++;
     if (levels[level].levelIdc != param.levelIdc)
     {
-        x265_log(&param, X265_LOG_WARNING, "specified level %d does not exist\n", param.levelIdc);
+        x265_log(&param, X265_LOG_ERROR, "specified level %d does not exist\n", param.levelIdc);
         return false;
     }
 
     LevelSpec& l = levels[level];
-    bool highTier = !!param.bHighTier;
-    if (highTier && l.maxBitrateHigh == MAX_UINT)
-    {
-        highTier = false;
-        x265_log(&param, X265_LOG_WARNING, "Level %s has no High tier, using Main tier\n", l.name);
-    }
+
+    //highTier is allowed for this level and has not been explicitly disabled. This does not mean it is the final chosen tier
+    bool allowHighTier = l.maxBitrateHigh < MAX_UINT && param.bHighTier;
 
     uint32_t lumaSamples = param.sourceWidth * param.sourceHeight;
     uint32_t samplesPerSec = (uint32_t)(lumaSamples * ((double)param.fpsNum / param.fpsDenom));
@@ -313,47 +320,51 @@
         ok = false;
     if (!ok)
     {
-        x265_log(&param, X265_LOG_WARNING, "picture dimensions are out of range for specified level\n");
+        x265_log(&param, X265_LOG_ERROR, "picture dimensions are out of range for specified level\n");
         return false;
     }
     else if (samplesPerSec > l.maxLumaSamplesPerSecond)
     {
-        x265_log(&param, X265_LOG_WARNING, "frame rate is out of range for specified level\n");
+        x265_log(&param, X265_LOG_ERROR, "frame rate is out of range for specified level\n");
         return false;
     }
 
-    if ((uint32_t)param.rc.vbvMaxBitrate > (highTier ? l.maxBitrateHigh : l.maxBitrateMain))
+    /* Adjustments of Bitrate, VBV buffer size, refs will be triggered only if specified params do not fit 
+     * within the max limits of that level (high tier if allowed, main otherwise)
+     */
+
+    if ((uint32_t)param.rc.vbvMaxBitrate > (allowHighTier ? l.maxBitrateHigh : l.maxBitrateMain))
     {
-        param.rc.vbvMaxBitrate = highTier ? l.maxBitrateHigh : l.maxBitrateMain;
-        x265_log(&param, X265_LOG_INFO, "lowering VBV max bitrate to %dKbps\n", param.rc.vbvMaxBitrate);
+        param.rc.vbvMaxBitrate = allowHighTier ? l.maxBitrateHigh : l.maxBitrateMain;
+        x265_log(&param, X265_LOG_WARNING, "lowering VBV max bitrate to %dKbps\n", param.rc.vbvMaxBitrate);
     }
-    if ((uint32_t)param.rc.vbvBufferSize > (highTier ? l.maxCpbSizeHigh : l.maxCpbSizeMain))
+    if ((uint32_t)param.rc.vbvBufferSize > (allowHighTier ? l.maxCpbSizeHigh : l.maxCpbSizeMain))
     {
-        param.rc.vbvBufferSize = highTier ? l.maxCpbSizeHigh : l.maxCpbSizeMain;
-        x265_log(&param, X265_LOG_INFO, "lowering VBV buffer size to %dKb\n", param.rc.vbvBufferSize);
+        param.rc.vbvBufferSize = allowHighTier ? l.maxCpbSizeHigh : l.maxCpbSizeMain;
+        x265_log(&param, X265_LOG_WARNING, "lowering VBV buffer size to %dKb\n", param.rc.vbvBufferSize);
     }
 
     switch (param.rc.rateControlMode)
     {
     case X265_RC_ABR:
-        if ((uint32_t)param.rc.bitrate > (highTier ? l.maxBitrateHigh : l.maxBitrateMain))
+        if ((uint32_t)param.rc.bitrate > (allowHighTier ? l.maxBitrateHigh : l.maxBitrateMain))
         {
-            param.rc.bitrate = l.maxBitrateHigh;
-            x265_log(&param, X265_LOG_INFO, "lowering target bitrate to High tier limit of %dKbps\n", param.rc.bitrate);
+            param.rc.bitrate =  allowHighTier ? l.maxBitrateHigh : l.maxBitrateMain;
+            x265_log(&param, X265_LOG_WARNING, "lowering target bitrate to High tier limit of %dKbps\n", param.rc.bitrate);
         }
         break;
 
     case X265_RC_CQP:
-        x265_log(&param, X265_LOG_WARNING, "Constant QP is inconsistent with specifying a decoder level, no bitrate guarantee is possible.\n");
+        x265_log(&param, X265_LOG_ERROR, "Constant QP is inconsistent with specifying a decoder level, no bitrate guarantee is possible.\n");
         return false;
 
     case X265_RC_CRF:
         if (!param.rc.vbvBufferSize || !param.rc.vbvMaxBitrate)
         {
             if (!param.rc.vbvMaxBitrate)
-                param.rc.vbvMaxBitrate = highTier ? l.maxBitrateHigh : l.maxBitrateMain;
+                param.rc.vbvMaxBitrate = allowHighTier ? l.maxBitrateHigh : l.maxBitrateMain;
             if (!param.rc.vbvBufferSize)
-                param.rc.vbvBufferSize = highTier ? l.maxCpbSizeHigh : l.maxCpbSizeMain;
+                param.rc.vbvBufferSize = allowHighTier ? l.maxCpbSizeHigh : l.maxCpbSizeMain;
             x265_log(&param, X265_LOG_WARNING, "Specifying a decoder level with constant rate factor rate-control requires\n");
             x265_log(&param, X265_LOG_WARNING, "enabling VBV with vbv-bufsize=%dkb vbv-maxrate=%dkbps. VBV outputs are non-deterministic!\n",
                      param.rc.vbvBufferSize, param.rc.vbvMaxBitrate);
@@ -368,27 +379,30 @@
     /* The value of sps_max_dec_pic_buffering_minus1[ HighestTid ] + 1 shall be less than or equal to MaxDpbSize */
     const uint32_t MaxDpbPicBuf = 6;
     uint32_t maxDpbSize = MaxDpbPicBuf;
-    if (lumaSamples <= (l.maxLumaSamples >> 2))
-        maxDpbSize = X265_MIN(4 * MaxDpbPicBuf, 16);
-    else if (lumaSamples <= (l.maxLumaSamples >> 1))
-        maxDpbSize = X265_MIN(2 * MaxDpbPicBuf, 16);
-    else if (lumaSamples <= ((3 * l.maxLumaSamples) >> 2))
-        maxDpbSize = X265_MIN((4 * MaxDpbPicBuf) / 3, 16);
+    if (!param.uhdBluray) /* Do not change MaxDpbPicBuf for UHD-Bluray */
+    {
+        if (lumaSamples <= (l.maxLumaSamples >> 2))
+            maxDpbSize = X265_MIN(4 * MaxDpbPicBuf, 16);
+        else if (lumaSamples <= (l.maxLumaSamples >> 1))
+            maxDpbSize = X265_MIN(2 * MaxDpbPicBuf, 16);
+        else if (lumaSamples <= ((3 * l.maxLumaSamples) >> 2))
+            maxDpbSize = X265_MIN((4 * MaxDpbPicBuf) / 3, 16);
+    }
 
     int savedRefCount = param.maxNumReferences;
     while (vps.maxDecPicBuffering > maxDpbSize && param.maxNumReferences > 1)
     {
         param.maxNumReferences--;
-        vps.maxDecPicBuffering = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPics + 1, (uint32_t)param.maxNumReferences) + vps.numReorderPics);
+        vps.maxDecPicBuffering = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPics + 1, (uint32_t)param.maxNumReferences) + 1);
     }
     if (param.maxNumReferences != savedRefCount)
-        x265_log(&param, X265_LOG_INFO, "Lowering max references to %d to meet level requirement\n", param.maxNumReferences);
+        x265_log(&param, X265_LOG_WARNING, "Lowering max references to %d to meet level requirement\n", param.maxNumReferences);
 
     /* For level 5 and higher levels, the value of CtbSizeY shall be equal to 32 or 64 */
     if (param.levelIdc >= 50 && param.maxCUSize < 32)
     {
         param.maxCUSize = 32;
-        x265_log(&param, X265_LOG_INFO, "Levels 5.0 and above require a maximum CTU size of at least 32, using --ctu 32\n");
+        x265_log(&param, X265_LOG_WARNING, "Levels 5.0 and above require a maximum CTU size of at least 32, using --ctu 32\n");
     }
 
     /* The value of NumPocTotalCurr shall be less than or equal to 8 */
@@ -396,7 +410,7 @@
     if (numPocTotalCurr > 8)
     {
         param.maxNumReferences = 8 - !!param.bframes;
-        x265_log(&param, X265_LOG_INFO, "Lowering max references to %d to meet numPocTotalCurr requirement\n", param.maxNumReferences);
+        x265_log(&param, X265_LOG_WARNING, "Lowering max references to %d to meet numPocTotalCurr requirement\n", param.maxNumReferences);
     }
 
     return true;

x265_1.9.tar.gz/source/encoder/motion.cpp -> x265_2.0.tar.gz/source/encoder/motion.cpp Changed

@@ -111,10 +111,8 @@
     chromaSatd = NULL;
 }
 
-void MotionEstimate::init(int method, int refine, int csp)
+void MotionEstimate::init(int csp)
 {
-    searchMethod = method;
-    subpelRefine = refine;
     fencPUYuv.create(FENC_STRIDE, csp);
 }
 
@@ -162,7 +160,7 @@
 }
 
 /* Called by lookahead, luma only, no use of PicYuv */
-void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight)
+void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int method, const int refine)
 {
     partEnum = partitionFromSizes(pwidth, pheight);
     X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
@@ -175,13 +173,17 @@
     blockOffset = offset;
     absPartIdx = ctuAddr = -1;
 
+    /* Search params */
+    searchMethod = method;
+    subpelRefine = refine;
+
     /* copy PU block into cache */
     primitives.pu[partEnum].copy_pp(fencPUYuv.m_buf[0], FENC_STRIDE, fencY + offset, stride);
     X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in this code path\n");
 }
 
 /* Called by Search::predInterSearch() or --pme equivalent, chroma residual might be considered */
-void MotionEstimate::setSourcePU(const Yuv& srcFencYuv, int _ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight)
+void MotionEstimate::setSourcePU(const Yuv& srcFencYuv, int _ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight, const int method, const int refine, bool bChroma)
 {
     partEnum = partitionFromSizes(pwidth, pheight);
     X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
@@ -192,9 +194,13 @@
 
     chromaSatd = primitives.chroma[fencPUYuv.m_csp].pu[partEnum].satd;
 
+    /* Set search characteristics */
+    searchMethod = method;
+    subpelRefine = refine;
+
     /* Enable chroma residual cost if subpelRefine level is greater than 2 and chroma block size
      * is an even multiple of 4x4 pixels (indicated by non-null chromaSatd pointer) */
-    bChromaSATD = subpelRefine > 2 && chromaSatd && (srcFencYuv.m_csp != X265_CSP_I400);
+    bChromaSATD = subpelRefine > 2 && chromaSatd && (srcFencYuv.m_csp != X265_CSP_I400 && bChroma);
     X265_CHECK(!(bChromaSATD && !workload[subpelRefine].hpel_satd), "Chroma SATD cannot be used with SAD hpel\n");
 
     ctuAddr = _ctuAddr;
@@ -1174,15 +1180,17 @@
 int MotionEstimate::subpelCompare(ReferencePlanes *ref, const MV& qmv, pixelcmp_t cmp)
 {
     intptr_t refStride = ref->lumaStride;
-    pixel *fref = ref->fpelPlane[0] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * refStride;
+    const pixel* fref = ref->fpelPlane[0] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * refStride;
     int xFrac = qmv.x & 0x3;
     int yFrac = qmv.y & 0x3;
     int cost;
-    intptr_t lclStride = fencPUYuv.m_size;
-    X265_CHECK(lclStride == FENC_STRIDE, "fenc buffer is assumed to have FENC_STRIDE by sad_x3 and sad_x4\n");
+    const intptr_t fencStride = FENC_STRIDE;
+    X265_CHECK(fencPUYuv.m_size == FENC_STRIDE, "fenc buffer is assumed to have FENC_STRIDE by sad_x3 and sad_x4\n");
 
+    ALIGN_VAR_32(pixel, subpelbuf[MAX_CU_SIZE * MAX_CU_SIZE]);
+    
     if (!(yFrac | xFrac))
-        cost = cmp(fencPUYuv.m_buf[0], lclStride, fref, refStride);
+        cost = cmp(fencPUYuv.m_buf[0], fencStride, fref, refStride);
     else
     {
         /* we are taking a short-cut here if the reference is weighted. To be
@@ -1190,15 +1198,13 @@
          * the final 16bit values prior to rounding and down shifting. Instead we
          * are simply interpolating the weighted full-pel pixels. Not 100%
          * accurate but good enough for fast qpel ME */
-        ALIGN_VAR_32(pixel, subpelbuf[64 * 64]);
         if (!yFrac)
-            primitives.pu[partEnum].luma_hpp(fref, refStride, subpelbuf, lclStride, xFrac);
+            primitives.pu[partEnum].luma_hpp(fref, refStride, subpelbuf, blockwidth, xFrac);
         else if (!xFrac)
-            primitives.pu[partEnum].luma_vpp(fref, refStride, subpelbuf, lclStride, yFrac);
+            primitives.pu[partEnum].luma_vpp(fref, refStride, subpelbuf, blockwidth, yFrac);
         else
-            primitives.pu[partEnum].luma_hvpp(fref, refStride, subpelbuf, lclStride, xFrac, yFrac);
-
-        cost = cmp(fencPUYuv.m_buf[0], lclStride, subpelbuf, lclStride);
+            primitives.pu[partEnum].luma_hvpp(fref, refStride, subpelbuf, blockwidth, xFrac, yFrac);
+        cost = cmp(fencPUYuv.m_buf[0], fencStride, subpelbuf, blockwidth);
     }
 
     if (bChromaSATD)
@@ -1206,12 +1212,12 @@
         int csp    = fencPUYuv.m_csp;
         int hshift = fencPUYuv.m_hChromaShift;
         int vshift = fencPUYuv.m_vChromaShift;
-        int shiftHor = (2 + hshift);
-        int shiftVer = (2 + vshift);
-        lclStride = fencPUYuv.m_csize;
+        int mvx = qmv.x << (1 - hshift);
+        int mvy = qmv.y << (1 - vshift);
+        intptr_t fencStrideC = fencPUYuv.m_csize;
 
         intptr_t refStrideC = ref->reconPic->m_strideC;
-        intptr_t refOffset = (qmv.x >> shiftHor) + (qmv.y >> shiftVer) * refStrideC;
+        intptr_t refOffset = (mvx >> 3) + (mvy >> 3) * refStrideC;
 
         const pixel* refCb = ref->getCbAddr(ctuAddr, absPartIdx) + refOffset;
         const pixel* refCr = ref->getCrAddr(ctuAddr, absPartIdx) + refOffset;
@@ -1219,48 +1225,46 @@
         X265_CHECK((hshift == 0) || (hshift == 1), "hshift must be 0 or 1\n");
         X265_CHECK((vshift == 0) || (vshift == 1), "vshift must be 0 or 1\n");
 
-        xFrac = qmv.x & (hshift ? 7 : 3);
-        yFrac = qmv.y & (vshift ? 7 : 3);
+        xFrac = mvx & 7;
+        yFrac = mvy & 7;
 
         if (!(yFrac | xFrac))
         {
-            cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, refCb, refStrideC);
-            cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, refCr, refStrideC);
+            cost += chromaSatd(fencPUYuv.m_buf[1], fencStrideC, refCb, refStrideC);
+            cost += chromaSatd(fencPUYuv.m_buf[2], fencStrideC, refCr, refStrideC);
         }
         else
         {
-            ALIGN_VAR_32(pixel, subpelbuf[64 * 64]);
+            int blockwidthC = blockwidth >> hshift;
+
             if (!yFrac)
             {
-                primitives.chroma[csp].pu[partEnum].filter_hpp(refCb, refStrideC, subpelbuf, lclStride, xFrac << (1 - hshift));
-                cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride);
+                primitives.chroma[csp].pu[partEnum].filter_hpp(refCb, refStrideC, subpelbuf, blockwidthC, xFrac);
+                cost += chromaSatd(fencPUYuv.m_buf[1], fencStrideC, subpelbuf, blockwidthC);
 
-                primitives.chroma[csp].pu[partEnum].filter_hpp(refCr, refStrideC, subpelbuf, lclStride, xFrac << (1 - hshift));
-                cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride);
+                primitives.chroma[csp].pu[partEnum].filter_hpp(refCr, refStrideC, subpelbuf, blockwidthC, xFrac);
+                cost += chromaSatd(fencPUYuv.m_buf[2], fencStrideC, subpelbuf, blockwidthC);
             }
             else if (!xFrac)
             {
-                primitives.chroma[csp].pu[partEnum].filter_vpp(refCb, refStrideC, subpelbuf, lclStride, yFrac << (1 - vshift));
-                cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride);
+                primitives.chroma[csp].pu[partEnum].filter_vpp(refCb, refStrideC, subpelbuf, blockwidthC, yFrac);
+                cost += chromaSatd(fencPUYuv.m_buf[1], fencStrideC, subpelbuf, blockwidthC);
 
-                primitives.chroma[csp].pu[partEnum].filter_vpp(refCr, refStrideC, subpelbuf, lclStride, yFrac << (1 - vshift));
-                cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride);
+                primitives.chroma[csp].pu[partEnum].filter_vpp(refCr, refStrideC, subpelbuf, blockwidthC, yFrac);
+                cost += chromaSatd(fencPUYuv.m_buf[2], fencStrideC, subpelbuf, blockwidthC);
             }
             else
             {
-                ALIGN_VAR_32(int16_t, immed[64 * (64 + NTAPS_CHROMA)]);
-
-                int extStride = blockwidth >> hshift;
-                int filterSize = NTAPS_CHROMA;
-                int halfFilterSize = (filterSize >> 1);
+                ALIGN_VAR_32(int16_t, immed[MAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_LUMA - 1)]);
+                const int halfFilterSize = (NTAPS_CHROMA >> 1);
 
-                primitives.chroma[csp].pu[partEnum].filter_hps(refCb, refStrideC, immed, extStride, xFrac << (1 - hshift), 1);
-                primitives.chroma[csp].pu[partEnum].filter_vsp(immed + (halfFilterSize - 1) * extStride, extStride, subpelbuf, lclStride, yFrac << (1 - vshift));
-                cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride);
+                primitives.chroma[csp].pu[partEnum].filter_hps(refCb, refStrideC, immed, blockwidthC, xFrac, 1);
+                primitives.chroma[csp].pu[partEnum].filter_vsp(immed + (halfFilterSize - 1) * blockwidthC, blockwidthC, subpelbuf, blockwidthC, yFrac);
+                cost += chromaSatd(fencPUYuv.m_buf[1], fencStrideC, subpelbuf, blockwidthC);
 
-                primitives.chroma[csp].pu[partEnum].filter_hps(refCr, refStrideC, immed, extStride, xFrac << (1 - hshift), 1);
-                primitives.chroma[csp].pu[partEnum].filter_vsp(immed + (halfFilterSize - 1) * extStride, extStride, subpelbuf, lclStride, yFrac << (1 - vshift));
-                cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride);
+                primitives.chroma[csp].pu[partEnum].filter_hps(refCr, refStrideC, immed, blockwidthC, xFrac, 1);
+                primitives.chroma[csp].pu[partEnum].filter_vsp(immed + (halfFilterSize - 1) * blockwidthC, blockwidthC, subpelbuf, blockwidthC, yFrac);
+                cost += chromaSatd(fencPUYuv.m_buf[2], fencStrideC, subpelbuf, blockwidthC);
             }
         }
     }

x265_1.9.tar.gz/source/encoder/motion.h -> x265_2.0.tar.gz/source/encoder/motion.h Changed

x265_1.9.tar.gz/source/encoder/ratecontrol.cpp -> x265_2.0.tar.gz/source/encoder/ratecontrol.cpp Changed

@@ -53,7 +53,7 @@
 {\
     bErr = 0;\
     p = strstr(opts, opt "=");\
-    char* q = strstr(opts, "no-"opt);\
+    char* q = strstr(opts, "no-" opt);\
     if (p && sscanf(p, opt "=%d" , &i) && param_val != i)\
         bErr = 1;\
     else if (!param_val && !q && !p)\
@@ -91,24 +91,6 @@
     return z + lut[x];
 }
 
-inline void reduceFraction(int* n, int* d)
-{
-    int a = *n;
-    int b = *d;
-    int c;
-    if (!a || !b)
-        return;
-    c = a % b;
-    while (c)
-    {
-        a = b;
-        b = c;
-        c = a % b;
-    }
-    *n /= b;
-    *d /= b;
-}
-
 inline char *strcatFilename(const char *input, const char *suffix)
 {
     char *output = X265_MALLOC(char, strlen(input) + strlen(suffix) + 1);
@@ -190,6 +172,8 @@
     m_numEntries = 0;
     m_isSceneTransition = false;
     m_lastPredictorReset = 0;
+    m_avgPFrameQp = 0;
+    m_isFirstMiniGop = false;
     if (m_param->rc.rateControlMode == X265_RC_CRF)
     {
         m_param->rc.qp = (int)m_param->rc.rfConstant;
@@ -212,7 +196,7 @@
             m_rateFactorMaxDecrement = m_param->rc.rfConstant - m_param->rc.rfConstantMin;
     }
     m_isAbr = m_param->rc.rateControlMode != X265_RC_CQP && !m_param->rc.bStatRead;
-    m_2pass = (m_param->rc.rateControlMode == X265_RC_ABR || m_param->rc.vbvMaxBitrate > 0) && m_param->rc.bStatRead;
+    m_2pass = m_param->rc.rateControlMode != X265_RC_CQP && m_param->rc.bStatRead;
     m_bitrate = m_param->rc.bitrate * 1000;
     m_frameDuration = (double)m_param->fpsDenom / m_param->fpsNum;
     m_qp = m_param->rc.qp;
@@ -225,8 +209,10 @@
     m_statFileOut = NULL;
     m_cutreeStatFileOut = m_cutreeStatFileIn = NULL;
     m_rce2Pass = NULL;
+    m_encOrder = NULL;
     m_lastBsliceSatdCost = 0;
     m_movingAvgSum = 0.0;
+    m_isNextGop = false;
 
     // vbv initialization
     m_param->rc.vbvBufferSize = x265_clip3(0, 2000000, m_param->rc.vbvBufferSize);
@@ -288,9 +274,13 @@
     m_ipOffset = 6.0 * X265_LOG2(m_param->rc.ipFactor);
     m_pbOffset = 6.0 * X265_LOG2(m_param->rc.pbFactor);
 
+    for (int i = 0; i < QP_MAX_MAX; i++)
+        m_qpToEncodedBits[i] = 0;
+
     /* Adjust the first frame in order to stabilize the quality level compared to the rest */
 #define ABR_INIT_QP_MIN (24)
-#define ABR_INIT_QP_MAX (40)
+#define ABR_INIT_QP_MAX (37)
+#define ABR_INIT_QP_GRAIN_MAX (33)
 #define ABR_SCENECUT_INIT_QP_MIN (12)
 #define CRF_INIT_QP (int)m_param->rc.rfConstant
     for (int i = 0; i < 3; i++)
@@ -361,6 +351,7 @@
         m_amortizeFraction = 0.85;
         m_amortizeFrames = m_param->totalFrames / 2;
     }
+
     for (int i = 0; i < s_slidingWindowFrames; i++)
     {
         m_satdCostWindow[i] = 0;
@@ -370,15 +361,22 @@
     m_isPatternPresent = false;
     m_numBframesInPattern = 0;
 
-    /* 720p videos seem to be a good cutoff for cplxrSum */
-    double tuneCplxFactor = (m_param->rc.cuTree && m_ncu > 3600) ? 2.5 : 1;
+    m_isGrainEnabled = false;
+    if(m_param->rc.bEnableGrain) // tune for grainy content OR equal p-b frame sizes
+    m_isGrainEnabled = true;
+    for (int i = 0; i < 3; i++)
+    m_lastQScaleFor[i] = x265_qp2qScale(m_param->rc.rateControlMode == X265_RC_CRF ? CRF_INIT_QP : ABR_INIT_QP_MIN);
+    m_avgPFrameQp = 0 ;
 
+    /* 720p videos seem to be a good cutoff for cplxrSum */
+    double tuneCplxFactor = (m_ncu > 3600 && m_param->rc.cuTree) ? 2.5 : m_isGrainEnabled ? 1.9 : 1;
     /* estimated ratio that produces a reasonable QP for the first I-frame */
     m_cplxrSum = .01 * pow(7.0e5, m_qCompress) * pow(m_ncu, 0.5) * tuneCplxFactor;
     m_wantedBitsWindow = m_bitrate * m_frameDuration;
     m_accumPNorm = .01;
     m_accumPQp = (m_param->rc.rateControlMode == X265_RC_CRF ? CRF_INIT_QP : ABR_INIT_QP_MIN) * m_accumPNorm;
 
+
     /* Frame Predictors used in vbv */
     initFramePredictors();
     if (!m_statFileOut && (m_param->rc.bStatWrite || m_param->rc.bStatRead))
@@ -401,11 +399,11 @@
                 char *tmpFile = strcatFilename(fileName, ".cutree");
                 if (!tmpFile)
                     return false;
-                m_cutreeStatFileIn = fopen(tmpFile, "rb");
+                m_cutreeStatFileIn = x265_fopen(tmpFile, "rb");
                 X265_FREE(tmpFile);
                 if (!m_cutreeStatFileIn)
                 {
-                    x265_log(m_param, X265_LOG_ERROR, "can't open stats file %s\n", tmpFile);
+                    x265_log_file(m_param, X265_LOG_ERROR, "can't open stats file %s.cutree\n", fileName);
                     return false;
                 }
             }
@@ -417,7 +415,7 @@
                 return false;
             }
             {
-                int i, j;
+                int i, j, m;
                 uint32_t k , l;
                 bool bErr = false;
                 char *opts = statsBuf;
@@ -439,6 +437,11 @@
                     x265_log(m_param, X265_LOG_ERROR, "fps specified in stats file not valid\n");
                     return false;
                 }
+                if (((p = strstr(opts, " vbv-maxrate=")) == 0 || sscanf(p, " vbv-maxrate=%d", &m) != 1) && m_param->rc.rateControlMode == X265_RC_CRF)
+                {
+                    x265_log(m_param, X265_LOG_ERROR, "Constant rate-factor is incompatible with 2pass without vbv-maxrate in the previous pass\n");
+                    return false;
+                }
                 if (k != m_param->fpsNum || l != m_param->fpsDenom)
                 {
                     x265_log(m_param, X265_LOG_ERROR, "fps mismatch with 1st pass (%u/%u vs %u/%u)\n",
@@ -564,8 +567,10 @@
                 p = next;
             }
             X265_FREE(statsBuf);
-            if (m_param->rc.rateControlMode == X265_RC_ABR || m_param->rc.vbvMaxBitrate > 0)
+            if (m_param->rc.rateControlMode != X265_RC_CQP)
             {
+                m_start = 0;
+                m_isQpModified = true;
                 if (!initPass2())
                     return false;
             } /* else we're using constant quant, so no need to run the bitrate allocation */
@@ -579,11 +584,11 @@
             statFileTmpname = strcatFilename(fileName, ".temp");
             if (!statFileTmpname)
                 return false;
-            m_statFileOut = fopen(statFileTmpname, "wb");
+            m_statFileOut = x265_fopen(statFileTmpname, "wb");
             X265_FREE(statFileTmpname);
             if (!m_statFileOut)
             {
-                x265_log(m_param, X265_LOG_ERROR, "can't open stats file %s\n", statFileTmpname);
+                x265_log_file(m_param, X265_LOG_ERROR, "can't open stats file %s.temp\n", fileName);
                 return false;
             }
             p = x265_param2string(m_param);
@@ -595,11 +600,11 @@
                 statFileTmpname = strcatFilename(fileName, ".cutree.temp");
                 if (!statFileTmpname)
                     return false;
-                m_cutreeStatFileOut = fopen(statFileTmpname, "wb");
+                m_cutreeStatFileOut = x265_fopen(statFileTmpname, "wb");
                 X265_FREE(statFileTmpname);
                 if (!m_cutreeStatFileOut)
                 {
-                    x265_log(m_param, X265_LOG_ERROR, "can't open mbtree stats file %s\n", statFileTmpname);
+                    x265_log_file(m_param, X265_LOG_ERROR, "can't open mbtree stats file %s.cutree.temp\n", fileName);
                     return false;
                 }
             }
@@ -647,7 +652,7 @@
 
     #undef MAX_DURATION
 }
-bool RateControl::analyseABR2Pass(int startIndex, int endIndex, uint64_t allAvailableBits)
+bool RateControl::analyseABR2Pass(uint64_t allAvailableBits)
 {
     double rateFactor, stepMult;
     double qBlur = m_param->rc.qblur;
@@ -657,21 +662,21 @@
     double *qScale, *blurredQscale;
     double baseCplx = m_ncu * (m_param->bframes ? 120 : 80);
     double clippedDuration = CLIP_DURATION(m_frameDuration) / BASE_FRAME_DURATION;
-    int framesCount = endIndex - startIndex + 1;

x265_1.9.tar.gz/source/encoder/ratecontrol.h -> x265_2.0.tar.gz/source/encoder/ratecontrol.h Changed

@@ -107,6 +107,7 @@
     int      miscBits;
     int      coeffBits;
     bool     keptAsRef;
+    bool     scenecut;
 
     SEIPictureTiming *picTimingSEI;
     HRDTiming        *hrdTiming;
@@ -126,8 +127,9 @@
     bool   m_isVbv;
     bool   m_isCbr;
     bool   m_singleFrameVbv;
-
+    bool   m_isGrainEnabled;
     bool   m_isAbrReset;
+    bool   m_isNextGop;
     int    m_lastAbrResetPoc;
 
     double m_rateTolerance;
@@ -141,7 +143,8 @@
     double m_vbvMaxRate;       /* in kbps */
     double m_rateFactorMaxIncrement; /* Don't allow RF above (CRF + this value). */
     double m_rateFactorMaxDecrement; /* don't allow RF below (this value). */
-
+    double m_avgPFrameQp;
+    bool   m_isFirstMiniGop;
     Predictor m_pred[4];       /* Slice predictors to preidct bits for each Slice type - I,P,Bref and B */
     int64_t m_leadingNoBSatd;
     int     m_predType;       /* Type of slice predictors to be used - depends on the slice type */
@@ -178,7 +181,7 @@
     bool    m_isPatternPresent;
     bool    m_isSceneTransition;
     int     m_lastPredictorReset;
-
+    double  m_qpToEncodedBits[QP_MAX_MAX + 1];
     /* a common variable on which rateControlStart, rateControlEnd and rateControUpdateStats waits to
      * sync the calls to these functions. For example
      * -F2:
@@ -202,7 +205,11 @@
 
     /* 2 pass */
     bool    m_2pass;
+    bool    m_isGopReEncoded;
+    bool    m_isQpModified;
     int     m_numEntries;
+    int     m_start;
+    int     m_reencode;
     FILE*   m_statFileOut;
     FILE*   m_cutreeStatFileOut;
     FILE*   m_cutreeStatFileIn;
@@ -235,6 +242,8 @@
     bool cuTreeReadFor2Pass(Frame* curFrame);
     void hrdFullness(SEIBufferingPeriod* sei);
     int writeRateControlFrameStats(Frame* curFrame, RateControlEntry* rce);
+    bool   initPass2();
+
 protected:
 
     static const int   s_slidingWindowFrames;
@@ -261,14 +270,14 @@
     double predictSize(Predictor *p, double q, double var);
     void   checkAndResetABR(RateControlEntry* rce, bool isFrameDone);
     double predictRowsSizeSum(Frame* pic, RateControlEntry* rce, double qpm, int32_t& encodedBits);
-    bool   initPass2();
-    bool   analyseABR2Pass(int startPoc, int endPoc, uint64_t allAvailableBits);
+    bool   analyseABR2Pass(uint64_t allAvailableBits);
     void   initFramePredictors();
     double getDiffLimitedQScale(RateControlEntry *rce, double q);
     double countExpectedBits(int startPos, int framesCount);
     bool   vbv2Pass(uint64_t allAvailableBits, int frameCount, int startPos);
     bool   findUnderflow(double *fills, int *t0, int *t1, int over, int framesCount);
     bool   fixUnderflow(int t0, int t1, double adjustment, double qscaleMin, double qscaleMax);
+    double tuneQScaleForGrain(double rcOverflow);
 };
 }
 #endif // ifndef X265_RATECONTROL_H

x265_1.9.tar.gz/source/encoder/reference.cpp -> x265_2.0.tar.gz/source/encoder/reference.cpp Changed

x265_1.9.tar.gz/source/encoder/sao.cpp -> x265_2.0.tar.gz/source/encoder/sao.cpp Changed

@@ -53,7 +53,7 @@
     return r;
 }
 
-inline int64_t estSaoDist(int32_t count, int offset, int32_t offsetOrg)
+inline int64_t estSaoDist(int32_t count, int32_t offset, int32_t offsetOrg)
 {
     return (count * offset - offsetOrg * 2) * offset;
 }
@@ -76,8 +76,6 @@
     m_countPreDblk = NULL;
     m_offsetOrgPreDblk = NULL;
     m_refDepth = 0;
-    m_lumaLambda = 0;
-    m_chromaLambda = 0;
     m_param = NULL;
     m_clipTable = NULL;
     m_clipTableBase = NULL;
@@ -120,8 +118,11 @@
 
     if (initCommon)
     {
-        CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu);
-        CHECKED_MALLOC(m_offsetOrgPreDblk, PerPlane, numCtu);
+        if (m_param->bSaoNonDeblocked)
+        {
+            CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu);
+            CHECKED_MALLOC(m_offsetOrgPreDblk, PerPlane, numCtu);
+        }
         CHECKED_MALLOC(m_depthSaoRate, double, 2 * SAO_DEPTHRATE_SIZE);
 
         m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 0] = 0;
@@ -137,17 +138,16 @@
         m_clipTable = &(m_clipTableBase[rangeExt]);
 
         // Share with fast clip lookup table
-        if (initCommon)
-        {
-            for (int i = 0; i < rangeExt; i++)
-                m_clipTableBase[i] = 0;
 
-            for (int i = 0; i < maxY; i++)
-                m_clipTable[i] = (pixel)i;
+        for (int i = 0; i < rangeExt; i++)
+            m_clipTableBase[i] = 0;
+
+        for (int i = 0; i < maxY; i++)
+            m_clipTable[i] = (pixel)i;
+
+        for (int i = maxY; i < maxY + rangeExt; i++)
+            m_clipTable[i] = maxY;
 
-            for (int i = maxY; i < maxY + rangeExt; i++)
-                m_clipTable[i] = maxY;
-        }
     }
     else
     {
@@ -204,8 +204,11 @@
 
     if (destoryCommon)
     {
-        X265_FREE_ZERO(m_countPreDblk);
-        X265_FREE_ZERO(m_offsetOrgPreDblk);
+        if (m_param->bSaoNonDeblocked)
+        {
+            X265_FREE_ZERO(m_countPreDblk);
+            X265_FREE_ZERO(m_offsetOrgPreDblk);
+        }
         X265_FREE_ZERO(m_depthSaoRate);
         X265_FREE_ZERO(m_clipTableBase);
     }
@@ -221,17 +224,10 @@
         saoParam->ctuParam[i] = new SaoCtuParam[m_numCuInHeight * m_numCuInWidth];
 }
 
-void SAO::startSlice(Frame* frame, Entropy& initState, int qp)
+void SAO::startSlice(Frame* frame, Entropy& initState)
 {
-    Slice* slice = frame->m_encData->m_slice;
-    int qpCb = qp;
-    if (m_param->internalCsp == X265_CSP_I420)
-        qpCb = x265_clip3(QP_MIN, QP_MAX_MAX, (int)g_chromaScale[qp + slice->m_pps->chromaQpOffset[0]]);
-    else
-        qpCb = X265_MIN(qp + slice->m_pps->chromaQpOffset[0], QP_MAX_SPEC);
-    m_lumaLambda = x265_lambda2_tab[qp];
-    m_chromaLambda = x265_lambda2_tab[qpCb]; // Use Cb QP for SAO chroma
     m_frame = frame;
+    Slice* slice = m_frame->m_encData->m_slice;
 
     switch (slice->m_sliceType)
     {
@@ -259,7 +255,7 @@
     }
 
     saoParam->bSaoFlag[0] = true;
-    saoParam->bSaoFlag[1] = m_param->internalCsp != X265_CSP_I400;
+    saoParam->bSaoFlag[1] = m_param->internalCsp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400;
 
     m_numNoSao[0] = 0; // Luma
     m_numNoSao[1] = 0; // Chroma
@@ -275,9 +271,8 @@
 }
 
 // CTU-based SAO process without slice granularity
-void SAO::processSaoCu(int addr, int typeIdx, int plane)
+void SAO::applyPixelOffsets(int addr, int typeIdx, int plane)
 {
-    int x, y;
     PicYuv* reconPic = m_frame->m_reconPic;
     pixel* rec = reconPic->getPlaneAddr(plane, addr);
     intptr_t stride = plane ? reconPic->m_strideC : reconPic->m_stride;
@@ -302,20 +297,13 @@
     ctuWidth  = rpelx - lpelx;
     ctuHeight = bpely - tpely;
 
-    int startX;
-    int startY;
-    int endX;
-    int endY;
-    pixel* tmpL;
-    pixel* tmpU;
-
     int8_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1, signLeft1[2];
     int8_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;
 
     memset(_upBuff1 + MAX_CU_SIZE, 0, 2 * sizeof(int8_t)); /* avoid valgrind uninit warnings */
 
-    tmpL = m_tmpL1[plane];
-    tmpU = &(m_tmpU[plane][lpelx]);
+    pixel* tmpL = m_tmpL1[plane];
+    pixel* tmpU = &(m_tmpU[plane][lpelx]);
 
     int8_t* offsetEo = m_offsetEo[plane];
 
@@ -324,14 +312,14 @@
     case SAO_EO_0: // dir: -
     {
         pixel firstPxl = 0, lastPxl = 0, row1FirstPxl = 0, row1LastPxl = 0;
-        startX = !lpelx;
-        endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
+        int startX = !lpelx;
+        int endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
         if (ctuWidth & 15)
         {
-            for (y = 0; y < ctuHeight; y++)
+            for (int y = 0; y < ctuHeight; y++, rec += stride)
             {
                 int signLeft = signOf(rec[startX] - tmpL[y]);
-                for (x = startX; x < endX; x++)
+                for (int x = startX; x < endX; x++)
                 {
                     int signRight = signOf(rec[x] - rec[x + 1]);
                     int edgeType = signRight + signLeft + 2;
@@ -339,13 +327,11 @@
 
                     rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]];
                 }
-
-                rec += stride;
             }
         }
         else
         {
-            for (y = 0; y < ctuHeight; y += 2)
+            for (int y = 0; y < ctuHeight; y += 2, rec += 2 * stride)
             {
                 signLeft1[0] = signOf(rec[startX] - tmpL[y]);
                 signLeft1[1] = signOf(rec[stride + startX] - tmpL[y + 1]);
@@ -375,27 +361,25 @@
                     rec[ctuWidth - 1] = lastPxl;
                     rec[stride + ctuWidth - 1] = row1LastPxl;
                 }
-
-                rec += 2 * stride;
             }
         }
         break;
     }
     case SAO_EO_1: // dir: |
     {
-        startY = !tpely;
-        endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
+        int startY = !tpely;
+        int endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
         if (!tpely)
             rec += stride;
 
         if (ctuWidth & 15)
         {
-            for (x = 0; x < ctuWidth; x++)
+            for (int x = 0; x < ctuWidth; x++)
                 upBuff1[x] = signOf(rec[x] - tmpU[x]);
 
-            for (y = startY; y < endY; y++)
+            for (int y = startY; y < endY; y++, rec += stride)
             {
-                for (x = 0; x < ctuWidth; x++)
+                for (int x = 0; x < ctuWidth; x++)
                 {

x265_1.9.tar.gz/source/encoder/sao.h -> x265_2.0.tar.gz/source/encoder/sao.h Changed

@@ -33,13 +33,6 @@
 namespace X265_NS {
 // private namespace
 
-enum SAOTypeLen
-{
-    SAO_EO_LEN = 4,
-    SAO_BO_LEN = 4,
-    SAO_NUM_BO_CLASSES = 32
-};
-
 enum SAOType
 {
     SAO_EO_0 = 0,
@@ -56,12 +49,11 @@
 
     enum { SAO_MAX_DEPTH = 4 };
     enum { SAO_BO_BITS  = 5 };
-    enum { MAX_NUM_SAO_CLASS = 33 };
+    enum { MAX_NUM_SAO_CLASS = 32 };
     enum { SAO_BIT_INC = 0 }; /* in HM12.0, it wrote as X265_MAX(X265_DEPTH - 10, 0) */
     enum { OFFSET_THRESH = 1 << X265_MIN(X265_DEPTH - 5, 5) };
     enum { NUM_EDGETYPE = 5 };
     enum { NUM_PLANE = 3 };
-    enum { NUM_MERGE_MODE = 3 };
     enum { SAO_DEPTHRATE_SIZE = 4 };
 
     static const uint32_t s_eoTable[NUM_EDGETYPE];
@@ -81,7 +73,7 @@
     PerPlane*   m_offsetOrgPreDblk;
 
     double*     m_depthSaoRate;
-    int8_t      m_offsetBo[NUM_PLANE][SAO_NUM_BO_CLASSES];
+    int8_t      m_offsetBo[NUM_PLANE][MAX_NUM_SAO_CLASS];
     int8_t      m_offsetEo[NUM_PLANE][NUM_EDGETYPE];
 
     int         m_chromaFormat;
@@ -114,10 +106,6 @@
     int         m_refDepth;
     int         m_numNoSao[2];
 
-    double      m_lumaLambda;
-    double      m_chromaLambda;
-    /* TODO: No doubles for distortion */
-
     SAO();
 
     bool create(x265_param* param, int initCommon);
@@ -126,31 +114,27 @@
 
     void allocSaoParam(SAOParam* saoParam) const;
 
-    void startSlice(Frame* pic, Entropy& initState, int qp);
+    void startSlice(Frame* pic, Entropy& initState);
     void resetStats();
-    void resetSaoUnit(SaoCtuParam* saoUnit);
 
     // CTU-based SAO process without slice granularity
-    void processSaoCu(int addr, int typeIdx, int plane);
+    void applyPixelOffsets(int addr, int typeIdx, int plane);
     void processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane);
-    void processSaoUnitCuLuma(SaoCtuParam* ctuParam, int idxY, int idxX);
-    void processSaoUnitCuChroma(SaoCtuParam* ctuParam[3], int idxY, int idxX);
+    void generateLumaOffsets(SaoCtuParam* ctuParam, int idxY, int idxX);
+    void generateChromaOffsets(SaoCtuParam* ctuParam[3], int idxY, int idxX);
 
-    void copySaoUnit(SaoCtuParam* saoUnitDst, const SaoCtuParam* saoUnitSrc);
-
-    void calcSaoStatsCu(int addr, int plane);
+    void calcSaoStatsCTU(int addr, int plane);
     void calcSaoStatsCu_BeforeDblk(Frame* pic, int idxX, int idxY);
 
-    void saoComponentParamDist(SAOParam* saoParam, int addr, int addrUp, int addrLeft, SaoCtuParam mergeSaoParam[2], double* mergeDist);
-    void sao2ChromaParamDist(SAOParam* saoParam, int addr, int addrUp, int addrLeft, SaoCtuParam mergeSaoParam[][2], double* mergeDist);
-
-    inline int estIterOffset(int typeIdx, int classIdx, double lambda, int offset, int32_t count, int32_t offsetOrg,
-                             int32_t* currentDistortionTableBo, double* currentRdCostTableBo);
-    inline int64_t estSaoTypeDist(int plane, int typeIdx, double lambda, int32_t* currentDistortionTableBo, double* currentRdCostTableBo);
+    void saoLumaComponentParamDist(SAOParam* saoParam, int addr, int64_t& rateDist, int64_t* lambda, int64_t& bestCost);
+    void saoChromaComponentParamDist(SAOParam* saoParam, int addr, int64_t& rateDist, int64_t* lambda, int64_t& bestCost);
 
+    void estIterOffset(int typeIdx, int64_t lambda, int32_t count, int32_t offsetOrg, int32_t& offset, int32_t& distClasses, int64_t& costClasses);
     void rdoSaoUnitRowEnd(const SAOParam* saoParam, int numctus);
-    void rdoSaoUnitRow(SAOParam* saoParam, int idxY);
     void rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr);
+    int64_t calcSaoRdoCost(int64_t distortion, uint32_t bits, int64_t lambda);
+
+    void saoStatsInitialOffset(int planes);
 
     friend class FrameFilter;
 };

x265_1.9.tar.gz/source/encoder/search.cpp -> x265_2.0.tar.gz/source/encoder/search.cpp Changed

@@ -73,14 +73,13 @@
 {
     uint32_t maxLog2CUSize = g_log2Size[param.maxCUSize];
     m_param = &param;
-    m_bEnableRDOQ = !!param.rdoqLevel;
     m_bFrameParallel = param.frameNumThreads > 1;
     m_numLayers = g_log2Size[param.maxCUSize] - 2;
 
     m_rdCost.setPsyRdScale(param.psyRd);
-    m_me.init(param.searchMethod, param.subpelRefine, param.internalCsp);
+    m_me.init(param.internalCsp);
 
-    bool ok = m_quant.init(param.rdoqLevel, param.psyRdoq, scalingList, m_entropyCoder);
+    bool ok = m_quant.init(param.psyRdoq, scalingList, m_entropyCoder);
     if (m_param->noiseReductionIntra || m_param->noiseReductionInter || m_param->rc.vbvBufferSize)
         ok &= m_quant.allocNoiseReduction(param);
 
@@ -223,9 +222,10 @@
 
     if (!(log2TrSize - m_hChromaShift < 2))
     {
-        if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1))
+        uint32_t parentIdx = absPartIdx & (0xFF << (log2TrSize + 1 - LOG2_UNIT_SIZE) * 2);
+        if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_U, tuDepth - 1))
             m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !subdiv);
-        if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1))
+        if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_V, tuDepth - 1))
             m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !subdiv);
     }
 
@@ -296,6 +296,7 @@
     uint32_t sizeIdx    = log2TrSize - 2;
     bool mightNotSplit  = log2TrSize <= depthRange[1];
     bool mightSplit     = (log2TrSize > depthRange[0]) && (bAllowSplit || !mightNotSplit);
+    bool bEnableRDOQ  = !!m_param->rdoqLevel;
 
     /* If maximum RD penalty, force spits at TU size 32x32 if SPS allows TUs of 16x16 */
     if (m_param->rdPenalty == 2 && m_slice->m_sliceType != I_SLICE && log2TrSize == 5 && depthRange[0] <= 4)
@@ -336,7 +337,7 @@
         coeff_t* coeffY       = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
 
         // store original entropy coding status
-        if (m_bEnableRDOQ)
+        if (bEnableRDOQ)
             m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
 
         primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
@@ -434,8 +435,7 @@
 
             cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
         }
-        for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
-            cu.m_cbf[0][absPartIdx + offs] |= (cbf << tuDepth);
+        cu.m_cbf[0][absPartIdx] |= (cbf << tuDepth);
 
         if (mightNotSplit && log2TrSize != depthRange[0])
         {
@@ -487,6 +487,7 @@
     uint32_t fullDepth = cuGeom.depth + tuDepth;
     uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
     uint32_t tuSize = 1 << log2TrSize;
+    bool bEnableRDOQ = !!m_param->rdoqLevel;
 
     X265_CHECK(tuSize <= MAX_TS_SIZE, "transform skip is only possible at 4x4 TUs\n");
 
@@ -525,7 +526,7 @@
     // store original entropy coding status
     m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
 
-    if (m_bEnableRDOQ)
+    if (bEnableRDOQ)
         m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
 
     int checkTransformSkip = 1;
@@ -717,8 +718,7 @@
             residualTransformQuantIntra(mode, cuGeom, qPartIdx, tuDepth + 1, depthRange);
             cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
         }
-        for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
-            cu.m_cbf[0][absPartIdx + offs] |= (cbf << tuDepth);
+        cu.m_cbf[0][absPartIdx] |= (cbf << tuDepth);
     }
 }
 
@@ -782,6 +782,7 @@
 {
     CUData& cu = mode.cu;
     uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
+    bool bEnableRDOQ = !!m_param->rdoqLevel;
 
     if (tuDepth < cu.m_tuDepth[absPartIdx])
     {
@@ -793,11 +794,9 @@
             splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
             splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
         }
-        for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
-        {
-            cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << tuDepth);
-            cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << tuDepth);
-        }
+        cu.m_cbf[1][absPartIdx] |= (splitCbfU << tuDepth);
+        cu.m_cbf[2][absPartIdx] |= (splitCbfV << tuDepth);
+
         return;
     }
 
@@ -812,7 +811,7 @@
         tuDepthC--;
     }
 
-    if (m_bEnableRDOQ)
+    if (bEnableRDOQ)
         m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
 
     bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && log2TrSizeC <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
@@ -1091,11 +1090,8 @@
             splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
             splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
         }
-        for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
-        {
-            cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << tuDepth);
-            cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << tuDepth);
-        }
+        cu.m_cbf[1][absPartIdx] |= (splitCbfU << tuDepth);
+        cu.m_cbf[2][absPartIdx] |= (splitCbfV << tuDepth);
 
         return;
     }
@@ -1629,8 +1625,7 @@
         for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
             combCbfY |= cu.getCbf(qPartIdx, TEXT_LUMA, 1);
 
-        for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
-            cu.m_cbf[0][offs] |= combCbfY;
+        cu.m_cbf[0][0] |= combCbfY;
     }
 
     // TODO: remove this
@@ -1732,6 +1727,12 @@
         else
             cu.getAllowedChromaDir(absPartIdxC, modeList);
 
+        if (m_frame->m_fencPic->m_picCsp  == X265_CSP_I400 && m_csp != X265_CSP_I400)
+        {
+            for (uint32_t l = 1; l < NUM_CHROMA_MODE; l++)
+                modeList[l] = modeList[0];
+            maxMode = 1;
+        }
         // check chroma modes
         for (uint32_t mode = minMode; mode < maxMode; mode++)
         {
@@ -1816,11 +1817,8 @@
             combCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, 1);
         }
 
-        for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
-        {
-            cu.m_cbf[1][offs] |= combCbfU;
-            cu.m_cbf[2][offs] |= combCbfV;
-        }
+        cu.m_cbf[1][0] |= combCbfU;
+        cu.m_cbf[2][0] |= combCbfV;
     }
 
     /* TODO: remove this */
@@ -1974,7 +1972,8 @@
         slave.m_frame = m_frame;
         slave.m_param = m_param;
         slave.setLambdaFromQP(pme.mode.cu, m_rdCost.m_qp);
-        slave.m_me.setSourcePU(*pme.mode.fencYuv, pme.pu.ctuAddr, pme.pu.cuAbsPartIdx, pme.pu.puAbsPartIdx, pme.pu.width, pme.pu.height);
+        bool bChroma = slave.m_frame->m_fencPic->m_picCsp != X265_CSP_I400;
+        slave.m_me.setSourcePU(*pme.mode.fencYuv, pme.pu.ctuAddr, pme.pu.cuAbsPartIdx, pme.pu.puAbsPartIdx, pme.pu.width, pme.pu.height, m_param->searchMethod, m_param->subpelRefine, bChroma);
     }
 
     /* Perform ME, repeat until no more work is available */
@@ -2015,9 +2014,12 @@
     int mvpIdx = selectMVP(interMode.cu, pu, amvp, list, ref);
     MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx];
 
-    MV lmv = getLowresMV(interMode.cu, pu, list, ref);
-    if (lmv.notZero())
-        mvc[numMvc++] = lmv;
+    if (!m_param->analysisMode) /* Prevents load/save outputs from diverging if lowresMV is not available */
+    {
+        MV lmv = getLowresMV(interMode.cu, pu, list, ref);
+        if (lmv.notZero())
+            mvc[numMvc++] = lmv;
+    }
 
     setSearchRange(interMode.cu, mvp, m_param->searchRange, mvmin, mvmax);
 
@@ -2074,7 +2076,7 @@
         MotionData* bestME = interMode.bestME[puIdx];
         PredictionUnit pu(cu, cuGeom, puIdx);
 
-        m_me.setSourcePU(*interMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height);
+        m_me.setSourcePU(*interMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, bChromaMC);

x265_1.9.tar.gz/source/encoder/search.h -> x265_2.0.tar.gz/source/encoder/search.h Changed

x265_1.9.tar.gz/source/encoder/slicetype.cpp -> x265_2.0.tar.gz/source/encoder/slicetype.cpp Changed

@@ -83,7 +83,7 @@
     uint32_t var;
 
     var  = acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[0] + blockOffsetLuma, stride, 0, csp);
-    if (csp != X265_CSP_I400)
+    if (csp != X265_CSP_I400 && curFrame->m_fencPic->m_picCsp != X265_CSP_I400)
     {
         var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[1] + blockOffsetChroma, cStride, 1, csp);
         var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[2] + blockOffsetChroma, cStride, 2, csp);
@@ -456,10 +456,13 @@
     COPY4_IF_LT(minscore, s, minscale, curScale, minoff, curOffset, found, 1);
 
     /* Use a smaller denominator if possible */
-    while (mindenom > 0 && !(minscale & 1))
+    if (mindenom > 0 && !(minscale & 1))
     {
-        mindenom--;
-        minscale >>= 1;
+        unsigned long idx;
+        CTZ(idx, minscale);
+        int shift = X265_MIN((int)idx, mindenom);
+        mindenom -= shift;
+        minscale >>= shift;
     }
 
     if (!found || (minscale == 1 << mindenom && minoff == 0) || (float)minscore / origscore > 0.998f)
@@ -2081,7 +2084,7 @@
     const intptr_t pelOffset = cuSize * cuX + cuSize * cuY * fenc->lumaStride;
 
     if (bBidir || bDoSearch[0] || bDoSearch[1])
-        tld.me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize);
+        tld.me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize, X265_HEX_SEARCH, 1);
 
     /* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */
     int lowresPenalty = 4;

x265_1.9.tar.gz/source/encoder/slicetype.h -> x265_2.0.tar.gz/source/encoder/slicetype.h Changed

x265_1.9.tar.gz/source/encoder/weightPrediction.cpp -> x265_2.0.tar.gz/source/encoder/weightPrediction.cpp Changed

@@ -31,6 +31,7 @@
 #include "slice.h"
 #include "mv.h"
 #include "bitstream.h"
+#include "threading.h"
 
 using namespace X265_NS;
 namespace {
@@ -132,25 +133,25 @@
                 intptr_t fpeloffset = (mv.y >> 2) * stride + (mv.x >> 2);
                 pixel *temp = src + pixoff + fpeloffset;
 
-                int xFrac = mv.x & 0x7;
-                int yFrac = mv.y & 0x7;
-                if ((yFrac | xFrac) == 0)
+                int xFrac = mv.x & 7;
+                int yFrac = mv.y & 7;
+                if (!(yFrac | xFrac))
                 {
                     primitives.chroma[csp].pu[LUMA_16x16].copy_pp(mcout + pixoff, stride, temp, stride);
                 }
-                else if (yFrac == 0)
+                else if (!yFrac)
                 {
                     primitives.chroma[csp].pu[LUMA_16x16].filter_hpp(temp, stride, mcout + pixoff, stride, xFrac);
                 }
-                else if (xFrac == 0)
+                else if (!xFrac)
                 {
                     primitives.chroma[csp].pu[LUMA_16x16].filter_vpp(temp, stride, mcout + pixoff, stride, yFrac);
                 }
                 else
                 {
-                    ALIGN_VAR_16(int16_t, imm[16 * (16 + NTAPS_CHROMA)]);
-                    primitives.chroma[csp].pu[LUMA_16x16].filter_hps(temp, stride, imm, bw, xFrac, 1);
-                    primitives.chroma[csp].pu[LUMA_16x16].filter_vsp(imm + ((NTAPS_CHROMA >> 1) - 1) * bw, bw, mcout + pixoff, stride, yFrac);
+                    ALIGN_VAR_16(int16_t, immed[16 * (16 + NTAPS_CHROMA - 1)]);
+                    primitives.chroma[csp].pu[LUMA_16x16].filter_hps(temp, stride, immed, bw, xFrac, 1);
+                    primitives.chroma[csp].pu[LUMA_16x16].filter_vsp(immed + ((NTAPS_CHROMA >> 1) - 1) * bw, bw, mcout + pixoff, stride, yFrac);
                 }
             }
             else
@@ -232,7 +233,7 @@
     cache.numPredDir = slice.isInterP() ? 1 : 2;
     cache.lowresWidthInCU = fenc.width >> 3;
     cache.lowresHeightInCU = fenc.lines >> 3;
-    cache.csp = fencPic->m_picCsp;
+    cache.csp = param.internalCsp;
     cache.hshift = CHROMA_H_SHIFT(cache.csp);
     cache.vshift = CHROMA_V_SHIFT(cache.csp);
 
@@ -329,7 +330,7 @@
                 {
                     /* reference chroma planes must be extended prior to being
                      * used as motion compensation sources */
-                    if (!refFrame->m_bChromaExtended && param.internalCsp != X265_CSP_I400)
+                    if (!refFrame->m_bChromaExtended && param.internalCsp != X265_CSP_I400 && frame.m_fencPic->m_picCsp != X265_CSP_I400)
                     {
                         refFrame->m_bChromaExtended = true;
                         PicYuv *refPic = refFrame->m_fencPic;
@@ -456,10 +457,13 @@
             /* Use a smaller luma denominator if possible */
             if (!(plane || list))
             {
-                while (mindenom > 0 && !(minscale & 1))
+                if (mindenom > 0 && !(minscale & 1))
                 {
-                    mindenom--;
-                    minscale >>= 1;
+                    unsigned long idx;
+                    CTZ(idx, minscale);
+                    int shift = X265_MIN((int)idx, mindenom);
+                    mindenom -= shift;
+                    minscale >>= shift;
                 }
             }

x265_1.9.tar.gz/source/input/y4m.cpp -> x265_2.0.tar.gz/source/input/y4m.cpp Changed

x265_1.9.tar.gz/source/input/yuv.cpp -> x265_2.0.tar.gz/source/input/yuv.cpp Changed

x265_1.9.tar.gz/source/output/raw.cpp -> x265_2.0.tar.gz/source/output/raw.cpp Changed

@@ -32,11 +32,11 @@
     b_fail = false;
     if (!strcmp(fname, "-"))
     {
-        ofs = &cout;
+        ofs = stdout;
         return;
     }
-    ofs = new ofstream(fname, ios::binary | ios::out);
-    if (ofs->fail())
+    ofs = x265_fopen(fname, "wb");
+    if (!ofs || ferror(ofs))
         b_fail = true;
 }
 
@@ -51,7 +51,7 @@
 
     for (uint32_t i = 0; i < nalcount; i++)
     {
-        ofs->write((const char*)nal->payload, nal->sizeBytes);
+        fwrite((const void*)nal->payload, 1, nal->sizeBytes, ofs);
         bytes += nal->sizeBytes;
         nal++;
     }
@@ -65,7 +65,7 @@
 
     for (uint32_t i = 0; i < nalcount; i++)
     {
-        ofs->write((const char*)nal->payload, nal->sizeBytes);
+        fwrite((const void*)nal->payload, 1, nal->sizeBytes, ofs);
         bytes += nal->sizeBytes;
         nal++;
     }
@@ -75,6 +75,6 @@
 
 void RAWOutput::closeFile(int64_t, int64_t)
 {
-    if (ofs != &cout)
-        delete ofs;
+    if (ofs != stdout)
+        fclose(ofs);
 }

x265_1.9.tar.gz/source/output/raw.h -> x265_2.0.tar.gz/source/output/raw.h Changed

x265_1.9.tar.gz/source/test/CMakeLists.txt -> x265_2.0.tar.gz/source/test/CMakeLists.txt Changed

@@ -1,4 +1,12 @@
 # vim: syntax=cmake
+
+check_symbol_exists(__rdtsc "intrin.h" HAVE_RDTSC)
+if(HAVE_RDTSC)
+    add_definitions(-DHAVE_RDTSC=1)
+endif()
+
+# add X86 assembly files
+if(X86)
 enable_language(ASM_YASM)
 
 if(MSVC_IDE)
@@ -11,11 +19,23 @@
 else()
     set(YASM_SRC checkasm-a.asm)
 endif()
+endif(X86)
 
-check_symbol_exists(__rdtsc "intrin.h" HAVE_RDTSC)
-if(HAVE_RDTSC)
-    add_definitions(-DHAVE_RDTSC=1)
-endif()
+# add ARM assembly files
+if(ARM OR CROSS_COMPILE_ARM)
+    enable_language(ASM)
+    set(YASM_SRC checkasm-arm.S)
+    add_custom_command(
+        OUTPUT checkasm-arm.obj
+        COMMAND ${CMAKE_CXX_COMPILER}
+        ARGS ${YASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/checkasm-arm.S -o checkasm-arm.obj
+        DEPENDS checkasm-arm.S)
+endif(ARM OR CROSS_COMPILE_ARM)
+
+# add PowerPC assembly files
+if(POWER)
+    set(YASM_SRC)
+endif(POWER)
 
 add_executable(TestBench ${YASM_SRC}
     testbench.cpp testharness.h
@@ -23,6 +43,7 @@
     mbdstharness.cpp mbdstharness.h
     ipfilterharness.cpp ipfilterharness.h
     intrapredharness.cpp intrapredharness.h)
+
 target_link_libraries(TestBench x265-static ${PLATFORM_LIBS})
 if(LINKER_OPTIONS)
     if(EXTRA_LIB)

x265_2.0.tar.gz/source/test/checkasm-arm.S Added

@@ -0,0 +1,133 @@
+/****************************************************************************
+ * checkasm-arm.S: assembly check tool
+ *****************************************************************************
+ * Copyright (C) 2016 x265 project
+ *
+ * Authors: Martin Storsjo <martin@martin.st>
+ *          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "../common/arm/asm.S"
+
+.section .rodata
+.align 4
+register_init:
+.quad 0x21f86d66c8ca00ce
+.quad 0x75b6ba21077c48ad
+.quad 0xed56bb2dcb3c7736
+.quad 0x8bda43d3fd1a7e06
+.quad 0xb64a9c9e5d318408
+.quad 0xdf9a54b303f1d3a3
+.quad 0x4a75479abd64e097
+.quad 0x249214109d5d1c88
+
+error_message:
+.asciz "failed to preserve register"
+
+.text
+
+@ max number of args used by any x265 asm function.
+#define MAX_ARGS 15
+
+#define ARG_STACK 4*(MAX_ARGS - 2)
+
+.macro clobbercheck variant
+.equ pushed, 4*10
+function x265_checkasm_call_\variant
+    push        {r4-r11, lr}
+.ifc \variant, neon
+    vpush       {q4-q7}
+.equ pushed, pushed + 16*4
+.endif
+
+    movrel      r12, register_init
+.ifc \variant, neon
+    vldm        r12, {q4-q7}
+.endif
+    ldm         r12, {r4-r11}
+
+    push        {r1}
+
+    sub         sp,  sp,  #ARG_STACK
+.equ pos, 0
+.rept MAX_ARGS-2
+    ldr         r12, [sp, #ARG_STACK + pushed + 8 + pos]
+    str         r12, [sp, #pos]
+.equ pos, pos + 4
+.endr
+
+    mov         r12, r0
+    mov         r0,  r2
+    mov         r1,  r3
+    ldrd        r2,  r3,  [sp, #ARG_STACK + pushed]
+    blx         r12
+    add         sp,  sp,  #ARG_STACK
+    pop         {r2}
+
+    push        {r0, r1}
+    movrel      r12, register_init
+.ifc \variant, neon
+    vldm        r12, {q0-q3}
+    veor        q0,  q0,  q4
+    veor        q1,  q1,  q5
+    veor        q2,  q2,  q6
+    veor        q3,  q3,  q7
+    vorr        q0,  q0,  q1
+    vorr        q0,  q0,  q2
+    vorr        q0,  q0,  q3
+    vorr        d0,  d0,  d1
+    vrev64.32   d1,  d0
+    vorr        d0,  d0,  d1
+    vmov.32     r3,  d0[0]
+.else
+    mov         r3,  #0
+.endif
+
+.macro check_reg reg1, reg2
+    ldrd        r0,  r1,  [r12], #8
+    eor         r0,  r0, \reg1
+    eor         r1,  r1, \reg2
+    orr         r3,  r3, r0
+    orr         r3,  r3, r1
+.endm
+    check_reg   r4,  r5
+    check_reg   r6,  r7
+    check_reg   r8,  r9
+    check_reg   r10, r11
+.purgem check_reg
+
+    cmp         r3,  #0
+    beq         0f
+
+    mov         r12, #0
+    str         r12, [r2]
+    movrel      r0, error_message
+    bl          puts
+0:
+    pop         {r0, r1}
+.ifc \variant, neon
+    vpop        {q4-q7}
+.endif
+    pop         {r4-r11, pc}
+endfunc
+.endm
+
+clobbercheck neon
+clobbercheck noneon

x265_1.9.tar.gz/source/test/pixelharness.cpp -> x265_2.0.tar.gz/source/test/pixelharness.cpp Changed

@@ -43,6 +43,7 @@
         ushort_test_buff[0][i]  = rand() % ((1 << 16) - 1);
         uchar_test_buff[0][i]   = rand() % ((1 << 8) - 1);
         residual_test_buff[0][i] = (rand() % (2 * RMAX + 1)) - RMAX - 1;// For sse_ss only
+        double_test_buff[0][i]  = (double)(short_test_buff[0][i]) / 256.0;
 
         pixel_test_buff[1][i]   = PIXEL_MIN;
         short_test_buff[1][i]   = SMIN;
@@ -52,6 +53,7 @@
         ushort_test_buff[1][i]  = PIXEL_MIN;
         uchar_test_buff[1][i]   = PIXEL_MIN;
         residual_test_buff[1][i] = RMIN;
+        double_test_buff[1][i]  = (double)(short_test_buff[1][i]) / 256.0;
 
         pixel_test_buff[2][i]   = PIXEL_MAX;
         short_test_buff[2][i]   = SMAX;
@@ -61,6 +63,7 @@
         ushort_test_buff[2][i]  = ((1 << 16) - 1);
         uchar_test_buff[2][i]   = 255;
         residual_test_buff[2][i] = RMAX;
+        double_test_buff[2][i] = (double)(short_test_buff[2][i]) / 256.0;
 
         pbuf1[i] = rand() & PIXEL_MAX;
         pbuf2[i] = rand() & PIXEL_MAX;
@@ -858,9 +861,8 @@
         int width = (rand() % 4) + 1; // range[1-4]
         float cres = ref(sum0, sum1, width);
         float vres = checked_float(opt, sum0, sum1, width);
-        if (fabs(vres - cres) > 0.0001)
+        if (fabs(vres - cres) > 0.001)
             return false;
-
         reportfail();
     }
 
@@ -1398,6 +1400,60 @@
     return true;
 }
 
+bool PixelHarness::check_cutree_fix8_pack(cutree_fix8_pack ref, cutree_fix8_pack opt)
+{
+    ALIGN_VAR_32(uint16_t, ref_dest[64 * 64]);
+    ALIGN_VAR_32(uint16_t, opt_dest[64 * 64]);
+
+    memset(ref_dest, 0xCD, sizeof(ref_dest));
+    memset(opt_dest, 0xCD, sizeof(opt_dest));
+
+    int j = 0;
+
+    for (int i = 0; i < ITERS; i++)
+    {
+        int count = 256 + i;
+        int index = i % TEST_CASES;
+        checked(opt, opt_dest, double_test_buff[index] + j, count);
+        ref(ref_dest, double_test_buff[index] + j, count);
+
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(uint16_t)))
+            return false;
+
+        reportfail();
+        j += INCR;
+    }
+
+    return true;
+}
+
+bool PixelHarness::check_cutree_fix8_unpack(cutree_fix8_unpack ref, cutree_fix8_unpack opt)
+{
+    ALIGN_VAR_32(double, ref_dest[64 * 64]);
+    ALIGN_VAR_32(double, opt_dest[64 * 64]);
+
+    memset(ref_dest, 0xCD, sizeof(ref_dest));
+    memset(opt_dest, 0xCD, sizeof(opt_dest));
+
+    int j = 0;
+
+    for (int i = 0; i < ITERS; i++)
+    {
+        int count = 256 + i;
+        int index = i % TEST_CASES;
+        checked(opt, opt_dest, ushort_test_buff[index] + j, count);
+        ref(ref_dest, ushort_test_buff[index] + j, count);
+
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(double)))
+            return false;
+
+        reportfail();
+        j += INCR;
+    }
+
+    return true;
+}
+
 bool PixelHarness::check_psyCost_pp(pixelcmp_t ref, pixelcmp_t opt)
 {
     int j = 0, index1, index2, optres, refres;
@@ -1819,34 +1875,6 @@
     return true;
 }
 
-bool PixelHarness::check_planeClipAndMax(planeClipAndMax_t ref, planeClipAndMax_t opt)
-{
-    for (int i = 0; i < ITERS; i++)
-    {
-        intptr_t rand_stride = rand() % STRIDE;
-        int rand_width = (rand() % (STRIDE * 2)) + 1;
-        const int rand_height = (rand() % MAX_HEIGHT) + 1;
-        const pixel rand_min = rand() % 32;
-        const pixel rand_max = PIXEL_MAX - (rand() % 32);
-        uint64_t ref_sum, opt_sum;
-
-        // video width must be more than or equal to 32
-        if (rand_width < 32)
-            rand_width = 32;
-
-        // stride must be more than or equal to width
-        if (rand_stride < rand_width)
-            rand_stride = rand_width;
-
-        pixel ref_max = ref(pbuf1, rand_stride, rand_width, rand_height, &ref_sum, rand_min, rand_max);
-        pixel opt_max = (pixel)checked(opt, pbuf1, rand_stride, rand_width, rand_height, &opt_sum, rand_min, rand_max);
-
-        if (ref_max != opt_max)
-            return false;
-    }
-    return true;
-}
-
 bool PixelHarness::check_pelFilterLumaStrong_H(pelFilterLumaStrong_t ref, pelFilterLumaStrong_t opt)
 {
     intptr_t srcStep = 1, offset = 64;
@@ -1913,6 +1941,68 @@
     return true;
 }
 
+bool PixelHarness::check_pelFilterChroma_H(pelFilterChroma_t ref, pelFilterChroma_t opt)
+{
+    intptr_t srcStep = 1, offset = 64;
+    int32_t maskP, maskQ, tc;
+    int j = 0;
+
+    pixel pixel_test_buff1[TEST_CASES][BUFFSIZE];
+    for (int i = 0; i < TEST_CASES; i++)
+        memcpy(pixel_test_buff1[i], pixel_test_buff[i], sizeof(pixel)* BUFFSIZE);
+
+    for (int i = 0; i < ITERS; i++)
+    {
+        tc = rand() % PIXEL_MAX;
+        maskP = (rand() % PIXEL_MAX) - 1;
+        maskQ = (rand() % PIXEL_MAX) - 1;
+
+        int index = rand() % 3;
+
+        ref(pixel_test_buff[index] + 4 * offset + j, srcStep, offset, tc, maskP, maskQ);
+        checked(opt, pixel_test_buff1[index] + 4 * offset + j, srcStep, offset, tc, maskP, maskQ);
+
+        if (memcmp(pixel_test_buff[index], pixel_test_buff1[index], sizeof(pixel)* BUFFSIZE))
+            return false;
+
+        reportfail()
+        j += INCR;
+    }
+
+    return true;
+}
+
+bool PixelHarness::check_pelFilterChroma_V(pelFilterChroma_t ref, pelFilterChroma_t opt)
+{
+    intptr_t srcStep = 64, offset = 1;
+    int32_t maskP, maskQ, tc;
+    int j = 0;
+
+    pixel pixel_test_buff1[TEST_CASES][BUFFSIZE];
+    for (int i = 0; i < TEST_CASES; i++)
+        memcpy(pixel_test_buff1[i], pixel_test_buff[i], sizeof(pixel)* BUFFSIZE);
+
+    for (int i = 0; i < ITERS; i++)
+    {
+        tc = rand() % PIXEL_MAX;
+        maskP = (rand() % PIXEL_MAX) - 1;
+        maskQ = (rand() % PIXEL_MAX) - 1;
+
+        int index = rand() % 3;
+
+        ref(pixel_test_buff[index] + 4 + j, srcStep, offset, tc, maskP, maskQ);
+        checked(opt, pixel_test_buff1[index] + 4 + j, srcStep, offset, tc, maskP, maskQ);
+
+        if (memcmp(pixel_test_buff[index], pixel_test_buff1[index], sizeof(pixel)* BUFFSIZE))
+            return false;
+
+        reportfail()
+        j += INCR;
+    }
+
+    return true;
+}
+
 bool PixelHarness::testPU(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt)
 {
     if (opt.pu[part].satd)

x265_1.9.tar.gz/source/test/pixelharness.h -> x265_2.0.tar.gz/source/test/pixelharness.h Changed

@@ -113,6 +113,8 @@
     bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);
     bool check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt);
     bool check_cutree_propagate_cost(cutree_propagate_cost ref, cutree_propagate_cost opt);
+    bool check_cutree_fix8_pack(cutree_fix8_pack ref, cutree_fix8_pack opt);
+    bool check_cutree_fix8_unpack(cutree_fix8_unpack ref, cutree_fix8_unpack opt);
     bool check_psyCost_pp(pixelcmp_t ref, pixelcmp_t opt);
     bool check_calSign(sign_t ref, sign_t opt);
     bool check_scanPosLast(scanPosLast_t ref, scanPosLast_t opt);
@@ -120,9 +122,10 @@
     bool check_costCoeffNxN(costCoeffNxN_t ref, costCoeffNxN_t opt);
     bool check_costCoeffRemain(costCoeffRemain_t ref, costCoeffRemain_t opt);
     bool check_costC1C2Flag(costC1C2Flag_t ref, costC1C2Flag_t opt);
-    bool check_planeClipAndMax(planeClipAndMax_t ref, planeClipAndMax_t opt);
     bool check_pelFilterLumaStrong_V(pelFilterLumaStrong_t ref, pelFilterLumaStrong_t opt);
     bool check_pelFilterLumaStrong_H(pelFilterLumaStrong_t ref, pelFilterLumaStrong_t opt);
+    bool check_pelFilterChroma_V(pelFilterChroma_t ref, pelFilterChroma_t opt);
+    bool check_pelFilterChroma_H(pelFilterChroma_t ref, pelFilterChroma_t opt);
 
 public:

x265_1.9.tar.gz/source/test/rate-control-tests.txt -> x265_2.0.tar.gz/source/test/rate-control-tests.txt Changed

@@ -25,6 +25,11 @@
 
 
 # multi-pass rate control tests
+sita_1920x1080_30.yuv, --preset ultrafast --crf 20 --no-cutree --no-scenecut --keyint 50 --no-open-gop --pass 1 --vbv-bufsize 7000 --vbv-maxrate 5000, --preset ultrafast --crf 20 --no-cutree --no-scenecut --keyint 50 --no-open-gop --pass 2 --vbv-bufsize 7000 --vbv-maxrate 5000
+sita_1920x1080_30.yuv, --preset medium --crf 20 --no-cutree --no-scenecut --keyint 50 --no-open-gop --pass 1 --vbv-bufsize 7000 --vbv-maxrate 5000, --preset medium --crf 20 --no-cutree --no-scenecut --keyint 50 --no-open-gop --pass 2 --vbv-bufsize 7000 --vbv-maxrate 5000
+sintel_trailer_2k_480p24.y4m, --preset medium --crf 18 --no-cutree --no-scenecut --no-open-gop --keyint 50 --vbv-bufsize 1200 --vbv-maxrate 1000 --pass 1, --preset medium --crf 18 --no-cutree --no-scenecut --no-open-gop --keyint 50 --vbv-bufsize 1200 --vbv-maxrate 1000 --pass 2
+sintel_trailer_2k_480p24.y4m, --preset veryslow --crf 18 --no-cutree --no-scenecut --no-open-gop --keyint 50 --vbv-bufsize 1200 --vbv-maxrate 1000 --pass 1, --preset veryslow --crf 18 --no-cutree --no-scenecut --no-open-gop --keyint 50 --vbv-bufsize 1200 --vbv-maxrate 1000 --pass 2
+ten_teaser_3840x2160_50_10bit.yuv, --preset medium --crf 25 --no-cutree --no-open-gop --no-scenecut --keyint 50 --vbv-maxrate 10000 --vbv-bufsize 12000 --pass 1, --preset medium --crf 25 --no-cutree --no-open-gop --no-scenecut --keyint 50 --vbv-maxrate 10000 --vbv-bufsize 12000 --pass 2
 big_buck_bunny_360p24.y4m,--preset slow --crf 40 --pass 1 -f 5000,--preset slow --bitrate 200 --pass 2 -f 5000
 big_buck_bunny_360p24.y4m,--preset medium --bitrate 700 --pass 1 -F4 --slow-firstpass -f 5000 ,--preset medium --bitrate 700 --vbv-bufsize 900 --vbv-maxrate 700 --pass 2 -F4 -f 5000
 112_1920x1080_25.yuv,--preset fast --bitrate 1000 --vbv-maxrate 1000 --vbv-bufsize 1000 --strict-cbr --pass 1 -F4,--preset fast --bitrate 1000 --vbv-maxrate 3000 --vbv-bufsize 3000 --pass 2 -F4

x265_1.9.tar.gz/source/test/regression-tests.txt -> x265_2.0.tar.gz/source/test/regression-tests.txt Changed

x265_1.9.tar.gz/source/test/testbench.cpp -> x265_2.0.tar.gz/source/test/testbench.cpp Changed

@@ -169,6 +169,9 @@
         { "XOP", X265_CPU_XOP },
         { "AVX2", X265_CPU_AVX2 },
         { "BMI2", X265_CPU_AVX2 | X265_CPU_BMI1 | X265_CPU_BMI2 },
+        { "ARMv6", X265_CPU_ARMV6 },
+        { "NEON", X265_CPU_NEON },
+        { "FastNeonMRC", X265_CPU_FAST_NEON_MRC },
         { "", 0 },
     };
 
@@ -182,6 +185,7 @@
         else
             continue;
 
+#if X265_ARCH_X86
         EncoderPrimitives vecprim;
         memset(&vecprim, 0, sizeof(vecprim));
         setupInstrinsicPrimitives(vecprim, test_arch[i].flag);
@@ -197,6 +201,7 @@
                 return -1;
             }
         }
+#endif
 
         EncoderPrimitives asmprim;
         memset(&asmprim, 0, sizeof(asmprim));
@@ -220,7 +225,9 @@
 
     EncoderPrimitives optprim;
     memset(&optprim, 0, sizeof(optprim));
+#if X265_ARCH_X86
     setupInstrinsicPrimitives(optprim, cpuid);
+#endif
     setupAssemblyPrimitives(optprim, cpuid);
 
     /* Note that we do not setup aliases for performance tests, that would be

x265_1.9.tar.gz/source/test/testharness.h -> x265_2.0.tar.gz/source/test/testharness.h Changed

@@ -32,7 +32,6 @@
 #pragma warning(disable: 4324) // structure was padded due to __declspec(align())
 #endif
 
-#define PIXEL_MAX ((1 << X265_DEPTH) - 1)
 #define PIXEL_MIN 0
 #define SHORT_MAX  32767
 #define SHORT_MIN -32767
@@ -75,10 +74,17 @@
 {
     uint32_t a = 0;
 
+#if X265_ARCH_X86
     asm volatile("rdtsc" : "=a" (a) ::"edx");
+#elif X265_ARCH_ARM
+    // TOD-DO: verify following inline asm to get cpu Timestamp Counter for ARM arch
+    // asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(a));
+
+    // TO-DO: replace clock() function with appropriate ARM cpu instructions
+    a = clock();
+#endif
     return a;
 }
-
 #endif // ifdef _MSC_VER
 
 #define BENCH_RUNS 1000
@@ -125,7 +131,7 @@
  * needs an explicit asm check because it only sometimes crashes in normal use. */
 intptr_t PFX(checkasm_call)(intptr_t (*func)(), int *ok, ...);
 float PFX(checkasm_call_float)(float (*func)(), int *ok, ...);
-#else
+#elif X265_ARCH_ARM == 0
 #define PFX(stack_pagealign)(func, align) func()
 #endif

x265_1.9.tar.gz/source/x265-extras.cpp -> x265_2.0.tar.gz/source/x265-extras.cpp Changed

@@ -46,17 +46,17 @@
         return NULL;
     }
 
-    FILE *csvfp = fopen(fname, "r");
+    FILE *csvfp = x265_fopen(fname, "r");
     if (csvfp)
     {
         /* file already exists, re-open for append */
         fclose(csvfp);
-        return fopen(fname, "ab");
+        return x265_fopen(fname, "ab");
     }
     else
     {
         /* new CSV file, write header */
-        csvfp = fopen(fname, "wb");
+        csvfp = x265_fopen(fname, "wb");
         if (csvfp)
         {
             if (level)
@@ -280,9 +280,9 @@
     fprintf(csvfp, " %-6u, %-6u, %s\n", stats.maxCLL, stats.maxFALL, api.version_str);
 }
 
-/* The dithering algorithm is based on Sierra-2-4A error diffusion. */
-static void ditherPlane(pixel *dst, int dstStride, uint16_t *src, int srcStride,
-                        int width, int height, int16_t *errors, int bitDepth)
+/* The dithering algorithm is based on Sierra-2-4A error diffusion.
+ * We convert planes in place (without allocating a new buffer). */
+static void ditherPlane(uint16_t *src, int srcStride, int width, int height, int16_t *errors, int bitDepth)
 {
     const int lShift = 16 - bitDepth;
     const int rShift = 16 - bitDepth + 2;
@@ -290,15 +290,34 @@
     const int pixelMax = (1 << bitDepth) - 1;
 
     memset(errors, 0, (width + 1) * sizeof(int16_t));
-    int pitch = 1;
-    for (int y = 0; y < height; y++, src += srcStride, dst += dstStride)
+
+    if (bitDepth == 8)
     {
-        int16_t err = 0;
-        for (int x = 0; x < width; x++)
+        for (int y = 0; y < height; y++, src += srcStride)
         {
-            err = err * 2 + errors[x] + errors[x + 1];
-            dst[x * pitch] = (pixel)x265_clip3(0, pixelMax, ((src[x * 1] << 2) + err + half) >> rShift);
-            errors[x] = err = src[x * pitch] - (dst[x * pitch] << lShift);
+            uint8_t* dst = (uint8_t *)src;
+            int16_t err = 0;
+            for (int x = 0; x < width; x++)
+            {
+                err = err * 2 + errors[x] + errors[x + 1];
+                int tmpDst = x265_clip3(0, pixelMax, ((src[x] << 2) + err + half) >> rShift);
+                errors[x] = err = (int16_t)(src[x] - (tmpDst << lShift));
+                dst[x] = (uint8_t)tmpDst;
+            }
+        }
+    }
+    else
+    {
+        for (int y = 0; y < height; y++, src += srcStride)
+        {
+            int16_t err = 0;
+            for (int x = 0; x < width; x++)
+            {
+                err = err * 2 + errors[x] + errors[x + 1];
+                int tmpDst = x265_clip3(0, pixelMax, ((src[x] << 2) + err + half) >> rShift);
+                errors[x] = err = (int16_t)(src[x] - (tmpDst << lShift));
+                src[x] = (uint16_t)tmpDst;
+            }
         }
     }
 }
@@ -317,10 +336,16 @@
         return;
     }
 
+    if (picIn.bitDepth == bitDepth)
+    {
+        fprintf(stderr, "extras[error]: dither support enabled only if encoder depth is different from picture depth\n");
+        return;
+    }
+
     /* This portion of code is from readFrame in x264. */
     for (int i = 0; i < x265_cli_csps[picIn.colorSpace].planes; i++)
     {
-        if ((picIn.bitDepth & 7) && (picIn.bitDepth != 16))
+        if (picIn.bitDepth < 16)
         {
             /* upconvert non 16bit high depth planes to 16bit */
             uint16_t *plane = (uint16_t*)picIn.planes[i];
@@ -332,14 +357,10 @@
             for (uint32_t j = 0; j < pixelCount; j++)
                 plane[j] = plane[j] << lShift;
         }
-    }
 
-    for (int i = 0; i < x265_cli_csps[picIn.colorSpace].planes; i++)
-    {
         int height = (int)(picHeight >> x265_cli_csps[picIn.colorSpace].height[i]);
         int width = (int)(picWidth >> x265_cli_csps[picIn.colorSpace].width[i]);
 
-        ditherPlane(((pixel*)picIn.planes[i]), picIn.stride[i] / sizeof(pixel), ((uint16_t*)picIn.planes[i]),
-                    picIn.stride[i] / 2, width, height, errorBuf, bitDepth);
+        ditherPlane(((uint16_t*)picIn.planes[i]), picIn.stride[i] / 2, width, height, errorBuf, bitDepth);
     }
 }

x265_1.9.tar.gz/source/x265.cpp -> x265_2.0.tar.gz/source/x265.cpp Changed

@@ -29,14 +29,10 @@
 #include "x265-extras.h"
 #include "x265cli.h"
 
-#include "common.h"
 #include "input/input.h"
 #include "output/output.h"
 #include "output/reconplay.h"
 
-#include "param.h"
-#include "cpu.h"
-
 #if HAVE_VLD
 /* Visual Leak Detector */
 #include <vld.h>
@@ -312,12 +308,9 @@
             OPT("recon-y4m-exec") reconPlayCmd = optarg;
             OPT("qpfile")
             {
-                this->qpfile = fopen(optarg, "rb");
+                this->qpfile = x265_fopen(optarg, "rb");
                 if (!this->qpfile)
-                {
-                    x265_log(param, X265_LOG_ERROR, "%s qpfile not found or error in opening qp file\n", optarg);
-                    return false;
-                }
+                    x265_log_file(param, X265_LOG_ERROR, "%s qpfile not found or error in opening qp file\n", optarg);
             }
             else
                 bError |= !!api->param_parse(param, long_options[long_options_index].name, optarg);
@@ -378,7 +371,7 @@
     this->input = InputFile::open(info, this->bForceY4m);
     if (!this->input || this->input->isFail())
     {
-        x265_log(param, X265_LOG_ERROR, "unable to open input file <%s>\n", inputfn);
+        x265_log_file(param, X265_LOG_ERROR, "unable to open input file <%s>\n", inputfn);
         return true;
     }
 
@@ -455,10 +448,10 @@
     this->output = OutputFile::open(outputfn, info);
     if (this->output->isFail())
     {
-        x265_log(param, X265_LOG_ERROR, "failed to open output file <%s> for writing\n", outputfn);
+        x265_log_file(param, X265_LOG_ERROR, "failed to open output file <%s> for writing\n", outputfn);
         return true;
     }
-    general_log(param, this->output->getName(), X265_LOG_INFO, "output file: %s\n", outputfn);
+    general_log_file(param, this->output->getName(), X265_LOG_INFO, "output file: %s\n", outputfn);
     return false;
 }
 
@@ -497,6 +490,39 @@
     return 1;
 }
 
+#ifdef _WIN32
+/* Copy of x264 code, which allows for Unicode characters in the command line.
+ * Retrieve command line arguments as UTF-8. */
+static int get_argv_utf8(int *argc_ptr, char ***argv_ptr)
+{
+    int ret = 0;
+    wchar_t **argv_utf16 = CommandLineToArgvW(GetCommandLineW(), argc_ptr);
+    if (argv_utf16)
+    {
+        int argc = *argc_ptr;
+        int offset = (argc + 1) * sizeof(char*);
+        int size = offset;
+
+        for (int i = 0; i < argc; i++)
+            size += WideCharToMultiByte(CP_UTF8, 0, argv_utf16[i], -1, NULL, 0, NULL, NULL);
+
+        char **argv = *argv_ptr = (char**)malloc(size);
+        if (argv)
+        {
+            for (int i = 0; i < argc; i++)
+            {
+                argv[i] = (char*)argv + offset;
+                offset += WideCharToMultiByte(CP_UTF8, 0, argv_utf16[i], -1, argv[i], size - offset, NULL, NULL);
+            }
+            argv[argc] = NULL;
+            ret = 1;
+        }
+        LocalFree(argv_utf16);
+    }
+    return ret;
+}
+#endif
+
 /* CLI return codes:
  *
  * 0 - encode successful
@@ -517,6 +543,10 @@
 
     GetConsoleTitle(orgConsoleTitle, CONSOLE_TITLE_SIZE);
     SetThreadExecutionState(ES_CONTINUOUS | ES_SYSTEM_REQUIRED | ES_AWAYMODE_REQUIRED);
+#if _WIN32
+    char** orgArgv = argv;
+    get_argv_utf8(&argc, &argv);
+#endif
 
     ReconPlay* reconPlay = NULL;
     CLIOptions cliopt;
@@ -560,7 +590,7 @@
         cliopt.csvfpt = x265_csvlog_open(*api, *param, cliopt.csvfn, cliopt.csvLogLevel);
         if (!cliopt.csvfpt)
         {
-            x265_log(param, X265_LOG_ERROR, "Unable to open CSV log file <%s>, aborting\n", cliopt.csvfn);
+            x265_log_file(param, X265_LOG_ERROR, "Unable to open CSV log file <%s>, aborting\n", cliopt.csvfn);
             cliopt.destroy();
             if (cliopt.api)
                 cliopt.api->param_free(cliopt.param);
@@ -747,6 +777,14 @@
     SetConsoleTitle(orgConsoleTitle);
     SetThreadExecutionState(ES_CONTINUOUS);
 
+#if _WIN32
+    if (argv != orgArgv)
+    {
+        free(argv);
+        argv = orgArgv;
+    }
+#endif
+
 #if HAVE_VLD
     assert(VLDReportLeaks() == 0);
 #endif

x265_1.9.tar.gz/source/x265.h -> x265_2.0.tar.gz/source/x265.h Changed

@@ -98,9 +98,9 @@
     uint32_t         sliceType;
     uint32_t         numCUsInFrame;
     uint32_t         numPartitions;
+    int              bScenecut;
     void*            interData;
     void*            intraData;
-    int              bScenecut;
 } x265_analysis_data;
 
 /* cu statistics */
@@ -221,6 +221,14 @@
     /* Frame level statistics */
     x265_frame_stats frameData;
 
+    /* Ratecontrol statistics for collecting the ratecontrol information.
+     * It is not used for collecting the last pass ratecontrol data in 
+     * multi pass ratecontrol mode. */
+    void*  rcData;
+
+    uint64_t framesize;
+
+    int    height;
 } x265_picture;
 
 typedef enum
@@ -587,6 +595,11 @@
      * Main (0) and High (1) tier. Default is Main tier (0) */
     int       bHighTier;
 
+    /* Enable UHD Blu-ray compatibility support. If specified, the encoder will
+     * attempt to modify/set the encode specifications. If the encoder is unable 
+     * to do so, this option will be turned OFF. */
+    int       uhdBluray;
+
     /* The maximum number of L0 references a P or B slice may use. This
      * influences the size of the decoded picture buffer. The higher this
      * number, the more reference frames there will be available for motion
@@ -764,7 +777,7 @@
      * enabled). At level 2 rate-distortion cost is used to make decimate decisions
      * on each 4x4 coding group (including the cost of signaling the group within
      * the group bitmap).  Psy-rdoq is less effective at preserving energy when
-     * RDOQ is at level 2 */
+     * RDOQ is at level 2. Default: 0 */
     int       rdoqLevel;
 
     /* Enable the implicit signaling of the sign bit of the last coefficient of
@@ -896,23 +909,27 @@
     /* Note: when deblocking and SAO are both enabled, the loop filter CU lag is
      * only one row, as they operate in series on the same row. */
 
-    /* Select the method in which SAO deals with deblocking boundary pixels.  If
+    /* Select the method in which SAO deals with deblocking boundary pixels. If
      * disabled the right and bottom boundary areas are skipped. If enabled,
      * non-deblocked pixels are used entirely. Default is disabled */
     int       bSaoNonDeblocked;
 
     /*== Analysis tools ==*/
 
-    /* A value between X265_NO_RDO_NO_RDOQ and X265_RDO_LEVEL which determines
-     * the level of rate distortion optimizations to perform during mode
-     * decisions and quantization. The more RDO the better the compression
-     * efficiency at a major cost of performance. Default is no RDO (0) */
+    /* A value between 1 and 6 (both inclusive) which determines the level of 
+     * rate distortion optimizations to perform during mode and depth decisions.
+     * The more RDO the better the compression efficiency at a major cost of 
+     * performance. Default is 3 */
     int       rdLevel;
 
-    /* Enable early skip decisions to avoid intra and inter analysis in likely
+    /* Enable early skip decisions to avoid analysing additional modes in likely
      * skip blocks. Default is disabled */
     int       bEnableEarlySkip;
 
+    /* Enable early CU size decisions to avoid recursing to higher depths. 
+     * Default is enabled */
+    int bEnableRecursionSkip;
+
     /* Use a faster search method to find the best intra mode. Default is 0 */
     int       bEnableFastIntra;
 
@@ -947,10 +964,16 @@
     double    psyRd;
 
     /* Strength of psycho-visual optimizations in quantization. Only has an
-     * effect in presets which use RDOQ (rd-levels 4 and 5).  The value must be
-     * between 0 and 50, 1.0 is typical. Default 1.0 */
+     * effect when RDOQ is enabled (presets slow, slower and veryslow). The 
+     * value must be between 0 and 50, 1.0 is typical. Default 0 */
     double    psyRdoq;
 
+    /* Perform quantisation parameter based RD refinement. RD cost is calculated
+     * on the best CU partitions, chosen after the CU analysis, for a range of QPs
+     * to find the optimal rounding effect. Only effective at rd-levels 5 and 6.
+     * Default disabled */
+    int       bEnableRdRefine;
+
     /* If X265_ANALYSIS_SAVE, write per-frame analysis information into analysis
      * buffers.  if X265_ANALYSIS_LOAD, read analysis information into analysis
      * buffer and use this analysis information to reduce the amount of work
@@ -1083,6 +1106,9 @@
          * (QG) size. Allowed values are 64, 32, 16 provided it falls within the
          * inclusuve range [maxCUSize, minCUSize]. Experimental, default: maxCUSize */
         uint32_t qgSize;
+
+        /* internally enable if tune grain is set */
+        int      bEnableGrain;
     } rc;
 
     /*== Video Usability Information ==*/

x265_1.9.tar.gz/source/x265cli.h -> x265_2.0.tar.gz/source/x265cli.h Changed

@@ -53,6 +53,7 @@
     { "profile",        required_argument, NULL, 'P' },
     { "level-idc",      required_argument, NULL, 0 },
     { "high-tier",            no_argument, NULL, 0 },
+    { "uhd-bd",               no_argument, NULL, 0 },
     { "no-high-tier",         no_argument, NULL, 0 },
     { "allow-non-conformance",no_argument, NULL, 0 },
     { "no-allow-non-conformance",no_argument, NULL, 0 },
@@ -96,6 +97,8 @@
     { "amp",                  no_argument, NULL, 0 },
     { "no-early-skip",        no_argument, NULL, 0 },
     { "early-skip",           no_argument, NULL, 0 },
+    { "no-rskip",             no_argument, NULL, 0 },
+    { "rskip",                no_argument, NULL, 0 },
     { "no-fast-cbf",          no_argument, NULL, 0 },
     { "fast-cbf",             no_argument, NULL, 0 },
     { "no-tskip",             no_argument, NULL, 0 },
@@ -143,6 +146,8 @@
     { "qp",             required_argument, NULL, 'q' },
     { "aq-mode",        required_argument, NULL, 0 },
     { "aq-strength",    required_argument, NULL, 0 },
+    { "rc-grain",             no_argument, NULL, 0 },
+    { "no-rc-grain",          no_argument, NULL, 0 },
     { "ipratio",        required_argument, NULL, 0 },
     { "pbratio",        required_argument, NULL, 0 },
     { "qcomp",          required_argument, NULL, 0 },
@@ -159,6 +164,8 @@
     { "psy-rdoq",       required_argument, NULL, 0 },
     { "no-psy-rd",            no_argument, NULL, 0 },
     { "no-psy-rdoq",          no_argument, NULL, 0 },
+    { "rd-refine",            no_argument, NULL, 0 },
+    { "no-rd-refine",         no_argument, NULL, 0 },
     { "scaling-list",   required_argument, NULL, 0 },
     { "lossless",             no_argument, NULL, 0 },
     { "no-lossless",          no_argument, NULL, 0 },
@@ -279,6 +286,7 @@
     H0("-P/--profile <string>            Enforce an encode profile: main, main10, mainstillpicture\n");
     H0("   --level-idc <integer|float>   Force a minimum required decoder level (as '5.0' or '50')\n");
     H0("   --[no-]high-tier              If a decoder level is specified, this modifier selects High tier of that level\n");
+    H0("   --uhd-bd                      Enable UHD Bluray compatibility support\n");
     H0("   --[no-]allow-non-conformance  Allow the encoder to generate profile NONE bitstreams. Default %s\n", OPT(param->bAllowNonConformance));
     H0("\nThreading, performance:\n");
     H0("   --pools <integer,...>         Comma separated thread count per thread pool (pool per NUMA node)\n");
@@ -300,11 +308,13 @@
     H0("   --tu-intra-depth <integer>    Max TU recursive depth for intra CUs. Default %d\n", param->tuQTMaxIntraDepth);
     H0("   --tu-inter-depth <integer>    Max TU recursive depth for inter CUs. Default %d\n", param->tuQTMaxInterDepth);
     H0("\nAnalysis:\n");
-    H0("   --rd <0..6>                   Level of RDO in mode decision 0:least....6:full RDO. Default %d\n", param->rdLevel);
+    H0("   --rd <1..6>                   Level of RDO in mode decision 1:least....6:full RDO. Default %d\n", param->rdLevel);
     H0("   --[no-]psy-rd <0..5.0>        Strength of psycho-visual rate distortion optimization, 0 to disable. Default %.1f\n", param->psyRd);
     H0("   --[no-]rdoq-level <0|1|2>     Level of RDO in quantization 0:none, 1:levels, 2:levels & coding groups. Default %d\n", param->rdoqLevel);
     H0("   --[no-]psy-rdoq <0..50.0>     Strength of psycho-visual optimization in RDO quantization, 0 to disable. Default %.1f\n", param->psyRdoq);
+    H0("   --[no-]rd-refine              Enable QP based RD refinement for rd levels 5 and 6. Default %s\n", OPT(param->bEnableRdRefine));
     H0("   --[no-]early-skip             Enable early SKIP detection. Default %s\n", OPT(param->bEnableEarlySkip));
+    H0("   --[no-]rskip                  Enable early exit from recursion. Default %s\n", OPT(param->bEnableRecursionSkip));
     H1("   --[no-]tskip-fast             Enable fast intra transform skipping. Default %s\n", OPT(param->bEnableTSkipFast));
     H1("   --nr-intra <integer>          An integer value in range of 0 to 2000, which denotes strength of noise reduction in intra CUs. Default 0\n");
     H1("   --nr-inter <integer>          An integer value in range of 0 to 2000, which denotes strength of noise reduction in inter CUs. Default 0\n");
@@ -373,6 +383,7 @@
     H0("   --aq-strength <float>         Reduces blocking and blurring in flat and textured areas (0 to 3.0). Default %.2f\n", param->rc.aqStrength);
     H0("   --qg-size <int>               Specifies the size of the quantization group (64, 32, 16). Default %d\n", param->rc.qgSize);
     H0("   --[no-]cutree                 Enable cutree for Adaptive Quantization. Default %s\n", OPT(param->rc.cuTree));
+    H0("   --[no-]rc-grain               Enable ratecontrol mode to handle grains specifically. turned on with tune grain. Default %s\n", OPT(param->rc.bEnableGrain));
     H1("   --ipratio <float>             QP factor between I and P. Default %.2f\n", param->rc.ipFactor);
     H1("   --pbratio <float>             QP factor between P and B. Default %.2f\n", param->rc.pbFactor);
     H1("   --qcomp <float>               Weight given to predicted complexity. Default %.2f\n", param->rc.qCompress);