Packman Build Service PMBS

We truncated the diff of some files because they were too big. If you want to see the full diff for every file, click here.

Changes of Revision 19

x265.changes Changed

@@ -1,4 +1,34 @@
 -------------------------------------------------------------------
+Sun Jan  1 20:32:07 UTC 2017 - idonmez@suse.com
+
+-  Update to version 2.2
+   Encode enhancements
+   * Enhancements to TU selection algorithm with early-outs for
+     improved speed; use --limit-tu to exercise.
+   * New motion search method SEA (Successive Elimination Algorithm)
+     supported now as :option: –me 4
+   * Bit-stream optimizations to improve fields in PPS and SPS for
+     bit-rate savings through --[no-]opt-qp-pps, 
+     --[no-]opt-ref-list-length-pps, and --[no-]multi-pass-opt-rps.
+   * Enabled using VBV constraints when encoding without WPP.
+   * All param options dumped in SEI packet in bitstream when info
+     selected.
+   API changes
+   * Options to disable SEI and optional-VUI messages from bitstream
+     made more descriptive.
+   * New option --scenecut-bias to enable controlling bias to mark
+     scene-cuts via cli.
+   * Support mono and mono16 color spaces for y4m input.
+   * --min-cu-size of 64 no-longer supported for reasons of
+     visual quality.
+   * API for CSV now expects version string for better integration
+     of x265 into other applications.
+   Bug fixes
+   * Several fixes to slice-based encoding.
+   * --log2-max-poc-lsb‘s range limited according to HEVC spec.
+   * Restrict MVs to within legal boundaries when encoding.
+
+-------------------------------------------------------------------
 Thu Dec 22 12:59:47 UTC 2016 - scarabeus@opensuse.org
 
 - Add conditional for the numa-devel again it was not ment to be dropped

​x
 
@@ -1,4 +1,34 @@
 -------------------------------------------------------------------
+Sun Jan  1 20:32:07 UTC 2017 - idonmez@suse.com
+
+-  Update to version 2.2
+   Encode enhancements
+   * Enhancements to TU selection algorithm with early-outs for
+     improved speed; use --limit-tu to exercise.
+   * New motion search method SEA (Successive Elimination Algorithm)
+     supported now as :option: –me 4
+   * Bit-stream optimizations to improve fields in PPS and SPS for
+     bit-rate savings through --[no-]opt-qp-pps, 
+     --[no-]opt-ref-list-length-pps, and --[no-]multi-pass-opt-rps.
+   * Enabled using VBV constraints when encoding without WPP.
+   * All param options dumped in SEI packet in bitstream when info
+     selected.
+   API changes
+   * Options to disable SEI and optional-VUI messages from bitstream
+     made more descriptive.
+   * New option --scenecut-bias to enable controlling bias to mark
+     scene-cuts via cli.
+   * Support mono and mono16 color spaces for y4m input.
+   * --min-cu-size of 64 no-longer supported for reasons of
+     visual quality.
+   * API for CSV now expects version string for better integration
+     of x265 into other applications.
+   Bug fixes
+   * Several fixes to slice-based encoding.
+   * --log2-max-poc-lsb‘s range limited according to HEVC spec.
+   * Restrict MVs to within legal boundaries when encoding.
+
+-------------------------------------------------------------------
 Thu Dec 22 12:59:47 UTC 2016 - scarabeus@opensuse.org
 
 - Add conditional for the numa-devel again it was not ment to be dropped
​

x265.spec Changed

 
@@ -1,10 +1,10 @@
 # based on the spec file from https://build.opensuse.org/package/view_file/home:Simmphonie/libx265/
 
 Name:           x265
-%define soname  95
+%define soname  102
 %define libname lib%{name}
 %define libsoname %{libname}-%{soname}
-Version:        2.1
+Version:        2.2
 Release:        0
 License:        GPL-2.0+
 Summary:        A free h265/HEVC encoder - encoder binary
​

arm.patch Changed

@@ -1,11 +1,11 @@
-Index: x265_2.1/source/CMakeLists.txt
+Index: x265_2.2/source/CMakeLists.txt
 ===================================================================
---- x265_2.1.orig/source/CMakeLists.txt
-+++ x265_2.1/source/CMakeLists.txt
-@@ -60,15 +60,22 @@ elseif(POWERMATCH GREATER "-1")
-     message(STATUS "Detected POWER target processor")
-     set(POWER 1)
-     add_definitions(-DX265_ARCH_POWER=1)
+--- x265_2.2.orig/source/CMakeLists.txt
++++ x265_2.2/source/CMakeLists.txt
+@@ -65,15 +65,22 @@ elseif(POWERMATCH GREATER "-1")
+         add_definitions(-DPPC64=1)
+         message(STATUS "Detected POWER PPC64 target processor")
+     endif()
 -elseif(ARMMATCH GREATER "-1")
 -    if(CROSS_COMPILE_ARM)
 -        message(STATUS "Cross compiling for ARM arch")
@@ -34,7 +34,7 @@
  else()
      message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
      message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
-@@ -190,18 +197,9 @@ if(GCC)
+@@ -208,18 +215,9 @@ if(GCC)
              endif()
          endif()
      endif()
@@ -55,10 +55,10 @@
      if(FPROFILE_GENERATE)
          if(INTEL_CXX)
              add_definitions(-prof-gen -prof-dir="${CMAKE_CURRENT_BINARY_DIR}")
-Index: x265_2.1/source/common/cpu.cpp
+Index: x265_2.2/source/common/cpu.cpp
 ===================================================================
---- x265_2.1.orig/source/common/cpu.cpp
-+++ x265_2.1/source/common/cpu.cpp
+--- x265_2.2.orig/source/common/cpu.cpp
++++ x265_2.2/source/common/cpu.cpp
 @@ -37,7 +37,7 @@
  #include <machine/cpu.h>
  #endif
@@ -68,7 +68,7 @@
  #include <signal.h>
  #include <setjmp.h>
  static sigjmp_buf jmpbuf;
-@@ -340,7 +340,6 @@ uint32_t cpu_detect(void)
+@@ -344,7 +344,6 @@ uint32_t cpu_detect(void)
      }
  
      canjump = 1;
@@ -76,7 +76,7 @@
      canjump = 0;
      signal(SIGILL, oldsig);
  #endif // if !HAVE_NEON
-@@ -356,7 +355,7 @@ uint32_t cpu_detect(void)
+@@ -360,7 +359,7 @@ uint32_t cpu_detect(void)
      // which may result in incorrect detection and the counters stuck enabled.
      // right now Apple does not seem to support performance counters for this test
  #ifndef __MACH__

 
@@ -1,11 +1,11 @@
-Index: x265_2.1/source/CMakeLists.txt
+Index: x265_2.2/source/CMakeLists.txt
 ===================================================================
---- x265_2.1.orig/source/CMakeLists.txt
-+++ x265_2.1/source/CMakeLists.txt
-@@ -60,15 +60,22 @@ elseif(POWERMATCH GREATER "-1")
-     message(STATUS "Detected POWER target processor")
-     set(POWER 1)
-     add_definitions(-DX265_ARCH_POWER=1)
+--- x265_2.2.orig/source/CMakeLists.txt
++++ x265_2.2/source/CMakeLists.txt
+@@ -65,15 +65,22 @@ elseif(POWERMATCH GREATER "-1")
+         add_definitions(-DPPC64=1)
+         message(STATUS "Detected POWER PPC64 target processor")
+     endif()
 -elseif(ARMMATCH GREATER "-1")
 -    if(CROSS_COMPILE_ARM)
 -        message(STATUS "Cross compiling for ARM arch")
@@ -34,7 +34,7 @@
  else()
      message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
      message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
-@@ -190,18 +197,9 @@ if(GCC)
+@@ -208,18 +215,9 @@ if(GCC)
              endif()
          endif()
      endif()
@@ -55,10 +55,10 @@
      if(FPROFILE_GENERATE)
          if(INTEL_CXX)
              add_definitions(-prof-gen -prof-dir="${CMAKE_CURRENT_BINARY_DIR}")
-Index: x265_2.1/source/common/cpu.cpp
+Index: x265_2.2/source/common/cpu.cpp
 ===================================================================
---- x265_2.1.orig/source/common/cpu.cpp
-+++ x265_2.1/source/common/cpu.cpp
+--- x265_2.2.orig/source/common/cpu.cpp
++++ x265_2.2/source/common/cpu.cpp
 @@ -37,7 +37,7 @@
  #include <machine/cpu.h>
  #endif
@@ -68,7 +68,7 @@
  #include <signal.h>
  #include <setjmp.h>
  static sigjmp_buf jmpbuf;
-@@ -340,7 +340,6 @@ uint32_t cpu_detect(void)
+@@ -344,7 +344,6 @@ uint32_t cpu_detect(void)
      }
  
      canjump = 1;
@@ -76,7 +76,7 @@
      canjump = 0;
      signal(SIGILL, oldsig);
  #endif // if !HAVE_NEON
-@@ -356,7 +355,7 @@ uint32_t cpu_detect(void)
+@@ -360,7 +359,7 @@ uint32_t cpu_detect(void)
      // which may result in incorrect detection and the counters stuck enabled.
      // right now Apple does not seem to support performance counters for this test
  #ifndef __MACH__
​

baselibs.conf Changed

 
@@ -1,1 +1,1 @@
-libx265-95
+libx265-102
​

x265_2.1.tar.gz/.hg_archival.txt -> x265_2.2.tar.gz/.hg_archival.txt Changed

 
@@ -1,6 +1,4 @@
 repo: 09fe40627f03a0f9c3e6ac78b22ac93da23f9fdf
-node: 3e8ce3b26319dbd53ab6369e4c4e986bf30f1315
+node: be14a7e9755e54f0fd34911c72bdfa66981220bc
 branch: stable
-latesttag: 2.1
-latesttagdistance: 1
-changessincelatesttag: 1
+tag: 2.2
​

x265_2.1.tar.gz/doc/reST/cli.rst -> x265_2.2.tar.gz/doc/reST/cli.rst Changed

@@ -662,7 +662,7 @@
 	and less frame parallelism as well. Because of this the faster
 	presets use a CU size of 32. Default: 64
 
-.. option:: --min-cu-size <64|32|16|8>
+.. option:: --min-cu-size <32|16|8>
 
 	Minimum CU size (width and height). By using 16 or 32 the encoder
 	will not analyze the cost of CUs below that minimum threshold,
@@ -869,6 +869,24 @@
 	partitions, in which case a TU split is implied and thus the
 	residual quad-tree begins one layer below the CU quad-tree.
 
+.. option:: --limit-tu <0..4>
+
+	Enables early exit from TU depth recursion, for inter coded blocks.
+	Level 1 - decides to recurse to next higher depth based on cost 
+	comparison of full size TU and split TU.
+	
+	Level 2 - based on first split subTU's depth, limits recursion of
+	other split subTUs.
+	
+	Level 3 - based on the average depth of the co-located and the neighbor
+	CUs' TU depth, limits recursion of the current CU.
+	
+	Level 4 - uses the depth of the neighbouring/ co-located CUs TU depth 
+	to limit the 1st subTU depth. The 1st subTU depth is taken as the 
+	limiting depth for the other subTUs.
+
+	Default: 0
+
 .. option:: --nr-intra <integer>, --nr-inter <integer>
 
 	Noise reduction - an adaptive deadzone applied after DCT
@@ -949,13 +967,17 @@
 	encoder: a star-pattern search followed by an optional radix scan
 	followed by an optional star-search refinement. Full is an
 	exhaustive search; an order of magnitude slower than all other
-	searches but not much better than umh or star.
+	searches but not much better than umh or star. SEA is similar to
+	FULL search; a three step motion search adopted from x264: DC 
+	calculation followed by ADS calculation followed by SAD of the
+	passed motion vector candidates, hence faster than Full search. 
 
 	0. dia
 	1. hex **(default)**
 	2. umh
 	3. star
-	4. full
+	4. sea
+	5. full
 
 .. option:: --subme, -m <0..7>
 
@@ -1153,6 +1175,13 @@
 	:option:`--scenecut` 0 or :option:`--no-scenecut` disables adaptive
 	I frame placement. Default 40
 
+.. option:: --scenecut-bias <0..100.0>
+
+	This value represents the percentage difference between the inter cost and
+	intra cost of a frame used in scenecut detection. For example, a value of 5 indicates,
+	if the inter cost of a frame is greater than or equal to 95 percent of the intra cost of the frame,
+	then detect this frame as scenecut. Values between 5 and 15 are recommended. Default 5.	
+	
 .. option:: --intra-refresh
 
 	Enables Periodic Intra Refresh(PIR) instead of keyframe insertion.
@@ -1304,7 +1333,7 @@
 	slices using param->rc.ipFactor and param->rc.pbFactor unless QP 0
 	is specified, in which case QP 0 is used for all slice types.  Note
 	that QP 0 does not cause lossless encoding, it only disables
-	quantization. Default disabled (CRF)
+	quantization. Default disabled.
 
 	**Range of values:** an integer from 0 to 51
 
@@ -1824,7 +1853,7 @@
 	enhancement layer. A decoder may chose to drop the enhancement layer 
 	and only decode and display the base layer slices.
 	
-	If used with a fixed GOP (:option:`b-adapt` 0) and :option:`bframes`
+	If used with a fixed GOP (:option:`--b-adapt` 0) and :option:`--bframes`
 	3 then the two layers evenly split the frame rate, with a cadence of
 	PbBbP. You probably also want :option:`--no-scenecut` and a keyframe
 	interval that is a multiple of 4.
@@ -1833,15 +1862,29 @@
 
   Maximum of the picture order count. Default 8
 
-.. option:: --discard-sei
+.. option:: --[no-]vui-timing-info
 
-  Discard SEI messages generated from the final bitstream. HDR-related SEI
-  messages are always dumped, immaterial of this option. Default disabled.
-	
-.. option:: --discard-vui
+	Emit VUI timing info in bitstream. Default enabled.
+
+.. option:: --[no-]vui-hrd-info
+
+	Emit VUI HRD info in  bitstream. Default enabled when
+	:option:`--hrd` is enabled.
+
+.. option:: --[no-]opt-qp-pps
+
+	Optimize QP in PPS (instead of default value of 26) based on the QP values
+	observed in last GOP. Default enabled.
+
+.. option:: --[no-]opt-ref-list-length-pps
+
+	Optimize L0 and L1 ref list length in PPS (instead of default value of 0)
+	based on the lengths observed in the last GOP. Default enabled.
+
+.. option:: --[no-]multi-pass-opt-rps
+
+	Enable storing commonly used RPS in SPS in multi pass mode. Default disabled.
 
-	Discard optional VUI information (timing, HRD info) from the
-	bitstream. Default disabled.
 
 Debugging options
 =================

 
@@ -662,7 +662,7 @@
    and less frame parallelism as well. Because of this the faster
    presets use a CU size of 32. Default: 64
 
-.. option:: --min-cu-size <64|32|16|8>
+.. option:: --min-cu-size <32|16|8>
 
    Minimum CU size (width and height). By using 16 or 32 the encoder
    will not analyze the cost of CUs below that minimum threshold,
@@ -869,6 +869,24 @@
    partitions, in which case a TU split is implied and thus the
    residual quad-tree begins one layer below the CU quad-tree.
 
+.. option:: --limit-tu <0..4>
+
+   Enables early exit from TU depth recursion, for inter coded blocks.
+   Level 1 - decides to recurse to next higher depth based on cost 
+   comparison of full size TU and split TU.
+   
+   Level 2 - based on first split subTU's depth, limits recursion of
+   other split subTUs.
+   
+   Level 3 - based on the average depth of the co-located and the neighbor
+   CUs' TU depth, limits recursion of the current CU.
+   
+   Level 4 - uses the depth of the neighbouring/ co-located CUs TU depth 
+   to limit the 1st subTU depth. The 1st subTU depth is taken as the 
+   limiting depth for the other subTUs.
+
+   Default: 0
+
 .. option:: --nr-intra <integer>, --nr-inter <integer>
 
    Noise reduction - an adaptive deadzone applied after DCT
@@ -949,13 +967,17 @@
    encoder: a star-pattern search followed by an optional radix scan
    followed by an optional star-search refinement. Full is an
    exhaustive search; an order of magnitude slower than all other
-   searches but not much better than umh or star.
+   searches but not much better than umh or star. SEA is similar to
+   FULL search; a three step motion search adopted from x264: DC 
+   calculation followed by ADS calculation followed by SAD of the
+   passed motion vector candidates, hence faster than Full search. 
 
    0. dia
    1. hex **(default)**
    2. umh
    3. star
-   4. full
+   4. sea
+   5. full
 
 .. option:: --subme, -m <0..7>
 
@@ -1153,6 +1175,13 @@
    :option:`--scenecut` 0 or :option:`--no-scenecut` disables adaptive
    I frame placement. Default 40
 
+.. option:: --scenecut-bias <0..100.0>
+
+   This value represents the percentage difference between the inter cost and
+   intra cost of a frame used in scenecut detection. For example, a value of 5 indicates,
+   if the inter cost of a frame is greater than or equal to 95 percent of the intra cost of the frame,
+   then detect this frame as scenecut. Values between 5 and 15 are recommended. Default 5. 
+   
 .. option:: --intra-refresh
 
    Enables Periodic Intra Refresh(PIR) instead of keyframe insertion.
@@ -1304,7 +1333,7 @@
    slices using param->rc.ipFactor and param->rc.pbFactor unless QP 0
    is specified, in which case QP 0 is used for all slice types.  Note
    that QP 0 does not cause lossless encoding, it only disables
-   quantization. Default disabled (CRF)
+   quantization. Default disabled.
 
    **Range of values:** an integer from 0 to 51
 
@@ -1824,7 +1853,7 @@
    enhancement layer. A decoder may chose to drop the enhancement layer 
    and only decode and display the base layer slices.
    
-   If used with a fixed GOP (:option:`b-adapt` 0) and :option:`bframes`
+   If used with a fixed GOP (:option:`--b-adapt` 0) and :option:`--bframes`
    3 then the two layers evenly split the frame rate, with a cadence of
    PbBbP. You probably also want :option:`--no-scenecut` and a keyframe
    interval that is a multiple of 4.
@@ -1833,15 +1862,29 @@
 
   Maximum of the picture order count. Default 8
 
-.. option:: --discard-sei
+.. option:: --[no-]vui-timing-info
 
-  Discard SEI messages generated from the final bitstream. HDR-related SEI
-  messages are always dumped, immaterial of this option. Default disabled.
-   
-.. option:: --discard-vui
+   Emit VUI timing info in bitstream. Default enabled.
+
+.. option:: --[no-]vui-hrd-info
+
+   Emit VUI HRD info in  bitstream. Default enabled when
+   :option:`--hrd` is enabled.
+
+.. option:: --[no-]opt-qp-pps
+
+   Optimize QP in PPS (instead of default value of 26) based on the QP values
+   observed in last GOP. Default enabled.
+
+.. option:: --[no-]opt-ref-list-length-pps
+
+   Optimize L0 and L1 ref list length in PPS (instead of default value of 0)
+   based on the lengths observed in the last GOP. Default enabled.
+
+.. option:: --[no-]multi-pass-opt-rps
+
+   Enable storing commonly used RPS in SPS in multi pass mode. Default disabled.
 
-   Discard optional VUI information (timing, HRD info) from the
-   bitstream. Default disabled.
 
 Debugging options
 =================
​

x265_2.1.tar.gz/doc/reST/index.rst -> x265_2.2.tar.gz/doc/reST/index.rst Changed

 
@@ -9,3 +9,4 @@
    threading
    presets
    lossless
+   releasenotes
​

x265_2.2.tar.gz/doc/reST/releasenotes.rst Added

@@ -0,0 +1,141 @@
+*************
+Release Notes
+*************
+
+Version 2.2
+===========
+
+Release date - 26th December, 2016.
+
+Encoder enhancements
+--------------------
+1. Enhancements to TU selection algorithm with early-outs for improved speed; use :option:`--limit-tu` to exercise.
+2. New motion search method SEA (Successive Elimination Algorithm) supported now as :option: `--me` 4
+3. Bit-stream optimizations to improve fields in PPS and SPS for bit-rate savings through :option:`--[no-]opt-qp-pps`, :option:`--[no-]opt-ref-list-length-pps`, and :option:`--[no-]multi-pass-opt-rps`.
+4. Enabled using VBV constraints when encoding without WPP.
+5. All param options dumped in SEI packet in bitstream when info selected.
+6. x265 now supports POWERPC-based systems. Several key functions also have optimized ALTIVEC kernels.
+
+API changes
+-----------
+1. Options to disable SEI and optional-VUI messages from bitstream made more descriptive.
+2. New option :option:`--scenecut-bias` to enable controlling bias to mark scene-cuts via cli.
+3. Support mono and mono16 color spaces for y4m input.
+4. :option:`--min-cu-size` of 64 no-longer supported for reasons of visual quality (was crashing earlier anyways.)
+5. API for CSV now expects version string for better integration of x265 into other applications.
+
+Bug fixes
+---------
+1. Several fixes to slice-based encoding.
+2. :option:`--log2-max-poc-lsb`'s range limited according to HEVC spec.
+3. Restrict MVs to within legal boundaries when encoding.
+
+Version 2.1
+===========
+
+Release date - 27th September, 2016
+
+Encoder enhancements
+--------------------
+1. Support for qg-size of 8
+2. Support for inserting non-IDR I-frames at scenecuts and when running with settings for fixed-GOP (min-keyint = max-keyint)
+3. Experimental support for slice-parallelism.
+
+API changes
+-----------
+1. Encode user-define SEI messages passed in through x265_picture object.
+2. Disable SEI and VUI messages from the bitstream
+3. Specify qpmin and qpmax
+4. Control number of bits to encode POC.
+
+Bug fixes
+---------
+1. QP fluctuation fix for first B-frame in mini-GOP for 2-pass encoding with tune-grain.
+2. Assembly fix for crashes in 32-bit from dct_sse4.
+3. Threadpool creation fix in windows platform.
+
+Version 2.0
+===========
+
+Release date - 13th July, 2016
+
+New Features
+------------
+
+1. uhd-bd: Enable Ultra-HD Bluray support
+2. rskip: Enables skipping recursion to analyze lower CU sizes using heuristics at different rd-levels. Provides good visual quality gains at the highest quality presets. 
+3. rc-grain: Enables a new ratecontrol mode specifically for grainy content. Strictly prevents QP oscillations within and between frames to avoid grain fluctuations.
+4. tune grain: A fully refactored and improved option to encode film grain content including QP control as well as analysis options.
+5. asm: ARM assembly is now enabled by default, native or cross compiled builds supported on armv6 and later systems.
+
+API and Key Behaviour Changes
+-----------------------------
+
+1. x265_rc_stats added to x265_picture, containing all RC decision points for that frame
+2. PTL: high tier is now allowed by default, chosen only if necessary
+3. multi-pass: First pass now uses slow-firstpass by default, enabling better RC decisions in future passes 
+4. pools: fix behaviour on multi-socketed Windows systems, provide more flexibility in determining thread and pool counts
+5. ABR: improve bits allocation in the first few frames, abr reset, vbv and cutree improved
+
+Misc
+----
+1. An SSIM calculation bug was corrected
+
+Version 1.9
+===========
+
+Release date - 29th January, 2016
+
+New Features
+------------
+
+1. Quant offsets: This feature allows block level quantization offsets to be specified for every frame. An API-only feature.
+2. --intra-refresh: Keyframes can be replaced by a moving column of intra blocks in non-keyframes.
+3. --limit-modes: Intelligently restricts mode analysis. 
+4. --max-luma and --min-luma for luma clipping, optional for HDR use-cases
+5. Emergency denoising is now enabled by default in very low bitrate, VBV encodes
+
+API Changes
+-----------
+
+1. x265_frame_stats returns many additional fields: maxCLL, maxFALL, residual energy, scenecut  and latency logging
+2. --qpfile now supports frametype 'K"
+3. x265 now allows CRF ratecontrol in pass N (N greater than or equal to 2)
+4. Chroma subsampling format YUV 4:0:0 is now fully supported and tested
+
+Presets and Performance
+-----------------------
+
+1. Recently added features lookahead-slices, limit-modes, limit-refs have been enabled by default for applicable presets.
+2. The default psy-rd strength has been increased to 2.0
+3. Multi-socket machines now use a single pool of threads that can work cross-socket.
+
+Version 1.8
+===========
+
+Release date - 10th August, 2015
+
+API Changes
+-----------
+1. Experimental support for Main12 is now enabled. Partial assembly support exists. 
+2. Main12 and Intra/Still picture profiles are now supported. Still picture profile is detected based on x265_param::totalFrames.
+3. Three classes of encoding statistics are now available through the API. 
+a) x265_stats - contains encoding statistics, available through x265_encoder_get_stats()
+b) x265_frame_stats and x265_cu_stats - contains frame encoding statistics, available through recon x265_picture
+4. --csv
+a) x265_encoder_log() is now deprecated
+b) x265_param::csvfn is also deprecated
+5. --log-level now controls only console logging, frame level console logging has been removed.
+6. Support added for new color transfer characteristic ARIB STD-B67
+
+New Features
+------------
+1. limit-refs: This feature limits the references analysed for individual CUS. Provides a nice tradeoff between efficiency and performance.
+2. aq-mode 3: A new aq-mode that provides additional biasing for low-light conditions.
+3. An improved scene cut detection logic that allows ratecontrol to manage visual quality at fade-ins and fade-outs better.
+
+Preset and Tune Options
+-----------------------
+
+1. tune grain: Increases psyRdoq strength to 10.0, and rdoq-level to 2.
+2. qg-size: Default value changed to 32.

 
@@ -0,0 +1,141 @@
+*************
+Release Notes
+*************
+
+Version 2.2
+===========
+
+Release date - 26th December, 2016.
+
+Encoder enhancements
+--------------------
+1. Enhancements to TU selection algorithm with early-outs for improved speed; use :option:`--limit-tu` to exercise.
+2. New motion search method SEA (Successive Elimination Algorithm) supported now as :option: `--me` 4
+3. Bit-stream optimizations to improve fields in PPS and SPS for bit-rate savings through :option:`--[no-]opt-qp-pps`, :option:`--[no-]opt-ref-list-length-pps`, and :option:`--[no-]multi-pass-opt-rps`.
+4. Enabled using VBV constraints when encoding without WPP.
+5. All param options dumped in SEI packet in bitstream when info selected.
+6. x265 now supports POWERPC-based systems. Several key functions also have optimized ALTIVEC kernels.
+
+API changes
+-----------
+1. Options to disable SEI and optional-VUI messages from bitstream made more descriptive.
+2. New option :option:`--scenecut-bias` to enable controlling bias to mark scene-cuts via cli.
+3. Support mono and mono16 color spaces for y4m input.
+4. :option:`--min-cu-size` of 64 no-longer supported for reasons of visual quality (was crashing earlier anyways.)
+5. API for CSV now expects version string for better integration of x265 into other applications.
+
+Bug fixes
+---------
+1. Several fixes to slice-based encoding.
+2. :option:`--log2-max-poc-lsb`'s range limited according to HEVC spec.
+3. Restrict MVs to within legal boundaries when encoding.
+
+Version 2.1
+===========
+
+Release date - 27th September, 2016
+
+Encoder enhancements
+--------------------
+1. Support for qg-size of 8
+2. Support for inserting non-IDR I-frames at scenecuts and when running with settings for fixed-GOP (min-keyint = max-keyint)
+3. Experimental support for slice-parallelism.
+
+API changes
+-----------
+1. Encode user-define SEI messages passed in through x265_picture object.
+2. Disable SEI and VUI messages from the bitstream
+3. Specify qpmin and qpmax
+4. Control number of bits to encode POC.
+
+Bug fixes
+---------
+1. QP fluctuation fix for first B-frame in mini-GOP for 2-pass encoding with tune-grain.
+2. Assembly fix for crashes in 32-bit from dct_sse4.
+3. Threadpool creation fix in windows platform.
+
+Version 2.0
+===========
+
+Release date - 13th July, 2016
+
+New Features
+------------
+
+1. uhd-bd: Enable Ultra-HD Bluray support
+2. rskip: Enables skipping recursion to analyze lower CU sizes using heuristics at different rd-levels. Provides good visual quality gains at the highest quality presets. 
+3. rc-grain: Enables a new ratecontrol mode specifically for grainy content. Strictly prevents QP oscillations within and between frames to avoid grain fluctuations.
+4. tune grain: A fully refactored and improved option to encode film grain content including QP control as well as analysis options.
+5. asm: ARM assembly is now enabled by default, native or cross compiled builds supported on armv6 and later systems.
+
+API and Key Behaviour Changes
+-----------------------------
+
+1. x265_rc_stats added to x265_picture, containing all RC decision points for that frame
+2. PTL: high tier is now allowed by default, chosen only if necessary
+3. multi-pass: First pass now uses slow-firstpass by default, enabling better RC decisions in future passes 
+4. pools: fix behaviour on multi-socketed Windows systems, provide more flexibility in determining thread and pool counts
+5. ABR: improve bits allocation in the first few frames, abr reset, vbv and cutree improved
+
+Misc
+----
+1. An SSIM calculation bug was corrected
+
+Version 1.9
+===========
+
+Release date - 29th January, 2016
+
+New Features
+------------
+
+1. Quant offsets: This feature allows block level quantization offsets to be specified for every frame. An API-only feature.
+2. --intra-refresh: Keyframes can be replaced by a moving column of intra blocks in non-keyframes.
+3. --limit-modes: Intelligently restricts mode analysis. 
+4. --max-luma and --min-luma for luma clipping, optional for HDR use-cases
+5. Emergency denoising is now enabled by default in very low bitrate, VBV encodes
+
+API Changes
+-----------
+
+1. x265_frame_stats returns many additional fields: maxCLL, maxFALL, residual energy, scenecut  and latency logging
+2. --qpfile now supports frametype 'K"
+3. x265 now allows CRF ratecontrol in pass N (N greater than or equal to 2)
+4. Chroma subsampling format YUV 4:0:0 is now fully supported and tested
+
+Presets and Performance
+-----------------------
+
+1. Recently added features lookahead-slices, limit-modes, limit-refs have been enabled by default for applicable presets.
+2. The default psy-rd strength has been increased to 2.0
+3. Multi-socket machines now use a single pool of threads that can work cross-socket.
+
+Version 1.8
+===========
+
+Release date - 10th August, 2015
+
+API Changes
+-----------
+1. Experimental support for Main12 is now enabled. Partial assembly support exists. 
+2. Main12 and Intra/Still picture profiles are now supported. Still picture profile is detected based on x265_param::totalFrames.
+3. Three classes of encoding statistics are now available through the API. 
+a) x265_stats - contains encoding statistics, available through x265_encoder_get_stats()
+b) x265_frame_stats and x265_cu_stats - contains frame encoding statistics, available through recon x265_picture
+4. --csv
+a) x265_encoder_log() is now deprecated
+b) x265_param::csvfn is also deprecated
+5. --log-level now controls only console logging, frame level console logging has been removed.
+6. Support added for new color transfer characteristic ARIB STD-B67
+
+New Features
+------------
+1. limit-refs: This feature limits the references analysed for individual CUS. Provides a nice tradeoff between efficiency and performance.
+2. aq-mode 3: A new aq-mode that provides additional biasing for low-light conditions.
+3. An improved scene cut detection logic that allows ratecontrol to manage visual quality at fade-ins and fade-outs better.
+
+Preset and Tune Options
+-----------------------
+
+1. tune grain: Increases psyRdoq strength to 10.0, and rdoq-level to 2.
+2. qg-size: Default value changed to 32.
​

x265_2.1.tar.gz/source/CMakeLists.txt -> x265_2.2.tar.gz/source/CMakeLists.txt Changed

@@ -30,7 +30,7 @@
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
 
 # X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 95)
+set(X265_BUILD 102)
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
                "${PROJECT_BINARY_DIR}/x265.def")
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
@@ -60,6 +60,11 @@
     message(STATUS "Detected POWER target processor")
     set(POWER 1)
     add_definitions(-DX265_ARCH_POWER=1)
+    if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
+        set(PPC64 1)
+        add_definitions(-DPPC64=1)
+        message(STATUS "Detected POWER PPC64 target processor")
+    endif()
 elseif(ARMMATCH GREATER "-1")
     if(CROSS_COMPILE_ARM)
         message(STATUS "Cross compiling for ARM arch")
@@ -167,6 +172,19 @@
 elseif(CMAKE_COMPILER_IS_GNUCXX)
     set(GCC 1)
 endif()
+
+if(CC STREQUAL "xlc")
+    message(STATUS "Use XLC compiler")
+    set(XLC 1)
+    set(GCC 0)
+    #set(CMAKE_C_COMPILER "/usr/bin/xlc")
+    #set(CMAKE_CXX_COMPILER "/usr/bin/xlc++")
+    add_definitions(-D__XLC__=1)
+    add_definitions(-O3 -qstrict -qhot -qaltivec)
+    add_definitions(-qinline=level=10 -qpath=IL:/data/video_files/latest.tpo/)
+endif()
+
+
 if(GCC)
     add_definitions(-Wall -Wextra -Wshadow)
     add_definitions(-D__STDC_LIMIT_MACROS=1)
@@ -396,6 +414,22 @@
     endif(WINXP_SUPPORT)
 endif()
 
+if(POWER)
+    # IBM Power8
+    option(ENABLE_ALTIVEC "Enable ALTIVEC profiling instrumentation" ON)
+    if(ENABLE_ALTIVEC)
+        add_definitions(-DHAVE_ALTIVEC=1 -maltivec -mabi=altivec)
+        add_definitions(-flax-vector-conversions -fpermissive)
+    else()
+        add_definitions(-DHAVE_ALTIVEC=0)
+    endif()
+
+    option(CPU_POWER8 "Enable CPU POWER8 profiling instrumentation" ON)
+    if(CPU_POWER8)
+        add_definitions(-mcpu=power8 -DX265_ARCH_POWER8=1)
+    endif()
+endif()
+
 include(version) # determine X265_VERSION and X265_LATEST_TAG
 include_directories(. common encoder "${PROJECT_BINARY_DIR}")

 
@@ -30,7 +30,7 @@
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
 
 # X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 95)
+set(X265_BUILD 102)
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
                "${PROJECT_BINARY_DIR}/x265.def")
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
@@ -60,6 +60,11 @@
     message(STATUS "Detected POWER target processor")
     set(POWER 1)
     add_definitions(-DX265_ARCH_POWER=1)
+    if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
+        set(PPC64 1)
+        add_definitions(-DPPC64=1)
+        message(STATUS "Detected POWER PPC64 target processor")
+    endif()
 elseif(ARMMATCH GREATER "-1")
     if(CROSS_COMPILE_ARM)
         message(STATUS "Cross compiling for ARM arch")
@@ -167,6 +172,19 @@
 elseif(CMAKE_COMPILER_IS_GNUCXX)
     set(GCC 1)
 endif()
+
+if(CC STREQUAL "xlc")
+    message(STATUS "Use XLC compiler")
+    set(XLC 1)
+    set(GCC 0)
+    #set(CMAKE_C_COMPILER "/usr/bin/xlc")
+    #set(CMAKE_CXX_COMPILER "/usr/bin/xlc++")
+    add_definitions(-D__XLC__=1)
+    add_definitions(-O3 -qstrict -qhot -qaltivec)
+    add_definitions(-qinline=level=10 -qpath=IL:/data/video_files/latest.tpo/)
+endif()
+
+
 if(GCC)
     add_definitions(-Wall -Wextra -Wshadow)
     add_definitions(-D__STDC_LIMIT_MACROS=1)
@@ -396,6 +414,22 @@
     endif(WINXP_SUPPORT)
 endif()
 
+if(POWER)
+    # IBM Power8
+    option(ENABLE_ALTIVEC "Enable ALTIVEC profiling instrumentation" ON)
+    if(ENABLE_ALTIVEC)
+        add_definitions(-DHAVE_ALTIVEC=1 -maltivec -mabi=altivec)
+        add_definitions(-flax-vector-conversions -fpermissive)
+    else()
+        add_definitions(-DHAVE_ALTIVEC=0)
+    endif()
+
+    option(CPU_POWER8 "Enable CPU POWER8 profiling instrumentation" ON)
+    if(CPU_POWER8)
+        add_definitions(-mcpu=power8 -DX265_ARCH_POWER8=1)
+    endif()
+endif()
+
 include(version) # determine X265_VERSION and X265_LATEST_TAG
 include_directories(. common encoder "${PROJECT_BINARY_DIR}")
 
​

x265_2.1.tar.gz/source/common/CMakeLists.txt -> x265_2.2.tar.gz/source/common/CMakeLists.txt Changed

@@ -99,6 +99,19 @@
     source_group(Assembly FILES ${ASM_PRIMITIVES})
 endif(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
 
+if(POWER)
+    set_source_files_properties(version.cpp PROPERTIES COMPILE_FLAGS -DX265_VERSION=${X265_VERSION})
+    if(ENABLE_ALTIVEC)
+        set(ALTIVEC_SRCS pixel_altivec.cpp dct_altivec.cpp ipfilter_altivec.cpp intrapred_altivec.cpp)
+        foreach(SRC ${ALTIVEC_SRCS})
+            set(ALTIVEC_PRIMITIVES ${ALTIVEC_PRIMITIVES} ppc/${SRC})
+        endforeach()
+        source_group(Intrinsics_altivec FILES ${ALTIVEC_PRIMITIVES})
+        set_source_files_properties(${ALTIVEC_PRIMITIVES} PROPERTIES COMPILE_FLAGS "-Wno-unused  -Wno-unknown-pragmas -Wno-maybe-uninitialized")
+    endif()
+endif()
+
+
 # set_target_properties can't do list expansion
 string(REPLACE ";" " " VERSION_FLAGS "${VFLAGS}")
 set_source_files_properties(version.cpp PROPERTIES COMPILE_FLAGS ${VERSION_FLAGS})
@@ -116,7 +129,7 @@
 endif(WIN32)
 
 add_library(common OBJECT
-    ${ASM_PRIMITIVES} ${VEC_PRIMITIVES} ${WINXP}
+    ${ASM_PRIMITIVES} ${VEC_PRIMITIVES} ${ALTIVEC_PRIMITIVES} ${WINXP}
     primitives.cpp primitives.h
     pixel.cpp dct.cpp ipfilter.cpp intrapred.cpp loopfilter.cpp
     constants.cpp constants.h

 
@@ -99,6 +99,19 @@
     source_group(Assembly FILES ${ASM_PRIMITIVES})
 endif(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
 
+if(POWER)
+    set_source_files_properties(version.cpp PROPERTIES COMPILE_FLAGS -DX265_VERSION=${X265_VERSION})
+    if(ENABLE_ALTIVEC)
+        set(ALTIVEC_SRCS pixel_altivec.cpp dct_altivec.cpp ipfilter_altivec.cpp intrapred_altivec.cpp)
+        foreach(SRC ${ALTIVEC_SRCS})
+            set(ALTIVEC_PRIMITIVES ${ALTIVEC_PRIMITIVES} ppc/${SRC})
+        endforeach()
+        source_group(Intrinsics_altivec FILES ${ALTIVEC_PRIMITIVES})
+        set_source_files_properties(${ALTIVEC_PRIMITIVES} PROPERTIES COMPILE_FLAGS "-Wno-unused  -Wno-unknown-pragmas -Wno-maybe-uninitialized")
+    endif()
+endif()
+
+
 # set_target_properties can't do list expansion
 string(REPLACE ";" " " VERSION_FLAGS "${VFLAGS}")
 set_source_files_properties(version.cpp PROPERTIES COMPILE_FLAGS ${VERSION_FLAGS})
@@ -116,7 +129,7 @@
 endif(WIN32)
 
 add_library(common OBJECT
-    ${ASM_PRIMITIVES} ${VEC_PRIMITIVES} ${WINXP}
+    ${ASM_PRIMITIVES} ${VEC_PRIMITIVES} ${ALTIVEC_PRIMITIVES} ${WINXP}
     primitives.cpp primitives.h
     pixel.cpp dct.cpp ipfilter.cpp intrapred.cpp loopfilter.cpp
     constants.cpp constants.h
​

x265_2.1.tar.gz/source/common/bitstream.h -> x265_2.2.tar.gz/source/common/bitstream.h Changed

 
@@ -71,6 +71,7 @@
     uint32_t getNumberOfWrittenBytes() const { return m_byteOccupancy; }
     uint32_t getNumberOfWrittenBits()  const { return m_byteOccupancy * 8 + m_partialByteBits; }
     const uint8_t* getFIFO() const           { return m_fifo; }
+    void     copyBits(Bitstream* stream)     { m_partialByteBits = stream->m_partialByteBits; m_byteOccupancy = stream->m_byteOccupancy; m_partialByte = stream->m_partialByte; }
 
     void     write(uint32_t val, uint32_t numBits);
     void     writeByte(uint32_t val);
​

x265_2.1.tar.gz/source/common/common.h -> x265_2.2.tar.gz/source/common/common.h Changed

 
@@ -176,7 +176,7 @@
 
 #define X265_MIN(a, b) ((a) < (b) ? (a) : (b))
 #define X265_MAX(a, b) ((a) > (b) ? (a) : (b))
-#define COPY1_IF_LT(x, y) if ((y) < (x)) (x) = (y);
+#define COPY1_IF_LT(x, y) {if ((y) < (x)) (x) = (y);}
 #define COPY2_IF_LT(x, y, a, b) \
     if ((y) < (x)) \
     { \
@@ -312,6 +312,7 @@
 
 #define MAX_NUM_REF_PICS            16 // max. number of pictures used for reference
 #define MAX_NUM_REF                 16 // max. number of entries in picture reference list
+#define MAX_NUM_SHORT_TERM_RPS      64 // max. number of short term reference picture set in SPS
 
 #define REF_NOT_VALID               -1
 
@@ -327,6 +328,8 @@
 
 #define PIXEL_MAX ((1 << X265_DEPTH) - 1)
 
+#define INTEGRAL_PLANE_NUM          12 // 12 integral planes for 32x32, 32x24, 32x8, 24x32, 16x16, 16x12, 16x4, 12x16, 8x32, 8x8, 4x16 and 4x4.
+
 namespace X265_NS {
 
 enum { SAO_NUM_OFFSET = 4 };
​

x265_2.1.tar.gz/source/common/cpu.cpp -> x265_2.2.tar.gz/source/common/cpu.cpp Changed

 
@@ -99,6 +99,10 @@
     { "ARMv6",           X265_CPU_ARMV6 },
     { "NEON",            X265_CPU_NEON },
     { "FastNeonMRC",     X265_CPU_FAST_NEON_MRC },
+
+#elif X265_ARCH_POWER8
+    { "Altivec",         X265_CPU_ALTIVEC },
+
 #endif // if X265_ARCH_X86
     { "", 0 },
 };
@@ -363,7 +367,18 @@
     return flags;
 }
 
-#else // if X265_ARCH_X86
+#elif X265_ARCH_POWER8
+
+uint32_t cpu_detect(void)
+{
+#if HAVE_ALTIVEC
+    return X265_CPU_ALTIVEC;
+#else
+    return 0;
+#endif
+}
+
+#else // if X265_ARCH_POWER8
 
 uint32_t cpu_detect(void)
 {
​

x265_2.1.tar.gz/source/common/cudata.cpp -> x265_2.2.tar.gz/source/common/cudata.cpp Changed

 
@@ -296,6 +296,9 @@
     /* initialize the remaining CU data in one memset */
     memset(m_cuDepth, 0, (frame.m_param->internalCsp == X265_CSP_I400 ? BytesPerPartition - 11 : BytesPerPartition - 7) * m_numPartitions);
 
+    for (int8_t i = 0; i < NUM_TU_DEPTH; i++)
+        m_refTuDepth[i] = -1;
+
     uint32_t widthInCU = m_slice->m_sps->numCuInWidth;
     m_cuLeft = (m_cuAddr % widthInCU) ? m_encData->getPicCTU(m_cuAddr - 1) : NULL;
     m_cuAbove = (m_cuAddr >= widthInCU) && !m_bFirstRowInSlice ? m_encData->getPicCTU(m_cuAddr - widthInCU) : NULL;
​

x265_2.1.tar.gz/source/common/cudata.h -> x265_2.2.tar.gz/source/common/cudata.h Changed

@@ -28,6 +28,8 @@
 #include "slice.h"
 #include "mv.h"
 
+#define NUM_TU_DEPTH 21
+
 namespace X265_NS {
 // private namespace
 
@@ -204,6 +206,7 @@
     enum { BytesPerPartition = 21 };  // combined sizeof() of all per-part data
 
     coeff_t*      m_trCoeff[3];       // transformed coefficient buffer per plane
+    int8_t        m_refTuDepth[NUM_TU_DEPTH];   // TU depth of CU at depths 0, 1 and 2
 
     MV*           m_mv[2];            // array of motion vectors per list
     MV*           m_mvd[2];           // array of coded motion vector deltas per list
@@ -355,9 +358,8 @@
             CHECKED_MALLOC(trCoeffMemBlock, coeff_t, (sizeL + sizeC * 2) * numInstances);
         }
         CHECKED_MALLOC(charMemBlock, uint8_t, numPartition * numInstances * CUData::BytesPerPartition);
-        CHECKED_MALLOC(mvMemBlock, MV, numPartition * 4 * numInstances);
+        CHECKED_MALLOC_ZERO(mvMemBlock, MV, numPartition * 4 * numInstances);
         return true;
-
     fail:
         return false;
     }

 
@@ -28,6 +28,8 @@
 #include "slice.h"
 #include "mv.h"
 
+#define NUM_TU_DEPTH 21
+
 namespace X265_NS {
 // private namespace
 
@@ -204,6 +206,7 @@
     enum { BytesPerPartition = 21 };  // combined sizeof() of all per-part data
 
     coeff_t*      m_trCoeff[3];       // transformed coefficient buffer per plane
+    int8_t        m_refTuDepth[NUM_TU_DEPTH];   // TU depth of CU at depths 0, 1 and 2
 
     MV*           m_mv[2];            // array of motion vectors per list
     MV*           m_mvd[2];           // array of coded motion vector deltas per list
@@ -355,9 +358,8 @@
             CHECKED_MALLOC(trCoeffMemBlock, coeff_t, (sizeL + sizeC * 2) * numInstances);
         }
         CHECKED_MALLOC(charMemBlock, uint8_t, numPartition * numInstances * CUData::BytesPerPartition);
-        CHECKED_MALLOC(mvMemBlock, MV, numPartition * 4 * numInstances);
+        CHECKED_MALLOC_ZERO(mvMemBlock, MV, numPartition * 4 * numInstances);
         return true;
-
     fail:
         return false;
     }
​

x265_2.1.tar.gz/source/common/framedata.cpp -> x265_2.2.tar.gz/source/common/framedata.cpp Changed

@@ -37,6 +37,9 @@
     m_slice  = new Slice;
     m_picCTU = new CUData[sps.numCUsInFrame];
     m_picCsp = csp;
+    m_spsrpsIdx = -1;
+    if (param.rc.bStatWrite)
+        m_spsrps = const_cast<RPS*>(sps.spsrps);
 
     m_cuMemPool.create(0, param.internalCsp, sps.numCUsInFrame);
     for (uint32_t ctuAddr = 0; ctuAddr < sps.numCUsInFrame; ctuAddr++)
@@ -45,6 +48,12 @@
     CHECKED_MALLOC_ZERO(m_cuStat, RCStatCU, sps.numCUsInFrame);
     CHECKED_MALLOC(m_rowStat, RCStatRow, sps.numCuInHeight);
     reinit(sps);
+    
+    for (int i = 0; i < INTEGRAL_PLANE_NUM; i++)
+    {
+        m_meBuffer[i] = NULL;
+        m_meIntegral[i] = NULL;
+    }
     return true;
 
 fail:
@@ -67,4 +76,16 @@
 
     X265_FREE(m_cuStat);
     X265_FREE(m_rowStat);
+
+    if (m_meBuffer)
+    {
+        for (int i = 0; i < INTEGRAL_PLANE_NUM; i++)
+        {
+            if (m_meBuffer[i] != NULL)
+            {
+                X265_FREE(m_meBuffer[i]);
+                m_meBuffer[i] = NULL;
+            }
+        }
+    }
 }

 
@@ -37,6 +37,9 @@
     m_slice  = new Slice;
     m_picCTU = new CUData[sps.numCUsInFrame];
     m_picCsp = csp;
+    m_spsrpsIdx = -1;
+    if (param.rc.bStatWrite)
+        m_spsrps = const_cast<RPS*>(sps.spsrps);
 
     m_cuMemPool.create(0, param.internalCsp, sps.numCUsInFrame);
     for (uint32_t ctuAddr = 0; ctuAddr < sps.numCUsInFrame; ctuAddr++)
@@ -45,6 +48,12 @@
     CHECKED_MALLOC_ZERO(m_cuStat, RCStatCU, sps.numCUsInFrame);
     CHECKED_MALLOC(m_rowStat, RCStatRow, sps.numCuInHeight);
     reinit(sps);
+    
+    for (int i = 0; i < INTEGRAL_PLANE_NUM; i++)
+    {
+        m_meBuffer[i] = NULL;
+        m_meIntegral[i] = NULL;
+    }
     return true;
 
 fail:
@@ -67,4 +76,16 @@
 
     X265_FREE(m_cuStat);
     X265_FREE(m_rowStat);
+
+    if (m_meBuffer)
+    {
+        for (int i = 0; i < INTEGRAL_PLANE_NUM; i++)
+        {
+            if (m_meBuffer[i] != NULL)
+            {
+                X265_FREE(m_meBuffer[i]);
+                m_meBuffer[i] = NULL;
+            }
+        }
+    }
 }
​

x265_2.1.tar.gz/source/common/framedata.h -> x265_2.2.tar.gz/source/common/framedata.h Changed

@@ -106,6 +106,9 @@
     CUDataMemPool  m_cuMemPool;
     CUData*        m_picCTU;
 
+    RPS*           m_spsrps;
+    int            m_spsrpsIdx;
+
     /* Rate control data used during encode and by references */
     struct RCStatCU
     {
@@ -123,10 +126,10 @@
         uint32_t encodedBits;   /* sum of 'totalBits' of encoded CTUs */
         uint32_t satdForVbv;    /* sum of lowres (estimated) costs for entire row */
         uint32_t intraSatdForVbv; /* sum of lowres (estimated) intra costs for entire row */
-        uint32_t diagSatd;
-        uint32_t diagIntraSatd;
-        double   diagQp;
-        double   diagQpScale;
+        uint32_t rowSatd;
+        uint32_t rowIntraSatd;
+        double   rowQp;
+        double   rowQpScale;
         double   sumQpRc;
         double   sumQpAq;
     };
@@ -148,6 +151,9 @@
     double         m_rateFactor; /* calculated based on the Frame QP */
     int            m_picCsp;
 
+    uint32_t*              m_meIntegral[INTEGRAL_PLANE_NUM];       // 12 integral planes for 32x32, 32x24, 32x8, 24x32, 16x16, 16x12, 16x4, 12x16, 8x32, 8x8, 4x16 and 4x4.
+    uint32_t*              m_meBuffer[INTEGRAL_PLANE_NUM];
+
     FrameData();
 
     bool create(const x265_param& param, const SPS& sps, int csp);
@@ -168,7 +174,6 @@
 /* Stores inter analysis data for a single frame */
 struct analysis_inter_data
 {
-    MV*         mv;
     WeightParam* wt;
     int32_t*    ref;
     uint8_t*    depth;

 
@@ -106,6 +106,9 @@
     CUDataMemPool  m_cuMemPool;
     CUData*        m_picCTU;
 
+    RPS*           m_spsrps;
+    int            m_spsrpsIdx;
+
     /* Rate control data used during encode and by references */
     struct RCStatCU
     {
@@ -123,10 +126,10 @@
         uint32_t encodedBits;   /* sum of 'totalBits' of encoded CTUs */
         uint32_t satdForVbv;    /* sum of lowres (estimated) costs for entire row */
         uint32_t intraSatdForVbv; /* sum of lowres (estimated) intra costs for entire row */
-        uint32_t diagSatd;
-        uint32_t diagIntraSatd;
-        double   diagQp;
-        double   diagQpScale;
+        uint32_t rowSatd;
+        uint32_t rowIntraSatd;
+        double   rowQp;
+        double   rowQpScale;
         double   sumQpRc;
         double   sumQpAq;
     };
@@ -148,6 +151,9 @@
     double         m_rateFactor; /* calculated based on the Frame QP */
     int            m_picCsp;
 
+    uint32_t*              m_meIntegral[INTEGRAL_PLANE_NUM];       // 12 integral planes for 32x32, 32x24, 32x8, 24x32, 16x16, 16x12, 16x4, 12x16, 8x32, 8x8, 4x16 and 4x4.
+    uint32_t*              m_meBuffer[INTEGRAL_PLANE_NUM];
+
     FrameData();
 
     bool create(const x265_param& param, const SPS& sps, int csp);
@@ -168,7 +174,6 @@
 /* Stores inter analysis data for a single frame */
 struct analysis_inter_data
 {
-    MV*         mv;
     WeightParam* wt;
     int32_t*    ref;
     uint8_t*    depth;
​

x265_2.1.tar.gz/source/common/param.cpp -> x265_2.2.tar.gz/source/common/param.cpp Changed

@@ -149,6 +149,7 @@
     param->bBPyramid = 1;
     param->scenecutThreshold = 40; /* Magic number pulled in from x264 */
     param->lookaheadSlices = 8;
+    param->scenecutBias = 5.0;
 
     /* Intra Coding Tools */
     param->bEnableConstrainedIntra = 0;
@@ -176,6 +177,7 @@
     param->maxNumReferences = 3;
     param->bEnableTemporalMvp = 1;
     param->bSourceReferenceEstimation = 0;
+    param->limitTU = 0;
 
     /* Loop Filter */
     param->bEnableLoopFilter = 1;
@@ -197,6 +199,7 @@
     param->bCULossless = 0;
     param->bEnableTemporalSubLayers = 0;
     param->bEnableRdRefine = 0;
+    param->bMultiPassOptRPS = 0;
 
     /* Rate control options */
     param->rc.vbvMaxBitrate = 0;
@@ -229,8 +232,6 @@
     param->rc.qpMin = 0;
     param->rc.qpMax = QP_MAX_MAX;
 
-    param->bDiscardOptionalVUI = 0;
-
     /* Video Usability Information (VUI) */
     param->vui.aspectRatioIdc = 0;
     param->vui.sarWidth = 0;
@@ -256,8 +257,13 @@
     param->minLuma = 0;
     param->maxLuma = PIXEL_MAX;
     param->log2MaxPocLsb = 8;
-    param->bDiscardSEI = false;
     param->maxSlices = 1;
+
+    param->bEmitVUITimingInfo   = 1;
+    param->bEmitVUIHRDInfo      = 1;
+    param->bOptQpPPS            = 1;
+    param->bOptRefListLengthPPS = 1;
+
 }
 
 int x265_param_default_preset(x265_param* param, const char* preset, const char* tune)
@@ -901,21 +907,19 @@
     // solve "fatal error C1061: compiler limit : blocks nested too deeply"
     if (bExtraParams)
     {
-        bExtraParams = false;
-        if (0) ;
-        OPT("slices") p->maxSlices = atoi(value);
-        else
-            bExtraParams = true;
-    }
-
-    if (bExtraParams)
-    {
         if (0) ;
         OPT("qpmin") p->rc.qpMin = atoi(value);
         OPT("analyze-src-pics") p->bSourceReferenceEstimation = atobool(value);
         OPT("log2-max-poc-lsb") p->log2MaxPocLsb = atoi(value);
-        OPT("discard-sei") p->bDiscardSEI = atobool(value);
-        OPT("discard-vui") p->bDiscardOptionalVUI = atobool(value);
+        OPT("vui-timing-info") p->bEmitVUITimingInfo = atobool(value);
+        OPT("vui-hrd-info") p->bEmitVUIHRDInfo = atobool(value);
+        OPT("slices") p->maxSlices = atoi(value);
+        OPT("limit-tu") p->limitTU = atoi(value);
+        OPT("opt-qp-pps") p->bOptQpPPS = atobool(value);
+        OPT("opt-ref-list-length-pps") p->bOptRefListLengthPPS = atobool(value);
+        OPT("multi-pass-opt-rps") p->bMultiPassOptRPS = atobool(value);
+        OPT("scenecut-bias") p->scenecutBias = atof(value);
+
         else
             return X265_PARAM_BAD_NAME;
     }
@@ -1078,8 +1082,8 @@
         "Multiple-Slices mode must be enable Wavefront Parallel Processing (--wpp)");
     CHECK(param->internalBitDepth != X265_DEPTH,
           "internalBitDepth must match compiled bit depth");
-    CHECK(param->minCUSize != 64 && param->minCUSize != 32 && param->minCUSize != 16 && param->minCUSize != 8,
-          "minimim CU size must be 8, 16, 32, or 64");
+    CHECK(param->minCUSize != 32 && param->minCUSize != 16 && param->minCUSize != 8,
+          "minimim CU size must be 8, 16 or 32");
     CHECK(param->minCUSize > param->maxCUSize,
           "min CU size must be less than or equal to max CU size");
     CHECK(param->rc.qp < -6 * (param->internalBitDepth - 8) || param->rc.qp > QP_MAX_SPEC,
@@ -1088,8 +1092,8 @@
           "Frame rate numerator and denominator must be specified");
     CHECK(param->interlaceMode < 0 || param->interlaceMode > 2,
           "Interlace mode must be 0 (progressive) 1 (top-field first) or 2 (bottom field first)");
-    CHECK(param->searchMethod<0 || param->searchMethod> X265_FULL_SEARCH,
-          "Search method is not supported value (0:DIA 1:HEX 2:UMH 3:HM 5:FULL)");
+    CHECK(param->searchMethod < 0 || param->searchMethod > X265_FULL_SEARCH,
+          "Search method is not supported value (0:DIA 1:HEX 2:UMH 3:HM 4:SEA 5:FULL)");
     CHECK(param->searchRange < 0,
           "Search Range must be more than 0");
     CHECK(param->searchRange >= 32768,
@@ -1122,6 +1126,7 @@
           "QuadtreeTUMaxDepthInter must be less than or equal to the difference between log2(maxCUSize) and QuadtreeTULog2MinSize plus 1");
     CHECK((param->maxTUSize != 32 && param->maxTUSize != 16 && param->maxTUSize != 8 && param->maxTUSize != 4),
           "max TU size must be 4, 8, 16, or 32");
+    CHECK(param->limitTU > 4, "Invalid limit-tu option, limit-TU must be between 0 and 4");
     CHECK(param->maxNumMergeCand < 1, "MaxNumMergeCand must be 1 or greater.");
     CHECK(param->maxNumMergeCand > 5, "MaxNumMergeCand must be 5 or smaller.");
 
@@ -1217,6 +1222,8 @@
           "Valid Logging level -1:none 0:error 1:warning 2:info 3:debug 4:full");
     CHECK(param->scenecutThreshold < 0,
           "scenecutThreshold must be greater than 0");
+    CHECK(param->scenecutBias < 0 || 100 < param->scenecutBias,
+           "scenecut-bias must be between 0 and 100");
     CHECK(param->rdPenalty < 0 || param->rdPenalty > 2,
           "Valid penalty for 32x32 intra TU in non-I slices. 0:disabled 1:RD-penalty 2:maximum");
     CHECK(param->keyframeMax < -1,
@@ -1247,10 +1254,12 @@
         "qpmax exceeds supported range (0 to 69)");
     CHECK(param->rc.qpMin < QP_MIN || param->rc.qpMin > QP_MAX_MAX,
         "qpmin exceeds supported range (0 to 69)");
-    CHECK(param->log2MaxPocLsb < 4,
-        "maximum of the picture order count can not be less than 4");
-    CHECK(1 > param->maxSlices || param->maxSlices > ((param->sourceHeight + param->maxCUSize - 1) / param->maxCUSize),
-        "The slices can not be more than number of rows");
+    CHECK(param->log2MaxPocLsb < 4 || param->log2MaxPocLsb > 16,
+        "Supported range for log2MaxPocLsb is 4 to 16");
+#if !X86_64
+    CHECK(param->searchMethod == X265_SEA && (param->sourceWidth > 840 || param->sourceHeight > 480),
+        "SEA motion search does not support resolutions greater than 480p in 32 bit build");
+#endif
     return check_failed;
 }
 
@@ -1338,9 +1347,8 @@
 
     x265_log(param, X265_LOG_INFO, "ME / range / subpel / merge         : %s / %d / %d / %d\n",
              x265_motion_est_names[param->searchMethod], param->searchRange, param->subpelRefine, param->maxNumMergeCand);
-
     if (param->keyframeMax != INT_MAX || param->scenecutThreshold)
-        x265_log(param, X265_LOG_INFO, "Keyframe min / max / scenecut       : %d / %d / %d\n", param->keyframeMin, param->keyframeMax, param->scenecutThreshold);
+        x265_log(param, X265_LOG_INFO, "Keyframe min / max / scenecut / bias: %d / %d / %d / %.2lf\n", param->keyframeMin, param->keyframeMax, param->scenecutThreshold, param->scenecutBias * 100);
     else
         x265_log(param, X265_LOG_INFO, "Keyframe min / max / scenecut       : disabled\n");
 
@@ -1395,6 +1403,7 @@
     TOOLVAL(param->noiseReductionInter, "nr-inter=%d");
     TOOLOPT(param->bEnableTSkipFast, "tskip-fast");
     TOOLOPT(!param->bEnableTSkipFast && param->bEnableTransformSkip, "tskip");
+    TOOLVAL(param->limitTU , "limit-tu=%d");
     TOOLOPT(param->bCULossless, "cu-lossless");
     TOOLOPT(param->bEnableSignHiding, "signhide");
     TOOLOPT(param->bEnableTemporalMvp, "tmvp");
@@ -1423,7 +1432,7 @@
     fflush(stderr);
 }
 
-char *x265_param2string(x265_param* p)
+char *x265_param2string(x265_param* p, int padx, int pady)
 {
     char *buf, *s;
 
@@ -1434,70 +1443,92 @@
 #define BOOL(param, cliopt) \
     s += sprintf(s, " %s", (param) ? cliopt : "no-" cliopt);
 
-    s += sprintf(s, "%dx%d", p->sourceWidth,p->sourceHeight);
-    s += sprintf(s, " fps=%u/%u", p->fpsNum, p->fpsDenom);
-    s += sprintf(s, " bitdepth=%d", p->internalBitDepth);
+    s += sprintf(s, "cpuid=%d", p->cpuid);
+    s += sprintf(s, " frame-threads=%d", p->frameNumThreads);
+    if (p->numaPools)
+        s += sprintf(s, " numa-pools=%s", p->numaPools);
     BOOL(p->bEnableWavefront, "wpp");
+    BOOL(p->bDistributeModeAnalysis, "pmode");
+    BOOL(p->bDistributeMotionEstimation, "pme");
+    BOOL(p->bEnablePsnr, "psnr");
+    BOOL(p->bEnableSsim, "ssim");
+    s += sprintf(s, " log-level=%d", p->logLevel);
+    s += sprintf(s, " bitdepth=%d", p->internalBitDepth);
+    s += sprintf(s, " input-csp=%d", p->internalCsp);
+    s += sprintf(s, " fps=%u/%u", p->fpsNum, p->fpsDenom);
+    s += sprintf(s, " input-res=%dx%d", p->sourceWidth - padx, p->sourceHeight - pady);
+    s += sprintf(s, " interlace=%d", p->interlaceMode);
+    s += sprintf(s, " total-frames=%d", p->totalFrames);
+    s += sprintf(s, " level-idc=%d", p->levelIdc);
+    s += sprintf(s, " high-tier=%d", p->bHighTier);
+    s += sprintf(s, " uhd-bd=%d", p->uhdBluray);
+    s += sprintf(s, " ref=%d", p->maxNumReferences);
+    BOOL(p->bAllowNonConformance, "allow-non-conformance");
+    BOOL(p->bRepeatHeaders, "repeat-headers");
+    BOOL(p->bAnnexB, "annexb");
+    BOOL(p->bEnableAccessUnitDelimiters, "aud");
+    BOOL(p->bEmitHRDSEI, "hrd");
+    BOOL(p->bEmitInfoSEI, "info");
+    s += sprintf(s, " hash=%d", p->decodedPictureHashSEI);
+    BOOL(p->bEnableTemporalSubLayers, "temporal-layers");
+    BOOL(p->bOpenGOP, "open-gop");
+    s += sprintf(s, " min-keyint=%d", p->keyframeMin);

 
@@ -149,6 +149,7 @@
     param->bBPyramid = 1;
     param->scenecutThreshold = 40; /* Magic number pulled in from x264 */
     param->lookaheadSlices = 8;
+    param->scenecutBias = 5.0;
 
     /* Intra Coding Tools */
     param->bEnableConstrainedIntra = 0;
@@ -176,6 +177,7 @@
     param->maxNumReferences = 3;
     param->bEnableTemporalMvp = 1;
     param->bSourceReferenceEstimation = 0;
+    param->limitTU = 0;
 
     /* Loop Filter */
     param->bEnableLoopFilter = 1;
@@ -197,6 +199,7 @@
     param->bCULossless = 0;
     param->bEnableTemporalSubLayers = 0;
     param->bEnableRdRefine = 0;
+    param->bMultiPassOptRPS = 0;
 
     /* Rate control options */
     param->rc.vbvMaxBitrate = 0;
@@ -229,8 +232,6 @@
     param->rc.qpMin = 0;
     param->rc.qpMax = QP_MAX_MAX;
 
-    param->bDiscardOptionalVUI = 0;
-
     /* Video Usability Information (VUI) */
     param->vui.aspectRatioIdc = 0;
     param->vui.sarWidth = 0;
@@ -256,8 +257,13 @@
     param->minLuma = 0;
     param->maxLuma = PIXEL_MAX;
     param->log2MaxPocLsb = 8;
-    param->bDiscardSEI = false;
     param->maxSlices = 1;
+
+    param->bEmitVUITimingInfo   = 1;
+    param->bEmitVUIHRDInfo      = 1;
+    param->bOptQpPPS            = 1;
+    param->bOptRefListLengthPPS = 1;
+
 }
 
 int x265_param_default_preset(x265_param* param, const char* preset, const char* tune)
@@ -901,21 +907,19 @@
     // solve "fatal error C1061: compiler limit : blocks nested too deeply"
     if (bExtraParams)
     {
-        bExtraParams = false;
-        if (0) ;
-        OPT("slices") p->maxSlices = atoi(value);
-        else
-            bExtraParams = true;
-    }
-
-    if (bExtraParams)
-    {
         if (0) ;
         OPT("qpmin") p->rc.qpMin = atoi(value);
         OPT("analyze-src-pics") p->bSourceReferenceEstimation = atobool(value);
         OPT("log2-max-poc-lsb") p->log2MaxPocLsb = atoi(value);
-        OPT("discard-sei") p->bDiscardSEI = atobool(value);
-        OPT("discard-vui") p->bDiscardOptionalVUI = atobool(value);
+        OPT("vui-timing-info") p->bEmitVUITimingInfo = atobool(value);
+        OPT("vui-hrd-info") p->bEmitVUIHRDInfo = atobool(value);
+        OPT("slices") p->maxSlices = atoi(value);
+        OPT("limit-tu") p->limitTU = atoi(value);
+        OPT("opt-qp-pps") p->bOptQpPPS = atobool(value);
+        OPT("opt-ref-list-length-pps") p->bOptRefListLengthPPS = atobool(value);
+        OPT("multi-pass-opt-rps") p->bMultiPassOptRPS = atobool(value);
+        OPT("scenecut-bias") p->scenecutBias = atof(value);
+
         else
             return X265_PARAM_BAD_NAME;
     }
@@ -1078,8 +1082,8 @@
         "Multiple-Slices mode must be enable Wavefront Parallel Processing (--wpp)");
     CHECK(param->internalBitDepth != X265_DEPTH,
           "internalBitDepth must match compiled bit depth");
-    CHECK(param->minCUSize != 64 && param->minCUSize != 32 && param->minCUSize != 16 && param->minCUSize != 8,
-          "minimim CU size must be 8, 16, 32, or 64");
+    CHECK(param->minCUSize != 32 && param->minCUSize != 16 && param->minCUSize != 8,
+          "minimim CU size must be 8, 16 or 32");
     CHECK(param->minCUSize > param->maxCUSize,
           "min CU size must be less than or equal to max CU size");
     CHECK(param->rc.qp < -6 * (param->internalBitDepth - 8) || param->rc.qp > QP_MAX_SPEC,
@@ -1088,8 +1092,8 @@
           "Frame rate numerator and denominator must be specified");
     CHECK(param->interlaceMode < 0 || param->interlaceMode > 2,
           "Interlace mode must be 0 (progressive) 1 (top-field first) or 2 (bottom field first)");
-    CHECK(param->searchMethod<0 || param->searchMethod> X265_FULL_SEARCH,
-          "Search method is not supported value (0:DIA 1:HEX 2:UMH 3:HM 5:FULL)");
+    CHECK(param->searchMethod < 0 || param->searchMethod > X265_FULL_SEARCH,
+          "Search method is not supported value (0:DIA 1:HEX 2:UMH 3:HM 4:SEA 5:FULL)");
     CHECK(param->searchRange < 0,
           "Search Range must be more than 0");
     CHECK(param->searchRange >= 32768,
@@ -1122,6 +1126,7 @@
           "QuadtreeTUMaxDepthInter must be less than or equal to the difference between log2(maxCUSize) and QuadtreeTULog2MinSize plus 1");
     CHECK((param->maxTUSize != 32 && param->maxTUSize != 16 && param->maxTUSize != 8 && param->maxTUSize != 4),
           "max TU size must be 4, 8, 16, or 32");
+    CHECK(param->limitTU > 4, "Invalid limit-tu option, limit-TU must be between 0 and 4");
     CHECK(param->maxNumMergeCand < 1, "MaxNumMergeCand must be 1 or greater.");
     CHECK(param->maxNumMergeCand > 5, "MaxNumMergeCand must be 5 or smaller.");
 
@@ -1217,6 +1222,8 @@
           "Valid Logging level -1:none 0:error 1:warning 2:info 3:debug 4:full");
     CHECK(param->scenecutThreshold < 0,
           "scenecutThreshold must be greater than 0");
+    CHECK(param->scenecutBias < 0 || 100 < param->scenecutBias,
+           "scenecut-bias must be between 0 and 100");
     CHECK(param->rdPenalty < 0 || param->rdPenalty > 2,
           "Valid penalty for 32x32 intra TU in non-I slices. 0:disabled 1:RD-penalty 2:maximum");
     CHECK(param->keyframeMax < -1,
@@ -1247,10 +1254,12 @@
         "qpmax exceeds supported range (0 to 69)");
     CHECK(param->rc.qpMin < QP_MIN || param->rc.qpMin > QP_MAX_MAX,
         "qpmin exceeds supported range (0 to 69)");
-    CHECK(param->log2MaxPocLsb < 4,
-        "maximum of the picture order count can not be less than 4");
-    CHECK(1 > param->maxSlices || param->maxSlices > ((param->sourceHeight + param->maxCUSize - 1) / param->maxCUSize),
-        "The slices can not be more than number of rows");
+    CHECK(param->log2MaxPocLsb < 4 || param->log2MaxPocLsb > 16,
+        "Supported range for log2MaxPocLsb is 4 to 16");
+#if !X86_64
+    CHECK(param->searchMethod == X265_SEA && (param->sourceWidth > 840 || param->sourceHeight > 480),
+        "SEA motion search does not support resolutions greater than 480p in 32 bit build");
+#endif
     return check_failed;
 }
 
@@ -1338,9 +1347,8 @@
 
     x265_log(param, X265_LOG_INFO, "ME / range / subpel / merge         : %s / %d / %d / %d\n",
              x265_motion_est_names[param->searchMethod], param->searchRange, param->subpelRefine, param->maxNumMergeCand);
-
     if (param->keyframeMax != INT_MAX || param->scenecutThreshold)
-        x265_log(param, X265_LOG_INFO, "Keyframe min / max / scenecut       : %d / %d / %d\n", param->keyframeMin, param->keyframeMax, param->scenecutThreshold);
+        x265_log(param, X265_LOG_INFO, "Keyframe min / max / scenecut / bias: %d / %d / %d / %.2lf\n", param->keyframeMin, param->keyframeMax, param->scenecutThreshold, param->scenecutBias * 100);
     else
         x265_log(param, X265_LOG_INFO, "Keyframe min / max / scenecut       : disabled\n");
 
@@ -1395,6 +1403,7 @@
     TOOLVAL(param->noiseReductionInter, "nr-inter=%d");
     TOOLOPT(param->bEnableTSkipFast, "tskip-fast");
     TOOLOPT(!param->bEnableTSkipFast && param->bEnableTransformSkip, "tskip");
+    TOOLVAL(param->limitTU , "limit-tu=%d");
     TOOLOPT(param->bCULossless, "cu-lossless");
     TOOLOPT(param->bEnableSignHiding, "signhide");
     TOOLOPT(param->bEnableTemporalMvp, "tmvp");
@@ -1423,7 +1432,7 @@
     fflush(stderr);
 }
 
-char *x265_param2string(x265_param* p)
+char *x265_param2string(x265_param* p, int padx, int pady)
 {
     char *buf, *s;
 
@@ -1434,70 +1443,92 @@
 #define BOOL(param, cliopt) \
     s += sprintf(s, " %s", (param) ? cliopt : "no-" cliopt);
 
-    s += sprintf(s, "%dx%d", p->sourceWidth,p->sourceHeight);
-    s += sprintf(s, " fps=%u/%u", p->fpsNum, p->fpsDenom);
-    s += sprintf(s, " bitdepth=%d", p->internalBitDepth);
+    s += sprintf(s, "cpuid=%d", p->cpuid);
+    s += sprintf(s, " frame-threads=%d", p->frameNumThreads);
+    if (p->numaPools)
+        s += sprintf(s, " numa-pools=%s", p->numaPools);
     BOOL(p->bEnableWavefront, "wpp");
+    BOOL(p->bDistributeModeAnalysis, "pmode");
+    BOOL(p->bDistributeMotionEstimation, "pme");
+    BOOL(p->bEnablePsnr, "psnr");
+    BOOL(p->bEnableSsim, "ssim");
+    s += sprintf(s, " log-level=%d", p->logLevel);
+    s += sprintf(s, " bitdepth=%d", p->internalBitDepth);
+    s += sprintf(s, " input-csp=%d", p->internalCsp);
+    s += sprintf(s, " fps=%u/%u", p->fpsNum, p->fpsDenom);
+    s += sprintf(s, " input-res=%dx%d", p->sourceWidth - padx, p->sourceHeight - pady);
+    s += sprintf(s, " interlace=%d", p->interlaceMode);
+    s += sprintf(s, " total-frames=%d", p->totalFrames);
+    s += sprintf(s, " level-idc=%d", p->levelIdc);
+    s += sprintf(s, " high-tier=%d", p->bHighTier);
+    s += sprintf(s, " uhd-bd=%d", p->uhdBluray);
+    s += sprintf(s, " ref=%d", p->maxNumReferences);
+    BOOL(p->bAllowNonConformance, "allow-non-conformance");
+    BOOL(p->bRepeatHeaders, "repeat-headers");
+    BOOL(p->bAnnexB, "annexb");
+    BOOL(p->bEnableAccessUnitDelimiters, "aud");
+    BOOL(p->bEmitHRDSEI, "hrd");
+    BOOL(p->bEmitInfoSEI, "info");
+    s += sprintf(s, " hash=%d", p->decodedPictureHashSEI);
+    BOOL(p->bEnableTemporalSubLayers, "temporal-layers");
+    BOOL(p->bOpenGOP, "open-gop");
+    s += sprintf(s, " min-keyint=%d", p->keyframeMin);
​

x265_2.1.tar.gz/source/common/param.h -> x265_2.2.tar.gz/source/common/param.h Changed

 
@@ -31,7 +31,7 @@
 int   x265_set_globals(x265_param *param);
 void  x265_print_params(x265_param *param);
 void  x265_param_apply_fastfirstpass(x265_param *p);
-char* x265_param2string(x265_param *param);
+char* x265_param2string(x265_param *param, int padx, int pady);
 int   x265_atoi(const char *str, bool& bError);
 double x265_atof(const char *str, bool& bError);
 int   parseCpuName(const char *value, bool& bError);
​

x265_2.1.tar.gz/source/common/pixel.cpp -> x265_2.2.tar.gz/source/common/pixel.cpp Changed

@@ -117,6 +117,52 @@
     }
 }
 
+template<int lx, int ly>
+int ads_x4(int encDC[4], uint32_t *sums, int delta, uint16_t *costMvX, int16_t *mvs, int width, int thresh)
+{
+    int nmv = 0;
+    for (int16_t i = 0; i < width; i++, sums++)
+    {
+        int ads = abs(encDC[0] - long(sums[0]))
+            + abs(encDC[1] - long(sums[lx >> 1]))
+            + abs(encDC[2] - long(sums[delta]))
+            + abs(encDC[3] - long(sums[delta + (lx >> 1)]))
+            + costMvX[i];
+        if (ads < thresh)
+            mvs[nmv++] = i;
+    }
+    return nmv;
+}
+
+template<int lx, int ly>
+int ads_x2(int encDC[2], uint32_t *sums, int delta, uint16_t *costMvX, int16_t *mvs, int width, int thresh)
+{
+    int nmv = 0;
+    for (int16_t i = 0; i < width; i++, sums++)
+    {
+        int ads = abs(encDC[0] - long(sums[0]))
+            + abs(encDC[1] - long(sums[delta]))
+            + costMvX[i];
+        if (ads < thresh)
+            mvs[nmv++] = i;
+    }
+    return nmv;
+}
+
+template<int lx, int ly>
+int ads_x1(int encDC[1], uint32_t *sums, int, uint16_t *costMvX, int16_t *mvs, int width, int thresh)
+{
+    int nmv = 0;
+    for (int16_t i = 0; i < width; i++, sums++)
+    {
+        int ads = abs(encDC[0] - long(sums[0]))
+            + costMvX[i];
+        if (ads < thresh)
+            mvs[nmv++] = i;
+    }
+    return nmv;
+}
+
 template<int lx, int ly, class T1, class T2>
 sse_t sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2)
 {
@@ -991,6 +1037,32 @@
     LUMA_PU(64, 16);
     LUMA_PU(16, 64);
 
+    p.pu[LUMA_4x4].ads = ads_x1<4, 4>;
+    p.pu[LUMA_8x8].ads = ads_x1<8, 8>;
+    p.pu[LUMA_8x4].ads = ads_x2<8, 4>;
+    p.pu[LUMA_4x8].ads = ads_x2<4, 8>;
+    p.pu[LUMA_16x16].ads = ads_x4<16, 16>;
+    p.pu[LUMA_16x8].ads = ads_x2<16, 8>;
+    p.pu[LUMA_8x16].ads = ads_x2<8, 16>;
+    p.pu[LUMA_16x12].ads = ads_x1<16, 12>;
+    p.pu[LUMA_12x16].ads = ads_x1<12, 16>;
+    p.pu[LUMA_16x4].ads = ads_x1<16, 4>;
+    p.pu[LUMA_4x16].ads = ads_x1<4, 16>;
+    p.pu[LUMA_32x32].ads = ads_x4<32, 32>;
+    p.pu[LUMA_32x16].ads = ads_x2<32, 16>;
+    p.pu[LUMA_16x32].ads = ads_x2<16, 32>;
+    p.pu[LUMA_32x24].ads = ads_x4<32, 24>;
+    p.pu[LUMA_24x32].ads = ads_x4<24, 32>;
+    p.pu[LUMA_32x8].ads = ads_x4<32, 8>;
+    p.pu[LUMA_8x32].ads = ads_x4<8, 32>;
+    p.pu[LUMA_64x64].ads = ads_x4<64, 64>;
+    p.pu[LUMA_64x32].ads = ads_x2<64, 32>;
+    p.pu[LUMA_32x64].ads = ads_x2<32, 64>;
+    p.pu[LUMA_64x48].ads = ads_x4<64, 48>;
+    p.pu[LUMA_48x64].ads = ads_x4<48, 64>;
+    p.pu[LUMA_64x16].ads = ads_x4<64, 16>;
+    p.pu[LUMA_16x64].ads = ads_x4<16, 64>;
+
     p.pu[LUMA_4x4].satd   = satd_4x4;
     p.pu[LUMA_8x8].satd   = satd8<8, 8>;
     p.pu[LUMA_8x4].satd   = satd_8x4;

 
@@ -117,6 +117,52 @@
     }
 }
 
+template<int lx, int ly>
+int ads_x4(int encDC[4], uint32_t *sums, int delta, uint16_t *costMvX, int16_t *mvs, int width, int thresh)
+{
+    int nmv = 0;
+    for (int16_t i = 0; i < width; i++, sums++)
+    {
+        int ads = abs(encDC[0] - long(sums[0]))
+            + abs(encDC[1] - long(sums[lx >> 1]))
+            + abs(encDC[2] - long(sums[delta]))
+            + abs(encDC[3] - long(sums[delta + (lx >> 1)]))
+            + costMvX[i];
+        if (ads < thresh)
+            mvs[nmv++] = i;
+    }
+    return nmv;
+}
+
+template<int lx, int ly>
+int ads_x2(int encDC[2], uint32_t *sums, int delta, uint16_t *costMvX, int16_t *mvs, int width, int thresh)
+{
+    int nmv = 0;
+    for (int16_t i = 0; i < width; i++, sums++)
+    {
+        int ads = abs(encDC[0] - long(sums[0]))
+            + abs(encDC[1] - long(sums[delta]))
+            + costMvX[i];
+        if (ads < thresh)
+            mvs[nmv++] = i;
+    }
+    return nmv;
+}
+
+template<int lx, int ly>
+int ads_x1(int encDC[1], uint32_t *sums, int, uint16_t *costMvX, int16_t *mvs, int width, int thresh)
+{
+    int nmv = 0;
+    for (int16_t i = 0; i < width; i++, sums++)
+    {
+        int ads = abs(encDC[0] - long(sums[0]))
+            + costMvX[i];
+        if (ads < thresh)
+            mvs[nmv++] = i;
+    }
+    return nmv;
+}
+
 template<int lx, int ly, class T1, class T2>
 sse_t sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2)
 {
@@ -991,6 +1037,32 @@
     LUMA_PU(64, 16);
     LUMA_PU(16, 64);
 
+    p.pu[LUMA_4x4].ads = ads_x1<4, 4>;
+    p.pu[LUMA_8x8].ads = ads_x1<8, 8>;
+    p.pu[LUMA_8x4].ads = ads_x2<8, 4>;
+    p.pu[LUMA_4x8].ads = ads_x2<4, 8>;
+    p.pu[LUMA_16x16].ads = ads_x4<16, 16>;
+    p.pu[LUMA_16x8].ads = ads_x2<16, 8>;
+    p.pu[LUMA_8x16].ads = ads_x2<8, 16>;
+    p.pu[LUMA_16x12].ads = ads_x1<16, 12>;
+    p.pu[LUMA_12x16].ads = ads_x1<12, 16>;
+    p.pu[LUMA_16x4].ads = ads_x1<16, 4>;
+    p.pu[LUMA_4x16].ads = ads_x1<4, 16>;
+    p.pu[LUMA_32x32].ads = ads_x4<32, 32>;
+    p.pu[LUMA_32x16].ads = ads_x2<32, 16>;
+    p.pu[LUMA_16x32].ads = ads_x2<16, 32>;
+    p.pu[LUMA_32x24].ads = ads_x4<32, 24>;
+    p.pu[LUMA_24x32].ads = ads_x4<24, 32>;
+    p.pu[LUMA_32x8].ads = ads_x4<32, 8>;
+    p.pu[LUMA_8x32].ads = ads_x4<8, 32>;
+    p.pu[LUMA_64x64].ads = ads_x4<64, 64>;
+    p.pu[LUMA_64x32].ads = ads_x2<64, 32>;
+    p.pu[LUMA_32x64].ads = ads_x2<32, 64>;
+    p.pu[LUMA_64x48].ads = ads_x4<64, 48>;
+    p.pu[LUMA_48x64].ads = ads_x4<48, 64>;
+    p.pu[LUMA_64x16].ads = ads_x4<64, 16>;
+    p.pu[LUMA_16x64].ads = ads_x4<16, 64>;
+
     p.pu[LUMA_4x4].satd   = satd_4x4;
     p.pu[LUMA_8x8].satd   = satd8<8, 8>;
     p.pu[LUMA_8x4].satd   = satd_8x4;
​

x265_2.2.tar.gz/source/common/ppc/dct_altivec.cpp Added

@@ -0,0 +1,819 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Roger Moussalli <rmoussal@us.ibm.com>
+ *          Min Chen <min.chen@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "common.h"
+#include "primitives.h"
+#include "contexts.h"   // costCoeffNxN_c
+#include "threading.h"  // CLZ
+#include "ppccommon.h"
+
+using namespace X265_NS;
+
+static uint32_t quant_altivec(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
+{
+
+    X265_CHECK(qBits >= 8, "qBits less than 8\n");
+
+    X265_CHECK((numCoeff % 16) == 0, "numCoeff must be multiple of 16\n");
+
+    int qBits8 = qBits - 8;
+    uint32_t numSig = 0;
+
+
+    int level[8] ;
+    int sign[8] ;
+    int tmplevel[8] ;
+
+    const vector signed short v_zeros = {0, 0, 0, 0, 0, 0, 0, 0} ;
+    const vector signed short v_neg1 = {-1, -1, -1, -1, -1, -1, -1, -1} ;
+    const vector signed short v_pos1_ss = {1, 1, 1, 1, 1, 1, 1, 1} ;
+    const vector signed int v_pos1_sw = {1, 1, 1, 1} ;
+
+    const vector signed int v_clip_high = {32767, 32767, 32767, 32767} ;
+    const vector signed int v_clip_low = {-32768, -32768, -32768, -32768} ;
+
+
+    vector signed short v_level_ss ;
+    vector signed int v_level_0, v_level_1 ;
+    vector signed int v_tmplevel_0, v_tmplevel_1 ;
+    vector signed short v_sign_ss ;
+    vector signed int v_sign_0, v_sign_1 ;
+    vector signed int v_quantCoeff_0, v_quantCoeff_1 ;
+
+    vector signed int v_numSig = {0, 0, 0, 0} ;
+
+    vector signed int v_add ;
+    v_add[0] = add ;
+    v_add = vec_splat(v_add, 0) ;
+
+    vector unsigned int v_qBits ;
+    v_qBits[0] = qBits ;
+    v_qBits = vec_splat(v_qBits, 0) ;
+
+    vector unsigned int v_qBits8 ;
+    v_qBits8[0] = qBits8 ;
+    v_qBits8 = vec_splat(v_qBits8, 0) ;
+
+
+    for (int blockpos_outer = 0; blockpos_outer < numCoeff; blockpos_outer+=16)
+    {
+        int blockpos = blockpos_outer ;
+
+        // for(int ii=0; ii<8; ii++) { level[ii] = coef[blockpos+ii] ;}
+        v_level_ss = vec_xl(0, &coef[blockpos]) ;
+        v_level_0 = vec_unpackh(v_level_ss) ;
+        v_level_1 = vec_unpackl(v_level_ss) ;
+
+
+        // for(int ii=0; ii<8; ii++) { sign[ii] = (level[ii] < 0 ? -1 : 1) ;}
+        vector bool short v_level_cmplt0 ;
+        v_level_cmplt0 = vec_cmplt(v_level_ss, v_zeros) ;
+        v_sign_ss = vec_sel(v_pos1_ss, v_neg1, v_level_cmplt0) ;
+        v_sign_0 = vec_unpackh(v_sign_ss) ;
+        v_sign_1 = vec_unpackl(v_sign_ss) ;
+        
+        
+
+        // for(int ii=0; ii<8; ii++) { tmplevel[ii] = abs(level[ii]) * quantCoeff[blockpos+ii] ;}
+        v_level_0 = vec_abs(v_level_0) ;
+        v_level_1 = vec_abs(v_level_1) ;
+        v_quantCoeff_0 = vec_xl(0, &quantCoeff[blockpos]) ;
+        v_quantCoeff_1 = vec_xl(16, &quantCoeff[blockpos]) ;
+        
+        asm ("vmuluwm %0,%1,%2"
+              : "=v" (v_tmplevel_0)
+              : "v"  (v_level_0) , "v" (v_quantCoeff_0)
+            ) ;
+
+        asm ("vmuluwm %0,%1,%2"
+              : "=v" (v_tmplevel_1)
+              : "v"  (v_level_1) , "v" (v_quantCoeff_1)
+            ) ;
+
+
+
+        // for(int ii=0; ii<8; ii++) { level[ii] = ((tmplevel[ii] + add) >> qBits) ;}
+        v_level_0 = vec_sra(vec_add(v_tmplevel_0, v_add), v_qBits) ;
+        v_level_1 = vec_sra(vec_add(v_tmplevel_1, v_add), v_qBits) ;
+
+        // for(int ii=0; ii<8; ii++) { deltaU[blockpos+ii] = ((tmplevel[ii] - (level[ii] << qBits)) >> qBits8) ;} 
+        vector signed int v_temp_0_sw, v_temp_1_sw ;
+        v_temp_0_sw = vec_sl(v_level_0, v_qBits) ;
+        v_temp_1_sw = vec_sl(v_level_1, v_qBits) ;
+
+        v_temp_0_sw = vec_sub(v_tmplevel_0, v_temp_0_sw) ;
+        v_temp_1_sw = vec_sub(v_tmplevel_1, v_temp_1_sw) ;
+
+        v_temp_0_sw = vec_sra(v_temp_0_sw, v_qBits8) ;
+        v_temp_1_sw = vec_sra(v_temp_1_sw, v_qBits8) ;
+
+        vec_xst(v_temp_0_sw, 0, &deltaU[blockpos]) ;
+        vec_xst(v_temp_1_sw, 16, &deltaU[blockpos]) ;
+
+
+        // for(int ii=0; ii<8; ii++) { if(level[ii]) ++numSig ; }
+        vector bool int v_level_cmpeq0 ;
+        vector signed int v_level_inc ;
+        v_level_cmpeq0 = vec_cmpeq(v_level_0, (vector signed int)v_zeros) ;
+        v_level_inc = vec_sel(v_pos1_sw, (vector signed int)v_zeros, v_level_cmpeq0) ;
+        v_numSig = vec_add(v_numSig, v_level_inc) ;
+
+        v_level_cmpeq0 = vec_cmpeq(v_level_1, (vector signed int)v_zeros) ;
+        v_level_inc = vec_sel(v_pos1_sw, (vector signed int)v_zeros, v_level_cmpeq0) ;
+        v_numSig = vec_add(v_numSig, v_level_inc) ;
+
+
+        // for(int ii=0; ii<8; ii++) { level[ii] *= sign[ii]; }
+        asm ("vmuluwm %0,%1,%2"
+              : "=v" (v_level_0)
+              : "v"  (v_level_0) , "v" (v_sign_0)
+            ) ;
+
+        asm ("vmuluwm %0,%1,%2"
+              : "=v" (v_level_1)
+              : "v"  (v_level_1) , "v" (v_sign_1)
+            ) ;
+
+
+
+        // for(int ii=0; ii<8; ii++) {qCoef[blockpos+ii] = (int16_t)x265_clip3(-32768, 32767, level[ii]);}
+        vector bool int v_level_cmp_clip_high, v_level_cmp_clip_low ;
+
+        v_level_cmp_clip_high = vec_cmpgt(v_level_0, v_clip_high) ;
+        v_level_0 = vec_sel(v_level_0, v_clip_high, v_level_cmp_clip_high) ;
+        v_level_cmp_clip_low = vec_cmplt(v_level_0, v_clip_low) ;
+        v_level_0 = vec_sel(v_level_0, v_clip_low, v_level_cmp_clip_low) ;
+
+
+        v_level_cmp_clip_high = vec_cmpgt(v_level_1, v_clip_high) ;
+        v_level_1 = vec_sel(v_level_1, v_clip_high, v_level_cmp_clip_high) ;
+        v_level_cmp_clip_low = vec_cmplt(v_level_1, v_clip_low) ;
+        v_level_1 = vec_sel(v_level_1, v_clip_low, v_level_cmp_clip_low) ;
+
+        v_level_ss = vec_pack(v_level_0, v_level_1) ;
+
+        vec_xst(v_level_ss, 0, &qCoef[blockpos]) ;
+
+
+
+
+        // UNROLL ONCE MORE (which is ok since loops for multiple of 16 times, though that is NOT obvious to the compiler)
+        blockpos += 8 ;
+
+        // for(int ii=0; ii<8; ii++) { level[ii] = coef[blockpos+ii] ;}
+        v_level_ss = vec_xl(0, &coef[blockpos]) ;
+        v_level_0 = vec_unpackh(v_level_ss) ;
+        v_level_1 = vec_unpackl(v_level_ss) ;
+
+
+        // for(int ii=0; ii<8; ii++) { sign[ii] = (level[ii] < 0 ? -1 : 1) ;}
+        v_level_cmplt0 = vec_cmplt(v_level_ss, v_zeros) ;
+        v_sign_ss = vec_sel(v_pos1_ss, v_neg1, v_level_cmplt0) ;
+        v_sign_0 = vec_unpackh(v_sign_ss) ;
+        v_sign_1 = vec_unpackl(v_sign_ss) ;
+        
+        
+
+        // for(int ii=0; ii<8; ii++) { tmplevel[ii] = abs(level[ii]) * quantCoeff[blockpos+ii] ;}
+        v_level_0 = vec_abs(v_level_0) ;

 
@@ -0,0 +1,819 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Roger Moussalli <rmoussal@us.ibm.com>
+ *          Min Chen <min.chen@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "common.h"
+#include "primitives.h"
+#include "contexts.h"   // costCoeffNxN_c
+#include "threading.h"  // CLZ
+#include "ppccommon.h"
+
+using namespace X265_NS;
+
+static uint32_t quant_altivec(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
+{
+
+    X265_CHECK(qBits >= 8, "qBits less than 8\n");
+
+    X265_CHECK((numCoeff % 16) == 0, "numCoeff must be multiple of 16\n");
+
+    int qBits8 = qBits - 8;
+    uint32_t numSig = 0;
+
+
+    int level[8] ;
+    int sign[8] ;
+    int tmplevel[8] ;
+
+    const vector signed short v_zeros = {0, 0, 0, 0, 0, 0, 0, 0} ;
+    const vector signed short v_neg1 = {-1, -1, -1, -1, -1, -1, -1, -1} ;
+    const vector signed short v_pos1_ss = {1, 1, 1, 1, 1, 1, 1, 1} ;
+    const vector signed int v_pos1_sw = {1, 1, 1, 1} ;
+
+    const vector signed int v_clip_high = {32767, 32767, 32767, 32767} ;
+    const vector signed int v_clip_low = {-32768, -32768, -32768, -32768} ;
+
+
+    vector signed short v_level_ss ;
+    vector signed int v_level_0, v_level_1 ;
+    vector signed int v_tmplevel_0, v_tmplevel_1 ;
+    vector signed short v_sign_ss ;
+    vector signed int v_sign_0, v_sign_1 ;
+    vector signed int v_quantCoeff_0, v_quantCoeff_1 ;
+
+    vector signed int v_numSig = {0, 0, 0, 0} ;
+
+    vector signed int v_add ;
+    v_add[0] = add ;
+    v_add = vec_splat(v_add, 0) ;
+
+    vector unsigned int v_qBits ;
+    v_qBits[0] = qBits ;
+    v_qBits = vec_splat(v_qBits, 0) ;
+
+    vector unsigned int v_qBits8 ;
+    v_qBits8[0] = qBits8 ;
+    v_qBits8 = vec_splat(v_qBits8, 0) ;
+
+
+    for (int blockpos_outer = 0; blockpos_outer < numCoeff; blockpos_outer+=16)
+    {
+        int blockpos = blockpos_outer ;
+
+        // for(int ii=0; ii<8; ii++) { level[ii] = coef[blockpos+ii] ;}
+        v_level_ss = vec_xl(0, &coef[blockpos]) ;
+        v_level_0 = vec_unpackh(v_level_ss) ;
+        v_level_1 = vec_unpackl(v_level_ss) ;
+
+
+        // for(int ii=0; ii<8; ii++) { sign[ii] = (level[ii] < 0 ? -1 : 1) ;}
+        vector bool short v_level_cmplt0 ;
+        v_level_cmplt0 = vec_cmplt(v_level_ss, v_zeros) ;
+        v_sign_ss = vec_sel(v_pos1_ss, v_neg1, v_level_cmplt0) ;
+        v_sign_0 = vec_unpackh(v_sign_ss) ;
+        v_sign_1 = vec_unpackl(v_sign_ss) ;
+        
+        
+
+        // for(int ii=0; ii<8; ii++) { tmplevel[ii] = abs(level[ii]) * quantCoeff[blockpos+ii] ;}
+        v_level_0 = vec_abs(v_level_0) ;
+        v_level_1 = vec_abs(v_level_1) ;
+        v_quantCoeff_0 = vec_xl(0, &quantCoeff[blockpos]) ;
+        v_quantCoeff_1 = vec_xl(16, &quantCoeff[blockpos]) ;
+        
+        asm ("vmuluwm %0,%1,%2"
+              : "=v" (v_tmplevel_0)
+              : "v"  (v_level_0) , "v" (v_quantCoeff_0)
+            ) ;
+
+        asm ("vmuluwm %0,%1,%2"
+              : "=v" (v_tmplevel_1)
+              : "v"  (v_level_1) , "v" (v_quantCoeff_1)
+            ) ;
+
+
+
+        // for(int ii=0; ii<8; ii++) { level[ii] = ((tmplevel[ii] + add) >> qBits) ;}
+        v_level_0 = vec_sra(vec_add(v_tmplevel_0, v_add), v_qBits) ;
+        v_level_1 = vec_sra(vec_add(v_tmplevel_1, v_add), v_qBits) ;
+
+        // for(int ii=0; ii<8; ii++) { deltaU[blockpos+ii] = ((tmplevel[ii] - (level[ii] << qBits)) >> qBits8) ;} 
+        vector signed int v_temp_0_sw, v_temp_1_sw ;
+        v_temp_0_sw = vec_sl(v_level_0, v_qBits) ;
+        v_temp_1_sw = vec_sl(v_level_1, v_qBits) ;
+
+        v_temp_0_sw = vec_sub(v_tmplevel_0, v_temp_0_sw) ;
+        v_temp_1_sw = vec_sub(v_tmplevel_1, v_temp_1_sw) ;
+
+        v_temp_0_sw = vec_sra(v_temp_0_sw, v_qBits8) ;
+        v_temp_1_sw = vec_sra(v_temp_1_sw, v_qBits8) ;
+
+        vec_xst(v_temp_0_sw, 0, &deltaU[blockpos]) ;
+        vec_xst(v_temp_1_sw, 16, &deltaU[blockpos]) ;
+
+
+        // for(int ii=0; ii<8; ii++) { if(level[ii]) ++numSig ; }
+        vector bool int v_level_cmpeq0 ;
+        vector signed int v_level_inc ;
+        v_level_cmpeq0 = vec_cmpeq(v_level_0, (vector signed int)v_zeros) ;
+        v_level_inc = vec_sel(v_pos1_sw, (vector signed int)v_zeros, v_level_cmpeq0) ;
+        v_numSig = vec_add(v_numSig, v_level_inc) ;
+
+        v_level_cmpeq0 = vec_cmpeq(v_level_1, (vector signed int)v_zeros) ;
+        v_level_inc = vec_sel(v_pos1_sw, (vector signed int)v_zeros, v_level_cmpeq0) ;
+        v_numSig = vec_add(v_numSig, v_level_inc) ;
+
+
+        // for(int ii=0; ii<8; ii++) { level[ii] *= sign[ii]; }
+        asm ("vmuluwm %0,%1,%2"
+              : "=v" (v_level_0)
+              : "v"  (v_level_0) , "v" (v_sign_0)
+            ) ;
+
+        asm ("vmuluwm %0,%1,%2"
+              : "=v" (v_level_1)
+              : "v"  (v_level_1) , "v" (v_sign_1)
+            ) ;
+
+
+
+        // for(int ii=0; ii<8; ii++) {qCoef[blockpos+ii] = (int16_t)x265_clip3(-32768, 32767, level[ii]);}
+        vector bool int v_level_cmp_clip_high, v_level_cmp_clip_low ;
+
+        v_level_cmp_clip_high = vec_cmpgt(v_level_0, v_clip_high) ;
+        v_level_0 = vec_sel(v_level_0, v_clip_high, v_level_cmp_clip_high) ;
+        v_level_cmp_clip_low = vec_cmplt(v_level_0, v_clip_low) ;
+        v_level_0 = vec_sel(v_level_0, v_clip_low, v_level_cmp_clip_low) ;
+
+
+        v_level_cmp_clip_high = vec_cmpgt(v_level_1, v_clip_high) ;
+        v_level_1 = vec_sel(v_level_1, v_clip_high, v_level_cmp_clip_high) ;
+        v_level_cmp_clip_low = vec_cmplt(v_level_1, v_clip_low) ;
+        v_level_1 = vec_sel(v_level_1, v_clip_low, v_level_cmp_clip_low) ;
+
+        v_level_ss = vec_pack(v_level_0, v_level_1) ;
+
+        vec_xst(v_level_ss, 0, &qCoef[blockpos]) ;
+
+
+
+
+        // UNROLL ONCE MORE (which is ok since loops for multiple of 16 times, though that is NOT obvious to the compiler)
+        blockpos += 8 ;
+
+        // for(int ii=0; ii<8; ii++) { level[ii] = coef[blockpos+ii] ;}
+        v_level_ss = vec_xl(0, &coef[blockpos]) ;
+        v_level_0 = vec_unpackh(v_level_ss) ;
+        v_level_1 = vec_unpackl(v_level_ss) ;
+
+
+        // for(int ii=0; ii<8; ii++) { sign[ii] = (level[ii] < 0 ? -1 : 1) ;}
+        v_level_cmplt0 = vec_cmplt(v_level_ss, v_zeros) ;
+        v_sign_ss = vec_sel(v_pos1_ss, v_neg1, v_level_cmplt0) ;
+        v_sign_0 = vec_unpackh(v_sign_ss) ;
+        v_sign_1 = vec_unpackl(v_sign_ss) ;
+        
+        
+
+        // for(int ii=0; ii<8; ii++) { tmplevel[ii] = abs(level[ii]) * quantCoeff[blockpos+ii] ;}
+        v_level_0 = vec_abs(v_level_0) ;
​

x265_2.2.tar.gz/source/common/ppc/intrapred_altivec.cpp Added

@@ -0,0 +1,30809 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Roger Moussalli <rmoussal@us.ibm.com>
+ *          Min Chen <min.chen@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include <iostream>
+#include <vector>
+#include <assert.h>
+#include <math.h>
+#include <cmath>
+#include <linux/types.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <sys/time.h>
+#include <string.h>
+
+#include "common.h"
+#include "primitives.h"
+#include "x265.h"
+#include "ppccommon.h"
+
+//using namespace std ;
+namespace X265_NS {
+
+/* INTRA Prediction - altivec implementation */
+template<int width, int dirMode>
+void intra_pred(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter){};
+
+template<>
+void intra_pred<4, 2>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
+{
+    if(dstStride == 4) {	
+        const vec_u8_t srcV = vec_xl(10, srcPix0); /* offset = width2+2 = width<<1 + 2*/
+        const vec_u8_t mask = {0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03,0x04, 0x02, 0x03,0x04,0x05, 0x03,0x04,0x05, 0x06}; 
+        vec_u8_t vout = vec_perm(srcV, srcV, mask);
+        vec_xst(vout, 0, dst); 
+    }
+    else if(dstStride%16 == 0){
+        vec_u8_t v0 = vec_xl(10, srcPix0);
+        vec_ste((vec_u32_t)v0, 0, (unsigned int*)dst);
+        vec_u8_t v1 = vec_xl(11, srcPix0);
+        vec_ste((vec_u32_t)v1, 0, (unsigned int*)(dst+dstStride));
+        vec_u8_t v2 = vec_xl(12, srcPix0);
+        vec_ste((vec_u32_t)v2, 0, (unsigned int*)(dst+dstStride*2));
+        vec_u8_t v3 = vec_xl(13, srcPix0);
+        vec_ste((vec_u32_t)v3, 0, (unsigned int*)(dst+dstStride*3));
+    }
+    else{
+        const vec_u8_t srcV = vec_xl(10, srcPix0); /* offset = width2+2 = width<<1 + 2*/
+        const vec_u8_t mask_0 = {0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; 
+        const vec_u8_t mask_1 = {0x01, 0x02, 0x03, 0x04, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; 
+        const vec_u8_t mask_2 = {0x02, 0x03, 0x04, 0x05, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; 
+        const vec_u8_t mask_3 = {0x03, 0x04, 0x05, 0x06, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; 
+        vec_u8_t v0 = vec_perm(srcV, vec_xl(0, dst), mask_0);
+        vec_xst(v0, 0, dst);
+        vec_u8_t v1 = vec_perm(srcV, vec_xl(dstStride, dst), mask_1);
+        vec_xst(v1, dstStride, dst);
+        vec_u8_t v2 = vec_perm(srcV, vec_xl(dstStride*2, dst), mask_2);
+        vec_xst(v2, dstStride*2, dst);
+        vec_u8_t v3 = vec_perm(srcV,  vec_xl(dstStride*3, dst), mask_3);
+        vec_xst(v3, dstStride*3, dst);
+    }
+#ifdef DEBUG
+        for (int y = 0; y < 4; y++)
+        {
+            for (int x = 0; x < 4; x++)
+            {
+                printf("%d ",dst[y * dstStride + x] );			
+            }
+            printf("\n");			
+        }
+        printf("\n\n");			
+#endif    	
+}
+
+template<>
+void intra_pred<8, 2>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
+{
+    if(dstStride == 8) {	
+        const vec_u8_t srcV1 = vec_xl(18, srcPix0); /* offset = width2+2 = width<<1 + 2*/
+        const vec_u8_t mask_0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x01, 0x02, 0x03,0x04, 0x05, 0x06, 0x07, 0x08};
+        const vec_u8_t mask_1 = {0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a};
+        const vec_u8_t mask_2 = {0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c};
+        const vec_u8_t mask_3 = {0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e};
+        vec_u8_t v0 = vec_perm(srcV1, srcV1, mask_0);
+        vec_u8_t v1 = vec_perm(srcV1, srcV1, mask_1);
+        vec_u8_t v2 = vec_perm(srcV1, srcV1, mask_2);
+        vec_u8_t v3 = vec_perm(srcV1, srcV1, mask_3);
+        vec_xst(v0, 0, dst);
+        vec_xst(v1, 16, dst); 
+        vec_xst(v2, 32, dst); 
+        vec_xst(v3, 48, dst); 
+    }
+    else{
+        //pixel *out = dst;	
+        const vec_u8_t srcV1 = vec_xl(18, srcPix0); /* offset = width2+2 = width<<1 + 2*/
+        const vec_u8_t mask_0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+        const vec_u8_t mask_1 = {0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+        const vec_u8_t mask_2 = {0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+        const vec_u8_t mask_3 = {0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+        const vec_u8_t mask_4 = {0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+        const vec_u8_t mask_5 = {0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+        const vec_u8_t mask_6 = {0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+        const vec_u8_t mask_7 = {0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+        vec_u8_t v0 = vec_perm(srcV1, vec_xl(0, dst), mask_0);
+        vec_xst(v0, 0, dst);
+        vec_u8_t v1 = vec_perm(srcV1, vec_xl(dstStride, dst), mask_1);
+        vec_xst(v1, dstStride, dst);
+        vec_u8_t v2 = vec_perm(srcV1, vec_xl(dstStride*2, dst), mask_2);
+        vec_xst(v2, dstStride*2, dst);
+        vec_u8_t v3 = vec_perm(srcV1,  vec_xl(dstStride*3, dst), mask_3);
+        vec_xst(v3, dstStride*3, dst);
+        vec_u8_t v4 = vec_perm(srcV1,  vec_xl(dstStride*4, dst), mask_4);
+        vec_xst(v4, dstStride*4, dst);
+        vec_u8_t v5 = vec_perm(srcV1,  vec_xl(dstStride*5, dst), mask_5);
+        vec_xst(v5, dstStride*5, dst);
+        vec_u8_t v6 = vec_perm(srcV1,  vec_xl(dstStride*6, dst), mask_6);
+        vec_xst(v6, dstStride*6, dst);
+        vec_u8_t v7 = vec_perm(srcV1,  vec_xl(dstStride*7, dst), mask_7);
+        vec_xst(v7, dstStride*7, dst);
+    }
+	
+#ifdef DEBUG
+        for (int y = 0; y < 8; y++)
+        {
+            for (int x = 0; x < 8; x++)
+            {
+                printf("%d ",dst[y * dstStride + x] );			
+            }
+            printf("\n");			
+        }
+        printf("\n\n");			
+#endif    	
+}
+
+template<>
+void intra_pred<16, 2>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
+{
+    int i;
+    //int off = dstStride;	
+    //const pixel *srcPix = srcPix0;
+    for(i=0; i<16; i++){
+        vec_xst(	vec_xl(34+i, srcPix0), i*dstStride, dst); /* first offset = width2+2 = width<<1 + 2*/
+    }
+#ifdef DEBUG
+        for (int y = 0; y < 16; y++)
+        {
+            for (int x = 0; x <16; x++)
+            {
+                printf("%d ",dst[y * dstStride + x] );			
+            }
+            printf("\n");			
+        }
+        printf("\n\n");			
+#endif    	
+}
+
+template<>
+void intra_pred<32, 2>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
+{
+    int i;
+    int off = dstStride;	
+    //const pixel *srcPix = srcPix0;
+    for(i=0; i<32; i++){
+        off = i*dstStride;		
+        vec_xst(	vec_xl(66+i, srcPix0), off, dst); /* first offset = width2+2 = width<<1 + 2*/
+        vec_xst(	vec_xl(82+i, srcPix0), off+16, dst); /* first offset = width2+2 = width<<1 + 2*/
+    }
+#ifdef DEBUG
+        for (int y = 0; y < 32; y++)
+        {
+            for (int x = 0; x <32; x++)
+            {
+                printf("%d ",dst[y * dstStride + x] );			
+            }
+            printf("\n");			
+        }
+        printf("\n\n");			
+#endif

 
@@ -0,0 +1,30809 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Roger Moussalli <rmoussal@us.ibm.com>
+ *          Min Chen <min.chen@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include <iostream>
+#include <vector>
+#include <assert.h>
+#include <math.h>
+#include <cmath>
+#include <linux/types.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <sys/time.h>
+#include <string.h>
+
+#include "common.h"
+#include "primitives.h"
+#include "x265.h"
+#include "ppccommon.h"
+
+//using namespace std ;
+namespace X265_NS {
+
+/* INTRA Prediction - altivec implementation */
+template<int width, int dirMode>
+void intra_pred(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter){};
+
+template<>
+void intra_pred<4, 2>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
+{
+    if(dstStride == 4) {   
+        const vec_u8_t srcV = vec_xl(10, srcPix0); /* offset = width2+2 = width<<1 + 2*/
+        const vec_u8_t mask = {0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03,0x04, 0x02, 0x03,0x04,0x05, 0x03,0x04,0x05, 0x06}; 
+        vec_u8_t vout = vec_perm(srcV, srcV, mask);
+        vec_xst(vout, 0, dst); 
+    }
+    else if(dstStride%16 == 0){
+        vec_u8_t v0 = vec_xl(10, srcPix0);
+        vec_ste((vec_u32_t)v0, 0, (unsigned int*)dst);
+        vec_u8_t v1 = vec_xl(11, srcPix0);
+        vec_ste((vec_u32_t)v1, 0, (unsigned int*)(dst+dstStride));
+        vec_u8_t v2 = vec_xl(12, srcPix0);
+        vec_ste((vec_u32_t)v2, 0, (unsigned int*)(dst+dstStride*2));
+        vec_u8_t v3 = vec_xl(13, srcPix0);
+        vec_ste((vec_u32_t)v3, 0, (unsigned int*)(dst+dstStride*3));
+    }
+    else{
+        const vec_u8_t srcV = vec_xl(10, srcPix0); /* offset = width2+2 = width<<1 + 2*/
+        const vec_u8_t mask_0 = {0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; 
+        const vec_u8_t mask_1 = {0x01, 0x02, 0x03, 0x04, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; 
+        const vec_u8_t mask_2 = {0x02, 0x03, 0x04, 0x05, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; 
+        const vec_u8_t mask_3 = {0x03, 0x04, 0x05, 0x06, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; 
+        vec_u8_t v0 = vec_perm(srcV, vec_xl(0, dst), mask_0);
+        vec_xst(v0, 0, dst);
+        vec_u8_t v1 = vec_perm(srcV, vec_xl(dstStride, dst), mask_1);
+        vec_xst(v1, dstStride, dst);
+        vec_u8_t v2 = vec_perm(srcV, vec_xl(dstStride*2, dst), mask_2);
+        vec_xst(v2, dstStride*2, dst);
+        vec_u8_t v3 = vec_perm(srcV,  vec_xl(dstStride*3, dst), mask_3);
+        vec_xst(v3, dstStride*3, dst);
+    }
+#ifdef DEBUG
+        for (int y = 0; y < 4; y++)
+        {
+            for (int x = 0; x < 4; x++)
+            {
+                printf("%d ",dst[y * dstStride + x] );         
+            }
+            printf("\n");          
+        }
+        printf("\n\n");            
+#endif     
+}
+
+template<>
+void intra_pred<8, 2>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
+{
+    if(dstStride == 8) {   
+        const vec_u8_t srcV1 = vec_xl(18, srcPix0); /* offset = width2+2 = width<<1 + 2*/
+        const vec_u8_t mask_0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x01, 0x02, 0x03,0x04, 0x05, 0x06, 0x07, 0x08};
+        const vec_u8_t mask_1 = {0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a};
+        const vec_u8_t mask_2 = {0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c};
+        const vec_u8_t mask_3 = {0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e};
+        vec_u8_t v0 = vec_perm(srcV1, srcV1, mask_0);
+        vec_u8_t v1 = vec_perm(srcV1, srcV1, mask_1);
+        vec_u8_t v2 = vec_perm(srcV1, srcV1, mask_2);
+        vec_u8_t v3 = vec_perm(srcV1, srcV1, mask_3);
+        vec_xst(v0, 0, dst);
+        vec_xst(v1, 16, dst); 
+        vec_xst(v2, 32, dst); 
+        vec_xst(v3, 48, dst); 
+    }
+    else{
+        //pixel *out = dst;    
+        const vec_u8_t srcV1 = vec_xl(18, srcPix0); /* offset = width2+2 = width<<1 + 2*/
+        const vec_u8_t mask_0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+        const vec_u8_t mask_1 = {0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+        const vec_u8_t mask_2 = {0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+        const vec_u8_t mask_3 = {0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+        const vec_u8_t mask_4 = {0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+        const vec_u8_t mask_5 = {0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+        const vec_u8_t mask_6 = {0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+        const vec_u8_t mask_7 = {0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+        vec_u8_t v0 = vec_perm(srcV1, vec_xl(0, dst), mask_0);
+        vec_xst(v0, 0, dst);
+        vec_u8_t v1 = vec_perm(srcV1, vec_xl(dstStride, dst), mask_1);
+        vec_xst(v1, dstStride, dst);
+        vec_u8_t v2 = vec_perm(srcV1, vec_xl(dstStride*2, dst), mask_2);
+        vec_xst(v2, dstStride*2, dst);
+        vec_u8_t v3 = vec_perm(srcV1,  vec_xl(dstStride*3, dst), mask_3);
+        vec_xst(v3, dstStride*3, dst);
+        vec_u8_t v4 = vec_perm(srcV1,  vec_xl(dstStride*4, dst), mask_4);
+        vec_xst(v4, dstStride*4, dst);
+        vec_u8_t v5 = vec_perm(srcV1,  vec_xl(dstStride*5, dst), mask_5);
+        vec_xst(v5, dstStride*5, dst);
+        vec_u8_t v6 = vec_perm(srcV1,  vec_xl(dstStride*6, dst), mask_6);
+        vec_xst(v6, dstStride*6, dst);
+        vec_u8_t v7 = vec_perm(srcV1,  vec_xl(dstStride*7, dst), mask_7);
+        vec_xst(v7, dstStride*7, dst);
+    }
+   
+#ifdef DEBUG
+        for (int y = 0; y < 8; y++)
+        {
+            for (int x = 0; x < 8; x++)
+            {
+                printf("%d ",dst[y * dstStride + x] );         
+            }
+            printf("\n");          
+        }
+        printf("\n\n");            
+#endif     
+}
+
+template<>
+void intra_pred<16, 2>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
+{
+    int i;
+    //int off = dstStride; 
+    //const pixel *srcPix = srcPix0;
+    for(i=0; i<16; i++){
+        vec_xst(   vec_xl(34+i, srcPix0), i*dstStride, dst); /* first offset = width2+2 = width<<1 + 2*/
+    }
+#ifdef DEBUG
+        for (int y = 0; y < 16; y++)
+        {
+            for (int x = 0; x <16; x++)
+            {
+                printf("%d ",dst[y * dstStride + x] );         
+            }
+            printf("\n");          
+        }
+        printf("\n\n");            
+#endif     
+}
+
+template<>
+void intra_pred<32, 2>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
+{
+    int i;
+    int off = dstStride;   
+    //const pixel *srcPix = srcPix0;
+    for(i=0; i<32; i++){
+        off = i*dstStride;     
+        vec_xst(   vec_xl(66+i, srcPix0), off, dst); /* first offset = width2+2 = width<<1 + 2*/
+        vec_xst(   vec_xl(82+i, srcPix0), off+16, dst); /* first offset = width2+2 = width<<1 + 2*/
+    }
+#ifdef DEBUG
+        for (int y = 0; y < 32; y++)
+        {
+            for (int x = 0; x <32; x++)
+            {
+                printf("%d ",dst[y * dstStride + x] );         
+            }
+            printf("\n");          
+        }
+        printf("\n\n");            
+#endif     
​

x265_2.2.tar.gz/source/common/ppc/ipfilter_altivec.cpp Added

@@ -0,0 +1,1522 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Roger Moussalli <rmoussal@us.ibm.com>
+ *          Min Chen <min.chen@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include <iostream>
+#include "common.h"
+#include "primitives.h"
+#include "ppccommon.h"
+
+using namespace X265_NS;
+
+// ORIGINAL : for(col=0; col<16; col++) {sum[col]  = src[ocol+col + 0 * srcStride] * c[0];}
+#define multiply_pixel_coeff(/*vector int*/ v_sum_0, /*vector int*/ v_sum_1, /*vector int*/ v_sum_2, /*vector int*/ v_sum_3, /*const pixel * */ src, /*int*/ src_offset, /*vector signed short*/ v_coeff) \
+{ \
+    vector unsigned char v_pixel ; \
+    vector signed short v_pixel_16_h, v_pixel_16_l ; \
+    const vector signed short v_mask_unisgned_8_to_16 = {0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF} ; \
+\
+    /* load the pixels */ \
+    v_pixel = vec_xl(src_offset, src) ; \
+\
+    /* unpack the 8-bit pixels to 16-bit values (and undo the sign extension) */ \
+    v_pixel_16_h = vec_unpackh((vector signed char)v_pixel) ; \
+    v_pixel_16_l = vec_unpackl((vector signed char)v_pixel) ; \
+    v_pixel_16_h = vec_and(v_pixel_16_h, v_mask_unisgned_8_to_16) ; \
+    v_pixel_16_l = vec_and(v_pixel_16_l, v_mask_unisgned_8_to_16) ; \
+\
+    /* multiply the pixels by the coefficient */ \
+    v_sum_0 = vec_mule(v_pixel_16_h, v_coeff) ; \
+    v_sum_1 = vec_mulo(v_pixel_16_h, v_coeff) ; \
+    v_sum_2 = vec_mule(v_pixel_16_l, v_coeff) ; \
+    v_sum_3 = vec_mulo(v_pixel_16_l, v_coeff) ; \
+} // end multiply_pixel_coeff()
+
+
+// ORIGINAL : for(col=0; col<16; col++) {sum[col] += src[ocol+col + 1 * srcStride] * c[1];}
+#define multiply_accumulate_pixel_coeff(/*vector int*/ v_sum_0, /*vector int*/ v_sum_1, /*vector int*/ v_sum_2, /*vector int*/ v_sum_3, /*const pixel * */ src, /*int*/ src_offset, /*vector signed short*/ v_coeff) \
+{ \
+    vector unsigned char v_pixel ; \
+    vector signed short v_pixel_16_h, v_pixel_16_l ; \
+    vector int v_product_int_0, v_product_int_1, v_product_int_2, v_product_int_3 ; \
+    const vector signed short v_mask_unisgned_8_to_16 = {0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF} ; \
+\
+    /* ORIGINAL : for(col=0; col<16; col++) {sum[col]  = src[ocol+col + 0 * srcStride] * c[0];} */ \
+    /* load the pixels */ \
+    v_pixel = vec_xl(src_offset, src) ; \
+\
+    /* unpack the 8-bit pixels to 16-bit values (and undo the sign extension) */ \
+    v_pixel_16_h = vec_unpackh((vector signed char)v_pixel) ; \
+    v_pixel_16_l = vec_unpackl((vector signed char)v_pixel) ; \
+    v_pixel_16_h = vec_and(v_pixel_16_h, v_mask_unisgned_8_to_16) ; \
+    v_pixel_16_l = vec_and(v_pixel_16_l, v_mask_unisgned_8_to_16) ; \
+\
+    /* multiply the pixels by the coefficient */ \
+    v_product_int_0 = vec_mule(v_pixel_16_h, v_coeff) ; \
+    v_product_int_1 = vec_mulo(v_pixel_16_h, v_coeff) ; \
+    v_product_int_2 = vec_mule(v_pixel_16_l, v_coeff) ; \
+    v_product_int_3 = vec_mulo(v_pixel_16_l, v_coeff) ; \
+\
+    /* accumulate the results with the sum vectors */ \
+    v_sum_0 = vec_add(v_sum_0, v_product_int_0) ; \
+    v_sum_1 = vec_add(v_sum_1, v_product_int_1) ; \
+    v_sum_2 = vec_add(v_sum_2, v_product_int_2) ; \
+    v_sum_3 = vec_add(v_sum_3, v_product_int_3) ; \
+} // end multiply_accumulate_pixel_coeff()
+
+
+
+#if 0
+//ORIGINAL
+// Works with the following values:
+// N = 8
+// width >= 16 (multiple of 16)
+// any height
+template<int N, int width, int height>
+void interp_vert_pp_altivec(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
+{
+
+
+    const int16_t* c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
+    const int shift = IF_FILTER_PREC;
+    const int offset = 1 << (shift - 1);
+    const uint16_t maxVal = (1 << X265_DEPTH) - 1;
+
+    src -= (N / 2 - 1) * srcStride;
+
+
+    // Vector to hold replicated shift amount
+    const vector unsigned int v_shift = {shift, shift, shift, shift} ;
+
+    // Vector to hold replicated offset
+    const vector int v_offset = {offset, offset, offset, offset} ;
+
+    // Vector to hold replicated maxVal
+    const vector signed short v_maxVal = {maxVal, maxVal, maxVal, maxVal, maxVal, maxVal, maxVal, maxVal} ;
+
+
+    // Vector to hold replicated coefficients (one coefficient replicated per vector)
+    vector signed short v_coeff_0, v_coeff_1, v_coeff_2, v_coeff_3, v_coeff_4, v_coeff_5, v_coeff_6, v_coeff_7 ;
+    vector signed short v_coefficients = vec_xl(0, c) ; // load all coefficients into one vector
+    
+    // Replicate the coefficients into respective vectors
+    v_coeff_0 = vec_splat(v_coefficients, 0) ;
+    v_coeff_1 = vec_splat(v_coefficients, 1) ;
+    v_coeff_2 = vec_splat(v_coefficients, 2) ;
+    v_coeff_3 = vec_splat(v_coefficients, 3) ;
+    v_coeff_4 = vec_splat(v_coefficients, 4) ;
+    v_coeff_5 = vec_splat(v_coefficients, 5) ;
+    v_coeff_6 = vec_splat(v_coefficients, 6) ;
+    v_coeff_7 = vec_splat(v_coefficients, 7) ;
+
+    
+
+    int row, ocol, col;
+    for (row = 0; row < height; row++)
+    {
+        for (ocol = 0; ocol < width; ocol+=16)
+        {
+
+
+           // int sum[16] ;
+           // int16_t val[16] ;
+
+           // --> for(col=0; col<16; col++) {sum[col]  = src[ocol+col + 1 * srcStride] * c[0];}
+           // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 1 * srcStride] * c[1];}
+           // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 2 * srcStride] * c[2];}
+           // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 3 * srcStride] * c[3];}
+           // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 4 * srcStride] * c[4];}
+           // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 5 * srcStride] * c[5];}
+           // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 6 * srcStride] * c[6];}
+           // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 7 * srcStride] * c[7];}
+
+
+	        vector signed int v_sum_0, v_sum_1, v_sum_2, v_sum_3 ;
+            vector signed short v_val_0, v_val_1 ;
+
+
+
+            multiply_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol, v_coeff_0) ;
+            multiply_accumulate_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol + 1 * srcStride, v_coeff_1) ;
+            multiply_accumulate_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol + 2 * srcStride, v_coeff_2) ;
+            multiply_accumulate_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol + 3 * srcStride, v_coeff_3) ;
+            multiply_accumulate_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol + 4 * srcStride, v_coeff_4) ;
+            multiply_accumulate_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol + 5 * srcStride, v_coeff_5) ;
+            multiply_accumulate_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol + 6 * srcStride, v_coeff_6) ;
+            multiply_accumulate_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol + 7 * srcStride, v_coeff_7) ;
+
+
+
+
+
+            // --> for(col=0; col<16; col++) {val[col] = (int16_t)((sum[col] + offset) >> shift);}
+            // Add offset
+            v_sum_0 = vec_add(v_sum_0, v_offset) ;
+            v_sum_1 = vec_add(v_sum_1, v_offset) ;
+            v_sum_2 = vec_add(v_sum_2, v_offset) ;
+            v_sum_3 = vec_add(v_sum_3, v_offset) ;
+            // Shift right by "shift"
+            v_sum_0 = vec_sra(v_sum_0, v_shift) ;
+            v_sum_1 = vec_sra(v_sum_1, v_shift) ;
+            v_sum_2 = vec_sra(v_sum_2, v_shift) ;
+            v_sum_3 = vec_sra(v_sum_3, v_shift) ;
+
+            // Pack into 16-bit numbers
+            v_val_0 = vec_pack(v_sum_0, v_sum_2) ;
+            v_val_1 = vec_pack(v_sum_1, v_sum_3) ;
+
+
+            
+            // --> for(col=0; col<16; col++) {val[col] = (val[col] < 0) ? 0 : val[col];}
+            vector bool short v_comp_zero_0, v_comp_zero_1 ;
+            vector signed short v_max_masked_0, v_max_masked_1 ;
+            vector signed short zeros16 = {0,0,0,0,0,0,0,0} ;
+            // Compute less than 0
+            v_comp_zero_0 = vec_cmplt(v_val_0, zeros16) ;
+            v_comp_zero_1 = vec_cmplt(v_val_1, zeros16) ;
+            // Keep values that are greater or equal to 0
+            v_val_0 = vec_andc(v_val_0, v_comp_zero_0) ;
+            v_val_1 = vec_andc(v_val_1, v_comp_zero_1) ;

 
@@ -0,0 +1,1522 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Roger Moussalli <rmoussal@us.ibm.com>
+ *          Min Chen <min.chen@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include <iostream>
+#include "common.h"
+#include "primitives.h"
+#include "ppccommon.h"
+
+using namespace X265_NS;
+
+// ORIGINAL : for(col=0; col<16; col++) {sum[col]  = src[ocol+col + 0 * srcStride] * c[0];}
+#define multiply_pixel_coeff(/*vector int*/ v_sum_0, /*vector int*/ v_sum_1, /*vector int*/ v_sum_2, /*vector int*/ v_sum_3, /*const pixel * */ src, /*int*/ src_offset, /*vector signed short*/ v_coeff) \
+{ \
+    vector unsigned char v_pixel ; \
+    vector signed short v_pixel_16_h, v_pixel_16_l ; \
+    const vector signed short v_mask_unisgned_8_to_16 = {0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF} ; \
+\
+    /* load the pixels */ \
+    v_pixel = vec_xl(src_offset, src) ; \
+\
+    /* unpack the 8-bit pixels to 16-bit values (and undo the sign extension) */ \
+    v_pixel_16_h = vec_unpackh((vector signed char)v_pixel) ; \
+    v_pixel_16_l = vec_unpackl((vector signed char)v_pixel) ; \
+    v_pixel_16_h = vec_and(v_pixel_16_h, v_mask_unisgned_8_to_16) ; \
+    v_pixel_16_l = vec_and(v_pixel_16_l, v_mask_unisgned_8_to_16) ; \
+\
+    /* multiply the pixels by the coefficient */ \
+    v_sum_0 = vec_mule(v_pixel_16_h, v_coeff) ; \
+    v_sum_1 = vec_mulo(v_pixel_16_h, v_coeff) ; \
+    v_sum_2 = vec_mule(v_pixel_16_l, v_coeff) ; \
+    v_sum_3 = vec_mulo(v_pixel_16_l, v_coeff) ; \
+} // end multiply_pixel_coeff()
+
+
+// ORIGINAL : for(col=0; col<16; col++) {sum[col] += src[ocol+col + 1 * srcStride] * c[1];}
+#define multiply_accumulate_pixel_coeff(/*vector int*/ v_sum_0, /*vector int*/ v_sum_1, /*vector int*/ v_sum_2, /*vector int*/ v_sum_3, /*const pixel * */ src, /*int*/ src_offset, /*vector signed short*/ v_coeff) \
+{ \
+    vector unsigned char v_pixel ; \
+    vector signed short v_pixel_16_h, v_pixel_16_l ; \
+    vector int v_product_int_0, v_product_int_1, v_product_int_2, v_product_int_3 ; \
+    const vector signed short v_mask_unisgned_8_to_16 = {0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF} ; \
+\
+    /* ORIGINAL : for(col=0; col<16; col++) {sum[col]  = src[ocol+col + 0 * srcStride] * c[0];} */ \
+    /* load the pixels */ \
+    v_pixel = vec_xl(src_offset, src) ; \
+\
+    /* unpack the 8-bit pixels to 16-bit values (and undo the sign extension) */ \
+    v_pixel_16_h = vec_unpackh((vector signed char)v_pixel) ; \
+    v_pixel_16_l = vec_unpackl((vector signed char)v_pixel) ; \
+    v_pixel_16_h = vec_and(v_pixel_16_h, v_mask_unisgned_8_to_16) ; \
+    v_pixel_16_l = vec_and(v_pixel_16_l, v_mask_unisgned_8_to_16) ; \
+\
+    /* multiply the pixels by the coefficient */ \
+    v_product_int_0 = vec_mule(v_pixel_16_h, v_coeff) ; \
+    v_product_int_1 = vec_mulo(v_pixel_16_h, v_coeff) ; \
+    v_product_int_2 = vec_mule(v_pixel_16_l, v_coeff) ; \
+    v_product_int_3 = vec_mulo(v_pixel_16_l, v_coeff) ; \
+\
+    /* accumulate the results with the sum vectors */ \
+    v_sum_0 = vec_add(v_sum_0, v_product_int_0) ; \
+    v_sum_1 = vec_add(v_sum_1, v_product_int_1) ; \
+    v_sum_2 = vec_add(v_sum_2, v_product_int_2) ; \
+    v_sum_3 = vec_add(v_sum_3, v_product_int_3) ; \
+} // end multiply_accumulate_pixel_coeff()
+
+
+
+#if 0
+//ORIGINAL
+// Works with the following values:
+// N = 8
+// width >= 16 (multiple of 16)
+// any height
+template<int N, int width, int height>
+void interp_vert_pp_altivec(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
+{
+
+
+    const int16_t* c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
+    const int shift = IF_FILTER_PREC;
+    const int offset = 1 << (shift - 1);
+    const uint16_t maxVal = (1 << X265_DEPTH) - 1;
+
+    src -= (N / 2 - 1) * srcStride;
+
+
+    // Vector to hold replicated shift amount
+    const vector unsigned int v_shift = {shift, shift, shift, shift} ;
+
+    // Vector to hold replicated offset
+    const vector int v_offset = {offset, offset, offset, offset} ;
+
+    // Vector to hold replicated maxVal
+    const vector signed short v_maxVal = {maxVal, maxVal, maxVal, maxVal, maxVal, maxVal, maxVal, maxVal} ;
+
+
+    // Vector to hold replicated coefficients (one coefficient replicated per vector)
+    vector signed short v_coeff_0, v_coeff_1, v_coeff_2, v_coeff_3, v_coeff_4, v_coeff_5, v_coeff_6, v_coeff_7 ;
+    vector signed short v_coefficients = vec_xl(0, c) ; // load all coefficients into one vector
+    
+    // Replicate the coefficients into respective vectors
+    v_coeff_0 = vec_splat(v_coefficients, 0) ;
+    v_coeff_1 = vec_splat(v_coefficients, 1) ;
+    v_coeff_2 = vec_splat(v_coefficients, 2) ;
+    v_coeff_3 = vec_splat(v_coefficients, 3) ;
+    v_coeff_4 = vec_splat(v_coefficients, 4) ;
+    v_coeff_5 = vec_splat(v_coefficients, 5) ;
+    v_coeff_6 = vec_splat(v_coefficients, 6) ;
+    v_coeff_7 = vec_splat(v_coefficients, 7) ;
+
+    
+
+    int row, ocol, col;
+    for (row = 0; row < height; row++)
+    {
+        for (ocol = 0; ocol < width; ocol+=16)
+        {
+
+
+           // int sum[16] ;
+           // int16_t val[16] ;
+
+           // --> for(col=0; col<16; col++) {sum[col]  = src[ocol+col + 1 * srcStride] * c[0];}
+           // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 1 * srcStride] * c[1];}
+           // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 2 * srcStride] * c[2];}
+           // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 3 * srcStride] * c[3];}
+           // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 4 * srcStride] * c[4];}
+           // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 5 * srcStride] * c[5];}
+           // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 6 * srcStride] * c[6];}
+           // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 7 * srcStride] * c[7];}
+
+
+           vector signed int v_sum_0, v_sum_1, v_sum_2, v_sum_3 ;
+            vector signed short v_val_0, v_val_1 ;
+
+
+
+            multiply_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol, v_coeff_0) ;
+            multiply_accumulate_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol + 1 * srcStride, v_coeff_1) ;
+            multiply_accumulate_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol + 2 * srcStride, v_coeff_2) ;
+            multiply_accumulate_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol + 3 * srcStride, v_coeff_3) ;
+            multiply_accumulate_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol + 4 * srcStride, v_coeff_4) ;
+            multiply_accumulate_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol + 5 * srcStride, v_coeff_5) ;
+            multiply_accumulate_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol + 6 * srcStride, v_coeff_6) ;
+            multiply_accumulate_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol + 7 * srcStride, v_coeff_7) ;
+
+
+
+
+
+            // --> for(col=0; col<16; col++) {val[col] = (int16_t)((sum[col] + offset) >> shift);}
+            // Add offset
+            v_sum_0 = vec_add(v_sum_0, v_offset) ;
+            v_sum_1 = vec_add(v_sum_1, v_offset) ;
+            v_sum_2 = vec_add(v_sum_2, v_offset) ;
+            v_sum_3 = vec_add(v_sum_3, v_offset) ;
+            // Shift right by "shift"
+            v_sum_0 = vec_sra(v_sum_0, v_shift) ;
+            v_sum_1 = vec_sra(v_sum_1, v_shift) ;
+            v_sum_2 = vec_sra(v_sum_2, v_shift) ;
+            v_sum_3 = vec_sra(v_sum_3, v_shift) ;
+
+            // Pack into 16-bit numbers
+            v_val_0 = vec_pack(v_sum_0, v_sum_2) ;
+            v_val_1 = vec_pack(v_sum_1, v_sum_3) ;
+
+
+            
+            // --> for(col=0; col<16; col++) {val[col] = (val[col] < 0) ? 0 : val[col];}
+            vector bool short v_comp_zero_0, v_comp_zero_1 ;
+            vector signed short v_max_masked_0, v_max_masked_1 ;
+            vector signed short zeros16 = {0,0,0,0,0,0,0,0} ;
+            // Compute less than 0
+            v_comp_zero_0 = vec_cmplt(v_val_0, zeros16) ;
+            v_comp_zero_1 = vec_cmplt(v_val_1, zeros16) ;
+            // Keep values that are greater or equal to 0
+            v_val_0 = vec_andc(v_val_0, v_comp_zero_0) ;
+            v_val_1 = vec_andc(v_val_1, v_comp_zero_1) ;
​

x265_2.2.tar.gz/source/common/ppc/pixel_altivec.cpp Added

@@ -0,0 +1,4321 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Steve Borho <steve@borho.org>
+ *          Mandar Gurav <mandar@multicorewareinc.com>
+ *          Mahesh Pittala <mahesh@multicorewareinc.com>
+ *          Min Chen <min.chen@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "common.h"
+#include "primitives.h"
+#include "x265.h"
+#include "ppccommon.h"
+
+#include <cstdlib> // abs()
+
+//using namespace X265_NS;
+
+namespace X265_NS {
+// place functions in anonymous namespace (file static)
+
+ /* Null vector */
+#define LOAD_ZERO const vec_u8_t zerov = vec_splat_u8( 0 )
+
+#define zero_u8v  (vec_u8_t)  zerov
+#define zero_s8v  (vec_s8_t)  zerov
+#define zero_u16v (vec_u16_t) zerov
+#define zero_s16v (vec_s16_t) zerov
+#define zero_u32v (vec_u32_t) zerov
+#define zero_s32v (vec_s32_t) zerov
+
+ /* 8 <-> 16 bits conversions */
+#ifdef WORDS_BIGENDIAN
+#define vec_u8_to_u16_h(v) (vec_u16_t) vec_mergeh( zero_u8v, (vec_u8_t) v )
+#define vec_u8_to_u16_l(v) (vec_u16_t) vec_mergel( zero_u8v, (vec_u8_t) v )
+#define vec_u8_to_s16_h(v) (vec_s16_t) vec_mergeh( zero_u8v, (vec_u8_t) v )
+#define vec_u8_to_s16_l(v) (vec_s16_t) vec_mergel( zero_u8v, (vec_u8_t) v )
+#else
+#define vec_u8_to_u16_h(v) (vec_u16_t) vec_mergeh( (vec_u8_t) v, zero_u8v )
+#define vec_u8_to_u16_l(v) (vec_u16_t) vec_mergel( (vec_u8_t) v, zero_u8v )
+#define vec_u8_to_s16_h(v) (vec_s16_t) vec_mergeh( (vec_u8_t) v, zero_u8v )
+#define vec_u8_to_s16_l(v) (vec_s16_t) vec_mergel( (vec_u8_t) v, zero_u8v )
+#endif
+
+#define vec_u8_to_u16(v) vec_u8_to_u16_h(v)
+#define vec_u8_to_s16(v) vec_u8_to_s16_h(v)
+
+#if defined(__GNUC__)
+#define ALIGN_VAR_8(T, var)  T var __attribute__((aligned(8)))
+#define ALIGN_VAR_16(T, var) T var __attribute__((aligned(16)))
+#define ALIGN_VAR_32(T, var) T var __attribute__((aligned(32)))
+#elif defined(_MSC_VER)
+#define ALIGN_VAR_8(T, var)  __declspec(align(8)) T var
+#define ALIGN_VAR_16(T, var) __declspec(align(16)) T var
+#define ALIGN_VAR_32(T, var) __declspec(align(32)) T var
+#endif // if defined(__GNUC__)
+
+typedef uint8_t  pixel;
+typedef uint32_t sum2_t ;
+typedef uint16_t sum_t ;
+#define BITS_PER_SUM (8 * sizeof(sum_t))
+
+/***********************************************************************
+ * SAD routines - altivec implementation
+ **********************************************************************/
+template<int lx, int ly>
+void inline sum_columns_altivec(vec_s32_t sumv, int* sum){}
+
+template<int lx, int ly>
+int inline sad16_altivec(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+{
+    assert(lx <=16);
+    LOAD_ZERO;
+    vec_u8_t  pix1v, pix2v;
+    vec_u8_t  absv = zero_u8v;
+    vec_s32_t sumv = zero_s32v;
+    ALIGN_VAR_16(int, sum );
+
+    for( int y = 0; y < ly; y++ )
+    {
+        pix1v = /*vec_vsx_ld*/vec_xl( 0, pix1);
+        pix2v = /*vec_vsx_ld*/vec_xl( 0, pix2);
+        //print_vec_u8("pix1v", &pix1v);
+        //print_vec_u8("pix2v", &pix2v);
+
+        absv = (vector unsigned char)vec_sub(vec_max(pix1v, pix2v), vec_min(pix1v, pix2v)); 
+        //print_vec_u8("abs sub", &absv);
+
+        sumv = (vec_s32_t) vec_sum4s( absv, (vec_u32_t) sumv);
+        //print_vec_i("vec_sum4s 0", &sumv);
+
+        pix1 += stride_pix1;
+        pix2 += stride_pix2;
+    }
+
+    sum_columns_altivec<lx, ly>(sumv, &sum);
+    //printf("<%d %d>%d\n", lx, ly, sum);
+    return sum;
+}
+
+template<int lx, int ly> //to be implemented later
+int sad16_altivec(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2)
+{
+    int sum = 0;
+    return sum;
+}
+
+template<int lx, int ly>//to be implemented later
+int sad_altivec(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2)
+{
+    int sum = 0;
+    return sum;
+}
+
+template<>
+void inline sum_columns_altivec<16, 4>(vec_s32_t sumv, int* sum)
+{
+    LOAD_ZERO;
+    sumv = vec_sums( sumv, zero_s32v );
+    //print_vec_i("vec_sums", &sumv);
+    sumv = vec_splat( sumv, 3 );
+    //print_vec_i("vec_splat 3", &sumv);
+    vec_ste( sumv, 0, sum );
+}
+
+template<>
+void inline sum_columns_altivec<16, 8>(vec_s32_t sumv, int* sum)
+{
+    LOAD_ZERO;
+    sumv = vec_sums( sumv, zero_s32v );
+    //print_vec_i("vec_sums", &sumv);
+    sumv = vec_splat( sumv, 3 );
+    //print_vec_i("vec_splat 3", &sumv);
+    vec_ste( sumv, 0, sum );
+}
+
+template<>
+void inline sum_columns_altivec<16, 12>(vec_s32_t sumv, int* sum)
+{
+    LOAD_ZERO;
+    sumv = vec_sums( sumv, zero_s32v );
+    //print_vec_i("vec_sums", &sumv);
+    sumv = vec_splat( sumv, 3 );
+    //print_vec_i("vec_splat 3", &sumv);
+    vec_ste( sumv, 0, sum );
+}
+
+template<>
+void inline sum_columns_altivec<16, 16>(vec_s32_t sumv, int* sum)
+{
+    LOAD_ZERO;
+    sumv = vec_sums( sumv, zero_s32v );
+    //print_vec_i("vec_sums", &sumv);
+    sumv = vec_splat( sumv, 3 );
+    //print_vec_i("vec_splat 3", &sumv);
+    vec_ste( sumv, 0, sum );
+}
+
+template<>
+void inline sum_columns_altivec<16, 24>(vec_s32_t sumv, int* sum)
+{
+    LOAD_ZERO;
+    sumv = vec_sums( sumv, zero_s32v );
+    //print_vec_i("vec_sums", &sumv);
+    sumv = vec_splat( sumv, 3 );
+    //print_vec_i("vec_splat 3", &sumv);
+    vec_ste( sumv, 0, sum );
+}
+
+template<>
+void inline sum_columns_altivec<16, 32>(vec_s32_t sumv, int* sum)
+{
+    LOAD_ZERO;
+    sumv = vec_sums( sumv, zero_s32v );
+    //print_vec_i("vec_sums", &sumv);
+    sumv = vec_splat( sumv, 3 );
+    //print_vec_i("vec_splat 3", &sumv);
+    vec_ste( sumv, 0, sum );
+}
+
+template<>
+void inline sum_columns_altivec<16, 48>(vec_s32_t sumv, int* sum)

 
@@ -0,0 +1,4321 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Steve Borho <steve@borho.org>
+ *          Mandar Gurav <mandar@multicorewareinc.com>
+ *          Mahesh Pittala <mahesh@multicorewareinc.com>
+ *          Min Chen <min.chen@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "common.h"
+#include "primitives.h"
+#include "x265.h"
+#include "ppccommon.h"
+
+#include <cstdlib> // abs()
+
+//using namespace X265_NS;
+
+namespace X265_NS {
+// place functions in anonymous namespace (file static)
+
+ /* Null vector */
+#define LOAD_ZERO const vec_u8_t zerov = vec_splat_u8( 0 )
+
+#define zero_u8v  (vec_u8_t)  zerov
+#define zero_s8v  (vec_s8_t)  zerov
+#define zero_u16v (vec_u16_t) zerov
+#define zero_s16v (vec_s16_t) zerov
+#define zero_u32v (vec_u32_t) zerov
+#define zero_s32v (vec_s32_t) zerov
+
+ /* 8 <-> 16 bits conversions */
+#ifdef WORDS_BIGENDIAN
+#define vec_u8_to_u16_h(v) (vec_u16_t) vec_mergeh( zero_u8v, (vec_u8_t) v )
+#define vec_u8_to_u16_l(v) (vec_u16_t) vec_mergel( zero_u8v, (vec_u8_t) v )
+#define vec_u8_to_s16_h(v) (vec_s16_t) vec_mergeh( zero_u8v, (vec_u8_t) v )
+#define vec_u8_to_s16_l(v) (vec_s16_t) vec_mergel( zero_u8v, (vec_u8_t) v )
+#else
+#define vec_u8_to_u16_h(v) (vec_u16_t) vec_mergeh( (vec_u8_t) v, zero_u8v )
+#define vec_u8_to_u16_l(v) (vec_u16_t) vec_mergel( (vec_u8_t) v, zero_u8v )
+#define vec_u8_to_s16_h(v) (vec_s16_t) vec_mergeh( (vec_u8_t) v, zero_u8v )
+#define vec_u8_to_s16_l(v) (vec_s16_t) vec_mergel( (vec_u8_t) v, zero_u8v )
+#endif
+
+#define vec_u8_to_u16(v) vec_u8_to_u16_h(v)
+#define vec_u8_to_s16(v) vec_u8_to_s16_h(v)
+
+#if defined(__GNUC__)
+#define ALIGN_VAR_8(T, var)  T var __attribute__((aligned(8)))
+#define ALIGN_VAR_16(T, var) T var __attribute__((aligned(16)))
+#define ALIGN_VAR_32(T, var) T var __attribute__((aligned(32)))
+#elif defined(_MSC_VER)
+#define ALIGN_VAR_8(T, var)  __declspec(align(8)) T var
+#define ALIGN_VAR_16(T, var) __declspec(align(16)) T var
+#define ALIGN_VAR_32(T, var) __declspec(align(32)) T var
+#endif // if defined(__GNUC__)
+
+typedef uint8_t  pixel;
+typedef uint32_t sum2_t ;
+typedef uint16_t sum_t ;
+#define BITS_PER_SUM (8 * sizeof(sum_t))
+
+/***********************************************************************
+ * SAD routines - altivec implementation
+ **********************************************************************/
+template<int lx, int ly>
+void inline sum_columns_altivec(vec_s32_t sumv, int* sum){}
+
+template<int lx, int ly>
+int inline sad16_altivec(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+{
+    assert(lx <=16);
+    LOAD_ZERO;
+    vec_u8_t  pix1v, pix2v;
+    vec_u8_t  absv = zero_u8v;
+    vec_s32_t sumv = zero_s32v;
+    ALIGN_VAR_16(int, sum );
+
+    for( int y = 0; y < ly; y++ )
+    {
+        pix1v = /*vec_vsx_ld*/vec_xl( 0, pix1);
+        pix2v = /*vec_vsx_ld*/vec_xl( 0, pix2);
+        //print_vec_u8("pix1v", &pix1v);
+        //print_vec_u8("pix2v", &pix2v);
+
+        absv = (vector unsigned char)vec_sub(vec_max(pix1v, pix2v), vec_min(pix1v, pix2v)); 
+        //print_vec_u8("abs sub", &absv);
+
+        sumv = (vec_s32_t) vec_sum4s( absv, (vec_u32_t) sumv);
+        //print_vec_i("vec_sum4s 0", &sumv);
+
+        pix1 += stride_pix1;
+        pix2 += stride_pix2;
+    }
+
+    sum_columns_altivec<lx, ly>(sumv, &sum);
+    //printf("<%d %d>%d\n", lx, ly, sum);
+    return sum;
+}
+
+template<int lx, int ly> //to be implemented later
+int sad16_altivec(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2)
+{
+    int sum = 0;
+    return sum;
+}
+
+template<int lx, int ly>//to be implemented later
+int sad_altivec(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2)
+{
+    int sum = 0;
+    return sum;
+}
+
+template<>
+void inline sum_columns_altivec<16, 4>(vec_s32_t sumv, int* sum)
+{
+    LOAD_ZERO;
+    sumv = vec_sums( sumv, zero_s32v );
+    //print_vec_i("vec_sums", &sumv);
+    sumv = vec_splat( sumv, 3 );
+    //print_vec_i("vec_splat 3", &sumv);
+    vec_ste( sumv, 0, sum );
+}
+
+template<>
+void inline sum_columns_altivec<16, 8>(vec_s32_t sumv, int* sum)
+{
+    LOAD_ZERO;
+    sumv = vec_sums( sumv, zero_s32v );
+    //print_vec_i("vec_sums", &sumv);
+    sumv = vec_splat( sumv, 3 );
+    //print_vec_i("vec_splat 3", &sumv);
+    vec_ste( sumv, 0, sum );
+}
+
+template<>
+void inline sum_columns_altivec<16, 12>(vec_s32_t sumv, int* sum)
+{
+    LOAD_ZERO;
+    sumv = vec_sums( sumv, zero_s32v );
+    //print_vec_i("vec_sums", &sumv);
+    sumv = vec_splat( sumv, 3 );
+    //print_vec_i("vec_splat 3", &sumv);
+    vec_ste( sumv, 0, sum );
+}
+
+template<>
+void inline sum_columns_altivec<16, 16>(vec_s32_t sumv, int* sum)
+{
+    LOAD_ZERO;
+    sumv = vec_sums( sumv, zero_s32v );
+    //print_vec_i("vec_sums", &sumv);
+    sumv = vec_splat( sumv, 3 );
+    //print_vec_i("vec_splat 3", &sumv);
+    vec_ste( sumv, 0, sum );
+}
+
+template<>
+void inline sum_columns_altivec<16, 24>(vec_s32_t sumv, int* sum)
+{
+    LOAD_ZERO;
+    sumv = vec_sums( sumv, zero_s32v );
+    //print_vec_i("vec_sums", &sumv);
+    sumv = vec_splat( sumv, 3 );
+    //print_vec_i("vec_splat 3", &sumv);
+    vec_ste( sumv, 0, sum );
+}
+
+template<>
+void inline sum_columns_altivec<16, 32>(vec_s32_t sumv, int* sum)
+{
+    LOAD_ZERO;
+    sumv = vec_sums( sumv, zero_s32v );
+    //print_vec_i("vec_sums", &sumv);
+    sumv = vec_splat( sumv, 3 );
+    //print_vec_i("vec_splat 3", &sumv);
+    vec_ste( sumv, 0, sum );
+}
+
+template<>
+void inline sum_columns_altivec<16, 48>(vec_s32_t sumv, int* sum)
​

x265_2.2.tar.gz/source/common/ppc/ppccommon.h Added

@@ -0,0 +1,91 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Min Chen <min.chen@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_PPCCOMMON_H
+#define X265_PPCCOMMON_H
+
+
+#if HAVE_ALTIVEC
+#include <altivec.h>
+
+#define vec_u8_t  vector unsigned char
+#define vec_s8_t  vector signed char
+#define vec_u16_t vector unsigned short
+#define vec_s16_t vector signed short
+#define vec_u32_t vector unsigned int
+#define vec_s32_t vector signed int
+
+//copy from x264
+#define LOAD_ZERO const vec_u8_t zerov = vec_splat_u8( 0 )
+
+#define zero_u8v  (vec_u8_t)  zerov
+#define zero_s8v  (vec_s8_t)  zerov
+#define zero_u16v (vec_u16_t) zerov
+#define zero_s16v (vec_s16_t) zerov
+#define zero_u32v (vec_u32_t) zerov
+#define zero_s32v (vec_s32_t) zerov
+
+/***********************************************************************
+ * 8 <-> 16 bits conversions
+ **********************************************************************/
+#ifdef WORDS_BIGENDIAN
+#define vec_u8_to_u16_h(v) (vec_u16_t) vec_mergeh( zero_u8v, (vec_u8_t) v )
+#define vec_u8_to_u16_l(v) (vec_u16_t) vec_mergel( zero_u8v, (vec_u8_t) v )
+#define vec_u8_to_s16_h(v) (vec_s16_t) vec_mergeh( zero_u8v, (vec_u8_t) v )
+#define vec_u8_to_s16_l(v) (vec_s16_t) vec_mergel( zero_u8v, (vec_u8_t) v )
+#else
+#define vec_u8_to_u16_h(v) (vec_u16_t) vec_mergeh( (vec_u8_t) v, zero_u8v )
+#define vec_u8_to_u16_l(v) (vec_u16_t) vec_mergel( (vec_u8_t) v, zero_u8v )
+#define vec_u8_to_s16_h(v) (vec_s16_t) vec_mergeh( (vec_u8_t) v, zero_u8v )
+#define vec_u8_to_s16_l(v) (vec_s16_t) vec_mergel( (vec_u8_t) v, zero_u8v )
+#endif
+
+#define vec_u8_to_u16(v) vec_u8_to_u16_h(v)
+#define vec_u8_to_s16(v) vec_u8_to_s16_h(v)
+
+#ifdef WORDS_BIGENDIAN
+#define vec_u16_to_u32_h(v) (vec_u32_t) vec_mergeh( zero_u16v, (vec_u16_t) v )
+#define vec_u16_to_u32_l(v) (vec_u32_t) vec_mergel( zero_u16v, (vec_u16_t) v )
+#define vec_u16_to_s32_h(v) (vec_s32_t) vec_mergeh( zero_u16v, (vec_u16_t) v )
+#define vec_u16_to_s32_l(v) (vec_s32_t) vec_mergel( zero_u16v, (vec_u16_t) v )
+#else
+#define vec_u16_to_u32_h(v) (vec_u32_t) vec_mergeh( (vec_u16_t) v, zero_u16v )
+#define vec_u16_to_u32_l(v) (vec_u32_t) vec_mergel( (vec_u16_t) v, zero_u16v )
+#define vec_u16_to_s32_h(v) (vec_s32_t) vec_mergeh( (vec_u16_t) v, zero_u16v )
+#define vec_u16_to_s32_l(v) (vec_s32_t) vec_mergel( (vec_u16_t) v, zero_u16v )
+#endif
+
+#define vec_u16_to_u32(v) vec_u16_to_u32_h(v)
+#define vec_u16_to_s32(v) vec_u16_to_s32_h(v)
+
+#define vec_u32_to_u16(v) vec_pack( v, zero_u32v )
+#define vec_s32_to_u16(v) vec_packsu( v, zero_s32v )
+
+#define BITS_PER_SUM (8 * sizeof(sum_t))
+
+#endif /* HAVE_ALTIVEC */
+
+#endif /* X265_PPCCOMMON_H */
+
+
+

 
@@ -0,0 +1,91 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Min Chen <min.chen@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_PPCCOMMON_H
+#define X265_PPCCOMMON_H
+
+
+#if HAVE_ALTIVEC
+#include <altivec.h>
+
+#define vec_u8_t  vector unsigned char
+#define vec_s8_t  vector signed char
+#define vec_u16_t vector unsigned short
+#define vec_s16_t vector signed short
+#define vec_u32_t vector unsigned int
+#define vec_s32_t vector signed int
+
+//copy from x264
+#define LOAD_ZERO const vec_u8_t zerov = vec_splat_u8( 0 )
+
+#define zero_u8v  (vec_u8_t)  zerov
+#define zero_s8v  (vec_s8_t)  zerov
+#define zero_u16v (vec_u16_t) zerov
+#define zero_s16v (vec_s16_t) zerov
+#define zero_u32v (vec_u32_t) zerov
+#define zero_s32v (vec_s32_t) zerov
+
+/***********************************************************************
+ * 8 <-> 16 bits conversions
+ **********************************************************************/
+#ifdef WORDS_BIGENDIAN
+#define vec_u8_to_u16_h(v) (vec_u16_t) vec_mergeh( zero_u8v, (vec_u8_t) v )
+#define vec_u8_to_u16_l(v) (vec_u16_t) vec_mergel( zero_u8v, (vec_u8_t) v )
+#define vec_u8_to_s16_h(v) (vec_s16_t) vec_mergeh( zero_u8v, (vec_u8_t) v )
+#define vec_u8_to_s16_l(v) (vec_s16_t) vec_mergel( zero_u8v, (vec_u8_t) v )
+#else
+#define vec_u8_to_u16_h(v) (vec_u16_t) vec_mergeh( (vec_u8_t) v, zero_u8v )
+#define vec_u8_to_u16_l(v) (vec_u16_t) vec_mergel( (vec_u8_t) v, zero_u8v )
+#define vec_u8_to_s16_h(v) (vec_s16_t) vec_mergeh( (vec_u8_t) v, zero_u8v )
+#define vec_u8_to_s16_l(v) (vec_s16_t) vec_mergel( (vec_u8_t) v, zero_u8v )
+#endif
+
+#define vec_u8_to_u16(v) vec_u8_to_u16_h(v)
+#define vec_u8_to_s16(v) vec_u8_to_s16_h(v)
+
+#ifdef WORDS_BIGENDIAN
+#define vec_u16_to_u32_h(v) (vec_u32_t) vec_mergeh( zero_u16v, (vec_u16_t) v )
+#define vec_u16_to_u32_l(v) (vec_u32_t) vec_mergel( zero_u16v, (vec_u16_t) v )
+#define vec_u16_to_s32_h(v) (vec_s32_t) vec_mergeh( zero_u16v, (vec_u16_t) v )
+#define vec_u16_to_s32_l(v) (vec_s32_t) vec_mergel( zero_u16v, (vec_u16_t) v )
+#else
+#define vec_u16_to_u32_h(v) (vec_u32_t) vec_mergeh( (vec_u16_t) v, zero_u16v )
+#define vec_u16_to_u32_l(v) (vec_u32_t) vec_mergel( (vec_u16_t) v, zero_u16v )
+#define vec_u16_to_s32_h(v) (vec_s32_t) vec_mergeh( (vec_u16_t) v, zero_u16v )
+#define vec_u16_to_s32_l(v) (vec_s32_t) vec_mergel( (vec_u16_t) v, zero_u16v )
+#endif
+
+#define vec_u16_to_u32(v) vec_u16_to_u32_h(v)
+#define vec_u16_to_s32(v) vec_u16_to_s32_h(v)
+
+#define vec_u32_to_u16(v) vec_pack( v, zero_u32v )
+#define vec_s32_to_u16(v) vec_packsu( v, zero_s32v )
+
+#define BITS_PER_SUM (8 * sizeof(sum_t))
+
+#endif /* HAVE_ALTIVEC */
+
+#endif /* X265_PPCCOMMON_H */
+
+
+
​

x265_2.1.tar.gz/source/common/primitives.cpp -> x265_2.2.tar.gz/source/common/primitives.cpp Changed

 
@@ -243,6 +243,15 @@
 #endif
         setupAssemblyPrimitives(primitives, param->cpuid);
 #endif
+#if HAVE_ALTIVEC
+        if (param->cpuid & X265_CPU_ALTIVEC)
+        {
+            setupPixelPrimitives_altivec(primitives);       // pixel_altivec.cpp, overwrite the initialization for altivec optimizated functions
+            setupDCTPrimitives_altivec(primitives);         // dct_altivec.cpp, overwrite the initialization for altivec optimizated functions
+            setupFilterPrimitives_altivec(primitives);      // ipfilter.cpp, overwrite the initialization for altivec optimizated functions
+            setupIntraPrimitives_altivec(primitives);       // intrapred_altivec.cpp, overwrite the initialization for altivec optimizated functions
+        }
+#endif
 
         setupAliasPrimitives(primitives);
     }
​

x265_2.1.tar.gz/source/common/primitives.h -> x265_2.2.tar.gz/source/common/primitives.h Changed

@@ -115,6 +115,7 @@
 typedef sse_t (*pixel_sse_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned
 typedef sse_t (*pixel_sse_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride);
 typedef sse_t (*pixel_ssd_s_t)(const int16_t* fenc, intptr_t fencstride);
+typedef int(*pixelcmp_ads_t)(int encDC[], uint32_t *sums, int delta, uint16_t *costMvX, int16_t *mvs, int width, int thresh);
 typedef void (*pixelcmp_x4_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
 typedef void (*pixelcmp_x3_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
 typedef void (*blockfill_s_t)(int16_t* dst, intptr_t dstride, int16_t val);
@@ -217,6 +218,7 @@
         pixelcmp_t     sad;         // Sum of Absolute Differences
         pixelcmp_x3_t  sad_x3;      // Sum of Absolute Differences, 3 mv offsets at once
         pixelcmp_x4_t  sad_x4;      // Sum of Absolute Differences, 4 mv offsets at once
+        pixelcmp_ads_t ads;         // Absolute Differences sum
         pixelcmp_t     satd;        // Sum of Absolute Transformed Differences (4x4 Hadamard)
 
         filter_pp_t    luma_hpp;    // 8-tap luma motion compensation interpolation filters
@@ -402,6 +404,22 @@
     return part;
 }
 
+/* Computes the size of the LumaPU for a given LumaPU enum */
+inline void sizesFromPartition(int part, int *width, int *height)
+{
+    X265_CHECK(part >= 0 && part <= 24, "Invalid part %d \n", part);
+    extern const uint8_t lumaPartitionMapTable[];
+    int index = 0;
+    for (int i = 0; i < 256;i++)
+        if (part == lumaPartitionMapTable[i])
+        {
+            index = i;
+            break;
+        }
+    *width = 4 * ((index >> 4) + 1);
+    *height = 4 * ((index % 16) + 1);
+}
+
 inline int partitionFromLog2Size(int log2Size)
 {
     X265_CHECK(2 <= log2Size && log2Size <= 6, "Invalid block size\n");
@@ -412,6 +430,12 @@
 void setupInstrinsicPrimitives(EncoderPrimitives &p, int cpuMask);
 void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask);
 void setupAliasPrimitives(EncoderPrimitives &p);
+#if HAVE_ALTIVEC
+void setupPixelPrimitives_altivec(EncoderPrimitives &p);
+void setupDCTPrimitives_altivec(EncoderPrimitives &p);
+void setupFilterPrimitives_altivec(EncoderPrimitives &p);
+void setupIntraPrimitives_altivec(EncoderPrimitives &p);
+#endif
 }
 
 #if !EXPORT_C_API

 
@@ -115,6 +115,7 @@
 typedef sse_t (*pixel_sse_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned
 typedef sse_t (*pixel_sse_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride);
 typedef sse_t (*pixel_ssd_s_t)(const int16_t* fenc, intptr_t fencstride);
+typedef int(*pixelcmp_ads_t)(int encDC[], uint32_t *sums, int delta, uint16_t *costMvX, int16_t *mvs, int width, int thresh);
 typedef void (*pixelcmp_x4_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
 typedef void (*pixelcmp_x3_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
 typedef void (*blockfill_s_t)(int16_t* dst, intptr_t dstride, int16_t val);
@@ -217,6 +218,7 @@
         pixelcmp_t     sad;         // Sum of Absolute Differences
         pixelcmp_x3_t  sad_x3;      // Sum of Absolute Differences, 3 mv offsets at once
         pixelcmp_x4_t  sad_x4;      // Sum of Absolute Differences, 4 mv offsets at once
+        pixelcmp_ads_t ads;         // Absolute Differences sum
         pixelcmp_t     satd;        // Sum of Absolute Transformed Differences (4x4 Hadamard)
 
         filter_pp_t    luma_hpp;    // 8-tap luma motion compensation interpolation filters
@@ -402,6 +404,22 @@
     return part;
 }
 
+/* Computes the size of the LumaPU for a given LumaPU enum */
+inline void sizesFromPartition(int part, int *width, int *height)
+{
+    X265_CHECK(part >= 0 && part <= 24, "Invalid part %d \n", part);
+    extern const uint8_t lumaPartitionMapTable[];
+    int index = 0;
+    for (int i = 0; i < 256;i++)
+        if (part == lumaPartitionMapTable[i])
+        {
+            index = i;
+            break;
+        }
+    *width = 4 * ((index >> 4) + 1);
+    *height = 4 * ((index % 16) + 1);
+}
+
 inline int partitionFromLog2Size(int log2Size)
 {
     X265_CHECK(2 <= log2Size && log2Size <= 6, "Invalid block size\n");
@@ -412,6 +430,12 @@
 void setupInstrinsicPrimitives(EncoderPrimitives &p, int cpuMask);
 void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask);
 void setupAliasPrimitives(EncoderPrimitives &p);
+#if HAVE_ALTIVEC
+void setupPixelPrimitives_altivec(EncoderPrimitives &p);
+void setupDCTPrimitives_altivec(EncoderPrimitives &p);
+void setupFilterPrimitives_altivec(EncoderPrimitives &p);
+void setupIntraPrimitives_altivec(EncoderPrimitives &p);
+#endif
 }
 
 #if !EXPORT_C_API
​

x265_2.1.tar.gz/source/common/scalinglist.cpp -> x265_2.2.tar.gz/source/common/scalinglist.cpp Changed

@@ -29,64 +29,6 @@
 // file-anonymous namespace
 
 /* Strings for scaling list file parsing */
-const char MatrixType[4][6][20] =
-{
-    {
-        "INTRA4X4_LUMA",
-        "INTRA4X4_CHROMAU",
-        "INTRA4X4_CHROMAV",
-        "INTER4X4_LUMA",
-        "INTER4X4_CHROMAU",
-        "INTER4X4_CHROMAV"
-    },
-    {
-        "INTRA8X8_LUMA",
-        "INTRA8X8_CHROMAU",
-        "INTRA8X8_CHROMAV",
-        "INTER8X8_LUMA",
-        "INTER8X8_CHROMAU",
-        "INTER8X8_CHROMAV"
-    },
-    {
-        "INTRA16X16_LUMA",
-        "INTRA16X16_CHROMAU",
-        "INTRA16X16_CHROMAV",
-        "INTER16X16_LUMA",
-        "INTER16X16_CHROMAU",
-        "INTER16X16_CHROMAV"
-    },
-    {
-        "INTRA32X32_LUMA",
-        "",
-        "",
-        "INTER32X32_LUMA",
-        "",
-        "",
-    },
-};
-const char MatrixType_DC[4][12][22] =
-{
-    {
-    },
-    {
-    },
-    {
-        "INTRA16X16_LUMA_DC",
-        "INTRA16X16_CHROMAU_DC",
-        "INTRA16X16_CHROMAV_DC",
-        "INTER16X16_LUMA_DC",
-        "INTER16X16_CHROMAU_DC",
-        "INTER16X16_CHROMAV_DC"
-    },
-    {
-        "INTRA32X32_LUMA_DC",
-        "",
-        "",
-        "INTER32X32_LUMA_DC",
-        "",
-        "",
-    },
-};
 
 static int quantTSDefault4x4[16] =
 {
@@ -124,6 +66,64 @@
 
 namespace X265_NS {
 // private namespace
+    const char ScalingList::MatrixType[4][6][20] =
+    {
+        {
+            "INTRA4X4_LUMA",
+            "INTRA4X4_CHROMAU",
+            "INTRA4X4_CHROMAV",
+            "INTER4X4_LUMA",
+            "INTER4X4_CHROMAU",
+            "INTER4X4_CHROMAV"
+        },
+        {
+            "INTRA8X8_LUMA",
+            "INTRA8X8_CHROMAU",
+            "INTRA8X8_CHROMAV",
+            "INTER8X8_LUMA",
+            "INTER8X8_CHROMAU",
+            "INTER8X8_CHROMAV"
+        },
+        {
+            "INTRA16X16_LUMA",
+            "INTRA16X16_CHROMAU",
+            "INTRA16X16_CHROMAV",
+            "INTER16X16_LUMA",
+            "INTER16X16_CHROMAU",
+            "INTER16X16_CHROMAV"
+        },
+        {
+            "INTRA32X32_LUMA",
+            "",
+            "",
+            "INTER32X32_LUMA",
+            "",
+            "",
+        },
+    };
+    const char ScalingList::MatrixType_DC[4][12][22] =
+    {
+        {
+        },
+        {
+        },
+        {
+            "INTRA16X16_LUMA_DC",
+            "INTRA16X16_CHROMAU_DC",
+            "INTRA16X16_CHROMAV_DC",
+            "INTER16X16_LUMA_DC",
+            "INTER16X16_CHROMAU_DC",
+            "INTER16X16_CHROMAV_DC"
+        },
+        {
+            "INTRA32X32_LUMA_DC",
+            "",
+            "",
+            "INTER32X32_LUMA_DC",
+            "",
+            "",
+        },
+    };
 
 const int     ScalingList::s_numCoefPerSize[NUM_SIZES] = { 16, 64, 256, 1024 };
 const int32_t ScalingList::s_quantScales[NUM_REM] = { 26214, 23302, 20560, 18396, 16384, 14564 };
@@ -312,6 +312,22 @@
                 m_scalingListDC[sizeIdc][listIdc] = data;
             }
         }
+        if (sizeIdc == 3)
+        {
+            for (int listIdc = 1; listIdc < NUM_LISTS; listIdc++)
+            {
+                if (listIdc % 3 != 0)
+                {
+                    src = m_scalingListCoef[sizeIdc][listIdc];
+                    const int *srcNextSmallerSize = m_scalingListCoef[sizeIdc - 1][listIdc];
+                    for (int i = 0; i < size; i++)
+                    {
+                        src[i] = srcNextSmallerSize[i];
+                    }
+                    m_scalingListDC[sizeIdc][listIdc] = m_scalingListDC[sizeIdc - 1][listIdc];
+                }
+            }
+        }
     }
 
     fclose(fp);

 
@@ -29,64 +29,6 @@
 // file-anonymous namespace
 
 /* Strings for scaling list file parsing */
-const char MatrixType[4][6][20] =
-{
-    {
-        "INTRA4X4_LUMA",
-        "INTRA4X4_CHROMAU",
-        "INTRA4X4_CHROMAV",
-        "INTER4X4_LUMA",
-        "INTER4X4_CHROMAU",
-        "INTER4X4_CHROMAV"
-    },
-    {
-        "INTRA8X8_LUMA",
-        "INTRA8X8_CHROMAU",
-        "INTRA8X8_CHROMAV",
-        "INTER8X8_LUMA",
-        "INTER8X8_CHROMAU",
-        "INTER8X8_CHROMAV"
-    },
-    {
-        "INTRA16X16_LUMA",
-        "INTRA16X16_CHROMAU",
-        "INTRA16X16_CHROMAV",
-        "INTER16X16_LUMA",
-        "INTER16X16_CHROMAU",
-        "INTER16X16_CHROMAV"
-    },
-    {
-        "INTRA32X32_LUMA",
-        "",
-        "",
-        "INTER32X32_LUMA",
-        "",
-        "",
-    },
-};
-const char MatrixType_DC[4][12][22] =
-{
-    {
-    },
-    {
-    },
-    {
-        "INTRA16X16_LUMA_DC",
-        "INTRA16X16_CHROMAU_DC",
-        "INTRA16X16_CHROMAV_DC",
-        "INTER16X16_LUMA_DC",
-        "INTER16X16_CHROMAU_DC",
-        "INTER16X16_CHROMAV_DC"
-    },
-    {
-        "INTRA32X32_LUMA_DC",
-        "",
-        "",
-        "INTER32X32_LUMA_DC",
-        "",
-        "",
-    },
-};
 
 static int quantTSDefault4x4[16] =
 {
@@ -124,6 +66,64 @@
 
 namespace X265_NS {
 // private namespace
+    const char ScalingList::MatrixType[4][6][20] =
+    {
+        {
+            "INTRA4X4_LUMA",
+            "INTRA4X4_CHROMAU",
+            "INTRA4X4_CHROMAV",
+            "INTER4X4_LUMA",
+            "INTER4X4_CHROMAU",
+            "INTER4X4_CHROMAV"
+        },
+        {
+            "INTRA8X8_LUMA",
+            "INTRA8X8_CHROMAU",
+            "INTRA8X8_CHROMAV",
+            "INTER8X8_LUMA",
+            "INTER8X8_CHROMAU",
+            "INTER8X8_CHROMAV"
+        },
+        {
+            "INTRA16X16_LUMA",
+            "INTRA16X16_CHROMAU",
+            "INTRA16X16_CHROMAV",
+            "INTER16X16_LUMA",
+            "INTER16X16_CHROMAU",
+            "INTER16X16_CHROMAV"
+        },
+        {
+            "INTRA32X32_LUMA",
+            "",
+            "",
+            "INTER32X32_LUMA",
+            "",
+            "",
+        },
+    };
+    const char ScalingList::MatrixType_DC[4][12][22] =
+    {
+        {
+        },
+        {
+        },
+        {
+            "INTRA16X16_LUMA_DC",
+            "INTRA16X16_CHROMAU_DC",
+            "INTRA16X16_CHROMAV_DC",
+            "INTER16X16_LUMA_DC",
+            "INTER16X16_CHROMAU_DC",
+            "INTER16X16_CHROMAV_DC"
+        },
+        {
+            "INTRA32X32_LUMA_DC",
+            "",
+            "",
+            "INTER32X32_LUMA_DC",
+            "",
+            "",
+        },
+    };
 
 const int     ScalingList::s_numCoefPerSize[NUM_SIZES] = { 16, 64, 256, 1024 };
 const int32_t ScalingList::s_quantScales[NUM_REM] = { 26214, 23302, 20560, 18396, 16384, 14564 };
@@ -312,6 +312,22 @@
                 m_scalingListDC[sizeIdc][listIdc] = data;
             }
         }
+        if (sizeIdc == 3)
+        {
+            for (int listIdc = 1; listIdc < NUM_LISTS; listIdc++)
+            {
+                if (listIdc % 3 != 0)
+                {
+                    src = m_scalingListCoef[sizeIdc][listIdc];
+                    const int *srcNextSmallerSize = m_scalingListCoef[sizeIdc - 1][listIdc];
+                    for (int i = 0; i < size; i++)
+                    {
+                        src[i] = srcNextSmallerSize[i];
+                    }
+                    m_scalingListDC[sizeIdc][listIdc] = m_scalingListDC[sizeIdc - 1][listIdc];
+                }
+            }
+        }
     }
 
     fclose(fp);
​

x265_2.1.tar.gz/source/common/scalinglist.h -> x265_2.2.tar.gz/source/common/scalinglist.h Changed

 
@@ -42,6 +42,8 @@
     static const int     s_numCoefPerSize[NUM_SIZES];
     static const int32_t s_invQuantScales[NUM_REM];
     static const int32_t s_quantScales[NUM_REM];
+    static const char MatrixType[4][6][20];
+    static const char MatrixType_DC[4][12][22];
 
     int32_t  m_scalingListDC[NUM_SIZES][NUM_LISTS];   // the DC value of the matrix coefficient for 16x16
     int32_t* m_scalingListCoef[NUM_SIZES][NUM_LISTS]; // quantization matrix
​

x265_2.1.tar.gz/source/common/slice.h -> x265_2.2.tar.gz/source/common/slice.h Changed

@@ -239,11 +239,16 @@
     uint32_t maxLatencyIncrease;
     int      numReorderPics;
 
+    RPS      spsrps[MAX_NUM_SHORT_TERM_RPS];
+    int      spsrpsNum;
+    int      numGOPBegin;
+
     bool     bUseSAO; // use param
     bool     bUseAMP; // use param
     bool     bUseStrongIntraSmoothing; // use param
     bool     bTemporalMVPEnabled;
-    bool     bDiscardOptionalVUI;
+    bool     bEmitVUITimingInfo;
+    bool     bEmitVUIHRDInfo;
 
     Window   conformanceWindow;
     VUI      vuiParameters;
@@ -282,6 +287,8 @@
 
     bool     bDeblockingFilterControlPresent;
     bool     bPicDisableDeblockingFilter;
+
+    int      numRefIdxDefault[2];
 };
 
 struct WeightParam
@@ -334,6 +341,7 @@
     int         m_sliceQp;
     int         m_poc;
     int         m_lastIDR;
+    int         m_rpsIdx;
 
     uint32_t    m_colRefIdx;       // never modified
 
@@ -347,6 +355,10 @@
     bool        m_sLFaseFlag;      // loop filter boundary flag
     bool        m_colFromL0Flag;   // collocated picture from List0 or List1 flag
 
+    int         m_iPPSQpMinus26;
+    int         numRefIdxDefault[2];
+    int         m_iNumRPSInSPS;
+
     Slice()
     {
         m_lastIDR = 0;
@@ -356,6 +368,10 @@
         memset(m_refReconPicList, 0, sizeof(m_refReconPicList));
         memset(m_refPOCList, 0, sizeof(m_refPOCList));
         disableWeights();
+        m_iPPSQpMinus26 = 0;
+        numRefIdxDefault[0] = 1;
+        numRefIdxDefault[1] = 1;
+        m_rpsIdx = -1;
     }
 
     void disableWeights();

 
@@ -239,11 +239,16 @@
     uint32_t maxLatencyIncrease;
     int      numReorderPics;
 
+    RPS      spsrps[MAX_NUM_SHORT_TERM_RPS];
+    int      spsrpsNum;
+    int      numGOPBegin;
+
     bool     bUseSAO; // use param
     bool     bUseAMP; // use param
     bool     bUseStrongIntraSmoothing; // use param
     bool     bTemporalMVPEnabled;
-    bool     bDiscardOptionalVUI;
+    bool     bEmitVUITimingInfo;
+    bool     bEmitVUIHRDInfo;
 
     Window   conformanceWindow;
     VUI      vuiParameters;
@@ -282,6 +287,8 @@
 
     bool     bDeblockingFilterControlPresent;
     bool     bPicDisableDeblockingFilter;
+
+    int      numRefIdxDefault[2];
 };
 
 struct WeightParam
@@ -334,6 +341,7 @@
     int         m_sliceQp;
     int         m_poc;
     int         m_lastIDR;
+    int         m_rpsIdx;
 
     uint32_t    m_colRefIdx;       // never modified
 
@@ -347,6 +355,10 @@
     bool        m_sLFaseFlag;      // loop filter boundary flag
     bool        m_colFromL0Flag;   // collocated picture from List0 or List1 flag
 
+    int         m_iPPSQpMinus26;
+    int         numRefIdxDefault[2];
+    int         m_iNumRPSInSPS;
+
     Slice()
     {
         m_lastIDR = 0;
@@ -356,6 +368,10 @@
         memset(m_refReconPicList, 0, sizeof(m_refReconPicList));
         memset(m_refPOCList, 0, sizeof(m_refPOCList));
         disableWeights();
+        m_iPPSQpMinus26 = 0;
+        numRefIdxDefault[0] = 1;
+        numRefIdxDefault[1] = 1;
+        m_rpsIdx = -1;
     }
 
     void disableWeights();
​

x265_2.1.tar.gz/source/common/version.cpp -> x265_2.2.tar.gz/source/common/version.cpp Changed

 
@@ -77,7 +77,7 @@
 #define BITS    "[32 bit]"
 #endif
 
-#if defined(ENABLE_ASSEMBLY)
+#if defined(ENABLE_ASSEMBLY) || HAVE_ALTIVEC
 #define ASM     ""
 #else
 #define ASM     "[noasm]"
​

x265_2.1.tar.gz/source/common/yuv.cpp -> x265_2.2.tar.gz/source/common/yuv.cpp Changed

 
@@ -47,6 +47,11 @@
     m_size  = size;
     m_part = partitionFromSizes(size, size);
 
+    for (int i = 0; i < 2; i++)
+        for (int j = 0; j < MAX_NUM_REF; j++)
+            for (int k = 0; k < INTEGRAL_PLANE_NUM; k++)
+                m_integral[i][j][k] = NULL;
+
     if (csp == X265_CSP_I400)
     {
         CHECKED_MALLOC(m_buf[0], pixel, size * size + 8);
​

x265_2.1.tar.gz/source/common/yuv.h -> x265_2.2.tar.gz/source/common/yuv.h Changed

 
@@ -48,6 +48,7 @@
     int      m_csp;
     int      m_hChromaShift;
     int      m_vChromaShift;
+    uint32_t *m_integral[2][MAX_NUM_REF][INTEGRAL_PLANE_NUM];
 
     Yuv();
 
​

x265_2.1.tar.gz/source/encoder/analysis.cpp -> x265_2.2.tar.gz/source/encoder/analysis.cpp Changed

@@ -203,6 +203,57 @@
     return *m_modeDepth[0].bestMode;
 }
 
+int32_t Analysis::loadTUDepth(CUGeom cuGeom, CUData parentCTU)
+{
+    float predDepth = 0;
+    CUData* neighbourCU;
+    uint8_t count = 0;
+    int32_t maxTUDepth = -1;
+    neighbourCU = m_slice->m_refFrameList[0][0]->m_encData->m_picCTU;
+    predDepth += neighbourCU->m_refTuDepth[cuGeom.geomRecurId];
+    count++;
+    if (m_slice->isInterB())
+    {
+        neighbourCU = m_slice->m_refFrameList[1][0]->m_encData->m_picCTU;
+        predDepth += neighbourCU->m_refTuDepth[cuGeom.geomRecurId];
+        count++;
+    }
+    if (parentCTU.m_cuAbove)
+    {
+        predDepth += parentCTU.m_cuAbove->m_refTuDepth[cuGeom.geomRecurId];
+        count++;
+        if (parentCTU.m_cuAboveLeft)
+        {
+            predDepth += parentCTU.m_cuAboveLeft->m_refTuDepth[cuGeom.geomRecurId];
+            count++;
+        }
+        if (parentCTU.m_cuAboveRight)
+        {
+            predDepth += parentCTU.m_cuAboveRight->m_refTuDepth[cuGeom.geomRecurId];
+            count++;
+        }
+    }
+    if (parentCTU.m_cuLeft)
+    {
+        predDepth += parentCTU.m_cuLeft->m_refTuDepth[cuGeom.geomRecurId];
+        count++;
+    }
+    predDepth /= count;
+
+    if (predDepth == 0)
+        maxTUDepth = 0;
+    else if (predDepth < 1)
+        maxTUDepth = 1;
+    else if (predDepth >= 1 && predDepth <= 1.5)
+        maxTUDepth = 2;
+    else if (predDepth > 1.5 && predDepth <= 2.5)
+        maxTUDepth = 3;
+    else
+        maxTUDepth = -1;
+
+    return maxTUDepth;
+}
+
 void Analysis::tryLossless(const CUGeom& cuGeom)
 {
     ModeDepth& md = m_modeDepth[cuGeom.depth];
@@ -394,6 +445,16 @@
         cacheCost[cuIdx] = md.bestMode->rdCost;
     }
 
+    /* Save Intra CUs TU depth only when analysis mode is OFF */
+    if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4 && !m_param->analysisMode)
+    {
+        CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr);
+        int8_t maxTUDepth = -1;
+        for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
+            maxTUDepth = X265_MAX(maxTUDepth, md.pred[PRED_INTRA].cu.m_tuDepth[i]);
+        ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth;
+    }
+
     /* Copy best data to encData CTU and recon */
     md.bestMode->cu.copyToPic(depth);
     if (md.bestMode != &md.pred[PRED_SPLIT])
@@ -883,6 +944,16 @@
     ModeDepth& md = m_modeDepth[depth];
     md.bestMode = NULL;
 
+    if (m_param->searchMethod == X265_SEA)
+    {
+        int numPredDir = m_slice->isInterP() ? 1 : 2;
+        int offset = (int)(m_frame->m_reconPic->m_cuOffsetY[parentCTU.m_cuAddr] + m_frame->m_reconPic->m_buOffsetY[cuGeom.absPartIdx]);
+        for (int list = 0; list < numPredDir; list++)
+            for (int i = 0; i < m_frame->m_encData->m_slice->m_numRefIdx[list]; i++)
+                for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
+                    m_modeDepth[depth].fencYuv.m_integral[list][i][planes] = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_encData->m_meIntegral[planes] + offset;
+    }
+
     PicYuv& reconPic = *m_frame->m_reconPic;
 
     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
@@ -894,6 +965,9 @@
     bool skipRectAmp = false;
     bool chooseMerge = false;
 
+    if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
+        m_maxTUDepth = loadTUDepth(cuGeom, parentCTU);
+
     SplitData splitData[4];
     splitData[0].initSplitCUData();
     splitData[1].initSplitCUData();
@@ -1400,6 +1474,18 @@
     if (m_param->rdLevel)
         md.bestMode->reconYuv.copyToPicYuv(reconPic, cuAddr, cuGeom.absPartIdx);
 
+    if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
+    {
+        if (mightNotSplit)
+        {
+            CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr);
+            int8_t maxTUDepth = -1;
+            for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
+                maxTUDepth = X265_MAX(maxTUDepth, md.bestMode->cu.m_tuDepth[i]);
+            ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth;
+        }
+    }
+
     return splitCUData;
 }
 
@@ -1409,6 +1495,16 @@
     ModeDepth& md = m_modeDepth[depth];
     md.bestMode = NULL;
 
+    if (m_param->searchMethod == X265_SEA)
+    {
+        int numPredDir = m_slice->isInterP() ? 1 : 2;
+        int offset = (int)(m_frame->m_reconPic->m_cuOffsetY[parentCTU.m_cuAddr] + m_frame->m_reconPic->m_buOffsetY[cuGeom.absPartIdx]);
+        for (int list = 0; list < numPredDir; list++)
+            for (int i = 0; i < m_frame->m_encData->m_slice->m_numRefIdx[list]; i++)
+                for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
+                    m_modeDepth[depth].fencYuv.m_integral[list][i][planes] = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_encData->m_meIntegral[planes] + offset;
+    }
+
     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
     bool skipRecursion = false;
@@ -1424,6 +1520,9 @@
         md.pred[PRED_2Nx2N].rdCost = 0;
     }
 
+    if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
+        m_maxTUDepth = loadTUDepth(cuGeom, parentCTU);
+
     SplitData splitData[4];
     splitData[0].initSplitCUData();
     splitData[1].initSplitCUData();
@@ -1751,6 +1850,18 @@
             addSplitFlagCost(*md.bestMode, cuGeom.depth);
     }
 
+    if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
+    {
+        if (mightNotSplit)
+        {
+            CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr);
+            int8_t maxTUDepth = -1;
+            for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
+                maxTUDepth = X265_MAX(maxTUDepth, md.bestMode->cu.m_tuDepth[i]);
+            ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth;
+        }
+    }
+
     /* compare split RD cost against best cost */
     if (mightSplit && !skipRecursion)
         checkBestMode(md.pred[PRED_SPLIT], depth);
@@ -1942,12 +2053,12 @@
             if (m_param->maxSlices > 1)
             {
                 // NOTE: First row in slice can't negative
-                if ((candMvField[i][0].mv.y < m_sliceMinY) | (candMvField[i][1].mv.y < m_sliceMinY))
+                if (X265_MIN(candMvField[i][0].mv.y, candMvField[i][1].mv.y) < m_sliceMinY)
                     continue;
 
                 // Last row in slice can't reference beyond bound since it is another slice area
                 // TODO: we may beyond bound in future since these area have a chance to finish because we use parallel slices. Necessary prepare research on load balance
-                if ((candMvField[i][0].mv.y > m_sliceMaxY) | (candMvField[i][1].mv.y > m_sliceMaxY))
+                if (X265_MAX(candMvField[i][0].mv.y, candMvField[i][1].mv.y) > m_sliceMaxY)
                     continue;
             }
 
@@ -2072,12 +2183,12 @@
             if (m_param->maxSlices > 1)
             {
                 // NOTE: First row in slice can't negative
-                if ((candMvField[i][0].mv.y < m_sliceMinY) | (candMvField[i][1].mv.y < m_sliceMinY))
+                if (X265_MIN(candMvField[i][0].mv.y, candMvField[i][1].mv.y) < m_sliceMinY)
                     continue;
 
                 // Last row in slice can't reference beyond bound since it is another slice area
                 // TODO: we may beyond bound in future since these area have a chance to finish because we use parallel slices. Necessary prepare research on load balance
-                if ((candMvField[i][0].mv.y > m_sliceMaxY) | (candMvField[i][1].mv.y > m_sliceMaxY))
+                if (X265_MAX(candMvField[i][0].mv.y, candMvField[i][1].mv.y) > m_sliceMaxY)
                     continue;
             }

 
@@ -203,6 +203,57 @@
     return *m_modeDepth[0].bestMode;
 }
 
+int32_t Analysis::loadTUDepth(CUGeom cuGeom, CUData parentCTU)
+{
+    float predDepth = 0;
+    CUData* neighbourCU;
+    uint8_t count = 0;
+    int32_t maxTUDepth = -1;
+    neighbourCU = m_slice->m_refFrameList[0][0]->m_encData->m_picCTU;
+    predDepth += neighbourCU->m_refTuDepth[cuGeom.geomRecurId];
+    count++;
+    if (m_slice->isInterB())
+    {
+        neighbourCU = m_slice->m_refFrameList[1][0]->m_encData->m_picCTU;
+        predDepth += neighbourCU->m_refTuDepth[cuGeom.geomRecurId];
+        count++;
+    }
+    if (parentCTU.m_cuAbove)
+    {
+        predDepth += parentCTU.m_cuAbove->m_refTuDepth[cuGeom.geomRecurId];
+        count++;
+        if (parentCTU.m_cuAboveLeft)
+        {
+            predDepth += parentCTU.m_cuAboveLeft->m_refTuDepth[cuGeom.geomRecurId];
+            count++;
+        }
+        if (parentCTU.m_cuAboveRight)
+        {
+            predDepth += parentCTU.m_cuAboveRight->m_refTuDepth[cuGeom.geomRecurId];
+            count++;
+        }
+    }
+    if (parentCTU.m_cuLeft)
+    {
+        predDepth += parentCTU.m_cuLeft->m_refTuDepth[cuGeom.geomRecurId];
+        count++;
+    }
+    predDepth /= count;
+
+    if (predDepth == 0)
+        maxTUDepth = 0;
+    else if (predDepth < 1)
+        maxTUDepth = 1;
+    else if (predDepth >= 1 && predDepth <= 1.5)
+        maxTUDepth = 2;
+    else if (predDepth > 1.5 && predDepth <= 2.5)
+        maxTUDepth = 3;
+    else
+        maxTUDepth = -1;
+
+    return maxTUDepth;
+}
+
 void Analysis::tryLossless(const CUGeom& cuGeom)
 {
     ModeDepth& md = m_modeDepth[cuGeom.depth];
@@ -394,6 +445,16 @@
         cacheCost[cuIdx] = md.bestMode->rdCost;
     }
 
+    /* Save Intra CUs TU depth only when analysis mode is OFF */
+    if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4 && !m_param->analysisMode)
+    {
+        CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr);
+        int8_t maxTUDepth = -1;
+        for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
+            maxTUDepth = X265_MAX(maxTUDepth, md.pred[PRED_INTRA].cu.m_tuDepth[i]);
+        ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth;
+    }
+
     /* Copy best data to encData CTU and recon */
     md.bestMode->cu.copyToPic(depth);
     if (md.bestMode != &md.pred[PRED_SPLIT])
@@ -883,6 +944,16 @@
     ModeDepth& md = m_modeDepth[depth];
     md.bestMode = NULL;
 
+    if (m_param->searchMethod == X265_SEA)
+    {
+        int numPredDir = m_slice->isInterP() ? 1 : 2;
+        int offset = (int)(m_frame->m_reconPic->m_cuOffsetY[parentCTU.m_cuAddr] + m_frame->m_reconPic->m_buOffsetY[cuGeom.absPartIdx]);
+        for (int list = 0; list < numPredDir; list++)
+            for (int i = 0; i < m_frame->m_encData->m_slice->m_numRefIdx[list]; i++)
+                for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
+                    m_modeDepth[depth].fencYuv.m_integral[list][i][planes] = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_encData->m_meIntegral[planes] + offset;
+    }
+
     PicYuv& reconPic = *m_frame->m_reconPic;
 
     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
@@ -894,6 +965,9 @@
     bool skipRectAmp = false;
     bool chooseMerge = false;
 
+    if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
+        m_maxTUDepth = loadTUDepth(cuGeom, parentCTU);
+
     SplitData splitData[4];
     splitData[0].initSplitCUData();
     splitData[1].initSplitCUData();
@@ -1400,6 +1474,18 @@
     if (m_param->rdLevel)
         md.bestMode->reconYuv.copyToPicYuv(reconPic, cuAddr, cuGeom.absPartIdx);
 
+    if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
+    {
+        if (mightNotSplit)
+        {
+            CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr);
+            int8_t maxTUDepth = -1;
+            for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
+                maxTUDepth = X265_MAX(maxTUDepth, md.bestMode->cu.m_tuDepth[i]);
+            ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth;
+        }
+    }
+
     return splitCUData;
 }
 
@@ -1409,6 +1495,16 @@
     ModeDepth& md = m_modeDepth[depth];
     md.bestMode = NULL;
 
+    if (m_param->searchMethod == X265_SEA)
+    {
+        int numPredDir = m_slice->isInterP() ? 1 : 2;
+        int offset = (int)(m_frame->m_reconPic->m_cuOffsetY[parentCTU.m_cuAddr] + m_frame->m_reconPic->m_buOffsetY[cuGeom.absPartIdx]);
+        for (int list = 0; list < numPredDir; list++)
+            for (int i = 0; i < m_frame->m_encData->m_slice->m_numRefIdx[list]; i++)
+                for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
+                    m_modeDepth[depth].fencYuv.m_integral[list][i][planes] = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_encData->m_meIntegral[planes] + offset;
+    }
+
     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
     bool skipRecursion = false;
@@ -1424,6 +1520,9 @@
         md.pred[PRED_2Nx2N].rdCost = 0;
     }
 
+    if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
+        m_maxTUDepth = loadTUDepth(cuGeom, parentCTU);
+
     SplitData splitData[4];
     splitData[0].initSplitCUData();
     splitData[1].initSplitCUData();
@@ -1751,6 +1850,18 @@
             addSplitFlagCost(*md.bestMode, cuGeom.depth);
     }
 
+    if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
+    {
+        if (mightNotSplit)
+        {
+            CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr);
+            int8_t maxTUDepth = -1;
+            for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
+                maxTUDepth = X265_MAX(maxTUDepth, md.bestMode->cu.m_tuDepth[i]);
+            ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth;
+        }
+    }
+
     /* compare split RD cost against best cost */
     if (mightSplit && !skipRecursion)
         checkBestMode(md.pred[PRED_SPLIT], depth);
@@ -1942,12 +2053,12 @@
             if (m_param->maxSlices > 1)
             {
                 // NOTE: First row in slice can't negative
-                if ((candMvField[i][0].mv.y < m_sliceMinY) | (candMvField[i][1].mv.y < m_sliceMinY))
+                if (X265_MIN(candMvField[i][0].mv.y, candMvField[i][1].mv.y) < m_sliceMinY)
                     continue;
 
                 // Last row in slice can't reference beyond bound since it is another slice area
                 // TODO: we may beyond bound in future since these area have a chance to finish because we use parallel slices. Necessary prepare research on load balance
-                if ((candMvField[i][0].mv.y > m_sliceMaxY) | (candMvField[i][1].mv.y > m_sliceMaxY))
+                if (X265_MAX(candMvField[i][0].mv.y, candMvField[i][1].mv.y) > m_sliceMaxY)
                     continue;
             }
 
@@ -2072,12 +2183,12 @@
             if (m_param->maxSlices > 1)
             {
                 // NOTE: First row in slice can't negative
-                if ((candMvField[i][0].mv.y < m_sliceMinY) | (candMvField[i][1].mv.y < m_sliceMinY))
+                if (X265_MIN(candMvField[i][0].mv.y, candMvField[i][1].mv.y) < m_sliceMinY)
                     continue;
 
                 // Last row in slice can't reference beyond bound since it is another slice area
                 // TODO: we may beyond bound in future since these area have a chance to finish because we use parallel slices. Necessary prepare research on load balance
-                if ((candMvField[i][0].mv.y > m_sliceMaxY) | (candMvField[i][1].mv.y > m_sliceMaxY))
+                if (X265_MAX(candMvField[i][0].mv.y, candMvField[i][1].mv.y) > m_sliceMaxY)
                     continue;
             }
 
​

x265_2.1.tar.gz/source/encoder/analysis.h -> x265_2.2.tar.gz/source/encoder/analysis.h Changed

 
@@ -116,6 +116,7 @@
     void destroy();
 
     Mode& compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext);
+    int32_t loadTUDepth(CUGeom cuGeom, CUData parentCTU);
 
 protected:
     /* Analysis data for save/load mode, writes/reads data based on absPartIdx */
​

x265_2.1.tar.gz/source/encoder/api.cpp -> x265_2.2.tar.gz/source/encoder/api.cpp Changed

 
@@ -141,6 +141,11 @@
         Encoder *encoder = static_cast<Encoder*>(enc);
         Entropy sbacCoder;
         Bitstream bs;
+        if (encoder->m_param->rc.bStatRead && encoder->m_param->bMultiPassOptRPS)
+        {
+            if (!encoder->computeSPSRPSIndex())
+                return -1;
+        }
         encoder->getStreamHeaders(encoder->m_nalList, sbacCoder, bs);
         *pp_nal = &encoder->m_nalList.m_nal[0];
         if (pi_nal) *pi_nal = encoder->m_nalList.m_numNal;
​

x265_2.1.tar.gz/source/encoder/bitcost.cpp -> x265_2.2.tar.gz/source/encoder/bitcost.cpp Changed

@@ -54,16 +54,40 @@
                 s_costs[qp][i] = s_costs[qp][-i] = (uint16_t)X265_MIN(s_bitsizes[i] * lambda + 0.5f, (1 << 15) - 1);
         }
     }
-
+    for (int j = 0; j < 4; j++)
+    {
+        if (!s_fpelMvCosts[qp][j])
+        {
+            ScopedLock s(s_costCalcLock);
+            if (!s_fpelMvCosts[qp][j])
+            {
+                s_fpelMvCosts[qp][j] = X265_MALLOC(uint16_t, BC_MAX_MV + 1) + (BC_MAX_MV >> 1);
+                if (!s_fpelMvCosts[qp][j])
+                {
+                    x265_log(NULL, X265_LOG_ERROR, "BitCost s_fpelMvCosts buffer allocation failure\n");
+                    return;
+                }
+                for (int i = -(BC_MAX_MV >> 1); i < (BC_MAX_MV >> 1); i++)
+                {
+                    s_fpelMvCosts[qp][j][i] = s_costs[qp][i * 4 + j];
+                }
+            }
+        }
+    }
     m_cost = s_costs[qp];
+    for (int j = 0; j < 4; j++)
+    {
+        m_fpelMvCosts[j] = s_fpelMvCosts[qp][j];
+    }
 }
-
 /***
  * Class static data and methods
  */
 
 uint16_t *BitCost::s_costs[BC_MAX_QP];
 
+uint16_t* BitCost::s_fpelMvCosts[BC_MAX_QP][4];
+
 float *BitCost::s_bitsizes;
 
 Lock BitCost::s_costCalcLock;
@@ -96,6 +120,17 @@
             s_costs[i] = NULL;
         }
     }
+    for (int i = 0; i < BC_MAX_QP; i++)
+    {
+        for (int j = 0; j < 4; j++)
+        {
+            if (s_fpelMvCosts[i][j])
+            {
+                X265_FREE(s_fpelMvCosts[i][j] - (BC_MAX_MV >> 1));
+                s_fpelMvCosts[i][j] = NULL;
+            }
+        }
+    }
 
     if (s_bitsizes)
     {

 
@@ -54,16 +54,40 @@
                 s_costs[qp][i] = s_costs[qp][-i] = (uint16_t)X265_MIN(s_bitsizes[i] * lambda + 0.5f, (1 << 15) - 1);
         }
     }
-
+    for (int j = 0; j < 4; j++)
+    {
+        if (!s_fpelMvCosts[qp][j])
+        {
+            ScopedLock s(s_costCalcLock);
+            if (!s_fpelMvCosts[qp][j])
+            {
+                s_fpelMvCosts[qp][j] = X265_MALLOC(uint16_t, BC_MAX_MV + 1) + (BC_MAX_MV >> 1);
+                if (!s_fpelMvCosts[qp][j])
+                {
+                    x265_log(NULL, X265_LOG_ERROR, "BitCost s_fpelMvCosts buffer allocation failure\n");
+                    return;
+                }
+                for (int i = -(BC_MAX_MV >> 1); i < (BC_MAX_MV >> 1); i++)
+                {
+                    s_fpelMvCosts[qp][j][i] = s_costs[qp][i * 4 + j];
+                }
+            }
+        }
+    }
     m_cost = s_costs[qp];
+    for (int j = 0; j < 4; j++)
+    {
+        m_fpelMvCosts[j] = s_fpelMvCosts[qp][j];
+    }
 }
-
 /***
  * Class static data and methods
  */
 
 uint16_t *BitCost::s_costs[BC_MAX_QP];
 
+uint16_t* BitCost::s_fpelMvCosts[BC_MAX_QP][4];
+
 float *BitCost::s_bitsizes;
 
 Lock BitCost::s_costCalcLock;
@@ -96,6 +120,17 @@
             s_costs[i] = NULL;
         }
     }
+    for (int i = 0; i < BC_MAX_QP; i++)
+    {
+        for (int j = 0; j < 4; j++)
+        {
+            if (s_fpelMvCosts[i][j])
+            {
+                X265_FREE(s_fpelMvCosts[i][j] - (BC_MAX_MV >> 1));
+                s_fpelMvCosts[i][j] = NULL;
+            }
+        }
+    }
 
     if (s_bitsizes)
     {
​

x265_2.1.tar.gz/source/encoder/bitcost.h -> x265_2.2.tar.gz/source/encoder/bitcost.h Changed

 
@@ -67,6 +67,8 @@
 
     uint16_t *m_cost;
 
+    uint16_t *m_fpelMvCosts[4];
+
     MV        m_mvp;
 
     BitCost& operator =(const BitCost&);
@@ -84,6 +86,8 @@
 
     static uint16_t *s_costs[BC_MAX_QP];
 
+    static uint16_t *s_fpelMvCosts[BC_MAX_QP][4];
+
     static Lock s_costCalcLock;
 
     static void CalculateLogs();
​

x265_2.1.tar.gz/source/encoder/dpb.cpp -> x265_2.2.tar.gz/source/encoder/dpb.cpp Changed

 
@@ -92,6 +92,19 @@
             m_freeList.pushBack(*curFrame);
             curFrame->m_encData->m_freeListNext = m_frameDataFreeList;
             m_frameDataFreeList = curFrame->m_encData;
+
+            if (curFrame->m_encData->m_meBuffer)
+            {
+                for (int i = 0; i < INTEGRAL_PLANE_NUM; i++)
+                {
+                    if (curFrame->m_encData->m_meBuffer[i] != NULL)
+                    {
+                        X265_FREE(curFrame->m_encData->m_meBuffer[i]);
+                        curFrame->m_encData->m_meBuffer[i] = NULL;
+                    }
+                }
+            }
+
             curFrame->m_encData = NULL;
             curFrame->m_reconPic = NULL;
         }
​

x265_2.1.tar.gz/source/encoder/encoder.cpp -> x265_2.2.tar.gz/source/encoder/encoder.cpp Changed

@@ -74,6 +74,10 @@
     m_threadPool = NULL;
     m_analysisFile = NULL;
     m_offsetEmergency = NULL;
+    m_iFrameNum = 0;
+    m_iPPSQpMinus26 = 0;
+    m_iLastSliceQp = 0;
+    m_rpsInSpsCount = 0;
     for (int i = 0; i < X265_MAX_FRAME_THREADS; i++)
         m_frameEncoder[i] = NULL;
 
@@ -145,12 +149,6 @@
         p->bEnableWavefront = p->bDistributeModeAnalysis = p->bDistributeMotionEstimation = p->lookaheadSlices = 0;
     }
 
-    if (!p->bEnableWavefront && p->rc.vbvBufferSize)
-    {
-        x265_log(p, X265_LOG_ERROR, "VBV requires wavefront parallelism\n");
-        m_aborted = true;
-    }
-
     x265_log(p, X265_LOG_INFO, "Slices                              : %d\n", p->maxSlices);
 
     char buf[128];
@@ -318,6 +316,8 @@
     if (!m_lookahead->create())
         m_aborted = true;
 
+    initRefIdx();
+
     if (m_param->analysisMode)
     {
         const char* name = m_param->analysisFileName;
@@ -869,6 +869,58 @@
                 slice->m_endCUAddr = slice->realEndAddress(m_sps.numCUsInFrame * NUM_4x4_PARTITIONS);
             }
 
+            if (m_param->searchMethod == X265_SEA && frameEnc->m_lowres.sliceType != X265_TYPE_B)
+            {
+                int padX = g_maxCUSize + 32;
+                int padY = g_maxCUSize + 16;
+                uint32_t numCuInHeight = (frameEnc->m_encData->m_reconPic->m_picHeight + g_maxCUSize - 1) / g_maxCUSize;
+                int maxHeight = numCuInHeight * g_maxCUSize;
+                for (int i = 0; i < INTEGRAL_PLANE_NUM; i++)
+                {
+                    frameEnc->m_encData->m_meBuffer[i] = X265_MALLOC(uint32_t, frameEnc->m_reconPic->m_stride * (maxHeight + (2 * padY)));
+                    if (frameEnc->m_encData->m_meBuffer[i])
+                    {
+                        memset(frameEnc->m_encData->m_meBuffer[i], 0, sizeof(uint32_t)* frameEnc->m_reconPic->m_stride * (maxHeight + (2 * padY)));
+                        frameEnc->m_encData->m_meIntegral[i] = frameEnc->m_encData->m_meBuffer[i] + frameEnc->m_encData->m_reconPic->m_stride * padY + padX;
+                    }
+                    else
+                        x265_log(m_param, X265_LOG_ERROR, "SEA motion search: POC %d Integral buffer[%d] unallocated\n", frameEnc->m_poc, i);
+                }
+            }
+
+            if (m_param->bOptQpPPS && frameEnc->m_lowres.bKeyframe && m_param->bRepeatHeaders)
+            {
+                ScopedLock qpLock(m_sliceQpLock);
+                if (m_iFrameNum > 0)
+                {
+                    //Search the least cost
+                    int64_t iLeastCost = m_iBitsCostSum[0];
+                    int iLeastId = 0;
+                    for (int i = 1; i < QP_MAX_MAX + 1; i++)
+                    {
+                        if (iLeastCost > m_iBitsCostSum[i])
+                        {
+                            iLeastId = i;
+                            iLeastCost = m_iBitsCostSum[i];
+                        }
+                    }
+
+                    /* If last slice Qp is close to (26 + m_iPPSQpMinus26) or outputs is all I-frame video,
+                       we don't need to change m_iPPSQpMinus26. */
+                    if ((abs(m_iLastSliceQp - (26 + m_iPPSQpMinus26)) > 1) && (m_iFrameNum > 1))
+                        m_iPPSQpMinus26 = (iLeastId + 1) - 26;
+                    m_iFrameNum = 0;
+                }
+
+                for (int i = 0; i < QP_MAX_MAX + 1; i++)
+                    m_iBitsCostSum[i] = 0;
+            }
+
+            frameEnc->m_encData->m_slice->m_iPPSQpMinus26 = m_iPPSQpMinus26;
+            frameEnc->m_encData->m_slice->numRefIdxDefault[0] = m_pps.numRefIdxDefault[0];
+            frameEnc->m_encData->m_slice->numRefIdxDefault[1] = m_pps.numRefIdxDefault[1];
+            frameEnc->m_encData->m_slice->m_iNumRPSInSPS = m_sps.spsrpsNum;
+
             curEncoder->m_rce.encodeOrder = frameEnc->m_encodeOrder = m_encodedFrameNum++;
             if (m_bframeDelay)
             {
@@ -1031,6 +1083,13 @@
 
         x265_log(m_param, X265_LOG_INFO, "lossless compression ratio %.2f::1\n", uncompressed / m_analyzeAll.m_accBits);
     }
+    if (m_param->bMultiPassOptRPS && m_param->rc.bStatRead)
+    {
+        x265_log(m_param, X265_LOG_INFO, "RPS in SPS: %d frames (%.2f%%), RPS not in SPS: %d frames (%.2f%%)\n", 
+            m_rpsInSpsCount, (float)100.0 * m_rpsInSpsCount / m_rateControl->m_numEntries, 
+            m_rateControl->m_numEntries - m_rpsInSpsCount, 
+            (float)100.0 * (m_rateControl->m_numEntries - m_rpsInSpsCount) / m_rateControl->m_numEntries);
+    }
 
     if (m_analyzeAll.m_numPics)
     {
@@ -1353,6 +1412,7 @@
         frameStats->qp = curEncData.m_avgQpAq;
         frameStats->bits = bits;
         frameStats->bScenecut = curFrame->m_lowres.bScenecut;
+        frameStats->bufferFill = m_rateControl->m_bufferFillActual;
         frameStats->frameLatency = inPoc - poc;
         if (m_param->rc.rateControlMode == X265_RC_CRF)
             frameStats->rateFactor = curEncData.m_rateFactor;
@@ -1413,6 +1473,66 @@
 #pragma warning(disable: 4127) // conditional expression is constant
 #endif
 
+void Encoder::initRefIdx()
+{
+    int j = 0;
+
+    for (j = 0; j < MAX_NUM_REF_IDX; j++)
+    {
+        m_refIdxLastGOP.numRefIdxl0[j] = 0;
+        m_refIdxLastGOP.numRefIdxl1[j] = 0;
+    }
+
+    return;
+}
+
+void Encoder::analyseRefIdx(int *numRefIdx)
+{
+    int i_l0 = 0;
+    int i_l1 = 0;
+
+    i_l0 = numRefIdx[0];
+    i_l1 = numRefIdx[1];
+
+    if ((0 < i_l0) && (MAX_NUM_REF_IDX > i_l0))
+        m_refIdxLastGOP.numRefIdxl0[i_l0]++;
+    if ((0 < i_l1) && (MAX_NUM_REF_IDX > i_l1))
+        m_refIdxLastGOP.numRefIdxl1[i_l1]++;
+
+    return;
+}
+
+void Encoder::updateRefIdx()
+{
+    int i_max_l0 = 0;
+    int i_max_l1 = 0;
+    int j = 0;
+
+    i_max_l0 = 0;
+    i_max_l1 = 0;
+    m_refIdxLastGOP.numRefIdxDefault[0] = 1;
+    m_refIdxLastGOP.numRefIdxDefault[1] = 1;
+    for (j = 0; j < MAX_NUM_REF_IDX; j++)
+    {
+        if (i_max_l0 < m_refIdxLastGOP.numRefIdxl0[j])
+        {
+            i_max_l0 = m_refIdxLastGOP.numRefIdxl0[j];
+            m_refIdxLastGOP.numRefIdxDefault[0] = j;
+        }
+        if (i_max_l1 < m_refIdxLastGOP.numRefIdxl1[j])
+        {
+            i_max_l1 = m_refIdxLastGOP.numRefIdxl1[j];
+            m_refIdxLastGOP.numRefIdxDefault[1] = j;
+        }
+    }
+
+    m_pps.numRefIdxDefault[0] = m_refIdxLastGOP.numRefIdxDefault[0];
+    m_pps.numRefIdxDefault[1] = m_refIdxLastGOP.numRefIdxDefault[1];
+    initRefIdx();
+
+    return;
+}
+
 void Encoder::getStreamHeaders(NALList& list, Entropy& sbacCoder, Bitstream& bs)
 {
     sbacCoder.setBitstream(&bs);
@@ -1429,7 +1549,7 @@
     list.serialize(NAL_UNIT_SPS, bs);
 
     bs.resetBits();
-    sbacCoder.codePPS(m_pps, (m_param->maxSlices <= 1));
+    sbacCoder.codePPS( m_pps, (m_param->maxSlices <= 1), m_iPPSQpMinus26);
     bs.writeByteAlignment();
     list.serialize(NAL_UNIT_PPS, bs);
 
@@ -1458,9 +1578,9 @@
         list.serialize(NAL_UNIT_PREFIX_SEI, bs);
     }
 
-    if (!m_param->bDiscardSEI && m_param->bEmitInfoSEI)
+    if (m_param->bEmitInfoSEI)
     {
-        char *opts = x265_param2string(m_param);
+        char *opts = x265_param2string(m_param, m_sps.conformanceWindow.rightOffset, m_sps.conformanceWindow.bottomOffset);
         if (opts)

 
@@ -74,6 +74,10 @@
     m_threadPool = NULL;
     m_analysisFile = NULL;
     m_offsetEmergency = NULL;
+    m_iFrameNum = 0;
+    m_iPPSQpMinus26 = 0;
+    m_iLastSliceQp = 0;
+    m_rpsInSpsCount = 0;
     for (int i = 0; i < X265_MAX_FRAME_THREADS; i++)
         m_frameEncoder[i] = NULL;
 
@@ -145,12 +149,6 @@
         p->bEnableWavefront = p->bDistributeModeAnalysis = p->bDistributeMotionEstimation = p->lookaheadSlices = 0;
     }
 
-    if (!p->bEnableWavefront && p->rc.vbvBufferSize)
-    {
-        x265_log(p, X265_LOG_ERROR, "VBV requires wavefront parallelism\n");
-        m_aborted = true;
-    }
-
     x265_log(p, X265_LOG_INFO, "Slices                              : %d\n", p->maxSlices);
 
     char buf[128];
@@ -318,6 +316,8 @@
     if (!m_lookahead->create())
         m_aborted = true;
 
+    initRefIdx();
+
     if (m_param->analysisMode)
     {
         const char* name = m_param->analysisFileName;
@@ -869,6 +869,58 @@
                 slice->m_endCUAddr = slice->realEndAddress(m_sps.numCUsInFrame * NUM_4x4_PARTITIONS);
             }
 
+            if (m_param->searchMethod == X265_SEA && frameEnc->m_lowres.sliceType != X265_TYPE_B)
+            {
+                int padX = g_maxCUSize + 32;
+                int padY = g_maxCUSize + 16;
+                uint32_t numCuInHeight = (frameEnc->m_encData->m_reconPic->m_picHeight + g_maxCUSize - 1) / g_maxCUSize;
+                int maxHeight = numCuInHeight * g_maxCUSize;
+                for (int i = 0; i < INTEGRAL_PLANE_NUM; i++)
+                {
+                    frameEnc->m_encData->m_meBuffer[i] = X265_MALLOC(uint32_t, frameEnc->m_reconPic->m_stride * (maxHeight + (2 * padY)));
+                    if (frameEnc->m_encData->m_meBuffer[i])
+                    {
+                        memset(frameEnc->m_encData->m_meBuffer[i], 0, sizeof(uint32_t)* frameEnc->m_reconPic->m_stride * (maxHeight + (2 * padY)));
+                        frameEnc->m_encData->m_meIntegral[i] = frameEnc->m_encData->m_meBuffer[i] + frameEnc->m_encData->m_reconPic->m_stride * padY + padX;
+                    }
+                    else
+                        x265_log(m_param, X265_LOG_ERROR, "SEA motion search: POC %d Integral buffer[%d] unallocated\n", frameEnc->m_poc, i);
+                }
+            }
+
+            if (m_param->bOptQpPPS && frameEnc->m_lowres.bKeyframe && m_param->bRepeatHeaders)
+            {
+                ScopedLock qpLock(m_sliceQpLock);
+                if (m_iFrameNum > 0)
+                {
+                    //Search the least cost
+                    int64_t iLeastCost = m_iBitsCostSum[0];
+                    int iLeastId = 0;
+                    for (int i = 1; i < QP_MAX_MAX + 1; i++)
+                    {
+                        if (iLeastCost > m_iBitsCostSum[i])
+                        {
+                            iLeastId = i;
+                            iLeastCost = m_iBitsCostSum[i];
+                        }
+                    }
+
+                    /* If last slice Qp is close to (26 + m_iPPSQpMinus26) or outputs is all I-frame video,
+                       we don't need to change m_iPPSQpMinus26. */
+                    if ((abs(m_iLastSliceQp - (26 + m_iPPSQpMinus26)) > 1) && (m_iFrameNum > 1))
+                        m_iPPSQpMinus26 = (iLeastId + 1) - 26;
+                    m_iFrameNum = 0;
+                }
+
+                for (int i = 0; i < QP_MAX_MAX + 1; i++)
+                    m_iBitsCostSum[i] = 0;
+            }
+
+            frameEnc->m_encData->m_slice->m_iPPSQpMinus26 = m_iPPSQpMinus26;
+            frameEnc->m_encData->m_slice->numRefIdxDefault[0] = m_pps.numRefIdxDefault[0];
+            frameEnc->m_encData->m_slice->numRefIdxDefault[1] = m_pps.numRefIdxDefault[1];
+            frameEnc->m_encData->m_slice->m_iNumRPSInSPS = m_sps.spsrpsNum;
+
             curEncoder->m_rce.encodeOrder = frameEnc->m_encodeOrder = m_encodedFrameNum++;
             if (m_bframeDelay)
             {
@@ -1031,6 +1083,13 @@
 
         x265_log(m_param, X265_LOG_INFO, "lossless compression ratio %.2f::1\n", uncompressed / m_analyzeAll.m_accBits);
     }
+    if (m_param->bMultiPassOptRPS && m_param->rc.bStatRead)
+    {
+        x265_log(m_param, X265_LOG_INFO, "RPS in SPS: %d frames (%.2f%%), RPS not in SPS: %d frames (%.2f%%)\n", 
+            m_rpsInSpsCount, (float)100.0 * m_rpsInSpsCount / m_rateControl->m_numEntries, 
+            m_rateControl->m_numEntries - m_rpsInSpsCount, 
+            (float)100.0 * (m_rateControl->m_numEntries - m_rpsInSpsCount) / m_rateControl->m_numEntries);
+    }
 
     if (m_analyzeAll.m_numPics)
     {
@@ -1353,6 +1412,7 @@
         frameStats->qp = curEncData.m_avgQpAq;
         frameStats->bits = bits;
         frameStats->bScenecut = curFrame->m_lowres.bScenecut;
+        frameStats->bufferFill = m_rateControl->m_bufferFillActual;
         frameStats->frameLatency = inPoc - poc;
         if (m_param->rc.rateControlMode == X265_RC_CRF)
             frameStats->rateFactor = curEncData.m_rateFactor;
@@ -1413,6 +1473,66 @@
 #pragma warning(disable: 4127) // conditional expression is constant
 #endif
 
+void Encoder::initRefIdx()
+{
+    int j = 0;
+
+    for (j = 0; j < MAX_NUM_REF_IDX; j++)
+    {
+        m_refIdxLastGOP.numRefIdxl0[j] = 0;
+        m_refIdxLastGOP.numRefIdxl1[j] = 0;
+    }
+
+    return;
+}
+
+void Encoder::analyseRefIdx(int *numRefIdx)
+{
+    int i_l0 = 0;
+    int i_l1 = 0;
+
+    i_l0 = numRefIdx[0];
+    i_l1 = numRefIdx[1];
+
+    if ((0 < i_l0) && (MAX_NUM_REF_IDX > i_l0))
+        m_refIdxLastGOP.numRefIdxl0[i_l0]++;
+    if ((0 < i_l1) && (MAX_NUM_REF_IDX > i_l1))
+        m_refIdxLastGOP.numRefIdxl1[i_l1]++;
+
+    return;
+}
+
+void Encoder::updateRefIdx()
+{
+    int i_max_l0 = 0;
+    int i_max_l1 = 0;
+    int j = 0;
+
+    i_max_l0 = 0;
+    i_max_l1 = 0;
+    m_refIdxLastGOP.numRefIdxDefault[0] = 1;
+    m_refIdxLastGOP.numRefIdxDefault[1] = 1;
+    for (j = 0; j < MAX_NUM_REF_IDX; j++)
+    {
+        if (i_max_l0 < m_refIdxLastGOP.numRefIdxl0[j])
+        {
+            i_max_l0 = m_refIdxLastGOP.numRefIdxl0[j];
+            m_refIdxLastGOP.numRefIdxDefault[0] = j;
+        }
+        if (i_max_l1 < m_refIdxLastGOP.numRefIdxl1[j])
+        {
+            i_max_l1 = m_refIdxLastGOP.numRefIdxl1[j];
+            m_refIdxLastGOP.numRefIdxDefault[1] = j;
+        }
+    }
+
+    m_pps.numRefIdxDefault[0] = m_refIdxLastGOP.numRefIdxDefault[0];
+    m_pps.numRefIdxDefault[1] = m_refIdxLastGOP.numRefIdxDefault[1];
+    initRefIdx();
+
+    return;
+}
+
 void Encoder::getStreamHeaders(NALList& list, Entropy& sbacCoder, Bitstream& bs)
 {
     sbacCoder.setBitstream(&bs);
@@ -1429,7 +1549,7 @@
     list.serialize(NAL_UNIT_SPS, bs);
 
     bs.resetBits();
-    sbacCoder.codePPS(m_pps, (m_param->maxSlices <= 1));
+    sbacCoder.codePPS( m_pps, (m_param->maxSlices <= 1), m_iPPSQpMinus26);
     bs.writeByteAlignment();
     list.serialize(NAL_UNIT_PPS, bs);
 
@@ -1458,9 +1578,9 @@
         list.serialize(NAL_UNIT_PREFIX_SEI, bs);
     }
 
-    if (!m_param->bDiscardSEI && m_param->bEmitInfoSEI)
+    if (m_param->bEmitInfoSEI)
     {
-        char *opts = x265_param2string(m_param);
+        char *opts = x265_param2string(m_param, m_sps.conformanceWindow.rightOffset, m_sps.conformanceWindow.bottomOffset);
         if (opts)
​

x265_2.1.tar.gz/source/encoder/encoder.h -> x265_2.2.tar.gz/source/encoder/encoder.h Changed

@@ -26,6 +26,7 @@
 
 #include "common.h"
 #include "slice.h"
+#include "threading.h"
 #include "scalinglist.h"
 #include "x265.h"
 #include "nal.h"
@@ -69,6 +70,24 @@
     void addSsim(double ssim);
 };
 
+#define MAX_NUM_REF_IDX 64
+
+struct RefIdxLastGOP
+{
+    int numRefIdxDefault[2];
+    int numRefIdxl0[MAX_NUM_REF_IDX];
+    int numRefIdxl1[MAX_NUM_REF_IDX];
+};
+
+struct RPSListNode
+{
+    int idx;
+    int count;
+    RPS* rps;
+    RPSListNode* next;
+    RPSListNode* prior;
+};
+
 class FrameEncoder;
 class DPB;
 class Lookahead;
@@ -136,6 +155,19 @@
      * one is done. Requires bIntraRefresh to be set.*/
     int                m_bQueuedIntraRefresh;
 
+    /* For optimising slice QP */
+    Lock               m_sliceQpLock;
+    int                m_iFrameNum;   
+    int                m_iPPSQpMinus26;
+    int                m_iLastSliceQp;
+    int64_t            m_iBitsCostSum[QP_MAX_MAX + 1];
+
+    Lock               m_sliceRefIdxLock;
+    RefIdxLastGOP      m_refIdxLastGOP;
+
+    Lock               m_rpsInSpsLock;
+    int                m_rpsInSpsCount;
+
     Encoder();
     ~Encoder() {}
 
@@ -173,6 +205,11 @@
 
     void calcRefreshInterval(Frame* frameEnc);
 
+    void initRefIdx();
+    void analyseRefIdx(int *numRefIdx);
+    void updateRefIdx();
+    bool computeSPSRPSIndex();
+
 protected:
 
     void initVPS(VPS *vps);

 
@@ -26,6 +26,7 @@
 
 #include "common.h"
 #include "slice.h"
+#include "threading.h"
 #include "scalinglist.h"
 #include "x265.h"
 #include "nal.h"
@@ -69,6 +70,24 @@
     void addSsim(double ssim);
 };
 
+#define MAX_NUM_REF_IDX 64
+
+struct RefIdxLastGOP
+{
+    int numRefIdxDefault[2];
+    int numRefIdxl0[MAX_NUM_REF_IDX];
+    int numRefIdxl1[MAX_NUM_REF_IDX];
+};
+
+struct RPSListNode
+{
+    int idx;
+    int count;
+    RPS* rps;
+    RPSListNode* next;
+    RPSListNode* prior;
+};
+
 class FrameEncoder;
 class DPB;
 class Lookahead;
@@ -136,6 +155,19 @@
      * one is done. Requires bIntraRefresh to be set.*/
     int                m_bQueuedIntraRefresh;
 
+    /* For optimising slice QP */
+    Lock               m_sliceQpLock;
+    int                m_iFrameNum;   
+    int                m_iPPSQpMinus26;
+    int                m_iLastSliceQp;
+    int64_t            m_iBitsCostSum[QP_MAX_MAX + 1];
+
+    Lock               m_sliceRefIdxLock;
+    RefIdxLastGOP      m_refIdxLastGOP;
+
+    Lock               m_rpsInSpsLock;
+    int                m_rpsInSpsCount;
+
     Encoder();
     ~Encoder() {}
 
@@ -173,6 +205,11 @@
 
     void calcRefreshInterval(Frame* frameEnc);
 
+    void initRefIdx();
+    void analyseRefIdx(int *numRefIdx);
+    void updateRefIdx();
+    bool computeSPSRPSIndex();
+
 protected:
 
     void initVPS(VPS *vps);
​

x265_2.1.tar.gz/source/encoder/entropy.cpp -> x265_2.2.tar.gz/source/encoder/entropy.cpp Changed

@@ -312,19 +312,21 @@
     WRITE_FLAG(sps.bUseSAO, "sample_adaptive_offset_enabled_flag");
 
     WRITE_FLAG(0, "pcm_enabled_flag");
-    WRITE_UVLC(0, "num_short_term_ref_pic_sets");
+    WRITE_UVLC(sps.spsrpsNum, "num_short_term_ref_pic_sets");
+    for (int i = 0; i < sps.spsrpsNum; i++)
+        codeShortTermRefPicSet(sps.spsrps[i], i);
     WRITE_FLAG(0, "long_term_ref_pics_present_flag");
 
     WRITE_FLAG(sps.bTemporalMVPEnabled, "sps_temporal_mvp_enable_flag");
     WRITE_FLAG(sps.bUseStrongIntraSmoothing, "sps_strong_intra_smoothing_enable_flag");
 
     WRITE_FLAG(1, "vui_parameters_present_flag");
-    codeVUI(sps.vuiParameters, sps.maxTempSubLayers, sps.bDiscardOptionalVUI);
+    codeVUI(sps.vuiParameters, sps.maxTempSubLayers, sps.bEmitVUITimingInfo, sps.bEmitVUIHRDInfo);
 
     WRITE_FLAG(0, "sps_extension_flag");
 }
 
-void Entropy::codePPS(const PPS& pps, bool filerAcross)
+void Entropy::codePPS( const PPS& pps, bool filerAcross, int iPPSInitQpMinus26 )
 {
     WRITE_UVLC(0,                          "pps_pic_parameter_set_id");
     WRITE_UVLC(0,                          "pps_seq_parameter_set_id");
@@ -333,10 +335,10 @@
     WRITE_CODE(0, 3,                       "num_extra_slice_header_bits");
     WRITE_FLAG(pps.bSignHideEnabled,       "sign_data_hiding_flag");
     WRITE_FLAG(0,                          "cabac_init_present_flag");
-    WRITE_UVLC(0,                          "num_ref_idx_l0_default_active_minus1");
-    WRITE_UVLC(0,                          "num_ref_idx_l1_default_active_minus1");
+    WRITE_UVLC(pps.numRefIdxDefault[0] - 1, "num_ref_idx_l0_default_active_minus1");
+    WRITE_UVLC(pps.numRefIdxDefault[1] - 1, "num_ref_idx_l1_default_active_minus1");
 
-    WRITE_SVLC(0, "init_qp_minus26");
+    WRITE_SVLC(iPPSInitQpMinus26,         "init_qp_minus26");
     WRITE_FLAG(pps.bConstrainedIntraPred, "constrained_intra_pred_flag");
     WRITE_FLAG(pps.bTransformSkipEnabled, "transform_skip_enabled_flag");
 
@@ -422,7 +424,7 @@
     }
 }
 
-void Entropy::codeVUI(const VUI& vui, int maxSubTLayers, bool bDiscardOptionalVUI)
+void Entropy::codeVUI(const VUI& vui, int maxSubTLayers, bool bEmitVUITimingInfo, bool bEmitVUIHRDInfo)
 {
     WRITE_FLAG(vui.aspectRatioInfoPresentFlag, "aspect_ratio_info_present_flag");
     if (vui.aspectRatioInfoPresentFlag)
@@ -473,7 +475,7 @@
         WRITE_UVLC(vui.defaultDisplayWindow.bottomOffset, "def_disp_win_bottom_offset");
     }
 
-    if (bDiscardOptionalVUI)
+    if (!bEmitVUITimingInfo)
         WRITE_FLAG(0, "vui_timing_info_present_flag");
     else
     {
@@ -483,7 +485,7 @@
         WRITE_FLAG(0, "vui_poc_proportional_to_timing_flag");
     }
 
-    if (bDiscardOptionalVUI)
+    if (!bEmitVUIHRDInfo)
         WRITE_FLAG(0, "vui_hrd_parameters_present_flag");
     else
     {
@@ -614,8 +616,21 @@
             }
 #endif
 
-        WRITE_FLAG(0, "short_term_ref_pic_set_sps_flag");
-        codeShortTermRefPicSet(slice.m_rps);
+        if (slice.m_rpsIdx < 0)
+        {
+            WRITE_FLAG(0, "short_term_ref_pic_set_sps_flag");
+            codeShortTermRefPicSet(slice.m_rps, slice.m_sps->spsrpsNum);
+        }
+        else
+        {
+            WRITE_FLAG(1, "short_term_ref_pic_set_sps_flag");
+            int numBits = 0;
+            while ((1 << numBits) < slice.m_iNumRPSInSPS)
+                numBits++;
+
+            if (numBits > 0)
+                WRITE_CODE(slice.m_rpsIdx, numBits, "short_term_ref_pic_set_idx");
+        }
 
         if (slice.m_sps->bTemporalMVPEnabled)
             WRITE_FLAG(1, "slice_temporal_mvp_enable_flag");
@@ -633,7 +648,7 @@
 
     if (!slice.isIntra())
     {
-        bool overrideFlag = (slice.m_numRefIdx[0] != 1 || (slice.isInterB() && slice.m_numRefIdx[1] != 1));
+        bool overrideFlag = (slice.m_numRefIdx[0] != slice.numRefIdxDefault[0] || (slice.isInterB() && slice.m_numRefIdx[1] != slice.numRefIdxDefault[1]));
         WRITE_FLAG(overrideFlag, "num_ref_idx_active_override_flag");
         if (overrideFlag)
         {
@@ -673,7 +688,7 @@
     if (!slice.isIntra())
         WRITE_UVLC(MRG_MAX_NUM_CANDS - slice.m_maxNumMergeCand, "five_minus_max_num_merge_cand");
 
-    int code = sliceQp - 26;
+    int code = sliceQp - (slice.m_iPPSQpMinus26 + 26);
     WRITE_SVLC(code, "slice_qp_delta");
 
     // TODO: Enable when pps_loop_filter_across_slices_enabled_flag==1
@@ -707,8 +722,11 @@
         WRITE_CODE(substreamSizes[i] - 1, offsetLen, "entry_point_offset_minus1");
 }
 
-void Entropy::codeShortTermRefPicSet(const RPS& rps)
+void Entropy::codeShortTermRefPicSet(const RPS& rps, int idx)
 {
+    if (idx > 0)
+        WRITE_FLAG(0, "inter_ref_pic_set_prediction_flag");
+
     WRITE_UVLC(rps.numberOfNegativePictures, "num_negative_pics");
     WRITE_UVLC(rps.numberOfPositivePictures, "num_positive_pics");
     int prev = 0;

 
@@ -312,19 +312,21 @@
     WRITE_FLAG(sps.bUseSAO, "sample_adaptive_offset_enabled_flag");
 
     WRITE_FLAG(0, "pcm_enabled_flag");
-    WRITE_UVLC(0, "num_short_term_ref_pic_sets");
+    WRITE_UVLC(sps.spsrpsNum, "num_short_term_ref_pic_sets");
+    for (int i = 0; i < sps.spsrpsNum; i++)
+        codeShortTermRefPicSet(sps.spsrps[i], i);
     WRITE_FLAG(0, "long_term_ref_pics_present_flag");
 
     WRITE_FLAG(sps.bTemporalMVPEnabled, "sps_temporal_mvp_enable_flag");
     WRITE_FLAG(sps.bUseStrongIntraSmoothing, "sps_strong_intra_smoothing_enable_flag");
 
     WRITE_FLAG(1, "vui_parameters_present_flag");
-    codeVUI(sps.vuiParameters, sps.maxTempSubLayers, sps.bDiscardOptionalVUI);
+    codeVUI(sps.vuiParameters, sps.maxTempSubLayers, sps.bEmitVUITimingInfo, sps.bEmitVUIHRDInfo);
 
     WRITE_FLAG(0, "sps_extension_flag");
 }
 
-void Entropy::codePPS(const PPS& pps, bool filerAcross)
+void Entropy::codePPS( const PPS& pps, bool filerAcross, int iPPSInitQpMinus26 )
 {
     WRITE_UVLC(0,                          "pps_pic_parameter_set_id");
     WRITE_UVLC(0,                          "pps_seq_parameter_set_id");
@@ -333,10 +335,10 @@
     WRITE_CODE(0, 3,                       "num_extra_slice_header_bits");
     WRITE_FLAG(pps.bSignHideEnabled,       "sign_data_hiding_flag");
     WRITE_FLAG(0,                          "cabac_init_present_flag");
-    WRITE_UVLC(0,                          "num_ref_idx_l0_default_active_minus1");
-    WRITE_UVLC(0,                          "num_ref_idx_l1_default_active_minus1");
+    WRITE_UVLC(pps.numRefIdxDefault[0] - 1, "num_ref_idx_l0_default_active_minus1");
+    WRITE_UVLC(pps.numRefIdxDefault[1] - 1, "num_ref_idx_l1_default_active_minus1");
 
-    WRITE_SVLC(0, "init_qp_minus26");
+    WRITE_SVLC(iPPSInitQpMinus26,         "init_qp_minus26");
     WRITE_FLAG(pps.bConstrainedIntraPred, "constrained_intra_pred_flag");
     WRITE_FLAG(pps.bTransformSkipEnabled, "transform_skip_enabled_flag");
 
@@ -422,7 +424,7 @@
     }
 }
 
-void Entropy::codeVUI(const VUI& vui, int maxSubTLayers, bool bDiscardOptionalVUI)
+void Entropy::codeVUI(const VUI& vui, int maxSubTLayers, bool bEmitVUITimingInfo, bool bEmitVUIHRDInfo)
 {
     WRITE_FLAG(vui.aspectRatioInfoPresentFlag, "aspect_ratio_info_present_flag");
     if (vui.aspectRatioInfoPresentFlag)
@@ -473,7 +475,7 @@
         WRITE_UVLC(vui.defaultDisplayWindow.bottomOffset, "def_disp_win_bottom_offset");
     }
 
-    if (bDiscardOptionalVUI)
+    if (!bEmitVUITimingInfo)
         WRITE_FLAG(0, "vui_timing_info_present_flag");
     else
     {
@@ -483,7 +485,7 @@
         WRITE_FLAG(0, "vui_poc_proportional_to_timing_flag");
     }
 
-    if (bDiscardOptionalVUI)
+    if (!bEmitVUIHRDInfo)
         WRITE_FLAG(0, "vui_hrd_parameters_present_flag");
     else
     {
@@ -614,8 +616,21 @@
             }
 #endif
 
-        WRITE_FLAG(0, "short_term_ref_pic_set_sps_flag");
-        codeShortTermRefPicSet(slice.m_rps);
+        if (slice.m_rpsIdx < 0)
+        {
+            WRITE_FLAG(0, "short_term_ref_pic_set_sps_flag");
+            codeShortTermRefPicSet(slice.m_rps, slice.m_sps->spsrpsNum);
+        }
+        else
+        {
+            WRITE_FLAG(1, "short_term_ref_pic_set_sps_flag");
+            int numBits = 0;
+            while ((1 << numBits) < slice.m_iNumRPSInSPS)
+                numBits++;
+
+            if (numBits > 0)
+                WRITE_CODE(slice.m_rpsIdx, numBits, "short_term_ref_pic_set_idx");
+        }
 
         if (slice.m_sps->bTemporalMVPEnabled)
             WRITE_FLAG(1, "slice_temporal_mvp_enable_flag");
@@ -633,7 +648,7 @@
 
     if (!slice.isIntra())
     {
-        bool overrideFlag = (slice.m_numRefIdx[0] != 1 || (slice.isInterB() && slice.m_numRefIdx[1] != 1));
+        bool overrideFlag = (slice.m_numRefIdx[0] != slice.numRefIdxDefault[0] || (slice.isInterB() && slice.m_numRefIdx[1] != slice.numRefIdxDefault[1]));
         WRITE_FLAG(overrideFlag, "num_ref_idx_active_override_flag");
         if (overrideFlag)
         {
@@ -673,7 +688,7 @@
     if (!slice.isIntra())
         WRITE_UVLC(MRG_MAX_NUM_CANDS - slice.m_maxNumMergeCand, "five_minus_max_num_merge_cand");
 
-    int code = sliceQp - 26;
+    int code = sliceQp - (slice.m_iPPSQpMinus26 + 26);
     WRITE_SVLC(code, "slice_qp_delta");
 
     // TODO: Enable when pps_loop_filter_across_slices_enabled_flag==1
@@ -707,8 +722,11 @@
         WRITE_CODE(substreamSizes[i] - 1, offsetLen, "entry_point_offset_minus1");
 }
 
-void Entropy::codeShortTermRefPicSet(const RPS& rps)
+void Entropy::codeShortTermRefPicSet(const RPS& rps, int idx)
 {
+    if (idx > 0)
+        WRITE_FLAG(0, "inter_ref_pic_set_prediction_flag");
+
     WRITE_UVLC(rps.numberOfNegativePictures, "num_negative_pics");
     WRITE_UVLC(rps.numberOfPositivePictures, "num_positive_pics");
     int prev = 0;
​

x265_2.1.tar.gz/source/encoder/entropy.h -> x265_2.2.tar.gz/source/encoder/entropy.h Changed

@@ -142,14 +142,14 @@
 
     void codeVPS(const VPS& vps);
     void codeSPS(const SPS& sps, const ScalingList& scalingList, const ProfileTierLevel& ptl);
-    void codePPS(const PPS& pps, bool filerAcross);
-    void codeVUI(const VUI& vui, int maxSubTLayers, bool discardOptionalVUI);
+    void codePPS( const PPS& pps, bool filerAcross, int iPPSInitQpMinus26 );
+    void codeVUI(const VUI& vui, int maxSubTLayers, bool bEmitVUITimingInfo, bool bEmitVUIHRDInfo);
     void codeAUD(const Slice& slice);
     void codeHrdParameters(const HRDInfo& hrd, int maxSubTLayers);
 
     void codeSliceHeader(const Slice& slice, FrameData& encData, uint32_t slice_addr, uint32_t slice_addr_bits, int sliceQp);
     void codeSliceHeaderWPPEntryPoints(const uint32_t *substreamSizes, uint32_t numSubStreams, uint32_t maxOffset);
-    void codeShortTermRefPicSet(const RPS& rps);
+    void codeShortTermRefPicSet(const RPS& rps, int idx);
     void finishSlice()                 { encodeBinTrm(1); finish(); dynamic_cast<Bitstream*>(m_bitIf)->writeByteAlignment(); }
 
     void encodeCTU(const CUData& cu, const CUGeom& cuGeom);

 
@@ -142,14 +142,14 @@
 
     void codeVPS(const VPS& vps);
     void codeSPS(const SPS& sps, const ScalingList& scalingList, const ProfileTierLevel& ptl);
-    void codePPS(const PPS& pps, bool filerAcross);
-    void codeVUI(const VUI& vui, int maxSubTLayers, bool discardOptionalVUI);
+    void codePPS( const PPS& pps, bool filerAcross, int iPPSInitQpMinus26 );
+    void codeVUI(const VUI& vui, int maxSubTLayers, bool bEmitVUITimingInfo, bool bEmitVUIHRDInfo);
     void codeAUD(const Slice& slice);
     void codeHrdParameters(const HRDInfo& hrd, int maxSubTLayers);
 
     void codeSliceHeader(const Slice& slice, FrameData& encData, uint32_t slice_addr, uint32_t slice_addr_bits, int sliceQp);
     void codeSliceHeaderWPPEntryPoints(const uint32_t *substreamSizes, uint32_t numSubStreams, uint32_t maxOffset);
-    void codeShortTermRefPicSet(const RPS& rps);
+    void codeShortTermRefPicSet(const RPS& rps, int idx);
     void finishSlice()                 { encodeBinTrm(1); finish(); dynamic_cast<Bitstream*>(m_bitIf)->writeByteAlignment(); }
 
     void encodeCTU(const CUData& cu, const CUGeom& cuGeom);
​

x265_2.1.tar.gz/source/encoder/frameencoder.cpp -> x265_2.2.tar.gz/source/encoder/frameencoder.cpp Changed

@@ -50,6 +50,7 @@
     m_bAllRowsStop = false;
     m_vbvResetTriggerRow = -1;
     m_outStreams = NULL;
+    m_backupStreams = NULL;
     m_substreamSizes = NULL;
     m_nr = NULL;
     m_tld = NULL;
@@ -85,6 +86,7 @@
 
     delete[] m_rows;
     delete[] m_outStreams;
+    delete[] m_backupStreams;
     X265_FREE(m_sliceBaseRow);
     X265_FREE(m_cuGeoms);
     X265_FREE(m_ctuGeomMap);
@@ -121,7 +123,7 @@
     int range  = m_param->searchRange;       /* fpel search */
     range += !!(m_param->searchMethod < 2);  /* diamond/hex range check lag */
     range += NTAPS_LUMA / 2;                 /* subpel filter half-length */
-    range += 2 + MotionEstimate::hpelIterationCount(m_param->subpelRefine) / 2; /* subpel refine steps */
+    range += 2 + (MotionEstimate::hpelIterationCount(m_param->subpelRefine) + 1) / 2; /* subpel refine steps */
     m_refLagRows = /*(m_param->maxSlices > 1 ? 1 : 0) +*/ 1 + ((range + g_maxCUSize - 1) / g_maxCUSize);
 
     // NOTE: 2 times of numRows because both Encoder and Filter in same queue
@@ -152,7 +154,7 @@
     // 7.4.7.1 - Ceil( Log2( PicSizeInCtbsY ) ) bits
     {
         unsigned long tmp;
-        CLZ(tmp, (numRows * numCols));
+        CLZ(tmp, (numRows * numCols - 1));
         m_sliceAddrBits = (uint16_t)(tmp + 1);
     }
 
@@ -305,6 +307,19 @@
     weightAnalyse(*frame->m_encData->m_slice, *frame, *master.m_param);
 }
 
+
+uint32_t getBsLength( int32_t code )
+{
+    uint32_t ucode = (code <= 0) ? -code << 1 : (code << 1) - 1;
+
+    ++ucode;
+    unsigned long idx;
+    CLZ( idx, ucode );
+    uint32_t length = (uint32_t)idx * 2 + 1;
+
+    return length;
+}
+
 void FrameEncoder::compressFrame()
 {
     ProfileScopeEvent(frameThread);
@@ -340,7 +355,28 @@
         m_nalList.serialize(NAL_UNIT_ACCESS_UNIT_DELIMITER, m_bs);
     }
     if (m_frame->m_lowres.bKeyframe && m_param->bRepeatHeaders)
-        m_top->getStreamHeaders(m_nalList, m_entropyCoder, m_bs);
+    {
+        if (m_param->bOptRefListLengthPPS)
+        {
+            ScopedLock refIdxLock(m_top->m_sliceRefIdxLock);
+            m_top->updateRefIdx();
+        }
+        if (m_top->m_param->rc.bStatRead  && m_top->m_param->bMultiPassOptRPS)
+        {
+            ScopedLock refIdxLock(m_top->m_rpsInSpsLock);
+            if (!m_top->computeSPSRPSIndex())
+            {
+                x265_log(m_param, X265_LOG_ERROR, "compute commonly RPS failed!\n");
+                m_top->m_aborted = true;
+            }
+            m_top->getStreamHeaders(m_nalList, m_entropyCoder, m_bs);
+        }
+        else
+            m_top->getStreamHeaders(m_nalList, m_entropyCoder, m_bs);
+    }
+
+    if (m_top->m_param->rc.bStatRead && m_top->m_param->bMultiPassOptRPS)
+        m_frame->m_encData->m_slice->m_rpsIdx = (m_top->m_rateControl->m_rce2Pass + m_frame->m_encodeOrder)->rpsIdx;
 
     // Weighted Prediction parameters estimation.
     bool bUseWeightP = slice->m_sliceType == P_SLICE && slice->m_pps->bUseWeightPred;
@@ -448,6 +484,19 @@
     /* Clip slice QP to 0-51 spec range before encoding */
     slice->m_sliceQp = x265_clip3(-QP_BD_OFFSET, QP_MAX_SPEC, qp);
 
+    if (m_param->bOptQpPPS && m_param->bRepeatHeaders)
+    {
+        ScopedLock qpLock(m_top->m_sliceQpLock);
+        for (int i = 0; i < (QP_MAX_MAX + 1); i++)
+        {
+            int delta = slice->m_sliceQp - (i + 1);
+            int codeLength = getBsLength( delta );
+            m_top->m_iBitsCostSum[i] += codeLength;
+        }
+        m_top->m_iFrameNum++;
+        m_top->m_iLastSliceQp = slice->m_sliceQp;
+    }
+
     m_initSliceContext.resetEntropy(*slice);
 
     m_frameFilter.start(m_frame, m_initSliceContext);
@@ -485,6 +534,8 @@
     if (!m_outStreams)
     {
         m_outStreams = new Bitstream[numSubstreams];
+        if (!m_param->bEnableWavefront)
+            m_backupStreams = new Bitstream[numSubstreams];
         m_substreamSizes = X265_MALLOC(uint32_t, numSubstreams);
         if (!m_param->bEnableSAO)
             for (uint32_t i = 0; i < numSubstreams; i++)
@@ -498,7 +549,7 @@
 
     if (m_frame->m_lowres.bKeyframe)
     {
-        if (!m_param->bDiscardSEI && m_param->bEmitHRDSEI)
+        if (m_param->bEmitHRDSEI)
         {
             SEIBufferingPeriod* bpSei = &m_top->m_rateControl->m_bufPeriodSEI;
 
@@ -520,7 +571,7 @@
         }
     }
 
-    if (!m_param->bDiscardSEI && (m_param->bEmitHRDSEI || !!m_param->interlaceMode))
+    if ((m_param->bEmitHRDSEI || !!m_param->interlaceMode))
     {
         SEIPictureTiming *sei = m_rce.picTimingSEI;
         const VUI *vui = &slice->m_sps->vuiParameters;
@@ -556,22 +607,19 @@
     }
 
     /* Write user SEI */
-    if (!m_param->bDiscardSEI)
+    for (int i = 0; i < m_frame->m_userSEI.numPayloads; i++)
     {
-        for (int i = 0; i < m_frame->m_userSEI.numPayloads; i++)
-        {
-            x265_sei_payload *payload = &m_frame->m_userSEI.payloads[i];
-            SEIuserDataUnregistered sei;
+        x265_sei_payload *payload = &m_frame->m_userSEI.payloads[i];
+        SEIuserDataUnregistered sei;
 
-            sei.m_payloadType = payload->payloadType;
-            sei.m_userDataLength = payload->payloadSize;
-            sei.m_userData = payload->payload;
+        sei.m_payloadType = payload->payloadType;
+        sei.m_userDataLength = payload->payloadSize;
+        sei.m_userData = payload->payload;
 
-            m_bs.resetBits();
-            sei.write(m_bs, *slice->m_sps);
-            m_bs.writeByteAlignment();
-            m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
-        }
+        m_bs.resetBits();
+        sei.write(m_bs, *slice->m_sps);
+        m_bs.writeByteAlignment();
+        m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
     }
 
     /* CQP and CRF (without capped VBV) doesn't use mid-frame statistics to 
@@ -606,8 +654,7 @@
                 const uint32_t sliceEndRow = m_sliceBaseRow[sliceId + 1] - 1;
                 const uint32_t row = sliceStartRow + rowInSlice;
 
-                if (row >= m_numRows)
-                    break;
+                X265_CHECK(row < m_numRows, "slices row fault was detected");
 
                 if (row > sliceEndRow)
                     continue;
@@ -626,7 +673,7 @@
                             refpic->m_reconRowFlag[rowIdx].waitForChange(0);
 
                         if ((bUseWeightP || bUseWeightB) && m_mref[l][ref].isWeighted)
-                            m_mref[l][ref].applyWeight(row + m_refLagRows, m_numRows, sliceEndRow + 1, sliceId);
+                            m_mref[l][ref].applyWeight(rowIdx, m_numRows, sliceEndRow, sliceId);
                     }
                 }
 
@@ -666,7 +713,7 @@
                             refpic->m_reconRowFlag[rowIdx].waitForChange(0);
 
                         if ((bUseWeightP || bUseWeightB) && m_mref[l][ref].isWeighted)
-                            m_mref[list][ref].applyWeight(i + m_refLagRows, m_numRows, m_numRows, 0);
+                            m_mref[list][ref].applyWeight(rowIdx, m_numRows, m_numRows, 0);
                     }
                 }
 
@@ -830,6 +877,11 @@
             const uint32_t sliceAddr = nextSliceRow * m_numCols;
             //CUData* ctu = m_frame->m_encData->getPicCTU(sliceAddr);
             //const int sliceQp = ctu->m_qp[0];
+            if (m_param->bOptRefListLengthPPS)
+            {
+                ScopedLock refIdxLock(m_top->m_sliceRefIdxLock);
+                m_top->analyseRefIdx(slice->m_numRefIdx);

 
@@ -50,6 +50,7 @@
     m_bAllRowsStop = false;
     m_vbvResetTriggerRow = -1;
     m_outStreams = NULL;
+    m_backupStreams = NULL;
     m_substreamSizes = NULL;
     m_nr = NULL;
     m_tld = NULL;
@@ -85,6 +86,7 @@
 
     delete[] m_rows;
     delete[] m_outStreams;
+    delete[] m_backupStreams;
     X265_FREE(m_sliceBaseRow);
     X265_FREE(m_cuGeoms);
     X265_FREE(m_ctuGeomMap);
@@ -121,7 +123,7 @@
     int range  = m_param->searchRange;       /* fpel search */
     range += !!(m_param->searchMethod < 2);  /* diamond/hex range check lag */
     range += NTAPS_LUMA / 2;                 /* subpel filter half-length */
-    range += 2 + MotionEstimate::hpelIterationCount(m_param->subpelRefine) / 2; /* subpel refine steps */
+    range += 2 + (MotionEstimate::hpelIterationCount(m_param->subpelRefine) + 1) / 2; /* subpel refine steps */
     m_refLagRows = /*(m_param->maxSlices > 1 ? 1 : 0) +*/ 1 + ((range + g_maxCUSize - 1) / g_maxCUSize);
 
     // NOTE: 2 times of numRows because both Encoder and Filter in same queue
@@ -152,7 +154,7 @@
     // 7.4.7.1 - Ceil( Log2( PicSizeInCtbsY ) ) bits
     {
         unsigned long tmp;
-        CLZ(tmp, (numRows * numCols));
+        CLZ(tmp, (numRows * numCols - 1));
         m_sliceAddrBits = (uint16_t)(tmp + 1);
     }
 
@@ -305,6 +307,19 @@
     weightAnalyse(*frame->m_encData->m_slice, *frame, *master.m_param);
 }
 
+
+uint32_t getBsLength( int32_t code )
+{
+    uint32_t ucode = (code <= 0) ? -code << 1 : (code << 1) - 1;
+
+    ++ucode;
+    unsigned long idx;
+    CLZ( idx, ucode );
+    uint32_t length = (uint32_t)idx * 2 + 1;
+
+    return length;
+}
+
 void FrameEncoder::compressFrame()
 {
     ProfileScopeEvent(frameThread);
@@ -340,7 +355,28 @@
         m_nalList.serialize(NAL_UNIT_ACCESS_UNIT_DELIMITER, m_bs);
     }
     if (m_frame->m_lowres.bKeyframe && m_param->bRepeatHeaders)
-        m_top->getStreamHeaders(m_nalList, m_entropyCoder, m_bs);
+    {
+        if (m_param->bOptRefListLengthPPS)
+        {
+            ScopedLock refIdxLock(m_top->m_sliceRefIdxLock);
+            m_top->updateRefIdx();
+        }
+        if (m_top->m_param->rc.bStatRead  && m_top->m_param->bMultiPassOptRPS)
+        {
+            ScopedLock refIdxLock(m_top->m_rpsInSpsLock);
+            if (!m_top->computeSPSRPSIndex())
+            {
+                x265_log(m_param, X265_LOG_ERROR, "compute commonly RPS failed!\n");
+                m_top->m_aborted = true;
+            }
+            m_top->getStreamHeaders(m_nalList, m_entropyCoder, m_bs);
+        }
+        else
+            m_top->getStreamHeaders(m_nalList, m_entropyCoder, m_bs);
+    }
+
+    if (m_top->m_param->rc.bStatRead && m_top->m_param->bMultiPassOptRPS)
+        m_frame->m_encData->m_slice->m_rpsIdx = (m_top->m_rateControl->m_rce2Pass + m_frame->m_encodeOrder)->rpsIdx;
 
     // Weighted Prediction parameters estimation.
     bool bUseWeightP = slice->m_sliceType == P_SLICE && slice->m_pps->bUseWeightPred;
@@ -448,6 +484,19 @@
     /* Clip slice QP to 0-51 spec range before encoding */
     slice->m_sliceQp = x265_clip3(-QP_BD_OFFSET, QP_MAX_SPEC, qp);
 
+    if (m_param->bOptQpPPS && m_param->bRepeatHeaders)
+    {
+        ScopedLock qpLock(m_top->m_sliceQpLock);
+        for (int i = 0; i < (QP_MAX_MAX + 1); i++)
+        {
+            int delta = slice->m_sliceQp - (i + 1);
+            int codeLength = getBsLength( delta );
+            m_top->m_iBitsCostSum[i] += codeLength;
+        }
+        m_top->m_iFrameNum++;
+        m_top->m_iLastSliceQp = slice->m_sliceQp;
+    }
+
     m_initSliceContext.resetEntropy(*slice);
 
     m_frameFilter.start(m_frame, m_initSliceContext);
@@ -485,6 +534,8 @@
     if (!m_outStreams)
     {
         m_outStreams = new Bitstream[numSubstreams];
+        if (!m_param->bEnableWavefront)
+            m_backupStreams = new Bitstream[numSubstreams];
         m_substreamSizes = X265_MALLOC(uint32_t, numSubstreams);
         if (!m_param->bEnableSAO)
             for (uint32_t i = 0; i < numSubstreams; i++)
@@ -498,7 +549,7 @@
 
     if (m_frame->m_lowres.bKeyframe)
     {
-        if (!m_param->bDiscardSEI && m_param->bEmitHRDSEI)
+        if (m_param->bEmitHRDSEI)
         {
             SEIBufferingPeriod* bpSei = &m_top->m_rateControl->m_bufPeriodSEI;
 
@@ -520,7 +571,7 @@
         }
     }
 
-    if (!m_param->bDiscardSEI && (m_param->bEmitHRDSEI || !!m_param->interlaceMode))
+    if ((m_param->bEmitHRDSEI || !!m_param->interlaceMode))
     {
         SEIPictureTiming *sei = m_rce.picTimingSEI;
         const VUI *vui = &slice->m_sps->vuiParameters;
@@ -556,22 +607,19 @@
     }
 
     /* Write user SEI */
-    if (!m_param->bDiscardSEI)
+    for (int i = 0; i < m_frame->m_userSEI.numPayloads; i++)
     {
-        for (int i = 0; i < m_frame->m_userSEI.numPayloads; i++)
-        {
-            x265_sei_payload *payload = &m_frame->m_userSEI.payloads[i];
-            SEIuserDataUnregistered sei;
+        x265_sei_payload *payload = &m_frame->m_userSEI.payloads[i];
+        SEIuserDataUnregistered sei;
 
-            sei.m_payloadType = payload->payloadType;
-            sei.m_userDataLength = payload->payloadSize;
-            sei.m_userData = payload->payload;
+        sei.m_payloadType = payload->payloadType;
+        sei.m_userDataLength = payload->payloadSize;
+        sei.m_userData = payload->payload;
 
-            m_bs.resetBits();
-            sei.write(m_bs, *slice->m_sps);
-            m_bs.writeByteAlignment();
-            m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
-        }
+        m_bs.resetBits();
+        sei.write(m_bs, *slice->m_sps);
+        m_bs.writeByteAlignment();
+        m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
     }
 
     /* CQP and CRF (without capped VBV) doesn't use mid-frame statistics to 
@@ -606,8 +654,7 @@
                 const uint32_t sliceEndRow = m_sliceBaseRow[sliceId + 1] - 1;
                 const uint32_t row = sliceStartRow + rowInSlice;
 
-                if (row >= m_numRows)
-                    break;
+                X265_CHECK(row < m_numRows, "slices row fault was detected");
 
                 if (row > sliceEndRow)
                     continue;
@@ -626,7 +673,7 @@
                             refpic->m_reconRowFlag[rowIdx].waitForChange(0);
 
                         if ((bUseWeightP || bUseWeightB) && m_mref[l][ref].isWeighted)
-                            m_mref[l][ref].applyWeight(row + m_refLagRows, m_numRows, sliceEndRow + 1, sliceId);
+                            m_mref[l][ref].applyWeight(rowIdx, m_numRows, sliceEndRow, sliceId);
                     }
                 }
 
@@ -666,7 +713,7 @@
                             refpic->m_reconRowFlag[rowIdx].waitForChange(0);
 
                         if ((bUseWeightP || bUseWeightB) && m_mref[l][ref].isWeighted)
-                            m_mref[list][ref].applyWeight(i + m_refLagRows, m_numRows, m_numRows, 0);
+                            m_mref[list][ref].applyWeight(rowIdx, m_numRows, m_numRows, 0);
                     }
                 }
 
@@ -830,6 +877,11 @@
             const uint32_t sliceAddr = nextSliceRow * m_numCols;
             //CUData* ctu = m_frame->m_encData->getPicCTU(sliceAddr);
             //const int sliceQp = ctu->m_qp[0];
+            if (m_param->bOptRefListLengthPPS)
+            {
+                ScopedLock refIdxLock(m_top->m_sliceRefIdxLock);
+                m_top->analyseRefIdx(slice->m_numRefIdx);
​

x265_2.1.tar.gz/source/encoder/frameencoder.h -> x265_2.2.tar.gz/source/encoder/frameencoder.h Changed

 
@@ -184,6 +184,7 @@
     NoiseReduction*          m_nr;
     ThreadLocalData*         m_tld; /* for --no-wpp */
     Bitstream*               m_outStreams;
+    Bitstream*               m_backupStreams;
     uint32_t*                m_substreamSizes;
 
     CUGeom*                  m_cuGeoms;
​

x265_2.1.tar.gz/source/encoder/framefilter.cpp -> x265_2.2.tar.gz/source/encoder/framefilter.cpp Changed

@@ -35,6 +35,109 @@
 static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride, uint32_t width, uint32_t height);
 static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t& cnt);
 
+static void integral_init4h(uint32_t *sum, pixel *pix, intptr_t stride)
+{
+    int32_t v = pix[0] + pix[1] + pix[2] + pix[3];
+    for (int16_t x = 0; x < stride - 4; x++)
+    {
+        sum[x] = v + sum[x - stride];
+        v += pix[x + 4] - pix[x];
+    }
+}
+
+static void integral_init8h(uint32_t *sum, pixel *pix, intptr_t stride)
+{
+    int32_t v = pix[0] + pix[1] + pix[2] + pix[3] + pix[4] + pix[5] + pix[6] + pix[7];
+    for (int16_t x = 0; x < stride - 8; x++)
+    {
+        sum[x] = v + sum[x - stride];
+        v += pix[x + 8] - pix[x];
+    }
+}
+
+static void integral_init12h(uint32_t *sum, pixel *pix, intptr_t stride)
+{
+    int32_t v = pix[0] + pix[1] + pix[2] + pix[3] + pix[4] + pix[5] + pix[6] + pix[7] +
+        pix[8] + pix[9] + pix[10] + pix[11];
+    for (int16_t x = 0; x < stride - 12; x++)
+    {
+        sum[x] = v + sum[x - stride];
+        v += pix[x + 12] - pix[x];
+    }
+}
+
+static void integral_init16h(uint32_t *sum, pixel *pix, intptr_t stride)
+{
+    int32_t v = pix[0] + pix[1] + pix[2] + pix[3] + pix[4] + pix[5] + pix[6] + pix[7] +
+        pix[8] + pix[9] + pix[10] + pix[11] + pix[12] + pix[13] + pix[14] + pix[15];
+    for (int16_t x = 0; x < stride - 16; x++)
+    {
+        sum[x] = v + sum[x - stride];
+        v += pix[x + 16] - pix[x];
+    }
+}
+
+static void integral_init24h(uint32_t *sum, pixel *pix, intptr_t stride)
+{
+    int32_t v = pix[0] + pix[1] + pix[2] + pix[3] + pix[4] + pix[5] + pix[6] + pix[7] +
+        pix[8] + pix[9] + pix[10] + pix[11] + pix[12] + pix[13] + pix[14] + pix[15] +
+        pix[16] + pix[17] + pix[18] + pix[19] + pix[20] + pix[21] + pix[22] + pix[23];
+    for (int16_t x = 0; x < stride - 24; x++)
+    {
+        sum[x] = v + sum[x - stride];
+        v += pix[x + 24] - pix[x];
+    }
+}
+
+static void integral_init32h(uint32_t *sum, pixel *pix, intptr_t stride)
+{
+    int32_t v = pix[0] + pix[1] + pix[2] + pix[3] + pix[4] + pix[5] + pix[6] + pix[7] +
+        pix[8] + pix[9] + pix[10] + pix[11] + pix[12] + pix[13] + pix[14] + pix[15] +
+        pix[16] + pix[17] + pix[18] + pix[19] + pix[20] + pix[21] + pix[22] + pix[23] +
+        pix[24] + pix[25] + pix[26] + pix[27] + pix[28] + pix[29] + pix[30] + pix[31];
+    for (int16_t x = 0; x < stride - 32; x++)
+    {
+        sum[x] = v + sum[x - stride];
+        v += pix[x + 32] - pix[x];
+    }
+}
+
+static void integral_init4v(uint32_t *sum4, intptr_t stride)
+{
+    for (int x = 0; x < stride; x++)
+        sum4[x] = sum4[x + 4 * stride] - sum4[x];
+}
+
+static void integral_init8v(uint32_t *sum8, intptr_t stride)
+{
+    for (int x = 0; x < stride; x++)
+        sum8[x] = sum8[x + 8 * stride] - sum8[x];
+}
+
+static void integral_init12v(uint32_t *sum12, intptr_t stride)
+{
+    for (int x = 0; x < stride; x++)
+        sum12[x] = sum12[x + 12 * stride] - sum12[x];
+}
+
+static void integral_init16v(uint32_t *sum16, intptr_t stride)
+{
+    for (int x = 0; x < stride; x++)
+        sum16[x] = sum16[x + 16 * stride] - sum16[x];
+}
+
+static void integral_init24v(uint32_t *sum24, intptr_t stride)
+{
+    for (int x = 0; x < stride; x++)
+        sum24[x] = sum24[x + 24 * stride] - sum24[x];
+}
+
+static void integral_init32v(uint32_t *sum32, intptr_t stride)
+{
+    for (int x = 0; x < stride; x++)
+        sum32[x] = sum32[x + 32 * stride] - sum32[x];
+}
+
 void FrameFilter::destroy()
 {
     X265_FREE(m_ssimBuf);
@@ -65,6 +168,7 @@
     m_saoRowDelay = m_param->bEnableLoopFilter ? 1 : 0;
     m_lastHeight = (m_param->sourceHeight % g_maxCUSize) ? (m_param->sourceHeight % g_maxCUSize) : g_maxCUSize;
     m_lastWidth = (m_param->sourceWidth % g_maxCUSize) ? (m_param->sourceWidth % g_maxCUSize) : g_maxCUSize;
+    integralCompleted.set(0);
 
     if (m_param->bEnableSsim)
         m_ssimBuf = X265_MALLOC(int, 8 * (m_param->sourceWidth / 4 + 3));
@@ -499,14 +603,19 @@
     if (!ctu->m_bFirstRowInSlice)
         processPostRow(row - 1);
 
-    if (ctu->m_bLastRowInSlice)
-        processPostRow(row);
-
     // NOTE: slices parallelism will be execute out-of-order
-    int numRowFinished;
-    for(numRowFinished = 0; numRowFinished < m_numRows; numRowFinished++)
-        if (!m_frame->m_reconRowFlag[numRowFinished].get())
-            break;
+    int numRowFinished = 0;
+    if (m_frame->m_reconRowFlag)
+    {
+        for (numRowFinished = 0; numRowFinished < m_numRows; numRowFinished++)
+        {
+            if (!m_frame->m_reconRowFlag[numRowFinished].get())
+                break;
+
+            if (numRowFinished == row)
+                continue;
+        }
+    }
 
     if (numRowFinished == m_numRows)
     {
@@ -522,6 +631,9 @@
             m_parallelFilter[0].m_sao.rdoSaoUnitRowEnd(saoParam, encData.m_slice->m_sps->numCUsInFrame);
         }
     }
+
+    if (ctu->m_bLastRowInSlice)
+        processPostRow(row);
 }
 
 void FrameFilter::processPostRow(int row)
@@ -656,6 +768,107 @@
         }
     } // end of (m_param->maxSlices == 1)
 
+    int lastRow = row == (int)m_frame->m_encData->m_slice->m_sps->numCuInHeight - 1;
+
+    /* generate integral planes for SEA motion search */
+    if (m_param->searchMethod == X265_SEA && m_frame->m_encData->m_meIntegral && m_frame->m_lowres.sliceType != X265_TYPE_B)
+    {
+        /* If WPP, other than first row, integral calculation for current row needs to wait till the
+        * integral for the previous row is computed */
+        if (m_param->bEnableWavefront && row)
+        {
+            while (m_parallelFilter[row - 1].m_frameFilter->integralCompleted.get() == 0)
+            {
+                m_parallelFilter[row - 1].m_frameFilter->integralCompleted.waitForChange(0);
+            }
+        }
+
+        int stride = (int)m_frame->m_reconPic->m_stride;
+        int padX = g_maxCUSize + 32;
+        int padY = g_maxCUSize + 16;
+        int numCuInHeight = m_frame->m_encData->m_slice->m_sps->numCuInHeight;
+        int maxHeight = numCuInHeight * g_maxCUSize;
+        int startRow = 0;
+
+        if (m_param->interlaceMode)
+            startRow = (row * g_maxCUSize >> 1);
+        else
+            startRow = row * g_maxCUSize;
+
+        int height = lastRow ? (maxHeight + g_maxCUSize * m_param->interlaceMode) : (((row + m_param->interlaceMode) * g_maxCUSize) + g_maxCUSize);
+
+        if (!row)
+        {
+            for (int i = 0; i < INTEGRAL_PLANE_NUM; i++)
+                memset(m_frame->m_encData->m_meIntegral[i] - padY * stride - padX, 0, stride * sizeof(uint32_t));
+            startRow = -padY;
+        }
+
+        if (lastRow)
+            height += padY - 1;
+
+        for (int y = startRow; y < height; y++)
+        {

 
@@ -35,6 +35,109 @@
 static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride, uint32_t width, uint32_t height);
 static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t& cnt);
 
+static void integral_init4h(uint32_t *sum, pixel *pix, intptr_t stride)
+{
+    int32_t v = pix[0] + pix[1] + pix[2] + pix[3];
+    for (int16_t x = 0; x < stride - 4; x++)
+    {
+        sum[x] = v + sum[x - stride];
+        v += pix[x + 4] - pix[x];
+    }
+}
+
+static void integral_init8h(uint32_t *sum, pixel *pix, intptr_t stride)
+{
+    int32_t v = pix[0] + pix[1] + pix[2] + pix[3] + pix[4] + pix[5] + pix[6] + pix[7];
+    for (int16_t x = 0; x < stride - 8; x++)
+    {
+        sum[x] = v + sum[x - stride];
+        v += pix[x + 8] - pix[x];
+    }
+}
+
+static void integral_init12h(uint32_t *sum, pixel *pix, intptr_t stride)
+{
+    int32_t v = pix[0] + pix[1] + pix[2] + pix[3] + pix[4] + pix[5] + pix[6] + pix[7] +
+        pix[8] + pix[9] + pix[10] + pix[11];
+    for (int16_t x = 0; x < stride - 12; x++)
+    {
+        sum[x] = v + sum[x - stride];
+        v += pix[x + 12] - pix[x];
+    }
+}
+
+static void integral_init16h(uint32_t *sum, pixel *pix, intptr_t stride)
+{
+    int32_t v = pix[0] + pix[1] + pix[2] + pix[3] + pix[4] + pix[5] + pix[6] + pix[7] +
+        pix[8] + pix[9] + pix[10] + pix[11] + pix[12] + pix[13] + pix[14] + pix[15];
+    for (int16_t x = 0; x < stride - 16; x++)
+    {
+        sum[x] = v + sum[x - stride];
+        v += pix[x + 16] - pix[x];
+    }
+}
+
+static void integral_init24h(uint32_t *sum, pixel *pix, intptr_t stride)
+{
+    int32_t v = pix[0] + pix[1] + pix[2] + pix[3] + pix[4] + pix[5] + pix[6] + pix[7] +
+        pix[8] + pix[9] + pix[10] + pix[11] + pix[12] + pix[13] + pix[14] + pix[15] +
+        pix[16] + pix[17] + pix[18] + pix[19] + pix[20] + pix[21] + pix[22] + pix[23];
+    for (int16_t x = 0; x < stride - 24; x++)
+    {
+        sum[x] = v + sum[x - stride];
+        v += pix[x + 24] - pix[x];
+    }
+}
+
+static void integral_init32h(uint32_t *sum, pixel *pix, intptr_t stride)
+{
+    int32_t v = pix[0] + pix[1] + pix[2] + pix[3] + pix[4] + pix[5] + pix[6] + pix[7] +
+        pix[8] + pix[9] + pix[10] + pix[11] + pix[12] + pix[13] + pix[14] + pix[15] +
+        pix[16] + pix[17] + pix[18] + pix[19] + pix[20] + pix[21] + pix[22] + pix[23] +
+        pix[24] + pix[25] + pix[26] + pix[27] + pix[28] + pix[29] + pix[30] + pix[31];
+    for (int16_t x = 0; x < stride - 32; x++)
+    {
+        sum[x] = v + sum[x - stride];
+        v += pix[x + 32] - pix[x];
+    }
+}
+
+static void integral_init4v(uint32_t *sum4, intptr_t stride)
+{
+    for (int x = 0; x < stride; x++)
+        sum4[x] = sum4[x + 4 * stride] - sum4[x];
+}
+
+static void integral_init8v(uint32_t *sum8, intptr_t stride)
+{
+    for (int x = 0; x < stride; x++)
+        sum8[x] = sum8[x + 8 * stride] - sum8[x];
+}
+
+static void integral_init12v(uint32_t *sum12, intptr_t stride)
+{
+    for (int x = 0; x < stride; x++)
+        sum12[x] = sum12[x + 12 * stride] - sum12[x];
+}
+
+static void integral_init16v(uint32_t *sum16, intptr_t stride)
+{
+    for (int x = 0; x < stride; x++)
+        sum16[x] = sum16[x + 16 * stride] - sum16[x];
+}
+
+static void integral_init24v(uint32_t *sum24, intptr_t stride)
+{
+    for (int x = 0; x < stride; x++)
+        sum24[x] = sum24[x + 24 * stride] - sum24[x];
+}
+
+static void integral_init32v(uint32_t *sum32, intptr_t stride)
+{
+    for (int x = 0; x < stride; x++)
+        sum32[x] = sum32[x + 32 * stride] - sum32[x];
+}
+
 void FrameFilter::destroy()
 {
     X265_FREE(m_ssimBuf);
@@ -65,6 +168,7 @@
     m_saoRowDelay = m_param->bEnableLoopFilter ? 1 : 0;
     m_lastHeight = (m_param->sourceHeight % g_maxCUSize) ? (m_param->sourceHeight % g_maxCUSize) : g_maxCUSize;
     m_lastWidth = (m_param->sourceWidth % g_maxCUSize) ? (m_param->sourceWidth % g_maxCUSize) : g_maxCUSize;
+    integralCompleted.set(0);
 
     if (m_param->bEnableSsim)
         m_ssimBuf = X265_MALLOC(int, 8 * (m_param->sourceWidth / 4 + 3));
@@ -499,14 +603,19 @@
     if (!ctu->m_bFirstRowInSlice)
         processPostRow(row - 1);
 
-    if (ctu->m_bLastRowInSlice)
-        processPostRow(row);
-
     // NOTE: slices parallelism will be execute out-of-order
-    int numRowFinished;
-    for(numRowFinished = 0; numRowFinished < m_numRows; numRowFinished++)
-        if (!m_frame->m_reconRowFlag[numRowFinished].get())
-            break;
+    int numRowFinished = 0;
+    if (m_frame->m_reconRowFlag)
+    {
+        for (numRowFinished = 0; numRowFinished < m_numRows; numRowFinished++)
+        {
+            if (!m_frame->m_reconRowFlag[numRowFinished].get())
+                break;
+
+            if (numRowFinished == row)
+                continue;
+        }
+    }
 
     if (numRowFinished == m_numRows)
     {
@@ -522,6 +631,9 @@
             m_parallelFilter[0].m_sao.rdoSaoUnitRowEnd(saoParam, encData.m_slice->m_sps->numCUsInFrame);
         }
     }
+
+    if (ctu->m_bLastRowInSlice)
+        processPostRow(row);
 }
 
 void FrameFilter::processPostRow(int row)
@@ -656,6 +768,107 @@
         }
     } // end of (m_param->maxSlices == 1)
 
+    int lastRow = row == (int)m_frame->m_encData->m_slice->m_sps->numCuInHeight - 1;
+
+    /* generate integral planes for SEA motion search */
+    if (m_param->searchMethod == X265_SEA && m_frame->m_encData->m_meIntegral && m_frame->m_lowres.sliceType != X265_TYPE_B)
+    {
+        /* If WPP, other than first row, integral calculation for current row needs to wait till the
+        * integral for the previous row is computed */
+        if (m_param->bEnableWavefront && row)
+        {
+            while (m_parallelFilter[row - 1].m_frameFilter->integralCompleted.get() == 0)
+            {
+                m_parallelFilter[row - 1].m_frameFilter->integralCompleted.waitForChange(0);
+            }
+        }
+
+        int stride = (int)m_frame->m_reconPic->m_stride;
+        int padX = g_maxCUSize + 32;
+        int padY = g_maxCUSize + 16;
+        int numCuInHeight = m_frame->m_encData->m_slice->m_sps->numCuInHeight;
+        int maxHeight = numCuInHeight * g_maxCUSize;
+        int startRow = 0;
+
+        if (m_param->interlaceMode)
+            startRow = (row * g_maxCUSize >> 1);
+        else
+            startRow = row * g_maxCUSize;
+
+        int height = lastRow ? (maxHeight + g_maxCUSize * m_param->interlaceMode) : (((row + m_param->interlaceMode) * g_maxCUSize) + g_maxCUSize);
+
+        if (!row)
+        {
+            for (int i = 0; i < INTEGRAL_PLANE_NUM; i++)
+                memset(m_frame->m_encData->m_meIntegral[i] - padY * stride - padX, 0, stride * sizeof(uint32_t));
+            startRow = -padY;
+        }
+
+        if (lastRow)
+            height += padY - 1;
+
+        for (int y = startRow; y < height; y++)
+        {
​

x265_2.1.tar.gz/source/encoder/framefilter.h -> x265_2.2.tar.gz/source/encoder/framefilter.h Changed

 
@@ -57,6 +57,8 @@
     int           m_lastHeight;
     int           m_lastWidth;
     
+    ThreadSafeInteger integralCompleted;     /* check if integral calculation is completed in this row */
+
     void*         m_ssimBuf;        /* Temp storage for ssim computation */
 
 #define MAX_PFILTER_CUS     (4) /* maximum CUs for every thread */
​

x265_2.1.tar.gz/source/encoder/motion.cpp -> x265_2.2.tar.gz/source/encoder/motion.cpp Changed

@@ -109,6 +109,8 @@
     blockOffset = 0;
     bChromaSATD = false;
     chromaSatd = NULL;
+    for (int i = 0; i < INTEGRAL_PLANE_NUM; i++)
+        integral[i] = NULL;
 }
 
 void MotionEstimate::init(int csp)
@@ -165,10 +167,12 @@
     partEnum = partitionFromSizes(pwidth, pheight);
     X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
     sad = primitives.pu[partEnum].sad;
+    ads = primitives.pu[partEnum].ads;
     satd = primitives.pu[partEnum].satd;
     sad_x3 = primitives.pu[partEnum].sad_x3;
     sad_x4 = primitives.pu[partEnum].sad_x4;
 
+
     blockwidth = pwidth;
     blockOffset = offset;
     absPartIdx = ctuAddr = -1;
@@ -188,6 +192,7 @@
     partEnum = partitionFromSizes(pwidth, pheight);
     X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
     sad = primitives.pu[partEnum].sad;
+    ads = primitives.pu[partEnum].ads;
     satd = primitives.pu[partEnum].satd;
     sad_x3 = primitives.pu[partEnum].sad_x3;
     sad_x4 = primitives.pu[partEnum].sad_x4;
@@ -278,12 +283,31 @@
         costs[1] += mvcost((omv + MV(m1x, m1y)) << 2); \
         costs[2] += mvcost((omv + MV(m2x, m2y)) << 2); \
         costs[3] += mvcost((omv + MV(m3x, m3y)) << 2); \
-        COPY2_IF_LT(bcost, costs[0], bmv, omv + MV(m0x, m0y)); \
-        COPY2_IF_LT(bcost, costs[1], bmv, omv + MV(m1x, m1y)); \
-        COPY2_IF_LT(bcost, costs[2], bmv, omv + MV(m2x, m2y)); \
-        COPY2_IF_LT(bcost, costs[3], bmv, omv + MV(m3x, m3y)); \
+        if ((omv.y + m0y >= mvmin.y) & (omv.y + m0y <= mvmax.y)) \
+            COPY2_IF_LT(bcost, costs[0], bmv, omv + MV(m0x, m0y)); \
+        if ((omv.y + m1y >= mvmin.y) & (omv.y + m1y <= mvmax.y)) \
+            COPY2_IF_LT(bcost, costs[1], bmv, omv + MV(m1x, m1y)); \
+        if ((omv.y + m2y >= mvmin.y) & (omv.y + m2y <= mvmax.y)) \
+            COPY2_IF_LT(bcost, costs[2], bmv, omv + MV(m2x, m2y)); \
+        if ((omv.y + m3y >= mvmin.y) & (omv.y + m3y <= mvmax.y)) \
+            COPY2_IF_LT(bcost, costs[3], bmv, omv + MV(m3x, m3y)); \
     }
 
+#define COST_MV_X3_ABS( m0x, m0y, m1x, m1y, m2x, m2y )\
+{\
+    sad_x3(fenc, \
+    fref + (m0x) + (m0y) * stride, \
+    fref + (m1x) + (m1y) * stride, \
+    fref + (m2x) + (m2y) * stride, \
+    stride, costs); \
+    costs[0] += p_cost_mvx[(m0x) << 2]; /* no cost_mvy */\
+    costs[1] += p_cost_mvx[(m1x) << 2]; \
+    costs[2] += p_cost_mvx[(m2x) << 2]; \
+    COPY3_IF_LT(bcost, costs[0], bmv.x, m0x, bmv.y, m0y); \
+    COPY3_IF_LT(bcost, costs[1], bmv.x, m1x, bmv.y, m1y); \
+    COPY3_IF_LT(bcost, costs[2], bmv.x, m2x, bmv.y, m2y); \
+}
+
 #define COST_MV_X4_DIR(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y, costs) \
     { \
         pixel *pix_base = fref + bmv.x + bmv.y * stride; \
@@ -627,6 +651,7 @@
         {
             bcost = cost;
             bmv = 0;
+            bmv.y = X265_MAX(X265_MIN(0, mvmax.y), mvmin.y);
         }
     }
 
@@ -659,8 +684,10 @@
         do
         {
             COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs);
-            COPY1_IF_LT(bcost, (costs[0] << 4) + 1);
-            COPY1_IF_LT(bcost, (costs[1] << 4) + 3);
+            if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
+                COPY1_IF_LT(bcost, (costs[0] << 4) + 1);
+            if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
+                COPY1_IF_LT(bcost, (costs[1] << 4) + 3);
             COPY1_IF_LT(bcost, (costs[2] << 4) + 4);
             COPY1_IF_LT(bcost, (costs[3] << 4) + 12);
             if (!(bcost & 15))
@@ -698,36 +725,57 @@
       /* equivalent to the above, but eliminates duplicate candidates */
         COST_MV_X3_DIR(-2, 0, -1, 2,  1, 2, costs);
         bcost <<= 3;
-        COPY1_IF_LT(bcost, (costs[0] << 3) + 2);
-        COPY1_IF_LT(bcost, (costs[1] << 3) + 3);
-        COPY1_IF_LT(bcost, (costs[2] << 3) + 4);
+        if ((bmv.y >= mvmin.y) & (bmv.y <= mvmax.y))
+            COPY1_IF_LT(bcost, (costs[0] << 3) + 2);
+        if ((bmv.y + 2 >= mvmin.y) & (bmv.y + 2 <= mvmax.y))
+        {
+            COPY1_IF_LT(bcost, (costs[1] << 3) + 3);
+            COPY1_IF_LT(bcost, (costs[2] << 3) + 4);
+        }
+
         COST_MV_X3_DIR(2, 0,  1, -2, -1, -2, costs);
-        COPY1_IF_LT(bcost, (costs[0] << 3) + 5);
-        COPY1_IF_LT(bcost, (costs[1] << 3) + 6);
-        COPY1_IF_LT(bcost, (costs[2] << 3) + 7);
+        if ((bmv.y >= mvmin.y) & (bmv.y <= mvmax.y))
+            COPY1_IF_LT(bcost, (costs[0] << 3) + 5);
+        if ((bmv.y - 2 >= mvmin.y) & (bmv.y - 2 <= mvmax.y))
+        {
+            COPY1_IF_LT(bcost, (costs[1] << 3) + 6);
+            COPY1_IF_LT(bcost, (costs[2] << 3) + 7);
+        }
 
         if (bcost & 7)
         {
             int dir = (bcost & 7) - 2;
-            bmv += hex2[dir + 1];
 
-            /* half hexagon, not overlapping the previous iteration */
-            for (int i = (merange >> 1) - 1; i > 0 && bmv.checkRange(mvmin, mvmax); i--)
+            if ((bmv.y + hex2[dir + 1].y >= mvmin.y) & (bmv.y + hex2[dir + 1].y <= mvmax.y))
             {
-                COST_MV_X3_DIR(hex2[dir + 0].x, hex2[dir + 0].y,
-                               hex2[dir + 1].x, hex2[dir + 1].y,
-                               hex2[dir + 2].x, hex2[dir + 2].y,
-                               costs);
-                bcost &= ~7;
-                COPY1_IF_LT(bcost, (costs[0] << 3) + 1);
-                COPY1_IF_LT(bcost, (costs[1] << 3) + 2);
-                COPY1_IF_LT(bcost, (costs[2] << 3) + 3);
-                if (!(bcost & 7))
-                    break;
-                dir += (bcost & 7) - 2;
-                dir = mod6m1[dir + 1];
                 bmv += hex2[dir + 1];
-            }
+
+                /* half hexagon, not overlapping the previous iteration */
+                for (int i = (merange >> 1) - 1; i > 0 && bmv.checkRange(mvmin, mvmax); i--)
+                {
+                    COST_MV_X3_DIR(hex2[dir + 0].x, hex2[dir + 0].y,
+                        hex2[dir + 1].x, hex2[dir + 1].y,
+                        hex2[dir + 2].x, hex2[dir + 2].y,
+                        costs);
+                    bcost &= ~7;
+
+                    if ((bmv.y + hex2[dir + 0].y >= mvmin.y) & (bmv.y + hex2[dir + 0].y <= mvmax.y))
+                        COPY1_IF_LT(bcost, (costs[0] << 3) + 1);
+
+                    if ((bmv.y + hex2[dir + 1].y >= mvmin.y) & (bmv.y + hex2[dir + 1].y <= mvmax.y))
+                        COPY1_IF_LT(bcost, (costs[1] << 3) + 2);
+
+                    if ((bmv.y + hex2[dir + 2].y >= mvmin.y) & (bmv.y + hex2[dir + 2].y <= mvmax.y))
+                        COPY1_IF_LT(bcost, (costs[2] << 3) + 3);
+
+                    if (!(bcost & 7))
+                        break;
+
+                    dir += (bcost & 7) - 2;
+                    dir = mod6m1[dir + 1];
+                    bmv += hex2[dir + 1];
+                }
+            } // if ((bmv.y + hex2[dir + 1].y >= mvmin.y) & (bmv.y + hex2[dir + 1].y <= mvmax.y))
         }
         bcost >>= 3;
 #endif // if 0
@@ -735,15 +783,21 @@
         /* square refine */
         int dir = 0;
         COST_MV_X4_DIR(0, -1,  0, 1, -1, 0, 1, 0, costs);
-        COPY2_IF_LT(bcost, costs[0], dir, 1);
-        COPY2_IF_LT(bcost, costs[1], dir, 2);
+        if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
+            COPY2_IF_LT(bcost, costs[0], dir, 1);
+        if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
+            COPY2_IF_LT(bcost, costs[1], dir, 2);
         COPY2_IF_LT(bcost, costs[2], dir, 3);
         COPY2_IF_LT(bcost, costs[3], dir, 4);
         COST_MV_X4_DIR(-1, -1, -1, 1, 1, -1, 1, 1, costs);
-        COPY2_IF_LT(bcost, costs[0], dir, 5);
-        COPY2_IF_LT(bcost, costs[1], dir, 6);
-        COPY2_IF_LT(bcost, costs[2], dir, 7);
-        COPY2_IF_LT(bcost, costs[3], dir, 8);
+        if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
+            COPY2_IF_LT(bcost, costs[0], dir, 5);
+        if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
+            COPY2_IF_LT(bcost, costs[1], dir, 6);
+        if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
+            COPY2_IF_LT(bcost, costs[2], dir, 7);
+        if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
+            COPY2_IF_LT(bcost, costs[3], dir, 8);
         bmv += square1[dir];
         break;
     }
@@ -756,6 +810,7 @@
         /* refine predictors */
         omv = bmv;
         ucost1 = bcost;
+        X265_CHECK(((pmv.y >= mvmin.y) & (pmv.y <= mvmax.y)), "pmv outside of search range!");

 
@@ -109,6 +109,8 @@
     blockOffset = 0;
     bChromaSATD = false;
     chromaSatd = NULL;
+    for (int i = 0; i < INTEGRAL_PLANE_NUM; i++)
+        integral[i] = NULL;
 }
 
 void MotionEstimate::init(int csp)
@@ -165,10 +167,12 @@
     partEnum = partitionFromSizes(pwidth, pheight);
     X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
     sad = primitives.pu[partEnum].sad;
+    ads = primitives.pu[partEnum].ads;
     satd = primitives.pu[partEnum].satd;
     sad_x3 = primitives.pu[partEnum].sad_x3;
     sad_x4 = primitives.pu[partEnum].sad_x4;
 
+
     blockwidth = pwidth;
     blockOffset = offset;
     absPartIdx = ctuAddr = -1;
@@ -188,6 +192,7 @@
     partEnum = partitionFromSizes(pwidth, pheight);
     X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
     sad = primitives.pu[partEnum].sad;
+    ads = primitives.pu[partEnum].ads;
     satd = primitives.pu[partEnum].satd;
     sad_x3 = primitives.pu[partEnum].sad_x3;
     sad_x4 = primitives.pu[partEnum].sad_x4;
@@ -278,12 +283,31 @@
         costs[1] += mvcost((omv + MV(m1x, m1y)) << 2); \
         costs[2] += mvcost((omv + MV(m2x, m2y)) << 2); \
         costs[3] += mvcost((omv + MV(m3x, m3y)) << 2); \
-        COPY2_IF_LT(bcost, costs[0], bmv, omv + MV(m0x, m0y)); \
-        COPY2_IF_LT(bcost, costs[1], bmv, omv + MV(m1x, m1y)); \
-        COPY2_IF_LT(bcost, costs[2], bmv, omv + MV(m2x, m2y)); \
-        COPY2_IF_LT(bcost, costs[3], bmv, omv + MV(m3x, m3y)); \
+        if ((omv.y + m0y >= mvmin.y) & (omv.y + m0y <= mvmax.y)) \
+            COPY2_IF_LT(bcost, costs[0], bmv, omv + MV(m0x, m0y)); \
+        if ((omv.y + m1y >= mvmin.y) & (omv.y + m1y <= mvmax.y)) \
+            COPY2_IF_LT(bcost, costs[1], bmv, omv + MV(m1x, m1y)); \
+        if ((omv.y + m2y >= mvmin.y) & (omv.y + m2y <= mvmax.y)) \
+            COPY2_IF_LT(bcost, costs[2], bmv, omv + MV(m2x, m2y)); \
+        if ((omv.y + m3y >= mvmin.y) & (omv.y + m3y <= mvmax.y)) \
+            COPY2_IF_LT(bcost, costs[3], bmv, omv + MV(m3x, m3y)); \
     }
 
+#define COST_MV_X3_ABS( m0x, m0y, m1x, m1y, m2x, m2y )\
+{\
+    sad_x3(fenc, \
+    fref + (m0x) + (m0y) * stride, \
+    fref + (m1x) + (m1y) * stride, \
+    fref + (m2x) + (m2y) * stride, \
+    stride, costs); \
+    costs[0] += p_cost_mvx[(m0x) << 2]; /* no cost_mvy */\
+    costs[1] += p_cost_mvx[(m1x) << 2]; \
+    costs[2] += p_cost_mvx[(m2x) << 2]; \
+    COPY3_IF_LT(bcost, costs[0], bmv.x, m0x, bmv.y, m0y); \
+    COPY3_IF_LT(bcost, costs[1], bmv.x, m1x, bmv.y, m1y); \
+    COPY3_IF_LT(bcost, costs[2], bmv.x, m2x, bmv.y, m2y); \
+}
+
 #define COST_MV_X4_DIR(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y, costs) \
     { \
         pixel *pix_base = fref + bmv.x + bmv.y * stride; \
@@ -627,6 +651,7 @@
         {
             bcost = cost;
             bmv = 0;
+            bmv.y = X265_MAX(X265_MIN(0, mvmax.y), mvmin.y);
         }
     }
 
@@ -659,8 +684,10 @@
         do
         {
             COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs);
-            COPY1_IF_LT(bcost, (costs[0] << 4) + 1);
-            COPY1_IF_LT(bcost, (costs[1] << 4) + 3);
+            if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
+                COPY1_IF_LT(bcost, (costs[0] << 4) + 1);
+            if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
+                COPY1_IF_LT(bcost, (costs[1] << 4) + 3);
             COPY1_IF_LT(bcost, (costs[2] << 4) + 4);
             COPY1_IF_LT(bcost, (costs[3] << 4) + 12);
             if (!(bcost & 15))
@@ -698,36 +725,57 @@
       /* equivalent to the above, but eliminates duplicate candidates */
         COST_MV_X3_DIR(-2, 0, -1, 2,  1, 2, costs);
         bcost <<= 3;
-        COPY1_IF_LT(bcost, (costs[0] << 3) + 2);
-        COPY1_IF_LT(bcost, (costs[1] << 3) + 3);
-        COPY1_IF_LT(bcost, (costs[2] << 3) + 4);
+        if ((bmv.y >= mvmin.y) & (bmv.y <= mvmax.y))
+            COPY1_IF_LT(bcost, (costs[0] << 3) + 2);
+        if ((bmv.y + 2 >= mvmin.y) & (bmv.y + 2 <= mvmax.y))
+        {
+            COPY1_IF_LT(bcost, (costs[1] << 3) + 3);
+            COPY1_IF_LT(bcost, (costs[2] << 3) + 4);
+        }
+
         COST_MV_X3_DIR(2, 0,  1, -2, -1, -2, costs);
-        COPY1_IF_LT(bcost, (costs[0] << 3) + 5);
-        COPY1_IF_LT(bcost, (costs[1] << 3) + 6);
-        COPY1_IF_LT(bcost, (costs[2] << 3) + 7);
+        if ((bmv.y >= mvmin.y) & (bmv.y <= mvmax.y))
+            COPY1_IF_LT(bcost, (costs[0] << 3) + 5);
+        if ((bmv.y - 2 >= mvmin.y) & (bmv.y - 2 <= mvmax.y))
+        {
+            COPY1_IF_LT(bcost, (costs[1] << 3) + 6);
+            COPY1_IF_LT(bcost, (costs[2] << 3) + 7);
+        }
 
         if (bcost & 7)
         {
             int dir = (bcost & 7) - 2;
-            bmv += hex2[dir + 1];
 
-            /* half hexagon, not overlapping the previous iteration */
-            for (int i = (merange >> 1) - 1; i > 0 && bmv.checkRange(mvmin, mvmax); i--)
+            if ((bmv.y + hex2[dir + 1].y >= mvmin.y) & (bmv.y + hex2[dir + 1].y <= mvmax.y))
             {
-                COST_MV_X3_DIR(hex2[dir + 0].x, hex2[dir + 0].y,
-                               hex2[dir + 1].x, hex2[dir + 1].y,
-                               hex2[dir + 2].x, hex2[dir + 2].y,
-                               costs);
-                bcost &= ~7;
-                COPY1_IF_LT(bcost, (costs[0] << 3) + 1);
-                COPY1_IF_LT(bcost, (costs[1] << 3) + 2);
-                COPY1_IF_LT(bcost, (costs[2] << 3) + 3);
-                if (!(bcost & 7))
-                    break;
-                dir += (bcost & 7) - 2;
-                dir = mod6m1[dir + 1];
                 bmv += hex2[dir + 1];
-            }
+
+                /* half hexagon, not overlapping the previous iteration */
+                for (int i = (merange >> 1) - 1; i > 0 && bmv.checkRange(mvmin, mvmax); i--)
+                {
+                    COST_MV_X3_DIR(hex2[dir + 0].x, hex2[dir + 0].y,
+                        hex2[dir + 1].x, hex2[dir + 1].y,
+                        hex2[dir + 2].x, hex2[dir + 2].y,
+                        costs);
+                    bcost &= ~7;
+
+                    if ((bmv.y + hex2[dir + 0].y >= mvmin.y) & (bmv.y + hex2[dir + 0].y <= mvmax.y))
+                        COPY1_IF_LT(bcost, (costs[0] << 3) + 1);
+
+                    if ((bmv.y + hex2[dir + 1].y >= mvmin.y) & (bmv.y + hex2[dir + 1].y <= mvmax.y))
+                        COPY1_IF_LT(bcost, (costs[1] << 3) + 2);
+
+                    if ((bmv.y + hex2[dir + 2].y >= mvmin.y) & (bmv.y + hex2[dir + 2].y <= mvmax.y))
+                        COPY1_IF_LT(bcost, (costs[2] << 3) + 3);
+
+                    if (!(bcost & 7))
+                        break;
+
+                    dir += (bcost & 7) - 2;
+                    dir = mod6m1[dir + 1];
+                    bmv += hex2[dir + 1];
+                }
+            } // if ((bmv.y + hex2[dir + 1].y >= mvmin.y) & (bmv.y + hex2[dir + 1].y <= mvmax.y))
         }
         bcost >>= 3;
 #endif // if 0
@@ -735,15 +783,21 @@
         /* square refine */
         int dir = 0;
         COST_MV_X4_DIR(0, -1,  0, 1, -1, 0, 1, 0, costs);
-        COPY2_IF_LT(bcost, costs[0], dir, 1);
-        COPY2_IF_LT(bcost, costs[1], dir, 2);
+        if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
+            COPY2_IF_LT(bcost, costs[0], dir, 1);
+        if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
+            COPY2_IF_LT(bcost, costs[1], dir, 2);
         COPY2_IF_LT(bcost, costs[2], dir, 3);
         COPY2_IF_LT(bcost, costs[3], dir, 4);
         COST_MV_X4_DIR(-1, -1, -1, 1, 1, -1, 1, 1, costs);
-        COPY2_IF_LT(bcost, costs[0], dir, 5);
-        COPY2_IF_LT(bcost, costs[1], dir, 6);
-        COPY2_IF_LT(bcost, costs[2], dir, 7);
-        COPY2_IF_LT(bcost, costs[3], dir, 8);
+        if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
+            COPY2_IF_LT(bcost, costs[0], dir, 5);
+        if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
+            COPY2_IF_LT(bcost, costs[1], dir, 6);
+        if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
+            COPY2_IF_LT(bcost, costs[2], dir, 7);
+        if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
+            COPY2_IF_LT(bcost, costs[3], dir, 8);
         bmv += square1[dir];
         break;
     }
@@ -756,6 +810,7 @@
         /* refine predictors */
         omv = bmv;
         ucost1 = bcost;
+        X265_CHECK(((pmv.y >= mvmin.y) & (pmv.y <= mvmax.y)), "pmv outside of search range!");
​

x265_2.1.tar.gz/source/encoder/motion.h -> x265_2.2.tar.gz/source/encoder/motion.h Changed

 
@@ -52,6 +52,7 @@
     pixelcmp_t sad;
     pixelcmp_x3_t sad_x3;
     pixelcmp_x4_t sad_x4;
+    pixelcmp_ads_t ads;
     pixelcmp_t satd;
     pixelcmp_t chromaSatd;
 
@@ -61,6 +62,7 @@
 
     static const int COST_MAX = 1 << 28;
 
+    uint32_t* integral[INTEGRAL_PLANE_NUM];
     Yuv fencPUYuv;
     int partEnum;
     bool bChromaSATD;
​

x265_2.1.tar.gz/source/encoder/nal.h -> x265_2.2.tar.gz/source/encoder/nal.h Changed

 
@@ -34,6 +34,7 @@
 
 class NALList
 {
+public:
     static const int MAX_NAL_UNITS = 16;
 
 public:
​

x265_2.1.tar.gz/source/encoder/ratecontrol.cpp -> x265_2.2.tar.gz/source/encoder/ratecontrol.cpp Changed

@@ -341,6 +341,8 @@
             m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, m_param->rc.vbvBufferInit / m_param->rc.vbvBufferSize);
         m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, X265_MAX(m_param->rc.vbvBufferInit, m_bufferRate / m_bufferSize));
         m_bufferFillFinal = m_bufferSize * m_param->rc.vbvBufferInit;
+        m_bufferFillActual = m_bufferFillFinal;
+        m_bufferExcess = 0;
     }
 
     m_totalBits = 0;
@@ -431,7 +433,7 @@
                 }
                 *statsIn = '\0';
                 statsIn++;
-                if (sscanf(opts, "#options: %dx%d", &i, &j) != 2)
+                if ((p = strstr(opts, " input-res=")) == 0 || sscanf(p, " input-res=%dx%d", &i, &j) != 2)
                 {
                     x265_log(m_param, X265_LOG_ERROR, "Resolution specified in stats file not valid\n");
                     return false;
@@ -457,9 +459,15 @@
                 CMP_OPT_FIRST_PASS("bframes", m_param->bframes);
                 CMP_OPT_FIRST_PASS("b-pyramid", m_param->bBPyramid);
                 CMP_OPT_FIRST_PASS("open-gop", m_param->bOpenGOP);
-                CMP_OPT_FIRST_PASS("keyint", m_param->keyframeMax);
+                CMP_OPT_FIRST_PASS(" keyint", m_param->keyframeMax);
                 CMP_OPT_FIRST_PASS("scenecut", m_param->scenecutThreshold);
                 CMP_OPT_FIRST_PASS("intra-refresh", m_param->bIntraRefresh);
+                if (m_param->bMultiPassOptRPS)
+                {
+                    CMP_OPT_FIRST_PASS("multi-pass-opt-rps", m_param->bMultiPassOptRPS);
+                    CMP_OPT_FIRST_PASS("repeat-headers", m_param->bRepeatHeaders);
+                    CMP_OPT_FIRST_PASS("min-keyint", m_param->keyframeMin);
+                }
 
                 if ((p = strstr(opts, "b-adapt=")) != 0 && sscanf(p, "b-adapt=%d", &i) && i >= X265_B_ADAPT_NONE && i <= X265_B_ADAPT_TRELLIS)
                 {
@@ -542,10 +550,27 @@
                 }
                 rce = &m_rce2Pass[encodeOrder];
                 m_encOrder[frameNumber] = encodeOrder;
-                e += sscanf(p, " in:%*d out:%*d type:%c q:%lf q-aq:%lf q-noVbv:%lf q-Rceq:%lf tex:%d mv:%d misc:%d icu:%lf pcu:%lf scu:%lf",
-                       &picType, &qpRc, &qpAq, &qNoVbv, &qRceq, &rce->coeffBits,
-                       &rce->mvBits, &rce->miscBits, &rce->iCuCount, &rce->pCuCount,
-                       &rce->skipCuCount);
+                if (!m_param->bMultiPassOptRPS)
+                {
+                    e += sscanf(p, " in:%*d out:%*d type:%c q:%lf q-aq:%lf q-noVbv:%lf q-Rceq:%lf tex:%d mv:%d misc:%d icu:%lf pcu:%lf scu:%lf",
+                        &picType, &qpRc, &qpAq, &qNoVbv, &qRceq, &rce->coeffBits,
+                        &rce->mvBits, &rce->miscBits, &rce->iCuCount, &rce->pCuCount,
+                        &rce->skipCuCount);
+                }
+                else
+                {
+                    char deltaPOC[128];
+                    char bUsed[40];
+                    memset(deltaPOC, 0, sizeof(deltaPOC));
+                    memset(bUsed, 0, sizeof(bUsed));
+                    e += sscanf(p, " in:%*d out:%*d type:%c q:%lf q-aq:%lf q-noVbv:%lf q-Rceq:%lf tex:%d mv:%d misc:%d icu:%lf pcu:%lf scu:%lf nump:%d numnegp:%d numposp:%d deltapoc:%s bused:%s",
+                        &picType, &qpRc, &qpAq, &qNoVbv, &qRceq, &rce->coeffBits,
+                        &rce->mvBits, &rce->miscBits, &rce->iCuCount, &rce->pCuCount,
+                        &rce->skipCuCount, &rce->rpsData.numberOfPictures, &rce->rpsData.numberOfNegativePictures, &rce->rpsData.numberOfPositivePictures, deltaPOC, bUsed);
+                    splitdeltaPOC(deltaPOC, rce);
+                    splitbUsed(bUsed, rce);
+                    rce->rpsIdx = -1;
+                }
                 rce->keptAsRef = true;
                 rce->isIdr = false;
                 if (picType == 'b' || picType == 'p')
@@ -598,7 +623,7 @@
                 x265_log_file(m_param, X265_LOG_ERROR, "can't open stats file %s.temp\n", fileName);
                 return false;
             }
-            p = x265_param2string(m_param);
+            p = x265_param2string(m_param, sps.conformanceWindow.rightOffset, sps.conformanceWindow.bottomOffset);
             if (p)
                 fprintf(m_statFileOut, "#options: %s\n", p);
             X265_FREE(p);
@@ -1649,15 +1674,18 @@
                 if (m_pred[m_predType].count == 1)
                     qScale = x265_clip3(lmin, lmax, qScale);
                 m_lastQScaleFor[m_sliceType] = qScale;
-                rce->frameSizePlanned = predictSize(&m_pred[m_predType], qScale, (double)m_currentSatd);
             }
-            else
-                rce->frameSizePlanned = qScale2bits(rce, qScale);
+        }
 
-            /* Limit planned size by MinCR */
+        if (m_2pass)
+            rce->frameSizePlanned = qScale2bits(rce, qScale);
+        else
+            rce->frameSizePlanned = predictSize(&m_pred[m_predType], qScale, (double)m_currentSatd);
+
+        /* Limit planned size by MinCR */
+        if (m_isVbv)
             rce->frameSizePlanned = X265_MIN(rce->frameSizePlanned, rce->frameSizeMaximum);
-            rce->frameSizeEstimated = rce->frameSizePlanned;
-        }
+        rce->frameSizeEstimated = rce->frameSizePlanned;
 
         rce->newQScale = qScale;
         if(rce->bLastMiniGopBFrame)
@@ -1875,7 +1903,7 @@
         if ((m_curSlice->m_poc == 0 || m_lastQScaleFor[P_SLICE] < q) && !(m_2pass && !m_isVbv))
             m_lastQScaleFor[P_SLICE] = q * fabs(m_param->rc.ipFactor);
 
-        if (m_2pass && m_isVbv)
+        if (m_2pass)
             rce->frameSizePlanned = qScale2bits(rce, q);
         else
             rce->frameSizePlanned = predictSize(&m_pred[m_predType], q, (double)m_currentSatd);
@@ -2161,7 +2189,7 @@
     for (uint32_t row = 0; row < maxRows; row++)
     {
         encodedBitsSoFar += curEncData.m_rowStat[row].encodedBits;
-        rowSatdCostSoFar = curEncData.m_rowStat[row].diagSatd;
+        rowSatdCostSoFar = curEncData.m_rowStat[row].rowSatd;
         uint32_t satdCostForPendingCus = curEncData.m_rowStat[row].satdForVbv - rowSatdCostSoFar;
         satdCostForPendingCus >>= X265_DEPTH - 8;
         if (satdCostForPendingCus  > 0)
@@ -2190,7 +2218,7 @@
                 }
 
                 refRowSatdCost >>= X265_DEPTH - 8;
-                refQScale = refEncData.m_rowStat[row].diagQpScale;
+                refQScale = refEncData.m_rowStat[row].rowQpScale;
             }
 
             if (picType == I_SLICE || qScale >= refQScale)
@@ -2212,7 +2240,7 @@
             }
             else if (picType == P_SLICE)
             {
-                intraCostForPendingCus = curEncData.m_rowStat[row].intraSatdForVbv - curEncData.m_rowStat[row].diagIntraSatd;
+                intraCostForPendingCus = curEncData.m_rowStat[row].intraSatdForVbv - curEncData.m_rowStat[row].rowIntraSatd;
                 intraCostForPendingCus >>= X265_DEPTH - 8;
                 /* Our QP is lower than the reference! */
                 double pred_intra = predictSize(rce->rowPred[1], qScale, intraCostForPendingCus);
@@ -2227,16 +2255,16 @@
     return totalSatdBits + encodedBitsSoFar;
 }
 
-int RateControl::rowDiagonalVbvRateControl(Frame* curFrame, uint32_t row, RateControlEntry* rce, double& qpVbv)
+int RateControl::rowVbvRateControl(Frame* curFrame, uint32_t row, RateControlEntry* rce, double& qpVbv)
 {
     FrameData& curEncData = *curFrame->m_encData;
     double qScaleVbv = x265_qp2qScale(qpVbv);
-    uint64_t rowSatdCost = curEncData.m_rowStat[row].diagSatd;
+    uint64_t rowSatdCost = curEncData.m_rowStat[row].rowSatd;
     double encodedBits = curEncData.m_rowStat[row].encodedBits;
 
-    if (row == 1)
+    if (m_param->bEnableWavefront && row == 1)
     {
-        rowSatdCost += curEncData.m_rowStat[0].diagSatd;
+        rowSatdCost += curEncData.m_rowStat[0].rowSatd;
         encodedBits += curEncData.m_rowStat[0].encodedBits;
     }
     rowSatdCost >>= X265_DEPTH - 8;
@@ -2244,11 +2272,11 @@
     if (curEncData.m_slice->m_sliceType != I_SLICE)
     {
         Frame* refFrame = curEncData.m_slice->m_refFrameList[0][0];
-        if (qpVbv < refFrame->m_encData->m_rowStat[row].diagQp)
+        if (qpVbv < refFrame->m_encData->m_rowStat[row].rowQp)
         {
-            uint64_t intraRowSatdCost = curEncData.m_rowStat[row].diagIntraSatd;
-            if (row == 1)
-                intraRowSatdCost += curEncData.m_rowStat[0].diagIntraSatd;
+            uint64_t intraRowSatdCost = curEncData.m_rowStat[row].rowIntraSatd;
+            if (m_param->bEnableWavefront && row == 1)
+                intraRowSatdCost += curEncData.m_rowStat[0].rowIntraSatd;
             intraRowSatdCost >>= X265_DEPTH - 8;
             updatePredictor(rce->rowPred[1], qScaleVbv, (double)intraRowSatdCost, encodedBits);
         }
@@ -2309,7 +2337,7 @@
         }
 
         while (qpVbv > qpMin
-               && (qpVbv > curEncData.m_rowStat[0].diagQp || m_singleFrameVbv)
+               && (qpVbv > curEncData.m_rowStat[0].rowQp || m_singleFrameVbv)
                && (((accFrameBits < rce->frameSizePlanned * 0.8f && qpVbv <= prevRowQp)
                    || accFrameBits < (rce->bufferFill - m_bufferSize + m_bufferRate) * 1.1)
                    && (!m_param->rc.bStrictCbr ? 1 : abrOvershoot < 0)))
@@ -2329,7 +2357,7 @@
                 accFrameBits = predictRowsSizeSum(curFrame, rce, qpVbv, encodedBitsSoFar);
                 abrOvershoot = (accFrameBits + m_totalBits - m_wantedBitsWindow) / totalBitsNeeded;
             }
-            if (qpVbv > curEncData.m_rowStat[0].diagQp &&
+            if (qpVbv > curEncData.m_rowStat[0].rowQp &&
                 abrOvershoot < -0.1 && timeDone > 0.5 && accFrameBits < rce->frameSizePlanned - rcTol)
             {
                 qpVbv -= stepSize;
@@ -2446,6 +2474,10 @@
     m_bufferFillFinal = X265_MAX(m_bufferFillFinal, 0);
     m_bufferFillFinal += m_bufferRate;
     m_bufferFillFinal = X265_MIN(m_bufferFillFinal, m_bufferSize);
+    double bufferBits = X265_MIN(bits + m_bufferExcess, m_bufferRate);
+    m_bufferExcess = X265_MAX(m_bufferExcess - bufferBits + bits, 0);
+    m_bufferFillActual += bufferBits - bits;
+    m_bufferFillActual = X265_MIN(m_bufferFillActual, m_bufferSize);

 
@@ -341,6 +341,8 @@
             m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, m_param->rc.vbvBufferInit / m_param->rc.vbvBufferSize);
         m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, X265_MAX(m_param->rc.vbvBufferInit, m_bufferRate / m_bufferSize));
         m_bufferFillFinal = m_bufferSize * m_param->rc.vbvBufferInit;
+        m_bufferFillActual = m_bufferFillFinal;
+        m_bufferExcess = 0;
     }
 
     m_totalBits = 0;
@@ -431,7 +433,7 @@
                 }
                 *statsIn = '\0';
                 statsIn++;
-                if (sscanf(opts, "#options: %dx%d", &i, &j) != 2)
+                if ((p = strstr(opts, " input-res=")) == 0 || sscanf(p, " input-res=%dx%d", &i, &j) != 2)
                 {
                     x265_log(m_param, X265_LOG_ERROR, "Resolution specified in stats file not valid\n");
                     return false;
@@ -457,9 +459,15 @@
                 CMP_OPT_FIRST_PASS("bframes", m_param->bframes);
                 CMP_OPT_FIRST_PASS("b-pyramid", m_param->bBPyramid);
                 CMP_OPT_FIRST_PASS("open-gop", m_param->bOpenGOP);
-                CMP_OPT_FIRST_PASS("keyint", m_param->keyframeMax);
+                CMP_OPT_FIRST_PASS(" keyint", m_param->keyframeMax);
                 CMP_OPT_FIRST_PASS("scenecut", m_param->scenecutThreshold);
                 CMP_OPT_FIRST_PASS("intra-refresh", m_param->bIntraRefresh);
+                if (m_param->bMultiPassOptRPS)
+                {
+                    CMP_OPT_FIRST_PASS("multi-pass-opt-rps", m_param->bMultiPassOptRPS);
+                    CMP_OPT_FIRST_PASS("repeat-headers", m_param->bRepeatHeaders);
+                    CMP_OPT_FIRST_PASS("min-keyint", m_param->keyframeMin);
+                }
 
                 if ((p = strstr(opts, "b-adapt=")) != 0 && sscanf(p, "b-adapt=%d", &i) && i >= X265_B_ADAPT_NONE && i <= X265_B_ADAPT_TRELLIS)
                 {
@@ -542,10 +550,27 @@
                 }
                 rce = &m_rce2Pass[encodeOrder];
                 m_encOrder[frameNumber] = encodeOrder;
-                e += sscanf(p, " in:%*d out:%*d type:%c q:%lf q-aq:%lf q-noVbv:%lf q-Rceq:%lf tex:%d mv:%d misc:%d icu:%lf pcu:%lf scu:%lf",
-                       &picType, &qpRc, &qpAq, &qNoVbv, &qRceq, &rce->coeffBits,
-                       &rce->mvBits, &rce->miscBits, &rce->iCuCount, &rce->pCuCount,
-                       &rce->skipCuCount);
+                if (!m_param->bMultiPassOptRPS)
+                {
+                    e += sscanf(p, " in:%*d out:%*d type:%c q:%lf q-aq:%lf q-noVbv:%lf q-Rceq:%lf tex:%d mv:%d misc:%d icu:%lf pcu:%lf scu:%lf",
+                        &picType, &qpRc, &qpAq, &qNoVbv, &qRceq, &rce->coeffBits,
+                        &rce->mvBits, &rce->miscBits, &rce->iCuCount, &rce->pCuCount,
+                        &rce->skipCuCount);
+                }
+                else
+                {
+                    char deltaPOC[128];
+                    char bUsed[40];
+                    memset(deltaPOC, 0, sizeof(deltaPOC));
+                    memset(bUsed, 0, sizeof(bUsed));
+                    e += sscanf(p, " in:%*d out:%*d type:%c q:%lf q-aq:%lf q-noVbv:%lf q-Rceq:%lf tex:%d mv:%d misc:%d icu:%lf pcu:%lf scu:%lf nump:%d numnegp:%d numposp:%d deltapoc:%s bused:%s",
+                        &picType, &qpRc, &qpAq, &qNoVbv, &qRceq, &rce->coeffBits,
+                        &rce->mvBits, &rce->miscBits, &rce->iCuCount, &rce->pCuCount,
+                        &rce->skipCuCount, &rce->rpsData.numberOfPictures, &rce->rpsData.numberOfNegativePictures, &rce->rpsData.numberOfPositivePictures, deltaPOC, bUsed);
+                    splitdeltaPOC(deltaPOC, rce);
+                    splitbUsed(bUsed, rce);
+                    rce->rpsIdx = -1;
+                }
                 rce->keptAsRef = true;
                 rce->isIdr = false;
                 if (picType == 'b' || picType == 'p')
@@ -598,7 +623,7 @@
                 x265_log_file(m_param, X265_LOG_ERROR, "can't open stats file %s.temp\n", fileName);
                 return false;
             }
-            p = x265_param2string(m_param);
+            p = x265_param2string(m_param, sps.conformanceWindow.rightOffset, sps.conformanceWindow.bottomOffset);
             if (p)
                 fprintf(m_statFileOut, "#options: %s\n", p);
             X265_FREE(p);
@@ -1649,15 +1674,18 @@
                 if (m_pred[m_predType].count == 1)
                     qScale = x265_clip3(lmin, lmax, qScale);
                 m_lastQScaleFor[m_sliceType] = qScale;
-                rce->frameSizePlanned = predictSize(&m_pred[m_predType], qScale, (double)m_currentSatd);
             }
-            else
-                rce->frameSizePlanned = qScale2bits(rce, qScale);
+        }
 
-            /* Limit planned size by MinCR */
+        if (m_2pass)
+            rce->frameSizePlanned = qScale2bits(rce, qScale);
+        else
+            rce->frameSizePlanned = predictSize(&m_pred[m_predType], qScale, (double)m_currentSatd);
+
+        /* Limit planned size by MinCR */
+        if (m_isVbv)
             rce->frameSizePlanned = X265_MIN(rce->frameSizePlanned, rce->frameSizeMaximum);
-            rce->frameSizeEstimated = rce->frameSizePlanned;
-        }
+        rce->frameSizeEstimated = rce->frameSizePlanned;
 
         rce->newQScale = qScale;
         if(rce->bLastMiniGopBFrame)
@@ -1875,7 +1903,7 @@
         if ((m_curSlice->m_poc == 0 || m_lastQScaleFor[P_SLICE] < q) && !(m_2pass && !m_isVbv))
             m_lastQScaleFor[P_SLICE] = q * fabs(m_param->rc.ipFactor);
 
-        if (m_2pass && m_isVbv)
+        if (m_2pass)
             rce->frameSizePlanned = qScale2bits(rce, q);
         else
             rce->frameSizePlanned = predictSize(&m_pred[m_predType], q, (double)m_currentSatd);
@@ -2161,7 +2189,7 @@
     for (uint32_t row = 0; row < maxRows; row++)
     {
         encodedBitsSoFar += curEncData.m_rowStat[row].encodedBits;
-        rowSatdCostSoFar = curEncData.m_rowStat[row].diagSatd;
+        rowSatdCostSoFar = curEncData.m_rowStat[row].rowSatd;
         uint32_t satdCostForPendingCus = curEncData.m_rowStat[row].satdForVbv - rowSatdCostSoFar;
         satdCostForPendingCus >>= X265_DEPTH - 8;
         if (satdCostForPendingCus  > 0)
@@ -2190,7 +2218,7 @@
                 }
 
                 refRowSatdCost >>= X265_DEPTH - 8;
-                refQScale = refEncData.m_rowStat[row].diagQpScale;
+                refQScale = refEncData.m_rowStat[row].rowQpScale;
             }
 
             if (picType == I_SLICE || qScale >= refQScale)
@@ -2212,7 +2240,7 @@
             }
             else if (picType == P_SLICE)
             {
-                intraCostForPendingCus = curEncData.m_rowStat[row].intraSatdForVbv - curEncData.m_rowStat[row].diagIntraSatd;
+                intraCostForPendingCus = curEncData.m_rowStat[row].intraSatdForVbv - curEncData.m_rowStat[row].rowIntraSatd;
                 intraCostForPendingCus >>= X265_DEPTH - 8;
                 /* Our QP is lower than the reference! */
                 double pred_intra = predictSize(rce->rowPred[1], qScale, intraCostForPendingCus);
@@ -2227,16 +2255,16 @@
     return totalSatdBits + encodedBitsSoFar;
 }
 
-int RateControl::rowDiagonalVbvRateControl(Frame* curFrame, uint32_t row, RateControlEntry* rce, double& qpVbv)
+int RateControl::rowVbvRateControl(Frame* curFrame, uint32_t row, RateControlEntry* rce, double& qpVbv)
 {
     FrameData& curEncData = *curFrame->m_encData;
     double qScaleVbv = x265_qp2qScale(qpVbv);
-    uint64_t rowSatdCost = curEncData.m_rowStat[row].diagSatd;
+    uint64_t rowSatdCost = curEncData.m_rowStat[row].rowSatd;
     double encodedBits = curEncData.m_rowStat[row].encodedBits;
 
-    if (row == 1)
+    if (m_param->bEnableWavefront && row == 1)
     {
-        rowSatdCost += curEncData.m_rowStat[0].diagSatd;
+        rowSatdCost += curEncData.m_rowStat[0].rowSatd;
         encodedBits += curEncData.m_rowStat[0].encodedBits;
     }
     rowSatdCost >>= X265_DEPTH - 8;
@@ -2244,11 +2272,11 @@
     if (curEncData.m_slice->m_sliceType != I_SLICE)
     {
         Frame* refFrame = curEncData.m_slice->m_refFrameList[0][0];
-        if (qpVbv < refFrame->m_encData->m_rowStat[row].diagQp)
+        if (qpVbv < refFrame->m_encData->m_rowStat[row].rowQp)
         {
-            uint64_t intraRowSatdCost = curEncData.m_rowStat[row].diagIntraSatd;
-            if (row == 1)
-                intraRowSatdCost += curEncData.m_rowStat[0].diagIntraSatd;
+            uint64_t intraRowSatdCost = curEncData.m_rowStat[row].rowIntraSatd;
+            if (m_param->bEnableWavefront && row == 1)
+                intraRowSatdCost += curEncData.m_rowStat[0].rowIntraSatd;
             intraRowSatdCost >>= X265_DEPTH - 8;
             updatePredictor(rce->rowPred[1], qScaleVbv, (double)intraRowSatdCost, encodedBits);
         }
@@ -2309,7 +2337,7 @@
         }
 
         while (qpVbv > qpMin
-               && (qpVbv > curEncData.m_rowStat[0].diagQp || m_singleFrameVbv)
+               && (qpVbv > curEncData.m_rowStat[0].rowQp || m_singleFrameVbv)
                && (((accFrameBits < rce->frameSizePlanned * 0.8f && qpVbv <= prevRowQp)
                    || accFrameBits < (rce->bufferFill - m_bufferSize + m_bufferRate) * 1.1)
                    && (!m_param->rc.bStrictCbr ? 1 : abrOvershoot < 0)))
@@ -2329,7 +2357,7 @@
                 accFrameBits = predictRowsSizeSum(curFrame, rce, qpVbv, encodedBitsSoFar);
                 abrOvershoot = (accFrameBits + m_totalBits - m_wantedBitsWindow) / totalBitsNeeded;
             }
-            if (qpVbv > curEncData.m_rowStat[0].diagQp &&
+            if (qpVbv > curEncData.m_rowStat[0].rowQp &&
                 abrOvershoot < -0.1 && timeDone > 0.5 && accFrameBits < rce->frameSizePlanned - rcTol)
             {
                 qpVbv -= stepSize;
@@ -2446,6 +2474,10 @@
     m_bufferFillFinal = X265_MAX(m_bufferFillFinal, 0);
     m_bufferFillFinal += m_bufferRate;
     m_bufferFillFinal = X265_MIN(m_bufferFillFinal, m_bufferSize);
+    double bufferBits = X265_MIN(bits + m_bufferExcess, m_bufferRate);
+    m_bufferExcess = X265_MAX(m_bufferExcess - bufferBits + bits, 0);
+    m_bufferFillActual += bufferBits - bits;
+    m_bufferFillActual = X265_MIN(m_bufferFillActual, m_bufferSize);
​

x265_2.1.tar.gz/source/encoder/ratecontrol.h -> x265_2.2.tar.gz/source/encoder/ratecontrol.h Changed

@@ -111,6 +111,8 @@
     bool     isIdr;
     SEIPictureTiming *picTimingSEI;
     HRDTiming        *hrdTiming;
+    int      rpsIdx;
+    RPS      rpsData;
 };
 
 class RateControl
@@ -144,6 +146,8 @@
     double m_rateFactorMaxIncrement; /* Don't allow RF above (CRF + this value). */
     double m_rateFactorMaxDecrement; /* don't allow RF below (this value). */
     double m_avgPFrameQp;
+    double m_bufferFillActual;
+    double m_bufferExcess;
     bool   m_isFirstMiniGop;
     Predictor m_pred[4];       /* Slice predictors to preidct bits for each Slice type - I,P,Bref and B */
     int64_t m_leadingNoBSatd;
@@ -239,7 +243,7 @@
     int  rateControlStart(Frame* curFrame, RateControlEntry* rce, Encoder* enc);
     void rateControlUpdateStats(RateControlEntry* rce);
     int  rateControlEnd(Frame* curFrame, int64_t bits, RateControlEntry* rce);
-    int  rowDiagonalVbvRateControl(Frame* curFrame, uint32_t row, RateControlEntry* rce, double& qpVbv);
+    int  rowVbvRateControl(Frame* curFrame, uint32_t row, RateControlEntry* rce, double& qpVbv);
     int  rateControlSliceType(int frameNum);
     bool cuTreeReadFor2Pass(Frame* curFrame);
     void hrdFullness(SEIBufferingPeriod* sei);
@@ -280,6 +284,8 @@
     bool   findUnderflow(double *fills, int *t0, int *t1, int over, int framesCount);
     bool   fixUnderflow(int t0, int t1, double adjustment, double qscaleMin, double qscaleMax);
     double tuneQScaleForGrain(double rcOverflow);
+    void   splitdeltaPOC(char deltapoc[], RateControlEntry *rce);
+    void   splitbUsed(char deltapoc[], RateControlEntry *rce);
 };
 }
 #endif // ifndef X265_RATECONTROL_H

 
@@ -111,6 +111,8 @@
     bool     isIdr;
     SEIPictureTiming *picTimingSEI;
     HRDTiming        *hrdTiming;
+    int      rpsIdx;
+    RPS      rpsData;
 };
 
 class RateControl
@@ -144,6 +146,8 @@
     double m_rateFactorMaxIncrement; /* Don't allow RF above (CRF + this value). */
     double m_rateFactorMaxDecrement; /* don't allow RF below (this value). */
     double m_avgPFrameQp;
+    double m_bufferFillActual;
+    double m_bufferExcess;
     bool   m_isFirstMiniGop;
     Predictor m_pred[4];       /* Slice predictors to preidct bits for each Slice type - I,P,Bref and B */
     int64_t m_leadingNoBSatd;
@@ -239,7 +243,7 @@
     int  rateControlStart(Frame* curFrame, RateControlEntry* rce, Encoder* enc);
     void rateControlUpdateStats(RateControlEntry* rce);
     int  rateControlEnd(Frame* curFrame, int64_t bits, RateControlEntry* rce);
-    int  rowDiagonalVbvRateControl(Frame* curFrame, uint32_t row, RateControlEntry* rce, double& qpVbv);
+    int  rowVbvRateControl(Frame* curFrame, uint32_t row, RateControlEntry* rce, double& qpVbv);
     int  rateControlSliceType(int frameNum);
     bool cuTreeReadFor2Pass(Frame* curFrame);
     void hrdFullness(SEIBufferingPeriod* sei);
@@ -280,6 +284,8 @@
     bool   findUnderflow(double *fills, int *t0, int *t1, int over, int framesCount);
     bool   fixUnderflow(int t0, int t1, double adjustment, double qscaleMin, double qscaleMax);
     double tuneQScaleForGrain(double rcOverflow);
+    void   splitdeltaPOC(char deltapoc[], RateControlEntry *rce);
+    void   splitbUsed(char deltapoc[], RateControlEntry *rce);
 };
 }
 #endif // ifndef X265_RATECONTROL_H
​

x265_2.1.tar.gz/source/encoder/reference.cpp -> x265_2.2.tar.gz/source/encoder/reference.cpp Changed

 
@@ -128,11 +128,12 @@
     intptr_t stride = reconPic->m_stride;
     int width   = reconPic->m_picWidth;
     int height  = (finishedRows - numWeightedRows) * g_maxCUSize;
-    if ((finishedRows == maxNumRows) && (reconPic->m_picHeight % g_maxCUSize))
+    /* the last row may be partial height */
+    if (finishedRows == maxNumRows - 1)
     {
-        /* the last row may be partial height */
-        height -= g_maxCUSize;
-        height += reconPic->m_picHeight % g_maxCUSize;
+        const int leftRows = (reconPic->m_picHeight & (g_maxCUSize - 1));
+
+        height += leftRows ? leftRows : g_maxCUSize;
     }
     int cuHeight = g_maxCUSize;
 
@@ -172,7 +173,7 @@
         }
 
         // Extending Bottom
-        if (finishedRows == maxNumRows)
+        if (finishedRows == maxNumRows - 1)
         {
             int picHeight = reconPic->m_picHeight;
             if (c) picHeight >>= reconPic->m_vChromaShift;
​

x265_2.1.tar.gz/source/encoder/sao.cpp -> x265_2.2.tar.gz/source/encoder/sao.cpp Changed

 
@@ -1208,10 +1208,15 @@
     if (!saoParam->bSaoFlag[0])
         m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + m_refDepth] = 1.0;
     else
+    {
+        X265_CHECK(m_numNoSao[0] <= numctus, "m_numNoSao check failure!");
         m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + m_refDepth] = m_numNoSao[0] / ((double)numctus);
+    }
 
     if (!saoParam->bSaoFlag[1])
+    {
         m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + m_refDepth] = 1.0;
+    }
     else
         m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + m_refDepth] = m_numNoSao[1] / ((double)numctus);
 }
​

x265_2.1.tar.gz/source/encoder/search.cpp -> x265_2.2.tar.gz/source/encoder/search.cpp Changed

@@ -67,6 +67,7 @@
     m_param = NULL;
     m_slice = NULL;
     m_frame = NULL;
+    m_maxTUDepth = -1;
 }
 
 bool Search::initSearch(const x265_param& param, ScalingList& scalingList)
@@ -93,6 +94,19 @@
     uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
     uint32_t numPartitions = 1 << (maxLog2CUSize - LOG2_UNIT_SIZE) * 2;
 
+    m_limitTU = 0;
+    if (m_param->limitTU)
+    {
+        if (m_param->limitTU == 1)
+            m_limitTU = X265_TU_LIMIT_BFS;
+        else if (m_param->limitTU == 2)
+            m_limitTU = X265_TU_LIMIT_DFS;
+        else if (m_param->limitTU == 3)
+            m_limitTU = X265_TU_LIMIT_NEIGH;
+        else if (m_param->limitTU == 4)
+            m_limitTU = X265_TU_LIMIT_DFS + X265_TU_LIMIT_NEIGH;
+    }
+
     /* these are indexed by qtLayer (log2size - 2) so nominally 0=4x4, 1=8x8, 2=16x16, 3=32x32
      * the coeffRQT and reconQtYuv are allocated to the max CU size at every depth. The parts
      * which are reconstructed at each depth are valid. At the end, the transform depth table
@@ -2131,6 +2145,13 @@
                 int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
                 MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx];
 
+                if (m_param->searchMethod == X265_SEA)
+                {
+                    int puX = puIdx & 1;
+                    int puY = puIdx >> 1;
+                    for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
+                        m_me.integral[planes] = interMode.fencYuv->m_integral[list][ref][planes] + puX * pu.width + puY * pu.height * m_slice->m_refFrameList[list][ref]->m_reconPic->m_stride;
+                }
                 setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
                 int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv,
                   m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
@@ -2229,7 +2250,13 @@
                         if (lmv.notZero())
                             mvc[numMvc++] = lmv;
                     }
-
+                    if (m_param->searchMethod == X265_SEA)
+                    {
+                        int puX = puIdx & 1;
+                        int puY = puIdx >> 1;
+                        for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
+                            m_me.integral[planes] = interMode.fencYuv->m_integral[list][ref][planes] + puX * pu.width + puY * pu.height * m_slice->m_refFrameList[list][ref]->m_reconPic->m_stride;
+                    }
                     setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
                     int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, 
                       m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
@@ -2544,6 +2571,9 @@
     /* conditional clipping for frame parallelism */
     mvmin.y = X265_MIN(mvmin.y, (int16_t)m_refLagPixels);
     mvmax.y = X265_MIN(mvmax.y, (int16_t)m_refLagPixels);
+
+    /* conditional clipping for negative mv range */
+    mvmax.y = X265_MAX(mvmax.y, mvmin.y);
 }
 
 /* Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */
@@ -2617,8 +2647,29 @@
 
     m_entropyCoder.load(m_rqt[depth].cur);
 
+    if ((m_limitTU & X265_TU_LIMIT_DFS) && !(m_limitTU & X265_TU_LIMIT_NEIGH))
+        m_maxTUDepth = -1;
+    else if (m_limitTU & X265_TU_LIMIT_BFS)
+        memset(&m_cacheTU, 0, sizeof(TUInfoCache));
+
     Cost costs;
-    estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange);
+    if (m_limitTU & X265_TU_LIMIT_NEIGH)
+    {
+        /* Save and reload maxTUDepth to avoid changing of maxTUDepth between modes */
+        int32_t tempDepth = m_maxTUDepth;
+        if (m_maxTUDepth != -1)
+        {
+            uint32_t splitFlag = interMode.cu.m_partSize[0] != SIZE_2Nx2N;
+            uint32_t minSize = tuDepthRange[0];
+            uint32_t maxSize = tuDepthRange[1];
+            maxSize = X265_MIN(maxSize, cuGeom.log2CUSize - splitFlag);
+            m_maxTUDepth = x265_clip3(cuGeom.log2CUSize - maxSize, cuGeom.log2CUSize - minSize, (uint32_t)m_maxTUDepth);
+        }
+        estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange);
+        m_maxTUDepth = tempDepth;
+    }
+    else
+        estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange);
 
     uint32_t tqBypass = cu.m_tqBypass[0];
     if (!tqBypass)
@@ -2867,7 +2918,57 @@
         return m_rdCost.calcRdCost(dist, nullBits);
 }
 
-void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& outCosts, const uint32_t depthRange[2])
+bool Search::splitTU(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& splitCost, const uint32_t depthRange[2], int32_t splitMore)
+{
+    CUData& cu = mode.cu;
+    uint32_t depth = cuGeom.depth + tuDepth;
+    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
+
+    uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
+    uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
+    for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
+    {
+        if ((m_limitTU & X265_TU_LIMIT_DFS) && tuDepth == 0 && qIdx == 1)
+        {
+            m_maxTUDepth = cu.m_tuDepth[0];
+            // Fetch maximum TU depth of first sub partition to limit recursion of others
+            for (uint32_t i = 1; i < cuGeom.numPartitions / 4; i++)
+                m_maxTUDepth = X265_MAX(m_maxTUDepth, cu.m_tuDepth[i]);
+        }
+        estimateResidualQT(mode, cuGeom, qPartIdx, tuDepth + 1, resiYuv, splitCost, depthRange, splitMore);
+        ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA,     tuDepth + 1);
+        if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
+        {
+            ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
+            vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
+        }
+    }
+    cu.m_cbf[0][absPartIdx] |= ycbf << tuDepth;
+    if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
+    {
+        cu.m_cbf[1][absPartIdx] |= ucbf << tuDepth;
+        cu.m_cbf[2][absPartIdx] |= vcbf << tuDepth;
+    }
+
+    // Here we were encoding cbfs and coefficients for splitted blocks. Since I have collected coefficient bits
+    // for each individual blocks, only encoding cbf values. As I mentioned encoding chroma cbfs is different then luma.
+    // But have one doubt that if coefficients are encoded in context at depth 2 (for example) and cbfs are encoded in context
+    // at depth 0 (for example).
+    m_entropyCoder.load(m_rqt[depth].rqtRoot);
+    m_entropyCoder.resetBits();
+    codeInterSubdivCbfQT(cu, absPartIdx, tuDepth, depthRange);
+    uint32_t splitCbfBits = m_entropyCoder.getNumberOfWrittenBits();
+    splitCost.bits += splitCbfBits;
+
+    if (m_rdCost.m_psyRd)
+        splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
+    else
+        splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
+        
+    return ycbf || ucbf || vcbf;
+}
+
+void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& outCosts, const uint32_t depthRange[2], int32_t splitMore)
 {
     CUData& cu = mode.cu;
     uint32_t depth = cuGeom.depth + tuDepth;
@@ -2876,6 +2977,37 @@
 
     bool bCheckSplit = log2TrSize > depthRange[0];
     bool bCheckFull = log2TrSize <= depthRange[1];
+    bool bSaveTUData = false, bLoadTUData = false;
+    uint32_t idx = 0;
+
+    if ((m_limitTU & X265_TU_LIMIT_BFS) && splitMore >= 0)
+    {
+        if (bCheckSplit && bCheckFull && tuDepth)
+        {
+            uint32_t qNumParts = 1 << (log2TrSize - LOG2_UNIT_SIZE) * 2;
+            uint32_t qIdx = (absPartIdx / qNumParts) % 4;
+            idx = (depth - 1) * 4 + qIdx;
+            if (splitMore)
+            {
+                bLoadTUData = true;
+                bCheckFull = false;
+            }
+            else
+            {
+                bSaveTUData = true;
+                bCheckSplit = false;
+            }
+        }
+    }
+    else if (m_limitTU & X265_TU_LIMIT_DFS || m_limitTU & X265_TU_LIMIT_NEIGH)
+    {
+        if (bCheckSplit && m_maxTUDepth >= 0)
+        {
+            uint32_t log2MaxTrSize = cuGeom.log2CUSize - m_maxTUDepth;
+            bCheckSplit = log2TrSize > log2MaxTrSize;
+        }
+    }
+
     bool bSplitPresentFlag = bCheckSplit && bCheckFull;
 
     if (cu.m_partSize[0] != SIZE_2Nx2N && !tuDepth && bCheckSplit)
@@ -3194,6 +3326,8 @@
                 singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
                 cbfFlag[TEXT_LUMA][0] = !!numSigTSkipY;
                 bestTransformMode[TEXT_LUMA][0] = 1;
+                if (m_param->limitTU)

 
@@ -67,6 +67,7 @@
     m_param = NULL;
     m_slice = NULL;
     m_frame = NULL;
+    m_maxTUDepth = -1;
 }
 
 bool Search::initSearch(const x265_param& param, ScalingList& scalingList)
@@ -93,6 +94,19 @@
     uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
     uint32_t numPartitions = 1 << (maxLog2CUSize - LOG2_UNIT_SIZE) * 2;
 
+    m_limitTU = 0;
+    if (m_param->limitTU)
+    {
+        if (m_param->limitTU == 1)
+            m_limitTU = X265_TU_LIMIT_BFS;
+        else if (m_param->limitTU == 2)
+            m_limitTU = X265_TU_LIMIT_DFS;
+        else if (m_param->limitTU == 3)
+            m_limitTU = X265_TU_LIMIT_NEIGH;
+        else if (m_param->limitTU == 4)
+            m_limitTU = X265_TU_LIMIT_DFS + X265_TU_LIMIT_NEIGH;
+    }
+
     /* these are indexed by qtLayer (log2size - 2) so nominally 0=4x4, 1=8x8, 2=16x16, 3=32x32
      * the coeffRQT and reconQtYuv are allocated to the max CU size at every depth. The parts
      * which are reconstructed at each depth are valid. At the end, the transform depth table
@@ -2131,6 +2145,13 @@
                 int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
                 MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx];
 
+                if (m_param->searchMethod == X265_SEA)
+                {
+                    int puX = puIdx & 1;
+                    int puY = puIdx >> 1;
+                    for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
+                        m_me.integral[planes] = interMode.fencYuv->m_integral[list][ref][planes] + puX * pu.width + puY * pu.height * m_slice->m_refFrameList[list][ref]->m_reconPic->m_stride;
+                }
                 setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
                 int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv,
                   m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
@@ -2229,7 +2250,13 @@
                         if (lmv.notZero())
                             mvc[numMvc++] = lmv;
                     }
-
+                    if (m_param->searchMethod == X265_SEA)
+                    {
+                        int puX = puIdx & 1;
+                        int puY = puIdx >> 1;
+                        for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
+                            m_me.integral[planes] = interMode.fencYuv->m_integral[list][ref][planes] + puX * pu.width + puY * pu.height * m_slice->m_refFrameList[list][ref]->m_reconPic->m_stride;
+                    }
                     setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
                     int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, 
                       m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
@@ -2544,6 +2571,9 @@
     /* conditional clipping for frame parallelism */
     mvmin.y = X265_MIN(mvmin.y, (int16_t)m_refLagPixels);
     mvmax.y = X265_MIN(mvmax.y, (int16_t)m_refLagPixels);
+
+    /* conditional clipping for negative mv range */
+    mvmax.y = X265_MAX(mvmax.y, mvmin.y);
 }
 
 /* Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */
@@ -2617,8 +2647,29 @@
 
     m_entropyCoder.load(m_rqt[depth].cur);
 
+    if ((m_limitTU & X265_TU_LIMIT_DFS) && !(m_limitTU & X265_TU_LIMIT_NEIGH))
+        m_maxTUDepth = -1;
+    else if (m_limitTU & X265_TU_LIMIT_BFS)
+        memset(&m_cacheTU, 0, sizeof(TUInfoCache));
+
     Cost costs;
-    estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange);
+    if (m_limitTU & X265_TU_LIMIT_NEIGH)
+    {
+        /* Save and reload maxTUDepth to avoid changing of maxTUDepth between modes */
+        int32_t tempDepth = m_maxTUDepth;
+        if (m_maxTUDepth != -1)
+        {
+            uint32_t splitFlag = interMode.cu.m_partSize[0] != SIZE_2Nx2N;
+            uint32_t minSize = tuDepthRange[0];
+            uint32_t maxSize = tuDepthRange[1];
+            maxSize = X265_MIN(maxSize, cuGeom.log2CUSize - splitFlag);
+            m_maxTUDepth = x265_clip3(cuGeom.log2CUSize - maxSize, cuGeom.log2CUSize - minSize, (uint32_t)m_maxTUDepth);
+        }
+        estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange);
+        m_maxTUDepth = tempDepth;
+    }
+    else
+        estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange);
 
     uint32_t tqBypass = cu.m_tqBypass[0];
     if (!tqBypass)
@@ -2867,7 +2918,57 @@
         return m_rdCost.calcRdCost(dist, nullBits);
 }
 
-void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& outCosts, const uint32_t depthRange[2])
+bool Search::splitTU(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& splitCost, const uint32_t depthRange[2], int32_t splitMore)
+{
+    CUData& cu = mode.cu;
+    uint32_t depth = cuGeom.depth + tuDepth;
+    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
+
+    uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
+    uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
+    for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
+    {
+        if ((m_limitTU & X265_TU_LIMIT_DFS) && tuDepth == 0 && qIdx == 1)
+        {
+            m_maxTUDepth = cu.m_tuDepth[0];
+            // Fetch maximum TU depth of first sub partition to limit recursion of others
+            for (uint32_t i = 1; i < cuGeom.numPartitions / 4; i++)
+                m_maxTUDepth = X265_MAX(m_maxTUDepth, cu.m_tuDepth[i]);
+        }
+        estimateResidualQT(mode, cuGeom, qPartIdx, tuDepth + 1, resiYuv, splitCost, depthRange, splitMore);
+        ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA,     tuDepth + 1);
+        if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
+        {
+            ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
+            vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
+        }
+    }
+    cu.m_cbf[0][absPartIdx] |= ycbf << tuDepth;
+    if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
+    {
+        cu.m_cbf[1][absPartIdx] |= ucbf << tuDepth;
+        cu.m_cbf[2][absPartIdx] |= vcbf << tuDepth;
+    }
+
+    // Here we were encoding cbfs and coefficients for splitted blocks. Since I have collected coefficient bits
+    // for each individual blocks, only encoding cbf values. As I mentioned encoding chroma cbfs is different then luma.
+    // But have one doubt that if coefficients are encoded in context at depth 2 (for example) and cbfs are encoded in context
+    // at depth 0 (for example).
+    m_entropyCoder.load(m_rqt[depth].rqtRoot);
+    m_entropyCoder.resetBits();
+    codeInterSubdivCbfQT(cu, absPartIdx, tuDepth, depthRange);
+    uint32_t splitCbfBits = m_entropyCoder.getNumberOfWrittenBits();
+    splitCost.bits += splitCbfBits;
+
+    if (m_rdCost.m_psyRd)
+        splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
+    else
+        splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
+        
+    return ycbf || ucbf || vcbf;
+}
+
+void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& outCosts, const uint32_t depthRange[2], int32_t splitMore)
 {
     CUData& cu = mode.cu;
     uint32_t depth = cuGeom.depth + tuDepth;
@@ -2876,6 +2977,37 @@
 
     bool bCheckSplit = log2TrSize > depthRange[0];
     bool bCheckFull = log2TrSize <= depthRange[1];
+    bool bSaveTUData = false, bLoadTUData = false;
+    uint32_t idx = 0;
+
+    if ((m_limitTU & X265_TU_LIMIT_BFS) && splitMore >= 0)
+    {
+        if (bCheckSplit && bCheckFull && tuDepth)
+        {
+            uint32_t qNumParts = 1 << (log2TrSize - LOG2_UNIT_SIZE) * 2;
+            uint32_t qIdx = (absPartIdx / qNumParts) % 4;
+            idx = (depth - 1) * 4 + qIdx;
+            if (splitMore)
+            {
+                bLoadTUData = true;
+                bCheckFull = false;
+            }
+            else
+            {
+                bSaveTUData = true;
+                bCheckSplit = false;
+            }
+        }
+    }
+    else if (m_limitTU & X265_TU_LIMIT_DFS || m_limitTU & X265_TU_LIMIT_NEIGH)
+    {
+        if (bCheckSplit && m_maxTUDepth >= 0)
+        {
+            uint32_t log2MaxTrSize = cuGeom.log2CUSize - m_maxTUDepth;
+            bCheckSplit = log2TrSize > log2MaxTrSize;
+        }
+    }
+
     bool bSplitPresentFlag = bCheckSplit && bCheckFull;
 
     if (cu.m_partSize[0] != SIZE_2Nx2N && !tuDepth && bCheckSplit)
@@ -3194,6 +3326,8 @@
                 singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
                 cbfFlag[TEXT_LUMA][0] = !!numSigTSkipY;
                 bestTransformMode[TEXT_LUMA][0] = 1;
+                if (m_param->limitTU)
​

x265_2.1.tar.gz/source/encoder/search.h -> x265_2.2.tar.gz/source/encoder/search.h Changed

@@ -49,6 +49,8 @@
 #define ProfileCounter(cu, count)
 #endif
 
+#define NUM_SUBPART MAX_TS_SIZE * 4 // 4 sub partitions * 4 depth
+
 namespace X265_NS {
 // private namespace
 
@@ -275,6 +277,9 @@
     uint32_t        m_numLayers;
     uint32_t        m_refLagPixels;
 
+    int32_t         m_maxTUDepth;
+    uint16_t        m_limitTU;
+
     int16_t         m_sliceMaxY;
     int16_t         m_sliceMinY;
 
@@ -377,8 +382,17 @@
         Cost() { rdcost = 0; bits = 0; distortion = 0; energy = 0; }
     };
 
+    struct TUInfoCache
+    {
+        Cost cost[NUM_SUBPART];
+        uint32_t bestTransformMode[NUM_SUBPART][MAX_NUM_COMPONENT][2];
+        uint8_t cbfFlag[NUM_SUBPART][MAX_NUM_COMPONENT][2];
+        Entropy rqtStore[NUM_SUBPART];
+    } m_cacheTU;
+
     uint64_t estimateNullCbfCost(sse_t dist, uint32_t psyEnergy, uint32_t tuDepth, TextType compId);
-    void     estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& costs, const uint32_t depthRange[2]);
+    bool     splitTU(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& splitCost, const uint32_t depthRange[2], int32_t splitMore);
+    void     estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& costs, const uint32_t depthRange[2], int32_t splitMore = -1);
 
     // generate prediction, generate residual and recon. if bAllowSplit, find optimal RQT splits
     void     codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& costs, const uint32_t depthRange[2]);

 
@@ -49,6 +49,8 @@
 #define ProfileCounter(cu, count)
 #endif
 
+#define NUM_SUBPART MAX_TS_SIZE * 4 // 4 sub partitions * 4 depth
+
 namespace X265_NS {
 // private namespace
 
@@ -275,6 +277,9 @@
     uint32_t        m_numLayers;
     uint32_t        m_refLagPixels;
 
+    int32_t         m_maxTUDepth;
+    uint16_t        m_limitTU;
+
     int16_t         m_sliceMaxY;
     int16_t         m_sliceMinY;
 
@@ -377,8 +382,17 @@
         Cost() { rdcost = 0; bits = 0; distortion = 0; energy = 0; }
     };
 
+    struct TUInfoCache
+    {
+        Cost cost[NUM_SUBPART];
+        uint32_t bestTransformMode[NUM_SUBPART][MAX_NUM_COMPONENT][2];
+        uint8_t cbfFlag[NUM_SUBPART][MAX_NUM_COMPONENT][2];
+        Entropy rqtStore[NUM_SUBPART];
+    } m_cacheTU;
+
     uint64_t estimateNullCbfCost(sse_t dist, uint32_t psyEnergy, uint32_t tuDepth, TextType compId);
-    void     estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& costs, const uint32_t depthRange[2]);
+    bool     splitTU(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& splitCost, const uint32_t depthRange[2], int32_t splitMore);
+    void     estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& costs, const uint32_t depthRange[2], int32_t splitMore = -1);
 
     // generate prediction, generate residual and recon. if bAllowSplit, find optimal RQT splits
     void     codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& costs, const uint32_t depthRange[2]);
​

x265_2.1.tar.gz/source/encoder/slicetype.cpp -> x265_2.2.tar.gz/source/encoder/slicetype.cpp Changed

 
@@ -1617,7 +1617,7 @@
 
     /* magic numbers pulled out of thin air */
     float threshMin = (float)(threshMax * 0.25);
-    double bias = 0.05;
+    double bias = m_param->scenecutBias;
     if (bRealScenecut)
     {
         if (m_param->keyframeMin == m_param->keyframeMax)
​

x265_2.1.tar.gz/source/input/y4m.cpp -> x265_2.2.tar.gz/source/input/y4m.cpp Changed

@@ -280,7 +280,7 @@
                 {
                     c = ifs->get();
 
-                    if (c <= '9' && c >= '0')
+                    if (c <= 'o' && c >= '0')
                         csp = csp * 10 + (c - '0');
                     else if (c == 'p')
                     {
@@ -300,9 +300,23 @@
                         break;
                 }
 
-                if (d >= 8 && d <= 16)
-                    depth = d;
-                colorSpace = (csp == 444) ? X265_CSP_I444 : (csp == 422) ? X265_CSP_I422 : X265_CSP_I420;
+                switch (csp)
+                {
+                case ('m'-'0')*100000 + ('o'-'0')*10000 + ('n'-'0')*1000 + ('o'-'0')*100 + 16:
+                    colorSpace = X265_CSP_I400;
+                    depth = 16;
+                    break;
+
+                case ('m'-'0')*1000 + ('o'-'0')*100 + ('n'-'0')*10 + ('o'-'0'):
+                    colorSpace = X265_CSP_I400;
+                    depth = 8;
+                    break;
+                   
+                default:
+                    if (d >= 8 && d <= 16)
+                        depth = d;
+                    colorSpace = (csp == 444) ? X265_CSP_I444 : (csp == 422) ? X265_CSP_I422 : X265_CSP_I420;
+                }
                 break;
 
             default:
@@ -324,7 +338,7 @@
     if (width < MIN_FRAME_WIDTH || width > MAX_FRAME_WIDTH ||
         height < MIN_FRAME_HEIGHT || height > MAX_FRAME_HEIGHT ||
         (rateNum / rateDenom) < 1 || (rateNum / rateDenom) > MAX_FRAME_RATE ||
-        colorSpace <= X265_CSP_I400 || colorSpace >= X265_CSP_COUNT)
+        colorSpace < X265_CSP_I400 || colorSpace >= X265_CSP_COUNT)
         return false;
 
     return true;

 
@@ -280,7 +280,7 @@
                 {
                     c = ifs->get();
 
-                    if (c <= '9' && c >= '0')
+                    if (c <= 'o' && c >= '0')
                         csp = csp * 10 + (c - '0');
                     else if (c == 'p')
                     {
@@ -300,9 +300,23 @@
                         break;
                 }
 
-                if (d >= 8 && d <= 16)
-                    depth = d;
-                colorSpace = (csp == 444) ? X265_CSP_I444 : (csp == 422) ? X265_CSP_I422 : X265_CSP_I420;
+                switch (csp)
+                {
+                case ('m'-'0')*100000 + ('o'-'0')*10000 + ('n'-'0')*1000 + ('o'-'0')*100 + 16:
+                    colorSpace = X265_CSP_I400;
+                    depth = 16;
+                    break;
+
+                case ('m'-'0')*1000 + ('o'-'0')*100 + ('n'-'0')*10 + ('o'-'0'):
+                    colorSpace = X265_CSP_I400;
+                    depth = 8;
+                    break;
+                   
+                default:
+                    if (d >= 8 && d <= 16)
+                        depth = d;
+                    colorSpace = (csp == 444) ? X265_CSP_I444 : (csp == 422) ? X265_CSP_I422 : X265_CSP_I420;
+                }
                 break;
 
             default:
@@ -324,7 +338,7 @@
     if (width < MIN_FRAME_WIDTH || width > MAX_FRAME_WIDTH ||
         height < MIN_FRAME_HEIGHT || height > MAX_FRAME_HEIGHT ||
         (rateNum / rateDenom) < 1 || (rateNum / rateDenom) > MAX_FRAME_RATE ||
-        colorSpace <= X265_CSP_I400 || colorSpace >= X265_CSP_COUNT)
+        colorSpace < X265_CSP_I400 || colorSpace >= X265_CSP_COUNT)
         return false;
 
     return true;
​

x265_2.1.tar.gz/source/test/rate-control-tests.txt -> x265_2.2.tar.gz/source/test/rate-control-tests.txt Changed

@@ -21,6 +21,9 @@
 big_buck_bunny_360p24.y4m,--preset medium --bitrate 400 --vbv-bufsize 600 --vbv-maxrate 600 --aud --hrd --tune fast-decode
 sita_1920x1080_30.yuv,--preset superfast --crf 25 --vbv-bufsize 3000 --vbv-maxrate 4000 --vbv-bufsize 5000 --hrd  --crf-max 30
 sita_1920x1080_30.yuv,--preset superfast --bitrate 3000 --vbv-bufsize 3000 --vbv-maxrate 3000 --aud --strict-cbr
+BasketballDrive_1920x1080_50.y4m,--preset ultrafast --bitrate 3000 --vbv-bufsize 3000 --vbv-maxrate 3000 --no-wpp
+big_buck_bunny_360p24.y4m,--preset medium --bitrate 400 --vbv-bufsize 600 --vbv-maxrate 600 --no-wpp --aud --hrd --tune fast-decode
+sita_1920x1080_30.yuv,--preset superfast --bitrate 3000 --vbv-bufsize 3000 --vbv-maxrate 3000 --aud --strict-cbr --no-wpp
 
 
 
@@ -38,4 +41,5 @@
 RaceHorses_416x240_30_10bit.yuv,--preset medium --crf 40 --pass 1, --preset faster --bitrate 200 --pass 2 -F4
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset superfast --bitrate 2500 --pass 1 -F4 --slow-firstpass,--preset superfast --bitrate 2500 --pass 2 -F4
 RaceHorses_416x240_30_10bit.yuv,--preset medium --crf 26 --vbv-maxrate 1000 --vbv-bufsize 1000 --pass 1,--preset fast --bitrate 1000  --vbv-maxrate 1000 --vbv-bufsize 700 --pass 3 -F4,--preset slow --bitrate 500 --vbv-maxrate 500  --vbv-bufsize 700 --pass 2 -F4
-
+sita_1920x1080_30.yuv, --preset ultrafast --crf 20 --no-cutree --keyint 50 --min-keyint 50 --no-open-gop --pass 1 --vbv-bufsize 7000 --vbv-maxrate 5000, --preset ultrafast --crf 20 --no-cutree --keyint 50 --min-keyint 50 --no-open-gop --pass 2 --vbv-bufsize 7000 --vbv-maxrate 5000 --repeat-headers
+sita_1920x1080_30.yuv, --preset medium --crf 20 --no-cutree --keyint 50 --min-keyint 50 --no-open-gop --pass 1 --vbv-bufsize 7000 --vbv-maxrate 5000 --repeat-headers --multi-pass-opt-rps, --preset medium --crf 20 --no-cutree --keyint 50 --min-keyint 50 --no-open-gop --pass 2 --vbv-bufsize 7000 --vbv-maxrate 5000 --repeat-headers --multi-pass-opt-rps

 
@@ -21,6 +21,9 @@
 big_buck_bunny_360p24.y4m,--preset medium --bitrate 400 --vbv-bufsize 600 --vbv-maxrate 600 --aud --hrd --tune fast-decode
 sita_1920x1080_30.yuv,--preset superfast --crf 25 --vbv-bufsize 3000 --vbv-maxrate 4000 --vbv-bufsize 5000 --hrd  --crf-max 30
 sita_1920x1080_30.yuv,--preset superfast --bitrate 3000 --vbv-bufsize 3000 --vbv-maxrate 3000 --aud --strict-cbr
+BasketballDrive_1920x1080_50.y4m,--preset ultrafast --bitrate 3000 --vbv-bufsize 3000 --vbv-maxrate 3000 --no-wpp
+big_buck_bunny_360p24.y4m,--preset medium --bitrate 400 --vbv-bufsize 600 --vbv-maxrate 600 --no-wpp --aud --hrd --tune fast-decode
+sita_1920x1080_30.yuv,--preset superfast --bitrate 3000 --vbv-bufsize 3000 --vbv-maxrate 3000 --aud --strict-cbr --no-wpp
 
 
 
@@ -38,4 +41,5 @@
 RaceHorses_416x240_30_10bit.yuv,--preset medium --crf 40 --pass 1, --preset faster --bitrate 200 --pass 2 -F4
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset superfast --bitrate 2500 --pass 1 -F4 --slow-firstpass,--preset superfast --bitrate 2500 --pass 2 -F4
 RaceHorses_416x240_30_10bit.yuv,--preset medium --crf 26 --vbv-maxrate 1000 --vbv-bufsize 1000 --pass 1,--preset fast --bitrate 1000  --vbv-maxrate 1000 --vbv-bufsize 700 --pass 3 -F4,--preset slow --bitrate 500 --vbv-maxrate 500  --vbv-bufsize 700 --pass 2 -F4
-
+sita_1920x1080_30.yuv, --preset ultrafast --crf 20 --no-cutree --keyint 50 --min-keyint 50 --no-open-gop --pass 1 --vbv-bufsize 7000 --vbv-maxrate 5000, --preset ultrafast --crf 20 --no-cutree --keyint 50 --min-keyint 50 --no-open-gop --pass 2 --vbv-bufsize 7000 --vbv-maxrate 5000 --repeat-headers
+sita_1920x1080_30.yuv, --preset medium --crf 20 --no-cutree --keyint 50 --min-keyint 50 --no-open-gop --pass 1 --vbv-bufsize 7000 --vbv-maxrate 5000 --repeat-headers --multi-pass-opt-rps, --preset medium --crf 20 --no-cutree --keyint 50 --min-keyint 50 --no-open-gop --pass 2 --vbv-bufsize 7000 --vbv-maxrate 5000 --repeat-headers --multi-pass-opt-rps
​

x265_2.1.tar.gz/source/test/regression-tests.txt -> x265_2.2.tar.gz/source/test/regression-tests.txt Changed

@@ -14,20 +14,21 @@
 BasketballDrive_1920x1080_50.y4m,--preset ultrafast --signhide --colormatrix bt709
 BasketballDrive_1920x1080_50.y4m,--preset superfast --psy-rd 1 --ctu 16 --no-wpp --limit-modes
 BasketballDrive_1920x1080_50.y4m,--preset veryfast --tune zerolatency --no-temporal-mvp
-BasketballDrive_1920x1080_50.y4m,--preset faster --aq-strength 2 --merange 190
-BasketballDrive_1920x1080_50.y4m,--preset medium --ctu 16 --max-tu-size 8 --subme 7 --qg-size 16 --cu-lossless
+BasketballDrive_1920x1080_50.y4m,--preset faster --aq-strength 2 --merange 190 --slices 3
+BasketballDrive_1920x1080_50.y4m,--preset medium --ctu 16 --max-tu-size 8 --subme 7 --qg-size 16 --cu-lossless --tu-inter-depth 3 --limit-tu 1
 BasketballDrive_1920x1080_50.y4m,--preset medium --keyint -1 --nr-inter 100 -F4 --no-sao
 BasketballDrive_1920x1080_50.y4m,--preset medium --no-cutree --analysis-mode=save --bitrate 7000 --limit-modes,--preset medium --no-cutree --analysis-mode=load --bitrate 7000 --limit-modes
 BasketballDrive_1920x1080_50.y4m,--preset slow --nr-intra 100 -F4 --aq-strength 3 --qg-size 16 --limit-refs 1
-BasketballDrive_1920x1080_50.y4m,--preset slower --lossless --chromaloc 3 --subme 0
+BasketballDrive_1920x1080_50.y4m,--preset slower --lossless --chromaloc 3 --subme 0 --limit-tu 4
 BasketballDrive_1920x1080_50.y4m,--preset slower --no-cutree --analysis-mode=save --bitrate 7000,--preset slower --no-cutree --analysis-mode=load --bitrate 7000
-BasketballDrive_1920x1080_50.y4m,--preset veryslow --crf 4 --cu-lossless --pmode --limit-refs 1 --aq-mode 3
-BasketballDrive_1920x1080_50.y4m,--preset veryslow --no-cutree --analysis-mode=save --bitrate 7000 --tskip-fast,--preset veryslow --no-cutree --analysis-mode=load --bitrate 7000  --tskip-fast
+BasketballDrive_1920x1080_50.y4m,--preset veryslow --crf 4 --cu-lossless --pmode --limit-refs 1 --aq-mode 3 --limit-tu 3
+BasketballDrive_1920x1080_50.y4m,--preset veryslow --no-cutree --analysis-mode=save --bitrate 7000 --tskip-fast --limit-tu 4,--preset veryslow --no-cutree --analysis-mode=load --bitrate 7000  --tskip-fast --limit-tu 4
 BasketballDrive_1920x1080_50.y4m,--preset veryslow --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
 Coastguard-4k.y4m,--preset ultrafast --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
 Coastguard-4k.y4m,--preset superfast --tune grain --overscan=crop
+Coastguard-4k.y4m,--preset superfast --tune grain --pme --aq-strength 2 --merange 190
 Coastguard-4k.y4m,--preset veryfast --no-cutree --analysis-mode=save --bitrate 15000,--preset veryfast --no-cutree --analysis-mode=load --bitrate 15000
-Coastguard-4k.y4m,--preset medium --rdoq-level 1 --tune ssim --no-signhide --me umh
+Coastguard-4k.y4m,--preset medium --rdoq-level 1 --tune ssim --no-signhide --me umh --slices 2
 Coastguard-4k.y4m,--preset slow --tune psnr --cbqpoffs -1 --crqpoffs 1 --limit-refs 1
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset ultrafast --weightp --tune zerolatency --qg-size 16
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset superfast --weightp --no-wpp --sao
@@ -41,13 +42,14 @@
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset superfast --weightp --dither --no-psy-rd
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryfast --temporal-layers --repeat-headers --limit-refs 2
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset medium --dither --keyint -1 --rdoq-level 1 --limit-modes
-CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --tskip --tskip-fast --no-scenecut
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --tskip --tskip-fast --no-scenecut --limit-tu 1
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset superfast --weightp --qg-size 16
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset medium --tune psnr --bframes 16 --limit-modes
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers --no-psy-rd --qg-size 32 --limit-refs 0 --cu-lossless
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset veryfast --weightp --nr-intra 1000 -F4
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset medium --nr-inter 500 -F4 --no-psy-rdoq
-DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset slower --no-weightp --rdoq-level 0 --limit-refs 3
+DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset slower --no-weightp --rdoq-level 0 --limit-refs 3 --tu-inter-depth 4 --limit-tu 3
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset fast --no-cutree --analysis-mode=save --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1,--preset fast --no-cutree --analysis-mode=load --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1
 FourPeople_1280x720_60.y4m,--preset superfast --no-wpp --lookahead-slices 2
 FourPeople_1280x720_60.y4m,--preset veryfast --aq-mode 2 --aq-strength 1.5 --qg-size 8
 FourPeople_1280x720_60.y4m,--preset medium --qp 38 --no-psy-rd
@@ -61,24 +63,27 @@
 KristenAndSara_1280x720_60.y4m,--preset ultrafast --strong-intra-smoothing
 KristenAndSara_1280x720_60.y4m,--preset superfast --min-cu-size 16 --qg-size 16 --limit-refs 1
 KristenAndSara_1280x720_60.y4m,--preset medium --no-cutree --max-tu-size 16
-KristenAndSara_1280x720_60.y4m,--preset slower --pmode --max-tu-size 8 --limit-refs 0 --limit-modes
+KristenAndSara_1280x720_60.y4m,--preset slower --pmode --max-tu-size 8 --limit-refs 0 --limit-modes --limit-tu 1
 NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset superfast --tune psnr
 NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --tune grain --limit-refs 2
 NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset slow --no-cutree --analysis-mode=save --bitrate 9000,--preset slow --no-cutree --analysis-mode=load --bitrate 9000
 News-4k.y4m,--preset ultrafast --no-cutree --analysis-mode=save --bitrate 15000,--preset ultrafast --no-cutree --analysis-mode=load --bitrate 15000
 News-4k.y4m,--preset superfast --lookahead-slices 6 --aq-mode 0
+News-4k.y4m,--preset superfast --slices 4 --aq-mode 0 
 News-4k.y4m,--preset medium --tune ssim --no-sao --qg-size 16
 News-4k.y4m,--preset veryslow --no-rskip
+News-4k.y4m,--preset veryslow --pme --crf 40
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset superfast --weightp
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset medium --no-weightp
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset slower --tune fastdecode
 ParkScene_1920x1080_24_10bit_444.yuv,--preset superfast --weightp --lookahead-slices 4
 ParkScene_1920x1080_24.y4m,--preset medium --qp 40 --rdpenalty 2 --tu-intra-depth 3
+ParkScene_1920x1080_24.y4m,--preset medium --pme --tskip-fast --tskip --min-keyint 48 --weightb --limit-refs 3
 ParkScene_1920x1080_24.y4m,--preset slower --no-weightp
 RaceHorses_416x240_30.y4m,--preset superfast --no-cutree
 RaceHorses_416x240_30.y4m,--preset medium --tskip-fast --tskip
-RaceHorses_416x240_30.y4m,--preset slower --keyint -1 --rdoq-level 0
-RaceHorses_416x240_30.y4m,--preset veryslow --tskip-fast --tskip --limit-refs 3
+RaceHorses_416x240_30.y4m,--preset slower --keyint -1 --rdoq-level 0 --limit-tu 2
+RaceHorses_416x240_30.y4m,--preset veryslow --tskip-fast --tskip --limit-refs 3 --limit-tu 3
 RaceHorses_416x240_30_10bit.yuv,--preset ultrafast --tune psnr --limit-refs 1
 RaceHorses_416x240_30_10bit.yuv,--preset veryfast --weightb
 RaceHorses_416x240_30_10bit.yuv,--preset faster --rdoq-level 0 --dither
@@ -108,7 +113,7 @@
 ducks_take_off_420_720p50.y4m,--preset veryslow --constrained-intra --bframes 2
 mobile_calendar_422_ntsc.y4m,--preset superfast --weightp
 mobile_calendar_422_ntsc.y4m,--preset medium --bitrate 500 -F4
-mobile_calendar_422_ntsc.y4m,--preset slower --tskip --tskip-fast
+mobile_calendar_422_ntsc.y4m,--preset slower --tskip --tskip-fast --limit-tu 4
 mobile_calendar_422_ntsc.y4m,--preset veryslow --tskip --limit-refs 2
 old_town_cross_444_720p50.y4m,--preset ultrafast --weightp --min-cu 32
 old_town_cross_444_720p50.y4m,--preset superfast --weightp --min-cu 16 --limit-modes
@@ -118,6 +123,7 @@
 old_town_cross_444_720p50.y4m,--preset medium --keyint -1 --no-weightp --ref 6
 old_town_cross_444_720p50.y4m,--preset slow --rdoq-level 1 --early-skip --ref 7 --no-b-pyramid
 old_town_cross_444_720p50.y4m,--preset slower --crf 4 --cu-lossless
+old_town_cross_444_720p50.y4m,--preset veryslow --max-tu-size 4 --min-cu-size 32 --limit-tu 4
 parkrun_ter_720p50.y4m,--preset medium --no-open-gop --sao-non-deblock --crf 4 --cu-lossless
 parkrun_ter_720p50.y4m,--preset slower --fast-intra --no-rect --tune grain
 silent_cif_420.y4m,--preset superfast --weightp --rect
@@ -133,6 +139,11 @@
 vtc1nw_422_ntsc.y4m,--preset slower --nr-inter 1000 -F4 --tune fast-decode --qg-size 16
 washdc_422_ntsc.y4m,--preset slower --psy-rdoq 2.0 --rdoq-level 2 --qg-size 32 --limit-refs 1
 washdc_422_ntsc.y4m,--preset veryslow --crf 4 --cu-lossless --limit-refs 3 --limit-modes
+washdc_422_ntsc.y4m,--preset veryslow --crf 4 --cu-lossless --limit-refs 3 --limit-modes --slices 2
+Kimono1_1920x1080_24_400.yuv,--preset ultrafast --slices 1 --weightp --tu-intra-depth 4
+Kimono1_1920x1080_24_400.yuv,--preset medium --rdoq-level 0 --limit-refs 3 --slices 2
+Kimono1_1920x1080_24_400.yuv,--preset veryslow --crf 4 --cu-lossless --slices 2 --limit-refs 3 --limit-modes
+Kimono1_1920x1080_24_400.yuv,--preset placebo --ctu 32 --max-tu-size 8 --limit-tu 2
 
 # Main12 intraCost overflow bug test
 720p50_parkrun_ter.y4m,--preset medium
@@ -141,4 +152,7 @@
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset faster --interlace tff
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --interlace bff
 
+#SEA Implementation Test
+silent_cif_420.y4m,--preset veryslow --me 4
+big_buck_bunny_360p24.y4m,--preset superfast --me 4
 # vim: tw=200

 
@@ -14,20 +14,21 @@
 BasketballDrive_1920x1080_50.y4m,--preset ultrafast --signhide --colormatrix bt709
 BasketballDrive_1920x1080_50.y4m,--preset superfast --psy-rd 1 --ctu 16 --no-wpp --limit-modes
 BasketballDrive_1920x1080_50.y4m,--preset veryfast --tune zerolatency --no-temporal-mvp
-BasketballDrive_1920x1080_50.y4m,--preset faster --aq-strength 2 --merange 190
-BasketballDrive_1920x1080_50.y4m,--preset medium --ctu 16 --max-tu-size 8 --subme 7 --qg-size 16 --cu-lossless
+BasketballDrive_1920x1080_50.y4m,--preset faster --aq-strength 2 --merange 190 --slices 3
+BasketballDrive_1920x1080_50.y4m,--preset medium --ctu 16 --max-tu-size 8 --subme 7 --qg-size 16 --cu-lossless --tu-inter-depth 3 --limit-tu 1
 BasketballDrive_1920x1080_50.y4m,--preset medium --keyint -1 --nr-inter 100 -F4 --no-sao
 BasketballDrive_1920x1080_50.y4m,--preset medium --no-cutree --analysis-mode=save --bitrate 7000 --limit-modes,--preset medium --no-cutree --analysis-mode=load --bitrate 7000 --limit-modes
 BasketballDrive_1920x1080_50.y4m,--preset slow --nr-intra 100 -F4 --aq-strength 3 --qg-size 16 --limit-refs 1
-BasketballDrive_1920x1080_50.y4m,--preset slower --lossless --chromaloc 3 --subme 0
+BasketballDrive_1920x1080_50.y4m,--preset slower --lossless --chromaloc 3 --subme 0 --limit-tu 4
 BasketballDrive_1920x1080_50.y4m,--preset slower --no-cutree --analysis-mode=save --bitrate 7000,--preset slower --no-cutree --analysis-mode=load --bitrate 7000
-BasketballDrive_1920x1080_50.y4m,--preset veryslow --crf 4 --cu-lossless --pmode --limit-refs 1 --aq-mode 3
-BasketballDrive_1920x1080_50.y4m,--preset veryslow --no-cutree --analysis-mode=save --bitrate 7000 --tskip-fast,--preset veryslow --no-cutree --analysis-mode=load --bitrate 7000  --tskip-fast
+BasketballDrive_1920x1080_50.y4m,--preset veryslow --crf 4 --cu-lossless --pmode --limit-refs 1 --aq-mode 3 --limit-tu 3
+BasketballDrive_1920x1080_50.y4m,--preset veryslow --no-cutree --analysis-mode=save --bitrate 7000 --tskip-fast --limit-tu 4,--preset veryslow --no-cutree --analysis-mode=load --bitrate 7000  --tskip-fast --limit-tu 4
 BasketballDrive_1920x1080_50.y4m,--preset veryslow --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
 Coastguard-4k.y4m,--preset ultrafast --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
 Coastguard-4k.y4m,--preset superfast --tune grain --overscan=crop
+Coastguard-4k.y4m,--preset superfast --tune grain --pme --aq-strength 2 --merange 190
 Coastguard-4k.y4m,--preset veryfast --no-cutree --analysis-mode=save --bitrate 15000,--preset veryfast --no-cutree --analysis-mode=load --bitrate 15000
-Coastguard-4k.y4m,--preset medium --rdoq-level 1 --tune ssim --no-signhide --me umh
+Coastguard-4k.y4m,--preset medium --rdoq-level 1 --tune ssim --no-signhide --me umh --slices 2
 Coastguard-4k.y4m,--preset slow --tune psnr --cbqpoffs -1 --crqpoffs 1 --limit-refs 1
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset ultrafast --weightp --tune zerolatency --qg-size 16
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset superfast --weightp --no-wpp --sao
@@ -41,13 +42,14 @@
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset superfast --weightp --dither --no-psy-rd
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryfast --temporal-layers --repeat-headers --limit-refs 2
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset medium --dither --keyint -1 --rdoq-level 1 --limit-modes
-CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --tskip --tskip-fast --no-scenecut
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --tskip --tskip-fast --no-scenecut --limit-tu 1
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset superfast --weightp --qg-size 16
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset medium --tune psnr --bframes 16 --limit-modes
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers --no-psy-rd --qg-size 32 --limit-refs 0 --cu-lossless
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset veryfast --weightp --nr-intra 1000 -F4
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset medium --nr-inter 500 -F4 --no-psy-rdoq
-DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset slower --no-weightp --rdoq-level 0 --limit-refs 3
+DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset slower --no-weightp --rdoq-level 0 --limit-refs 3 --tu-inter-depth 4 --limit-tu 3
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset fast --no-cutree --analysis-mode=save --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1,--preset fast --no-cutree --analysis-mode=load --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1
 FourPeople_1280x720_60.y4m,--preset superfast --no-wpp --lookahead-slices 2
 FourPeople_1280x720_60.y4m,--preset veryfast --aq-mode 2 --aq-strength 1.5 --qg-size 8
 FourPeople_1280x720_60.y4m,--preset medium --qp 38 --no-psy-rd
@@ -61,24 +63,27 @@
 KristenAndSara_1280x720_60.y4m,--preset ultrafast --strong-intra-smoothing
 KristenAndSara_1280x720_60.y4m,--preset superfast --min-cu-size 16 --qg-size 16 --limit-refs 1
 KristenAndSara_1280x720_60.y4m,--preset medium --no-cutree --max-tu-size 16
-KristenAndSara_1280x720_60.y4m,--preset slower --pmode --max-tu-size 8 --limit-refs 0 --limit-modes
+KristenAndSara_1280x720_60.y4m,--preset slower --pmode --max-tu-size 8 --limit-refs 0 --limit-modes --limit-tu 1
 NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset superfast --tune psnr
 NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --tune grain --limit-refs 2
 NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset slow --no-cutree --analysis-mode=save --bitrate 9000,--preset slow --no-cutree --analysis-mode=load --bitrate 9000
 News-4k.y4m,--preset ultrafast --no-cutree --analysis-mode=save --bitrate 15000,--preset ultrafast --no-cutree --analysis-mode=load --bitrate 15000
 News-4k.y4m,--preset superfast --lookahead-slices 6 --aq-mode 0
+News-4k.y4m,--preset superfast --slices 4 --aq-mode 0 
 News-4k.y4m,--preset medium --tune ssim --no-sao --qg-size 16
 News-4k.y4m,--preset veryslow --no-rskip
+News-4k.y4m,--preset veryslow --pme --crf 40
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset superfast --weightp
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset medium --no-weightp
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset slower --tune fastdecode
 ParkScene_1920x1080_24_10bit_444.yuv,--preset superfast --weightp --lookahead-slices 4
 ParkScene_1920x1080_24.y4m,--preset medium --qp 40 --rdpenalty 2 --tu-intra-depth 3
+ParkScene_1920x1080_24.y4m,--preset medium --pme --tskip-fast --tskip --min-keyint 48 --weightb --limit-refs 3
 ParkScene_1920x1080_24.y4m,--preset slower --no-weightp
 RaceHorses_416x240_30.y4m,--preset superfast --no-cutree
 RaceHorses_416x240_30.y4m,--preset medium --tskip-fast --tskip
-RaceHorses_416x240_30.y4m,--preset slower --keyint -1 --rdoq-level 0
-RaceHorses_416x240_30.y4m,--preset veryslow --tskip-fast --tskip --limit-refs 3
+RaceHorses_416x240_30.y4m,--preset slower --keyint -1 --rdoq-level 0 --limit-tu 2
+RaceHorses_416x240_30.y4m,--preset veryslow --tskip-fast --tskip --limit-refs 3 --limit-tu 3
 RaceHorses_416x240_30_10bit.yuv,--preset ultrafast --tune psnr --limit-refs 1
 RaceHorses_416x240_30_10bit.yuv,--preset veryfast --weightb
 RaceHorses_416x240_30_10bit.yuv,--preset faster --rdoq-level 0 --dither
@@ -108,7 +113,7 @@
 ducks_take_off_420_720p50.y4m,--preset veryslow --constrained-intra --bframes 2
 mobile_calendar_422_ntsc.y4m,--preset superfast --weightp
 mobile_calendar_422_ntsc.y4m,--preset medium --bitrate 500 -F4
-mobile_calendar_422_ntsc.y4m,--preset slower --tskip --tskip-fast
+mobile_calendar_422_ntsc.y4m,--preset slower --tskip --tskip-fast --limit-tu 4
 mobile_calendar_422_ntsc.y4m,--preset veryslow --tskip --limit-refs 2
 old_town_cross_444_720p50.y4m,--preset ultrafast --weightp --min-cu 32
 old_town_cross_444_720p50.y4m,--preset superfast --weightp --min-cu 16 --limit-modes
@@ -118,6 +123,7 @@
 old_town_cross_444_720p50.y4m,--preset medium --keyint -1 --no-weightp --ref 6
 old_town_cross_444_720p50.y4m,--preset slow --rdoq-level 1 --early-skip --ref 7 --no-b-pyramid
 old_town_cross_444_720p50.y4m,--preset slower --crf 4 --cu-lossless
+old_town_cross_444_720p50.y4m,--preset veryslow --max-tu-size 4 --min-cu-size 32 --limit-tu 4
 parkrun_ter_720p50.y4m,--preset medium --no-open-gop --sao-non-deblock --crf 4 --cu-lossless
 parkrun_ter_720p50.y4m,--preset slower --fast-intra --no-rect --tune grain
 silent_cif_420.y4m,--preset superfast --weightp --rect
@@ -133,6 +139,11 @@
 vtc1nw_422_ntsc.y4m,--preset slower --nr-inter 1000 -F4 --tune fast-decode --qg-size 16
 washdc_422_ntsc.y4m,--preset slower --psy-rdoq 2.0 --rdoq-level 2 --qg-size 32 --limit-refs 1
 washdc_422_ntsc.y4m,--preset veryslow --crf 4 --cu-lossless --limit-refs 3 --limit-modes
+washdc_422_ntsc.y4m,--preset veryslow --crf 4 --cu-lossless --limit-refs 3 --limit-modes --slices 2
+Kimono1_1920x1080_24_400.yuv,--preset ultrafast --slices 1 --weightp --tu-intra-depth 4
+Kimono1_1920x1080_24_400.yuv,--preset medium --rdoq-level 0 --limit-refs 3 --slices 2
+Kimono1_1920x1080_24_400.yuv,--preset veryslow --crf 4 --cu-lossless --slices 2 --limit-refs 3 --limit-modes
+Kimono1_1920x1080_24_400.yuv,--preset placebo --ctu 32 --max-tu-size 8 --limit-tu 2
 
 # Main12 intraCost overflow bug test
 720p50_parkrun_ter.y4m,--preset medium
@@ -141,4 +152,7 @@
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset faster --interlace tff
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --interlace bff
 
+#SEA Implementation Test
+silent_cif_420.y4m,--preset veryslow --me 4
+big_buck_bunny_360p24.y4m,--preset superfast --me 4
 # vim: tw=200
​

x265_2.1.tar.gz/source/test/smoke-tests.txt -> x265_2.2.tar.gz/source/test/smoke-tests.txt Changed

@@ -3,10 +3,9 @@
 # consider VBV tests a failure if new bitrate is more than 5% different
 # from the old bitrate
 # vbv-tolerance = 0.05
-
 big_buck_bunny_360p24.y4m,--preset=superfast --bitrate 400 --vbv-bufsize 600 --vbv-maxrate 400 --hrd --aud --repeat-headers
 big_buck_bunny_360p24.y4m,--preset=medium --bitrate 1000 -F4 --cu-lossless --scaling-list default
-big_buck_bunny_360p24.y4m,--preset=slower --no-weightp --pme --qg-size 16
+big_buck_bunny_360p24.y4m,--preset=slower --no-weightp --qg-size 16
 washdc_422_ntsc.y4m,--preset=faster --no-strong-intra-smoothing --keyint 1 --qg-size 16
 washdc_422_ntsc.y4m,--preset=medium --qp 40 --nr-inter 400 -F4
 washdc_422_ntsc.y4m,--preset=veryslow --pmode --tskip --rdoq-level 0
@@ -16,9 +15,10 @@
 RaceHorses_416x240_30_10bit.yuv,--preset=veryfast --max-tu-size 8
 RaceHorses_416x240_30_10bit.yuv,--preset=slower --bitrate 500 -F4 --rdoq-level 1
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset=ultrafast --constrained-intra --min-keyint 5 --keyint 10
-CrowdRun_1920x1080_50_10bit_444.yuv,--preset=medium --max-tu-size 16
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset=medium --max-tu-size 16 --tu-inter-depth 2 --limit-tu 3
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset=veryfast --min-cu 16
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset=fast --weightb --interlace bff
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset=veryslow --limit-ref 1 --limit-mode --tskip --limit-tu 1
 
 # Main12 intraCost overflow bug test
 720p50_parkrun_ter.y4m,--preset medium

 
@@ -3,10 +3,9 @@
 # consider VBV tests a failure if new bitrate is more than 5% different
 # from the old bitrate
 # vbv-tolerance = 0.05
-
 big_buck_bunny_360p24.y4m,--preset=superfast --bitrate 400 --vbv-bufsize 600 --vbv-maxrate 400 --hrd --aud --repeat-headers
 big_buck_bunny_360p24.y4m,--preset=medium --bitrate 1000 -F4 --cu-lossless --scaling-list default
-big_buck_bunny_360p24.y4m,--preset=slower --no-weightp --pme --qg-size 16
+big_buck_bunny_360p24.y4m,--preset=slower --no-weightp --qg-size 16
 washdc_422_ntsc.y4m,--preset=faster --no-strong-intra-smoothing --keyint 1 --qg-size 16
 washdc_422_ntsc.y4m,--preset=medium --qp 40 --nr-inter 400 -F4
 washdc_422_ntsc.y4m,--preset=veryslow --pmode --tskip --rdoq-level 0
@@ -16,9 +15,10 @@
 RaceHorses_416x240_30_10bit.yuv,--preset=veryfast --max-tu-size 8
 RaceHorses_416x240_30_10bit.yuv,--preset=slower --bitrate 500 -F4 --rdoq-level 1
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset=ultrafast --constrained-intra --min-keyint 5 --keyint 10
-CrowdRun_1920x1080_50_10bit_444.yuv,--preset=medium --max-tu-size 16
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset=medium --max-tu-size 16 --tu-inter-depth 2 --limit-tu 3
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset=veryfast --min-cu 16
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset=fast --weightb --interlace bff
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset=veryslow --limit-ref 1 --limit-mode --tskip --limit-tu 1
 
 # Main12 intraCost overflow bug test
 720p50_parkrun_ter.y4m,--preset medium
​

x265_2.1.tar.gz/source/x265-extras.cpp -> x265_2.2.tar.gz/source/x265-extras.cpp Changed

@@ -64,6 +64,8 @@
                 fprintf(csvfp, "Encode Order, Type, POC, QP, Bits, Scenecut, ");
                 if (param.rc.rateControlMode == X265_RC_CRF)
                     fprintf(csvfp, "RateFactor, ");
+                if (param.rc.vbvBufferSize)
+                    fprintf(csvfp, "BufferFill, ");
                 if (param.bEnablePsnr)
                     fprintf(csvfp, "Y PSNR, U PSNR, V PSNR, YUV PSNR, ");
                 if (param.bEnableSsim)
@@ -132,6 +134,8 @@
     fprintf(csvfp, "%d, %c-SLICE, %4d, %2.2lf, %10d, %d,", frameStats->encoderOrder, frameStats->sliceType, frameStats->poc, frameStats->qp, (int)frameStats->bits, frameStats->bScenecut);
     if (param.rc.rateControlMode == X265_RC_CRF)
         fprintf(csvfp, "%.3lf,", frameStats->rateFactor);
+    if (param.rc.vbvBufferSize)
+        fprintf(csvfp, "%.3lf,", frameStats->bufferFill);
     if (param.bEnablePsnr)
         fprintf(csvfp, "%.3lf, %.3lf, %.3lf, %.3lf,", frameStats->psnrY, frameStats->psnrU, frameStats->psnrV, frameStats->psnr);
     if (param.bEnableSsim)
@@ -187,7 +191,7 @@
     fflush(stderr);
 }
 
-void x265_csvlog_encode(FILE* csvfp, const x265_api& api, const x265_param& param, const x265_stats& stats, int level, int argc, char** argv)
+void x265_csvlog_encode(FILE* csvfp, const char* version, const x265_param& param, const x265_stats& stats, int level, int argc, char** argv)
 {
     if (!csvfp)
         return;
@@ -277,7 +281,7 @@
     else
         fprintf(csvfp, " -, -, -, -, -, -, -,");
 
-    fprintf(csvfp, " %-6u, %-6u, %s\n", stats.maxCLL, stats.maxFALL, api.version_str);
+    fprintf(csvfp, " %-6u, %-6u, %s\n", stats.maxCLL, stats.maxFALL, version);
 }
 
 /* The dithering algorithm is based on Sierra-2-4A error diffusion.

 
@@ -64,6 +64,8 @@
                 fprintf(csvfp, "Encode Order, Type, POC, QP, Bits, Scenecut, ");
                 if (param.rc.rateControlMode == X265_RC_CRF)
                     fprintf(csvfp, "RateFactor, ");
+                if (param.rc.vbvBufferSize)
+                    fprintf(csvfp, "BufferFill, ");
                 if (param.bEnablePsnr)
                     fprintf(csvfp, "Y PSNR, U PSNR, V PSNR, YUV PSNR, ");
                 if (param.bEnableSsim)
@@ -132,6 +134,8 @@
     fprintf(csvfp, "%d, %c-SLICE, %4d, %2.2lf, %10d, %d,", frameStats->encoderOrder, frameStats->sliceType, frameStats->poc, frameStats->qp, (int)frameStats->bits, frameStats->bScenecut);
     if (param.rc.rateControlMode == X265_RC_CRF)
         fprintf(csvfp, "%.3lf,", frameStats->rateFactor);
+    if (param.rc.vbvBufferSize)
+        fprintf(csvfp, "%.3lf,", frameStats->bufferFill);
     if (param.bEnablePsnr)
         fprintf(csvfp, "%.3lf, %.3lf, %.3lf, %.3lf,", frameStats->psnrY, frameStats->psnrU, frameStats->psnrV, frameStats->psnr);
     if (param.bEnableSsim)
@@ -187,7 +191,7 @@
     fflush(stderr);
 }
 
-void x265_csvlog_encode(FILE* csvfp, const x265_api& api, const x265_param& param, const x265_stats& stats, int level, int argc, char** argv)
+void x265_csvlog_encode(FILE* csvfp, const char* version, const x265_param& param, const x265_stats& stats, int level, int argc, char** argv)
 {
     if (!csvfp)
         return;
@@ -277,7 +281,7 @@
     else
         fprintf(csvfp, " -, -, -, -, -, -, -,");
 
-    fprintf(csvfp, " %-6u, %-6u, %s\n", stats.maxCLL, stats.maxFALL, api.version_str);
+    fprintf(csvfp, " %-6u, %-6u, %s\n", stats.maxCLL, stats.maxFALL, version);
 }
 
 /* The dithering algorithm is based on Sierra-2-4A error diffusion.
​

x265_2.1.tar.gz/source/x265-extras.h -> x265_2.2.tar.gz/source/x265-extras.h Changed

 
@@ -53,7 +53,7 @@
 /* Log final encode statistics to the CSV file handle. 'argc' and 'argv' are
  * intended to be command line arguments passed to the encoder. Encode
  * statistics should be queried from the encoder just prior to closing it. */
-LIBAPI void x265_csvlog_encode(FILE* csvfp, const x265_api& api, const x265_param& param, const x265_stats& stats, int level, int argc, char** argv);
+LIBAPI void x265_csvlog_encode(FILE* csvfp, const char* version, const x265_param& param, const x265_stats& stats, int level, int argc, char** argv);
 
 /* In-place downshift from a bit-depth greater than 8 to a bit-depth of 8, using
  * the residual bits to dither each row. */
​

x265_2.1.tar.gz/source/x265.cpp -> x265_2.2.tar.gz/source/x265.cpp Changed

 
@@ -746,7 +746,7 @@
 
     api->encoder_get_stats(encoder, &stats, sizeof(stats));
     if (cliopt.csvfpt && !b_ctrl_c)
-        x265_csvlog_encode(cliopt.csvfpt, *api, *param, stats, cliopt.csvLogLevel, argc, argv);
+        x265_csvlog_encode(cliopt.csvfpt, api->version_str, *param, stats, cliopt.csvLogLevel, argc, argv);
     api->encoder_close(encoder);
 
     int64_t second_largest_pts = 0;
​

x265_2.1.tar.gz/source/x265.h -> x265_2.2.tar.gz/source/x265.h Changed

@@ -137,6 +137,7 @@
     double           avgPsyEnergy;
     double           avgResEnergy;
     double           avgLumaLevel;
+    double           bufferFill;
     uint64_t         bits;
     int              encoderOrder;
     int              poc;
@@ -289,6 +290,7 @@
     X265_HEX_SEARCH,
     X265_UMH_SEARCH,
     X265_STAR_SEARCH,
+    X265_SEA,
     X265_FULL_SEARCH
 } X265_ME_METHODS;
 
@@ -334,6 +336,9 @@
 #define X265_CPU_NEON            0x0000002  /* ARM NEON */
 #define X265_CPU_FAST_NEON_MRC   0x0000004  /* Transfer from NEON to ARM register is fast (Cortex-A9) */
 
+/* IBM Power8 */
+#define X265_CPU_ALTIVEC         0x0000001
+
 #define X265_MAX_SUBPEL_LEVEL   7
 
 /* Log level */
@@ -351,6 +356,10 @@
 #define X265_REF_LIMIT_DEPTH    1
 #define X265_REF_LIMIT_CU       2
 
+#define X265_TU_LIMIT_BFS       1
+#define X265_TU_LIMIT_DFS       2
+#define X265_TU_LIMIT_NEIGH     4
+
 #define X265_BFRAME_MAX         16
 #define X265_MAX_FRAME_THREADS  16
 
@@ -456,7 +465,7 @@
 } x265_stats;
 
 /* String values accepted by x265_param_parse() (and CLI) for various parameters */
-static const char * const x265_motion_est_names[] = { "dia", "hex", "umh", "star", "full", 0 };
+static const char * const x265_motion_est_names[] = { "dia", "hex", "umh", "star", "sea", "full", 0 };
 static const char * const x265_source_csp_names[] = { "i400", "i420", "i422", "i444", "nv12", "nv16", 0 };
 static const char * const x265_video_format_names[] = { "component", "pal", "ntsc", "secam", "mac", "undef", 0 };
 static const char * const x265_fullrange_names[] = { "limited", "full", 0 };
@@ -823,6 +832,10 @@
      * compressed by the DCT transforms, at the expense of much more compute */
     uint32_t  tuQTMaxIntraDepth;
 
+    /* Enable early exit decisions for inter coded blocks to avoid recursing to
+     * higher TU depths. Default: 0 */
+    uint32_t  limitTU;
+
     /* Set the amount of rate-distortion analysis to use within quant. 0 implies
      * no rate-distortion optimization. At level 1 rate-distortion cost is used to
      * find optimal rounding values for each level (and allows psy-rdoq to be
@@ -898,9 +911,9 @@
     /* Limit modes analyzed for each CU using cost metrics from the 4 sub-CUs */
     uint32_t limitModes;
 
-    /* ME search method (DIA, HEX, UMH, STAR, FULL). The search patterns
+    /* ME search method (DIA, HEX, UMH, STAR, SEA, FULL). The search patterns
      * (methods) are sorted in increasing complexity, with diamond being the
-     * simplest and fastest and full being the slowest.  DIA, HEX, and UMH were
+     * simplest and fastest and full being the slowest.  DIA, HEX, UMH and SEA were
      * adapted from x264 directly. STAR is an adaption of the HEVC reference
      * encoder's three step search, while full is a naive exhaustive search. The
      * default is the star search, it has a good balance of performance and
@@ -1300,15 +1313,28 @@
     /* Maximum of the picture order count */
     int log2MaxPocLsb;
 
-    /* Dicard SEI messages when printing */
-    int bDiscardSEI;
-    
-    /* Control removing optional vui information (timing, HRD info) to get low bitrate */
-    int       bDiscardOptionalVUI;
+    /* Emit VUI Timing info, an optional VUI field */
+    int bEmitVUITimingInfo;
+
+    /* Emit HRD Timing info */
+    int bEmitVUIHRDInfo;
 
     /* Maximum count of Slices of picture, the value range is [1, maximum rows] */
     unsigned int maxSlices;
 
+    /* Optimize QP in PPS based on statistics from prevvious GOP*/
+    int bOptQpPPS;
+
+    /* Opitmize ref list length in PPS based on stats from previous GOP*/
+    int bOptRefListLengthPPS;
+
+    /* Enable storing commonly RPS in SPS in multi pass mode */
+    int       bMultiPassOptRPS;
+
+    /* This value represents the percentage difference between the inter cost and
+    * intra cost of a frame used in scenecut detection. Default 5. */
+    double     scenecutBias;
+
 } x265_param;
 
 /* x265_param_alloc:

 
@@ -137,6 +137,7 @@
     double           avgPsyEnergy;
     double           avgResEnergy;
     double           avgLumaLevel;
+    double           bufferFill;
     uint64_t         bits;
     int              encoderOrder;
     int              poc;
@@ -289,6 +290,7 @@
     X265_HEX_SEARCH,
     X265_UMH_SEARCH,
     X265_STAR_SEARCH,
+    X265_SEA,
     X265_FULL_SEARCH
 } X265_ME_METHODS;
 
@@ -334,6 +336,9 @@
 #define X265_CPU_NEON            0x0000002  /* ARM NEON */
 #define X265_CPU_FAST_NEON_MRC   0x0000004  /* Transfer from NEON to ARM register is fast (Cortex-A9) */
 
+/* IBM Power8 */
+#define X265_CPU_ALTIVEC         0x0000001
+
 #define X265_MAX_SUBPEL_LEVEL   7
 
 /* Log level */
@@ -351,6 +356,10 @@
 #define X265_REF_LIMIT_DEPTH    1
 #define X265_REF_LIMIT_CU       2
 
+#define X265_TU_LIMIT_BFS       1
+#define X265_TU_LIMIT_DFS       2
+#define X265_TU_LIMIT_NEIGH     4
+
 #define X265_BFRAME_MAX         16
 #define X265_MAX_FRAME_THREADS  16
 
@@ -456,7 +465,7 @@
 } x265_stats;
 
 /* String values accepted by x265_param_parse() (and CLI) for various parameters */
-static const char * const x265_motion_est_names[] = { "dia", "hex", "umh", "star", "full", 0 };
+static const char * const x265_motion_est_names[] = { "dia", "hex", "umh", "star", "sea", "full", 0 };
 static const char * const x265_source_csp_names[] = { "i400", "i420", "i422", "i444", "nv12", "nv16", 0 };
 static const char * const x265_video_format_names[] = { "component", "pal", "ntsc", "secam", "mac", "undef", 0 };
 static const char * const x265_fullrange_names[] = { "limited", "full", 0 };
@@ -823,6 +832,10 @@
      * compressed by the DCT transforms, at the expense of much more compute */
     uint32_t  tuQTMaxIntraDepth;
 
+    /* Enable early exit decisions for inter coded blocks to avoid recursing to
+     * higher TU depths. Default: 0 */
+    uint32_t  limitTU;
+
     /* Set the amount of rate-distortion analysis to use within quant. 0 implies
      * no rate-distortion optimization. At level 1 rate-distortion cost is used to
      * find optimal rounding values for each level (and allows psy-rdoq to be
@@ -898,9 +911,9 @@
     /* Limit modes analyzed for each CU using cost metrics from the 4 sub-CUs */
     uint32_t limitModes;
 
-    /* ME search method (DIA, HEX, UMH, STAR, FULL). The search patterns
+    /* ME search method (DIA, HEX, UMH, STAR, SEA, FULL). The search patterns
      * (methods) are sorted in increasing complexity, with diamond being the
-     * simplest and fastest and full being the slowest.  DIA, HEX, and UMH were
+     * simplest and fastest and full being the slowest.  DIA, HEX, UMH and SEA were
      * adapted from x264 directly. STAR is an adaption of the HEVC reference
      * encoder's three step search, while full is a naive exhaustive search. The
      * default is the star search, it has a good balance of performance and
@@ -1300,15 +1313,28 @@
     /* Maximum of the picture order count */
     int log2MaxPocLsb;
 
-    /* Dicard SEI messages when printing */
-    int bDiscardSEI;
-    
-    /* Control removing optional vui information (timing, HRD info) to get low bitrate */
-    int       bDiscardOptionalVUI;
+    /* Emit VUI Timing info, an optional VUI field */
+    int bEmitVUITimingInfo;
+
+    /* Emit HRD Timing info */
+    int bEmitVUIHRDInfo;
 
     /* Maximum count of Slices of picture, the value range is [1, maximum rows] */
     unsigned int maxSlices;
 
+    /* Optimize QP in PPS based on statistics from prevvious GOP*/
+    int bOptQpPPS;
+
+    /* Opitmize ref list length in PPS based on stats from previous GOP*/
+    int bOptRefListLengthPPS;
+
+    /* Enable storing commonly RPS in SPS in multi pass mode */
+    int       bMultiPassOptRPS;
+
+    /* This value represents the percentage difference between the inter cost and
+    * intra cost of a frame used in scenecut detection. Default 5. */
+    double     scenecutBias;
+
 } x265_param;
 
 /* x265_param_alloc:
​

x265_2.1.tar.gz/source/x265cli.h -> x265_2.2.tar.gz/source/x265cli.h Changed

@@ -85,6 +85,7 @@
     { "max-tu-size",    required_argument, NULL, 0 },
     { "tu-intra-depth", required_argument, NULL, 0 },
     { "tu-inter-depth", required_argument, NULL, 0 },
+    { "limit-tu",       required_argument, NULL, 0 },
     { "me",             required_argument, NULL, 0 },
     { "subme",          required_argument, NULL, 'm' },
     { "merange",        required_argument, NULL, 0 },
@@ -120,6 +121,7 @@
     { "min-keyint",     required_argument, NULL, 'i' },
     { "scenecut",       required_argument, NULL, 0 },
     { "no-scenecut",          no_argument, NULL, 0 },
+    { "scenecut-bias",  required_argument, NULL, 0 },
     { "intra-refresh",        no_argument, NULL, 0 },
     { "rc-lookahead",   required_argument, NULL, 0 },
     { "lookahead-slices", required_argument, NULL, 0 },
@@ -208,8 +210,14 @@
     { "min-luma",       required_argument, NULL, 0 },
     { "max-luma",       required_argument, NULL, 0 },
     { "log2-max-poc-lsb", required_argument, NULL, 8 },
-    { "discard-sei",          no_argument, NULL, 0 },
-    { "discard-vui",          no_argument, NULL, 0 },
+    { "vui-timing-info",      no_argument, NULL, 0 },
+    { "no-vui-timing-info",   no_argument, NULL, 0 },
+    { "vui-hrd-info",         no_argument, NULL, 0 },
+    { "no-vui-hrd-info",      no_argument, NULL, 0 },
+    { "opt-qp-pps",           no_argument, NULL, 0 },
+    { "no-opt-qp-pps",        no_argument, NULL, 0 },
+    { "opt-ref-list-length-pps",         no_argument, NULL, 0 },
+    { "no-opt-ref-list-length-pps",      no_argument, NULL, 0 },
     { "no-dither",            no_argument, NULL, 0 },
     { "dither",               no_argument, NULL, 0 },
     { "no-repeat-headers",    no_argument, NULL, 0 },
@@ -229,6 +237,8 @@
     { "pass",           required_argument, NULL, 0 },
     { "slow-firstpass",       no_argument, NULL, 0 },
     { "no-slow-firstpass",    no_argument, NULL, 0 },
+    { "multi-pass-opt-rps",   no_argument, NULL, 0 },
+    { "no-multi-pass-opt-rps", no_argument, NULL, 0 },
     { "analysis-mode",  required_argument, NULL, 0 },
     { "analysis-file",  required_argument, NULL, 0 },
     { "strict-cbr",           no_argument, NULL, 0 },
@@ -317,6 +327,7 @@
     H0("   --max-tu-size <32|16|8|4>     Maximum TU size (WxH). Default %d\n", param->maxTUSize);
     H0("   --tu-intra-depth <integer>    Max TU recursive depth for intra CUs. Default %d\n", param->tuQTMaxIntraDepth);
     H0("   --tu-inter-depth <integer>    Max TU recursive depth for inter CUs. Default %d\n", param->tuQTMaxInterDepth);
+    H0("   --limit-tu <0..4>             Enable early exit from TU recursion for inter coded blocks. Default %d\n", param->limitTU);
     H0("\nAnalysis:\n");
     H0("   --rd <1..6>                   Level of RDO in mode decision 1:least....6:full RDO. Default %d\n", param->rdLevel);
     H0("   --[no-]psy-rd <0..5.0>        Strength of psycho-visual rate distortion optimization, 0 to disable. Default %.1f\n", param->psyRd);
@@ -357,6 +368,7 @@
     H0("-i/--min-keyint <integer>        Scenecuts closer together than this are coded as I, not IDR. Default: auto\n");
     H0("   --no-scenecut                 Disable adaptive I-frame decision\n");
     H0("   --scenecut <integer>          How aggressively to insert extra I-frames. Default %d\n", param->scenecutThreshold);
+    H1("   --scenecut-bias <0..100.0>    Bias for scenecut detection. Default %.2f\n", param->scenecutBias);
     H0("   --intra-refresh               Use Periodic Intra Refresh instead of IDR frames\n");
     H0("   --rc-lookahead <integer>      Number of frames for frame-type lookahead (determines encoder latency) Default %d\n", param->lookaheadDepth);
     H1("   --lookahead-slices <0..16>    Number of slices to use per lookahead cost estimate. Default %d\n", param->lookaheadSlices);
@@ -448,8 +460,11 @@
     H0("   --[no-]aud                    Emit access unit delimiters at the start of each access unit. Default %s\n", OPT(param->bEnableAccessUnitDelimiters));
     H1("   --hash <integer>              Decoded Picture Hash SEI 0: disabled, 1: MD5, 2: CRC, 3: Checksum. Default %d\n", param->decodedPictureHashSEI);
     H0("   --log2-max-poc-lsb <integer>  Maximum of the picture order count\n");
-    H0("   --discard-sei                 Discard SEI packets in bitstream. Default %s\n", OPT(param->bDiscardSEI));
-    H0("   --discard-vui                 Discard optional VUI information from the bistream. Default %s\n", OPT(param->bDiscardOptionalVUI));
+    H0("   --[no-]vui-timing-info        Emit VUI timing information in the bistream. Default %s\n", OPT(param->bEmitVUITimingInfo));
+    H0("   --[no-]vui-hrd-info           Emit VUI HRD information in the bistream. Default %s\n", OPT(param->bEmitVUIHRDInfo));
+    H0("   --[no-]opt-qp-pps             Dynamically optimize QP in PPS (instead of default 26) based on QPs in previous GOP. Default %s\n", OPT(param->bOptQpPPS));
+    H0("   --[no-]opt-ref-list-length-pps  Dynamically set L0 and L1 ref list length in PPS (instead of default 0) based on values in last GOP. Default %s\n", OPT(param->bOptRefListLengthPPS));
+    H0("   --[no-]multi-pass-opt-rps     Enable storing commonly used RPS in SPS in multi pass mode. Default %s\n", OPT(param->bMultiPassOptRPS));
     H1("\nReconstructed video options (debugging):\n");
     H1("-r/--recon <filename>            Reconstructed raw image YUV or Y4M output file name\n");
     H1("   --recon-depth <integer>       Bit-depth of reconstructed raw image file. Defaults to input bit depth, or 8 if Y4M\n");

 
@@ -85,6 +85,7 @@
     { "max-tu-size",    required_argument, NULL, 0 },
     { "tu-intra-depth", required_argument, NULL, 0 },
     { "tu-inter-depth", required_argument, NULL, 0 },
+    { "limit-tu",       required_argument, NULL, 0 },
     { "me",             required_argument, NULL, 0 },
     { "subme",          required_argument, NULL, 'm' },
     { "merange",        required_argument, NULL, 0 },
@@ -120,6 +121,7 @@
     { "min-keyint",     required_argument, NULL, 'i' },
     { "scenecut",       required_argument, NULL, 0 },
     { "no-scenecut",          no_argument, NULL, 0 },
+    { "scenecut-bias",  required_argument, NULL, 0 },
     { "intra-refresh",        no_argument, NULL, 0 },
     { "rc-lookahead",   required_argument, NULL, 0 },
     { "lookahead-slices", required_argument, NULL, 0 },
@@ -208,8 +210,14 @@
     { "min-luma",       required_argument, NULL, 0 },
     { "max-luma",       required_argument, NULL, 0 },
     { "log2-max-poc-lsb", required_argument, NULL, 8 },
-    { "discard-sei",          no_argument, NULL, 0 },
-    { "discard-vui",          no_argument, NULL, 0 },
+    { "vui-timing-info",      no_argument, NULL, 0 },
+    { "no-vui-timing-info",   no_argument, NULL, 0 },
+    { "vui-hrd-info",         no_argument, NULL, 0 },
+    { "no-vui-hrd-info",      no_argument, NULL, 0 },
+    { "opt-qp-pps",           no_argument, NULL, 0 },
+    { "no-opt-qp-pps",        no_argument, NULL, 0 },
+    { "opt-ref-list-length-pps",         no_argument, NULL, 0 },
+    { "no-opt-ref-list-length-pps",      no_argument, NULL, 0 },
     { "no-dither",            no_argument, NULL, 0 },
     { "dither",               no_argument, NULL, 0 },
     { "no-repeat-headers",    no_argument, NULL, 0 },
@@ -229,6 +237,8 @@
     { "pass",           required_argument, NULL, 0 },
     { "slow-firstpass",       no_argument, NULL, 0 },
     { "no-slow-firstpass",    no_argument, NULL, 0 },
+    { "multi-pass-opt-rps",   no_argument, NULL, 0 },
+    { "no-multi-pass-opt-rps", no_argument, NULL, 0 },
     { "analysis-mode",  required_argument, NULL, 0 },
     { "analysis-file",  required_argument, NULL, 0 },
     { "strict-cbr",           no_argument, NULL, 0 },
@@ -317,6 +327,7 @@
     H0("   --max-tu-size <32|16|8|4>     Maximum TU size (WxH). Default %d\n", param->maxTUSize);
     H0("   --tu-intra-depth <integer>    Max TU recursive depth for intra CUs. Default %d\n", param->tuQTMaxIntraDepth);
     H0("   --tu-inter-depth <integer>    Max TU recursive depth for inter CUs. Default %d\n", param->tuQTMaxInterDepth);
+    H0("   --limit-tu <0..4>             Enable early exit from TU recursion for inter coded blocks. Default %d\n", param->limitTU);
     H0("\nAnalysis:\n");
     H0("   --rd <1..6>                   Level of RDO in mode decision 1:least....6:full RDO. Default %d\n", param->rdLevel);
     H0("   --[no-]psy-rd <0..5.0>        Strength of psycho-visual rate distortion optimization, 0 to disable. Default %.1f\n", param->psyRd);
@@ -357,6 +368,7 @@
     H0("-i/--min-keyint <integer>        Scenecuts closer together than this are coded as I, not IDR. Default: auto\n");
     H0("   --no-scenecut                 Disable adaptive I-frame decision\n");
     H0("   --scenecut <integer>          How aggressively to insert extra I-frames. Default %d\n", param->scenecutThreshold);
+    H1("   --scenecut-bias <0..100.0>    Bias for scenecut detection. Default %.2f\n", param->scenecutBias);
     H0("   --intra-refresh               Use Periodic Intra Refresh instead of IDR frames\n");
     H0("   --rc-lookahead <integer>      Number of frames for frame-type lookahead (determines encoder latency) Default %d\n", param->lookaheadDepth);
     H1("   --lookahead-slices <0..16>    Number of slices to use per lookahead cost estimate. Default %d\n", param->lookaheadSlices);
@@ -448,8 +460,11 @@
     H0("   --[no-]aud                    Emit access unit delimiters at the start of each access unit. Default %s\n", OPT(param->bEnableAccessUnitDelimiters));
     H1("   --hash <integer>              Decoded Picture Hash SEI 0: disabled, 1: MD5, 2: CRC, 3: Checksum. Default %d\n", param->decodedPictureHashSEI);
     H0("   --log2-max-poc-lsb <integer>  Maximum of the picture order count\n");
-    H0("   --discard-sei                 Discard SEI packets in bitstream. Default %s\n", OPT(param->bDiscardSEI));
-    H0("   --discard-vui                 Discard optional VUI information from the bistream. Default %s\n", OPT(param->bDiscardOptionalVUI));
+    H0("   --[no-]vui-timing-info        Emit VUI timing information in the bistream. Default %s\n", OPT(param->bEmitVUITimingInfo));
+    H0("   --[no-]vui-hrd-info           Emit VUI HRD information in the bistream. Default %s\n", OPT(param->bEmitVUIHRDInfo));
+    H0("   --[no-]opt-qp-pps             Dynamically optimize QP in PPS (instead of default 26) based on QPs in previous GOP. Default %s\n", OPT(param->bOptQpPPS));
+    H0("   --[no-]opt-ref-list-length-pps  Dynamically set L0 and L1 ref list length in PPS (instead of default 0) based on values in last GOP. Default %s\n", OPT(param->bOptRefListLengthPPS));
+    H0("   --[no-]multi-pass-opt-rps     Enable storing commonly used RPS in SPS in multi pass mode. Default %s\n", OPT(param->bMultiPassOptRPS));
     H1("\nReconstructed video options (debugging):\n");
     H1("-r/--recon <filename>            Reconstructed raw image YUV or Y4M output file name\n");
     H1("   --recon-depth <integer>       Bit-depth of reconstructed raw image file. Defaults to input bit depth, or 8 if Y4M\n");
​