Packman Build Service PMBS

x265.changes Changed

x265.spec Changed

arm.patch Changed

@@ -1,8 +1,8 @@
-Index: x265_2.0/source/CMakeLists.txt
+Index: x265_2.1/source/CMakeLists.txt
 ===================================================================
---- x265_2.0.orig/source/CMakeLists.txt
-+++ x265_2.0/source/CMakeLists.txt
-@@ -60,15 +60,22 @@
+--- x265_2.1.orig/source/CMakeLists.txt
++++ x265_2.1/source/CMakeLists.txt
+@@ -60,15 +60,22 @@ elseif(POWERMATCH GREATER "-1")
      message(STATUS "Detected POWER target processor")
      set(POWER 1)
      add_definitions(-DX265_ARCH_POWER=1)
@@ -34,8 +34,8 @@
  else()
      message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
      message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
-@@ -186,18 +193,9 @@
-             add_definitions(-march=i686)
+@@ -190,18 +197,9 @@ if(GCC)
+             endif()
          endif()
      endif()
 -    if(ARM AND CROSS_COMPILE_ARM)
@@ -48,18 +48,17 @@
 -        else()
 -            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
 -        endif()
--    endif()
--    add_definitions(${ARM_ARGS})
 +    if(ARMV7)
 +        add_definitions(-fPIC)
-+    endif()
+     endif()
+-    add_definitions(${ARM_ARGS})
      if(FPROFILE_GENERATE)
          if(INTEL_CXX)
              add_definitions(-prof-gen -prof-dir="${CMAKE_CURRENT_BINARY_DIR}")
-Index: x265_2.0/source/common/cpu.cpp
+Index: x265_2.1/source/common/cpu.cpp
 ===================================================================
---- x265_2.0.orig/source/common/cpu.cpp
-+++ x265_2.0/source/common/cpu.cpp
+--- x265_2.1.orig/source/common/cpu.cpp
++++ x265_2.1/source/common/cpu.cpp
 @@ -37,7 +37,7 @@
  #include <machine/cpu.h>
  #endif
@@ -69,7 +68,7 @@
  #include <signal.h>
  #include <setjmp.h>
  static sigjmp_buf jmpbuf;
-@@ -340,7 +340,6 @@
+@@ -340,7 +340,6 @@ uint32_t cpu_detect(void)
      }
  
      canjump = 1;
@@ -77,7 +76,7 @@
      canjump = 0;
      signal(SIGILL, oldsig);
  #endif // if !HAVE_NEON
-@@ -356,7 +355,7 @@
+@@ -356,7 +355,7 @@ uint32_t cpu_detect(void)
      // which may result in incorrect detection and the counters stuck enabled.
      // right now Apple does not seem to support performance counters for this test
  #ifndef __MACH__

x265_2.0.tar.gz/.hg_archival.txt -> x265_2.1.tar.gz/.hg_archival.txt Changed

x265_2.0.tar.gz/.hgtags -> x265_2.1.tar.gz/.hgtags Changed

x265_2.0.tar.gz/doc/reST/cli.rst -> x265_2.1.tar.gz/doc/reST/cli.rst Changed

@@ -59,10 +59,9 @@
 
 .. option:: --log-level <integer|string>
 
-	Logging level. Debug level enables per-frame QP, metric, and bitrate
-	logging. If a CSV file is being generated, frame level makes the log
-	be per-frame rather than per-encode. Full level enables hash and
-	weight logging. -1 disables all logging, except certain fatal
+	Controls the level of information displayed on the console. Debug level
+	enables per-frame QP, metric, and bitrate logging. Full level enables
+	hash and weight logging. -1 disables all logging, except certain fatal
 	errors, and can be specified by the string "none".
 
 	0. error
@@ -79,8 +78,8 @@
 
 .. option:: --csv <filename>
 
-	Writes encoding results to a comma separated value log file. Creates
-	the file if it doesnt already exist. If :option:`--csv-log-level` is 0, 
+	Write encoding statistics to a Comma Separated Values log file. Creates
+	the file if it doesn't already exist. If :option:`--csv-log-level` is 0, 
 	it adds one line per run. If :option:`--csv-log-level` is greater than
 	0, it writes one line per frame. Default none
 
@@ -128,12 +127,13 @@
 
 .. option:: --csv-log-level <integer>
 
-        CSV logging level. Default 0
-        0. summary
-        1. frame level logging
-        2. frame level logging with performance statistics
+    Controls the level of detail (and size) of --csv log files
+		
+    0. summary **(default)**
+    1. frame level logging
+    2. frame level logging with performance statistics
 
-        **CLI ONLY**
+    **CLI ONLY**
 
 .. option:: --ssim, --no-ssim
 
@@ -334,6 +334,17 @@
 
 	**Values:** psnr, ssim, grain, zero-latency, fast-decode.
 
+.. option:: --slices <integer>
+
+	Encode each incoming frame as multiple parallel slices that may be decoded
+	independently. Support available only for rectangular slices that cover the
+	entire width of the image. 
+
+	Recommended for improving encoder performance only if frame-parallelism and
+	WPP are unable to maximize utilization on given hardware.
+
+	Default: 1 slice per frame. **Experimental feature**
+
 Input/Output File Options
 =========================
 
@@ -474,21 +485,22 @@
 
 	8bit profiles::
 
-	main, main-intra, mainstillpicture (or msp for short)
-	main444-8 main444-intra main444-stillpicture
+	* main, main-intra, mainstillpicture (or msp for short)
+	* main444-8, main444-intra, main444-stillpicture
+
 	See note below on signaling intra and stillpicture profiles.
 	
 	10bit profiles::
 
-	main10, main10-intra
-	main422-10, main422-10-intra
-	main444-10, main444-10-intra
+	* main10, main10-intra
+	* main422-10, main422-10-intra
+	* main444-10, main444-10-intra
 
 	12bit profiles::
 
-	main12, main12-intra
-	main422-12, main422-12-intra
-	main444-12, main444-12-intra
+	* main12, main12-intra
+	* main422-12, main422-12-intra
+	* main444-12, main444-12-intra
 
 
 	**CLI ONLY**
@@ -1009,6 +1021,11 @@
 
 	Enable weighted prediction in B slices. Default disabled
 
+.. option:: --analyze-src-pics, --no-analyze-src-pics
+
+    Enalbe motion estimation with source frame pixels, in this mode, 
+    motion estimation can be computed independently. Default disabled.
+
 Spatial/intra options
 =====================
 
@@ -1123,11 +1140,9 @@
 
 .. option:: --min-keyint, -i <integer>
 
-	Minimum GOP size. Scenecuts closer together than this are coded as I
-	or P, not IDR. Minimum keyint is clamped to be at least half of
-	:option:`--keyint`. If you wish to force regular keyframe intervals
-	and disable adaptive I frame placement, you must use
-	:option:`--no-scenecut`.
+	Minimum GOP size. Scenecuts beyond this interval are coded as IDR and start
+	a new keyframe, while scenecuts closer together are coded as I or P. For
+	fixed keyframe interval, set value to be equal to keyint.
 
 	**Range of values:** >=0 (0: auto)
 
@@ -1314,20 +1329,25 @@
 	0. disabled
 	1. AQ enabled **(default)**
 	2. AQ enabled with auto-variance
-	3. AQ enabled with auto-variance and bias to dark scenes
+	3. AQ enabled with auto-variance and bias to dark scenes. This is 
+	recommended for 8-bit encodes or low-bitrate 10-bit encodes, to 
+	prevent color banding/blocking. 
 
 .. option:: --aq-strength <float>
 
 	Adjust the strength of the adaptive quantization offsets. Setting
-	:option:`--aq-strength` to 0 disables AQ. Default 1.0.
+	:option:`--aq-strength` to 0 disables AQ. At aq-modes 2 and 3, high 
+	aq-strengths will lead to high QP offsets resulting in a large 
+	difference in achieved bitrates. 
 
+	Default 1.0.
 	**Range of values:** 0.0 to 3.0
 
-.. option:: --qg-size <64|32|16>
+.. option:: --qg-size <64|32|16|8>
 
 	Enable adaptive quantization for sub-CTUs. This parameter specifies 
 	the minimum CU size at which QP can be adjusted, ie. Quantization Group
-	size. Allowed range of values are 64, 32, 16 provided this falls within 
+	size. Allowed range of values are 64, 32, 16, 8 provided this falls within 
 	the inclusive range [maxCUSize, minCUSize]. Experimental.
 	Default: same as maxCUSize
 
@@ -1434,6 +1454,14 @@
 	The maximum single adjustment in QP allowed to rate control. Default
 	4
 	
+.. option:: --qpmin <integer>
+
+	sets a hard lower limit on QP allowed to ratecontrol. Default 0
+
+.. option:: --qpmax <integer>
+
+	sets a hard upper limit on QP allowed to ratecontrol. Default 69
+	
 .. option:: --rc-grain, --no-rc-grain
 
    Enables a specialised ratecontrol algorithm for film grain content. This 
@@ -1722,7 +1750,7 @@
 	Example for MaxCLL=1000 candela per square meter, MaxFALL=400
 	candela per square meter:
 
-		--max-cll â1000,400â
+		--max-cll 1000,400
 
 	Note that this string value will need to be escaped or quoted to
 	protect against shell expansion on many platforms. No default.
@@ -1801,6 +1829,20 @@
 	PbBbP. You probably also want :option:`--no-scenecut` and a keyframe
 	interval that is a multiple of 4.
 
+.. option:: --log2-max-poc-lsb <integer>
+
+  Maximum of the picture order count. Default 8
+
+.. option:: --discard-sei
+
+  Discard SEI messages generated from the final bitstream. HDR-related SEI
+  messages are always dumped, immaterial of this option. Default disabled.
+	
+.. option:: --discard-vui
+
+	Discard optional VUI information (timing, HRD info) from the
+	bitstream. Default disabled.
+
 Debugging options
 =================

x265_2.0.tar.gz/source/CMakeLists.txt -> x265_2.1.tar.gz/source/CMakeLists.txt Changed

x265_2.0.tar.gz/source/common/arm/asm-primitives.cpp -> x265_2.1.tar.gz/source/common/arm/asm-primitives.cpp Changed

x265_2.0.tar.gz/source/common/arm/dct-a.S -> x265_2.1.tar.gz/source/common/arm/dct-a.S Changed

x265_2.0.tar.gz/source/common/arm/pixel-util.S -> x265_2.1.tar.gz/source/common/arm/pixel-util.S Changed

@@ -2449,3 +2449,191 @@
     bx              lr
 endfunc
 
+function x265_psyCost_8x8_neon
+
+    vpush           {q4-q7}
+
+    vld1.8          {d0}, [r0], r1
+    vld1.8          {d1}, [r0], r1
+    vmovl.u8        q8, d0
+    vld1.8          {d2}, [r0], r1
+    vmovl.u8        q9, d1
+    vld1.8          {d3}, [r0], r1
+    vmovl.u8        q10, d2
+    vld1.8          {d4}, [r0], r1
+    vmovl.u8        q11, d3
+    vld1.8          {d5}, [r0], r1
+    vmovl.u8        q12, d4
+    vld1.8          {d6}, [r0], r1
+    vmovl.u8        q13, d5
+    vld1.8          {d7}, [r0], r1
+    vmovl.u8        q14, d6
+    vmovl.u8        q15, d7
+
+    // SAD Stage-0
+    vadd.u16        q4, q8, q9
+    vadd.u16        q5, q10, q11
+    vadd.u16        q6, q12, q13
+    vadd.u16        q7, q14, q15
+
+    // SAD Stage-1
+    vadd.u16        q4, q5
+    vadd.u16        q6, q7
+    vadd.u16        q4, q6
+    vpadd.u16       d8, d9
+    vpaddl.u16      d8, d8
+    vpadd.u32       d8, d8
+    vshr.u32        d8, #2
+
+    // sa8d
+    SUMSUB_AB       q0,  q1,  q8,  q9
+    SUMSUB_AB       q2,  q3,  q10, q11
+    SUMSUB_AB       q8,  q10, q0,  q2
+    SUMSUB_AB       q9,  q11, q1,  q3
+
+    HADAMARD4_V     q12, q13, q14, q15,  q0,  q1,  q2,  q3
+
+    SUMSUB_ABCD     q0,  q8,  q1,  q9,   q8,  q12, q9,  q13
+    SUMSUB_AB       q2,  q10, q10, q14
+    vtrn.16         q8,  q9
+    SUMSUB_AB       q3,  q11, q11, q15
+    vtrn.16         q0,  q1
+    SUMSUB_AB       q12, q13, q8,  q9
+    vtrn.16         q10, q11
+    SUMSUB_AB       q8,  q9,  q0,  q1
+    vtrn.16         q2,  q3
+    SUMSUB_AB       q14, q15, q10, q11
+    vadd.i16        q10, q2,  q3
+    vtrn.32         q12, q14
+    vsub.i16        q11, q2,  q3
+    vtrn.32         q13, q15
+    SUMSUB_AB       q0,  q2,  q12, q14
+    vtrn.32         q8,  q10
+    SUMSUB_AB       q1,  q3,  q13, q15
+    vtrn.32         q9,  q11
+    SUMSUB_AB       q12, q14, q8,  q10
+    SUMSUB_AB       q13, q15, q9,  q11
+
+    vswp            d1,  d24
+    ABS2            q0,  q12
+    vswp            d3,  d26
+    ABS2            q1,  q13
+    vswp            d5,  d28
+    ABS2            q2,  q14
+    vswp            d7,  d30
+    ABS2            q3,  q15
+    vmax.s16        q8,  q0,  q12
+    vmax.s16        q9,  q1,  q13
+    vmax.s16        q10, q2,  q14
+    vmax.s16        q11, q3,  q15
+    vadd.i16        q8,  q8,  q9
+    vadd.i16        q9,  q10, q11
+    vadd.u16        q0, q8, q9
+    vadd.u16        d0, d1
+    vpaddl.u16      d0, d0
+    vpadd.u32       d0, d0
+    vmov.32         r0, d0[0]
+    add             r0, r0, #1
+    lsr             r0, r0, #1
+//-------------------------------------------------------------
+    vld1.8          d0, [r2], r3
+    vld1.8          d1, [r2], r3
+    vmovl.u8        q8, d0
+    vld1.8          d2, [r2], r3
+    vmovl.u8        q9, d1
+    vld1.8          d3, [r2], r3
+    vmovl.u8        q10, d2
+    vld1.8          d4, [r2], r3
+    vmovl.u8        q11, d3
+    vld1.8          d5, [r2], r3
+    vmovl.u8        q12, d4
+    vld1.8          d6, [r2], r3
+    vmovl.u8        q13, d5
+    vld1.8          d7, [r2], r3
+    vmovl.u8        q14, d6
+    vmovl.u8        q15, d7
+
+    // SAD Stage-0
+    vadd.u16       q5, q8, q9
+    vadd.u16       q6, q10, q11
+    vadd.u16       q7, q12, q13
+    vadd.u16       q0, q14, q15
+
+    // SAD Stage-1
+    vadd.u16        q5, q6
+    vadd.u16        q7, q0
+    vadd.u16        q5, q7
+    vadd.u16        d10, d11
+    vpaddl.u16      d10, d10
+    vpadd.u32       d10, d10
+    vshr.u32        d10, #2
+
+    // sa8d
+    SUMSUB_AB       q0,  q1,  q8,  q9
+    SUMSUB_AB       q2,  q3,  q10, q11
+    SUMSUB_AB       q8,  q10, q0,  q2
+    SUMSUB_AB       q9,  q11, q1,  q3
+
+    HADAMARD4_V     q12, q13, q14, q15,  q0,  q1,  q2,  q3
+
+    SUMSUB_ABCD     q0,  q8,  q1,  q9,   q8,  q12, q9,  q13
+    SUMSUB_AB       q2,  q10, q10, q14
+    vtrn.16         q8,  q9
+    SUMSUB_AB       q3,  q11, q11, q15
+    vtrn.16         q0,  q1
+    SUMSUB_AB       q12, q13, q8,  q9
+    vtrn.16         q10, q11
+    SUMSUB_AB       q8,  q9,  q0,  q1
+    vtrn.16         q2,  q3
+    SUMSUB_AB       q14, q15, q10, q11
+    vadd.i16        q10, q2,  q3
+    vtrn.32         q12, q14
+    vsub.i16        q11, q2,  q3
+    vtrn.32         q13, q15
+    SUMSUB_AB       q0,  q2,  q12, q14
+    vtrn.32         q8,  q10
+    SUMSUB_AB       q1,  q3,  q13, q15
+    vtrn.32         q9,  q11
+    SUMSUB_AB       q12, q14, q8,  q10
+    SUMSUB_AB       q13, q15, q9,  q11
+
+    vswp            d1,  d24
+    ABS2            q0,  q12
+    vswp            d3,  d26
+    ABS2            q1,  q13
+    vswp            d5,  d28
+    ABS2            q2,  q14
+    vswp            d7,  d30
+    ABS2            q3,  q15
+    vmax.s16        q8,  q0,  q12
+    vmax.s16        q9,  q1,  q13
+    vmax.s16        q10, q2,  q14
+    vmax.s16        q11, q3,  q15
+    vadd.i16        q8,  q8,  q9
+    vadd.i16        q9,  q10, q11
+    vadd.u16        q0, q8, q9
+    vadd.u16        d0, d1
+    vpaddl.u16      d0, d0
+    vpadd.u32       d0, d0
+    vmov.32         r2, d0[0]
+    add             r2, r2, #1
+    lsr             r2, r2, #1
+
+    // SAD & SA8D Final Stage
+    vmov.32         r1, d8[0]
+    sub             r0, r1
+    vmov.32         r3, d10[0]
+    sub             r2, r3
+    cmp             r0, r2
+    bgt             subr0
+    sub             r0, r2, r0
+    b               end
+subr0:
+    sub             r0, r2
+end:
+
+    vpop            {q4-q7}
+    bx              lr
+endfunc
+
+

x265_2.0.tar.gz/source/common/arm/pixel-util.h -> x265_2.1.tar.gz/source/common/arm/pixel-util.h Changed

x265_2.0.tar.gz/source/common/common.h -> x265_2.1.tar.gz/source/common/common.h Changed

@@ -71,6 +71,7 @@
 #define NUM_INTRA_MODE 35
 
 #if defined(__GNUC__)
+#define ALIGN_VAR_4(T, var)  T var __attribute__((aligned(4)))
 #define ALIGN_VAR_8(T, var)  T var __attribute__((aligned(8)))
 #define ALIGN_VAR_16(T, var) T var __attribute__((aligned(16)))
 #define ALIGN_VAR_32(T, var) T var __attribute__((aligned(32)))
@@ -81,6 +82,7 @@
 
 #elif defined(_MSC_VER)
 
+#define ALIGN_VAR_4(T, var)  __declspec(align(4)) T var
 #define ALIGN_VAR_8(T, var)  __declspec(align(8)) T var
 #define ALIGN_VAR_16(T, var) __declspec(align(16)) T var
 #define ALIGN_VAR_32(T, var) __declspec(align(32)) T var
@@ -157,7 +159,6 @@
 #define MIN_QPSCALE     0.21249999999999999
 #define MAX_MAX_QPSCALE 615.46574234477100
 
-#define BITS_FOR_POC 8
 
 template<typename T>
 inline T x265_min(T a, T b) { return a < b ? a : b; }
@@ -255,7 +256,9 @@
 #define LOG2_UNIT_SIZE          2                           // log2(unitSize)
 #define UNIT_SIZE               (1 << LOG2_UNIT_SIZE)       // unit size of CU partition
 
-#define MAX_NUM_PARTITIONS      256
+#define LOG2_RASTER_SIZE        (MAX_LOG2_CU_SIZE - LOG2_UNIT_SIZE)
+#define RASTER_SIZE             (1 << LOG2_RASTER_SIZE)
+#define MAX_NUM_PARTITIONS      (RASTER_SIZE * RASTER_SIZE)
 #define NUM_4x4_PARTITIONS      (1U << (g_unitSizeDepth << 1)) // number of 4x4 units in max CU size
 
 #define MIN_PU_SIZE             4

x265_2.0.tar.gz/source/common/constants.cpp -> x265_2.1.tar.gz/source/common/constants.cpp Changed

@@ -166,9 +166,48 @@
 uint32_t g_maxCUSize     = MAX_CU_SIZE;
 uint32_t g_unitSizeDepth = NUM_CU_DEPTH;
 uint32_t g_maxCUDepth    = NUM_CU_DEPTH - 1;
-uint32_t g_zscanToRaster[MAX_NUM_PARTITIONS] = { 0, };
-uint32_t g_rasterToZscan[MAX_NUM_PARTITIONS] = { 0, };
-
+uint32_t g_maxSlices     = 1;
+
+const uint32_t g_zscanToRaster[MAX_NUM_PARTITIONS] =
+{
+    0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13, 0x20, 0x21, 0x30, 0x31, 0x22, 0x23, 0x32, 0x33,
+    0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17, 0x24, 0x25, 0x34, 0x35, 0x26, 0x27, 0x36, 0x37,
+    0x40, 0x41, 0x50, 0x51, 0x42, 0x43, 0x52, 0x53, 0x60, 0x61, 0x70, 0x71, 0x62, 0x63, 0x72, 0x73,
+    0x44, 0x45, 0x54, 0x55, 0x46, 0x47, 0x56, 0x57, 0x64, 0x65, 0x74, 0x75, 0x66, 0x67, 0x76, 0x77,
+    0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B, 0x1A, 0x1B, 0x28, 0x29, 0x38, 0x39, 0x2A, 0x2B, 0x3A, 0x3B,
+    0x0C, 0x0D, 0x1C, 0x1D, 0x0E, 0x0F, 0x1E, 0x1F, 0x2C, 0x2D, 0x3C, 0x3D, 0x2E, 0x2F, 0x3E, 0x3F,
+    0x48, 0x49, 0x58, 0x59, 0x4A, 0x4B, 0x5A, 0x5B, 0x68, 0x69, 0x78, 0x79, 0x6A, 0x6B, 0x7A, 0x7B,
+    0x4C, 0x4D, 0x5C, 0x5D, 0x4E, 0x4F, 0x5E, 0x5F, 0x6C, 0x6D, 0x7C, 0x7D, 0x6E, 0x6F, 0x7E, 0x7F,
+    0x80, 0x81, 0x90, 0x91, 0x82, 0x83, 0x92, 0x93, 0xA0, 0xA1, 0xB0, 0xB1, 0xA2, 0xA3, 0xB2, 0xB3,
+    0x84, 0x85, 0x94, 0x95, 0x86, 0x87, 0x96, 0x97, 0xA4, 0xA5, 0xB4, 0xB5, 0xA6, 0xA7, 0xB6, 0xB7,
+    0xC0, 0xC1, 0xD0, 0xD1, 0xC2, 0xC3, 0xD2, 0xD3, 0xE0, 0xE1, 0xF0, 0xF1, 0xE2, 0xE3, 0xF2, 0xF3,
+    0xC4, 0xC5, 0xD4, 0xD5, 0xC6, 0xC7, 0xD6, 0xD7, 0xE4, 0xE5, 0xF4, 0xF5, 0xE6, 0xE7, 0xF6, 0xF7,
+    0x88, 0x89, 0x98, 0x99, 0x8A, 0x8B, 0x9A, 0x9B, 0xA8, 0xA9, 0xB8, 0xB9, 0xAA, 0xAB, 0xBA, 0xBB,
+    0x8C, 0x8D, 0x9C, 0x9D, 0x8E, 0x8F, 0x9E, 0x9F, 0xAC, 0xAD, 0xBC, 0xBD, 0xAE, 0xAF, 0xBE, 0xBF,
+    0xC8, 0xC9, 0xD8, 0xD9, 0xCA, 0xCB, 0xDA, 0xDB, 0xE8, 0xE9, 0xF8, 0xF9, 0xEA, 0xEB, 0xFA, 0xFB,
+    0xCC, 0xCD, 0xDC, 0xDD, 0xCE, 0xCF, 0xDE, 0xDF, 0xEC, 0xED, 0xFC, 0xFD, 0xEE, 0xEF, 0xFE, 0xFF
+};
+
+const uint32_t g_rasterToZscan[MAX_NUM_PARTITIONS] =
+{
+    0x00, 0x01, 0x04, 0x05, 0x10, 0x11, 0x14, 0x15, 0x40, 0x41, 0x44, 0x45, 0x50, 0x51, 0x54, 0x55,
+    0x02, 0x03, 0x06, 0x07, 0x12, 0x13, 0x16, 0x17, 0x42, 0x43, 0x46, 0x47, 0x52, 0x53, 0x56, 0x57,
+    0x08, 0x09, 0x0C, 0x0D, 0x18, 0x19, 0x1C, 0x1D, 0x48, 0x49, 0x4C, 0x4D, 0x58, 0x59, 0x5C, 0x5D,
+    0x0A, 0x0B, 0x0E, 0x0F, 0x1A, 0x1B, 0x1E, 0x1F, 0x4A, 0x4B, 0x4E, 0x4F, 0x5A, 0x5B, 0x5E, 0x5F,
+    0x20, 0x21, 0x24, 0x25, 0x30, 0x31, 0x34, 0x35, 0x60, 0x61, 0x64, 0x65, 0x70, 0x71, 0x74, 0x75,
+    0x22, 0x23, 0x26, 0x27, 0x32, 0x33, 0x36, 0x37, 0x62, 0x63, 0x66, 0x67, 0x72, 0x73, 0x76, 0x77,
+    0x28, 0x29, 0x2C, 0x2D, 0x38, 0x39, 0x3C, 0x3D, 0x68, 0x69, 0x6C, 0x6D, 0x78, 0x79, 0x7C, 0x7D,
+    0x2A, 0x2B, 0x2E, 0x2F, 0x3A, 0x3B, 0x3E, 0x3F, 0x6A, 0x6B, 0x6E, 0x6F, 0x7A, 0x7B, 0x7E, 0x7F,
+    0x80, 0x81, 0x84, 0x85, 0x90, 0x91, 0x94, 0x95, 0xC0, 0xC1, 0xC4, 0xC5, 0xD0, 0xD1, 0xD4, 0xD5,
+    0x82, 0x83, 0x86, 0x87, 0x92, 0x93, 0x96, 0x97, 0xC2, 0xC3, 0xC6, 0xC7, 0xD2, 0xD3, 0xD6, 0xD7,
+    0x88, 0x89, 0x8C, 0x8D, 0x98, 0x99, 0x9C, 0x9D, 0xC8, 0xC9, 0xCC, 0xCD, 0xD8, 0xD9, 0xDC, 0xDD,
+    0x8A, 0x8B, 0x8E, 0x8F, 0x9A, 0x9B, 0x9E, 0x9F, 0xCA, 0xCB, 0xCE, 0xCF, 0xDA, 0xDB, 0xDE, 0xDF,
+    0xA0, 0xA1, 0xA4, 0xA5, 0xB0, 0xB1, 0xB4, 0xB5, 0xE0, 0xE1, 0xE4, 0xE5, 0xF0, 0xF1, 0xF4, 0xF5,
+    0xA2, 0xA3, 0xA6, 0xA7, 0xB2, 0xB3, 0xB6, 0xB7, 0xE2, 0xE3, 0xE6, 0xE7, 0xF2, 0xF3, 0xF6, 0xF7,
+    0xA8, 0xA9, 0xAC, 0xAD, 0xB8, 0xB9, 0xBC, 0xBD, 0xE8, 0xE9, 0xEC, 0xED, 0xF8, 0xF9, 0xFC, 0xFD,
+    0xAA, 0xAB, 0xAE, 0xAF, 0xBA, 0xBB, 0xBE, 0xBF, 0xEA, 0xEB, 0xEE, 0xEF, 0xFA, 0xFB, 0xFE, 0xFF
+};
+    
 const uint8_t g_zscanToPelX[MAX_NUM_PARTITIONS] =
 {
     0, 4, 0, 4, 8, 12, 8, 12, 0, 4, 0, 4, 8, 12, 8, 12,
@@ -209,33 +248,6 @@
     48, 48, 52, 52, 48, 48, 52, 52, 56, 56, 60, 60, 56, 56, 60, 60
 };
 
-void initZscanToRaster(uint32_t maxFullDepth, uint32_t depth, uint32_t startVal, uint32_t*& curIdx)
-{
-    uint32_t stride = 1 << maxFullDepth;
-
-    if (depth > maxFullDepth)
-    {
-        curIdx[0] = startVal;
-        curIdx++;
-    }
-    else
-    {
-        int step = stride >> depth;
-        initZscanToRaster(maxFullDepth, depth + 1, startVal,                        curIdx);
-        initZscanToRaster(maxFullDepth, depth + 1, startVal + step,                 curIdx);
-        initZscanToRaster(maxFullDepth, depth + 1, startVal + step * stride,        curIdx);
-        initZscanToRaster(maxFullDepth, depth + 1, startVal + step * stride + step, curIdx);
-    }
-}
-
-void initRasterToZscan(uint32_t maxFullDepth)
-{
-    uint32_t numPartitions = 1 << (maxFullDepth * 2);
-
-    for (uint32_t i = 0; i < numPartitions; i++)
-        g_rasterToZscan[g_zscanToRaster[i]] = i;
-}
-
 const int16_t g_lumaFilter[4][NTAPS_LUMA] =
 {
     {  0, 0,   0, 64,  0,   0, 0,  0 },

x265_2.0.tar.gz/source/common/constants.h -> x265_2.1.tar.gz/source/common/constants.h Changed

@@ -32,9 +32,6 @@
 
 extern int g_ctuSizeConfigured;
 
-void initZscanToRaster(uint32_t maxFullDepth, uint32_t depth, uint32_t startVal, uint32_t*& curIdx);
-void initRasterToZscan(uint32_t maxFullDepth);
-
 extern double x265_lambda_tab[QP_MAX_MAX + 1];
 extern double x265_lambda2_tab[QP_MAX_MAX + 1];
 extern const uint16_t x265_chroma_lambda2_offset_tab[MAX_CHROMA_LAMBDA_OFFSET + 1];
@@ -46,8 +43,8 @@
 extern const uint8_t g_chroma422IntraAngleMappingTable[AngleMapping422TableSize];
 
 // flexible conversion from relative to absolute index
-extern uint32_t g_zscanToRaster[MAX_NUM_PARTITIONS];
-extern uint32_t g_rasterToZscan[MAX_NUM_PARTITIONS];
+extern const uint32_t g_zscanToRaster[MAX_NUM_PARTITIONS];
+extern const uint32_t g_rasterToZscan[MAX_NUM_PARTITIONS];
 
 // conversion of partition index to picture pel position
 extern const uint8_t g_zscanToPelX[MAX_NUM_PARTITIONS];
@@ -59,6 +56,7 @@
 extern uint32_t g_maxCUSize;
 extern uint32_t g_maxCUDepth;
 extern uint32_t g_unitSizeDepth; // Depth at which 4x4 unit occurs from max CU size
+extern uint32_t g_maxSlices; // number of Slices
 
 extern const int16_t g_t4[4][4];
 extern const int16_t g_t8[8][8];
@@ -84,8 +82,7 @@
 extern const uint16_t* const g_scanOrder[NUM_SCAN_TYPE][NUM_SCAN_SIZE];
 extern const uint16_t* const g_scanOrderCG[NUM_SCAN_TYPE][NUM_SCAN_SIZE];
 extern const uint16_t g_scan8x8diag[8 * 8];
-extern const uint16_t g_scan4x4[NUM_SCAN_TYPE + 1][4 * 4];  // +1 for safe buffer area for codeCoeffNxN assembly optimize, there have up to 15 bytes beyond bound read
-
+ALIGN_VAR_16(extern const uint16_t, g_scan4x4[NUM_SCAN_TYPE + 1][4 * 4]);  // +1 for safe buffer area for codeCoeffNxN assembly optimize, there have up to 15 bytes beyond bound read
 extern const uint8_t g_lastCoeffTable[32];
 extern const uint8_t g_goRiceRange[5]; // maximum value coded with Rice codes

x265_2.0.tar.gz/source/common/cudata.cpp -> x265_2.1.tar.gz/source/common/cudata.cpp Changed

@@ -58,51 +58,46 @@
 // file private namespace
 
 /* Check whether 2 addresses point to the same column */
-inline bool isEqualCol(int addrA, int addrB, int numUnits)
+inline bool isEqualCol(int addrA, int addrB)
 {
-    // addrA % numUnits == addrB % numUnits
-    return ((addrA ^ addrB) &  (numUnits - 1)) == 0;
+    return ((addrA ^ addrB) & (RASTER_SIZE - 1)) == 0;
 }
 
 /* Check whether 2 addresses point to the same row */
-inline bool isEqualRow(int addrA, int addrB, int numUnits)
+inline bool isEqualRow(int addrA, int addrB)
 {
-    // addrA / numUnits == addrB / numUnits
-    return ((addrA ^ addrB) & ~(numUnits - 1)) == 0;
+    return ((addrA ^ addrB) < RASTER_SIZE);
 }
 
 /* Check whether 2 addresses point to the same row or column */
-inline bool isEqualRowOrCol(int addrA, int addrB, int numUnits)
+inline bool isEqualRowOrCol(int addrA, int addrB)
 {
-    return isEqualCol(addrA, addrB, numUnits) | isEqualRow(addrA, addrB, numUnits);
+    return isEqualCol(addrA, addrB) | isEqualRow(addrA, addrB);
 }
 
 /* Check whether one address points to the first column */
-inline bool isZeroCol(int addr, int numUnits)
+inline bool isZeroCol(int addr)
 {
-    // addr % numUnits == 0
-    return (addr & (numUnits - 1)) == 0;
+    return (addr & (RASTER_SIZE - 1)) == 0;
 }
 
 /* Check whether one address points to the first row */
-inline bool isZeroRow(int addr, int numUnits)
+inline bool isZeroRow(int addr)
 {
-    // addr / numUnits == 0
-    return (addr & ~(numUnits - 1)) == 0;
+    return (addr < RASTER_SIZE);
 }
 
 /* Check whether one address points to a column whose index is smaller than a given value */
-inline bool lessThanCol(int addr, int val, int numUnits)
+inline bool lessThanCol(int addr, int val)
 {
-    // addr % numUnits < val
-    return (addr & (numUnits - 1)) < val;
+    return (addr & (RASTER_SIZE - 1)) < val;
 }
 
 /* Check whether one address points to a row whose index is smaller than a given value */
-inline bool lessThanRow(int addr, int val, int numUnits)
+inline bool lessThanRow(int addr, int val)
 {
     // addr / numUnits < val
-    return addr < val * numUnits;
+    return (addr >> LOG2_RASTER_SIZE) < val;
 }
 
 inline MV scaleMv(MV mv, int scale)
@@ -271,7 +266,7 @@
     }
 }
 
-void CUData::initCTU(const Frame& frame, uint32_t cuAddr, int qp)
+void CUData::initCTU(const Frame& frame, uint32_t cuAddr, int qp, uint32_t firstRowInSlice, uint32_t lastRowInSlice, uint32_t lastCuInSlice)
 {
     m_encData       = frame.m_encData;
     m_slice         = m_encData->m_slice;
@@ -280,6 +275,9 @@
     m_cuPelY        = (cuAddr / m_slice->m_sps->numCuInWidth) << g_maxLog2CUSize;
     m_absIdxInCTU   = 0;
     m_numPartitions = NUM_4x4_PARTITIONS;
+    m_bFirstRowInSlice = (uint8_t)firstRowInSlice;
+    m_bLastRowInSlice  = (uint8_t)lastRowInSlice;
+    m_bLastCuInSlice   = (uint8_t)lastCuInSlice;
 
     /* sequential memsets */
     m_partSet((uint8_t*)m_qp, (uint8_t)qp);
@@ -300,7 +298,7 @@
 
     uint32_t widthInCU = m_slice->m_sps->numCuInWidth;
     m_cuLeft = (m_cuAddr % widthInCU) ? m_encData->getPicCTU(m_cuAddr - 1) : NULL;
-    m_cuAbove = (m_cuAddr / widthInCU) ? m_encData->getPicCTU(m_cuAddr - widthInCU) : NULL;
+    m_cuAbove = (m_cuAddr >= widthInCU) && !m_bFirstRowInSlice ? m_encData->getPicCTU(m_cuAddr - widthInCU) : NULL;
     m_cuAboveLeft = (m_cuLeft && m_cuAbove) ? m_encData->getPicCTU(m_cuAddr - widthInCU - 1) : NULL;
     m_cuAboveRight = (m_cuAbove && ((m_cuAddr % widthInCU) < (widthInCU - 1))) ? m_encData->getPicCTU(m_cuAddr - widthInCU + 1) : NULL;
 }
@@ -318,6 +316,10 @@
     m_cuAbove       = ctu.m_cuAbove;
     m_cuAboveLeft   = ctu.m_cuAboveLeft;
     m_cuAboveRight  = ctu.m_cuAboveRight;
+    m_bFirstRowInSlice = ctu.m_bFirstRowInSlice;
+    m_bLastRowInSlice = ctu.m_bLastRowInSlice;
+    m_bLastCuInSlice = ctu.m_bLastCuInSlice;
+
     X265_CHECK(m_numPartitions == cuGeom.numPartitions, "initSubCU() size mismatch\n");
 
     m_partSet((uint8_t*)m_qp, (uint8_t)qp);
@@ -341,6 +343,9 @@
 
     uint32_t offset = childGeom.numPartitions * subPartIdx;
 
+    m_bFirstRowInSlice = subCU.m_bFirstRowInSlice;
+    m_bLastCuInSlice = subCU.m_bLastCuInSlice;
+
     m_subPartCopy((uint8_t*)m_qp + offset, (uint8_t*)subCU.m_qp);
     m_subPartCopy(m_log2CUSize + offset, subCU.m_log2CUSize);
     m_subPartCopy(m_lumaIntraDir + offset, subCU.m_lumaIntraDir);
@@ -561,11 +566,11 @@
 {
     uint32_t absPartIdx = g_zscanToRaster[curPartUnitIdx];
 
-    if (!isZeroCol(absPartIdx, s_numPartInCUSize))
+    if (!isZeroCol(absPartIdx))
     {
         uint32_t absZorderCUIdx   = g_zscanToRaster[m_absIdxInCTU];
         lPartUnitIdx = g_rasterToZscan[absPartIdx - 1];
-        if (isEqualCol(absPartIdx, absZorderCUIdx, s_numPartInCUSize))
+        if (isEqualCol(absPartIdx, absZorderCUIdx))
             return m_encData->getPicCTU(m_cuAddr);
         else
         {
@@ -582,18 +587,18 @@
 {
     uint32_t absPartIdx = g_zscanToRaster[curPartUnitIdx];
 
-    if (!isZeroRow(absPartIdx, s_numPartInCUSize))
+    if (!isZeroRow(absPartIdx))
     {
         uint32_t absZorderCUIdx = g_zscanToRaster[m_absIdxInCTU];
-        aPartUnitIdx = g_rasterToZscan[absPartIdx - s_numPartInCUSize];
-        if (isEqualRow(absPartIdx, absZorderCUIdx, s_numPartInCUSize))
+        aPartUnitIdx = g_rasterToZscan[absPartIdx - RASTER_SIZE];
+        if (isEqualRow(absPartIdx, absZorderCUIdx))
             return m_encData->getPicCTU(m_cuAddr);
         else
             aPartUnitIdx -= m_absIdxInCTU;
         return this;
     }
 
-    aPartUnitIdx = g_rasterToZscan[absPartIdx + NUM_4x4_PARTITIONS - s_numPartInCUSize];
+    aPartUnitIdx = g_rasterToZscan[absPartIdx + ((s_numPartInCUSize - 1) << LOG2_RASTER_SIZE)];
     return m_cuAbove;
 }
 
@@ -601,13 +606,13 @@
 {
     uint32_t absPartIdx = g_zscanToRaster[curPartUnitIdx];
 
-    if (!isZeroCol(absPartIdx, s_numPartInCUSize))
+    if (!isZeroCol(absPartIdx))
     {
-        if (!isZeroRow(absPartIdx, s_numPartInCUSize))
+        if (!isZeroRow(absPartIdx))
         {
             uint32_t absZorderCUIdx  = g_zscanToRaster[m_absIdxInCTU];
-            alPartUnitIdx = g_rasterToZscan[absPartIdx - s_numPartInCUSize - 1];
-            if (isEqualRowOrCol(absPartIdx, absZorderCUIdx, s_numPartInCUSize))
+            alPartUnitIdx = g_rasterToZscan[absPartIdx - RASTER_SIZE - 1];
+            if (isEqualRowOrCol(absPartIdx, absZorderCUIdx))
                 return m_encData->getPicCTU(m_cuAddr);
             else
             {
@@ -615,17 +620,17 @@
                 return this;
             }
         }
-        alPartUnitIdx = g_rasterToZscan[absPartIdx + NUM_4x4_PARTITIONS - s_numPartInCUSize - 1];
+        alPartUnitIdx = g_rasterToZscan[absPartIdx + ((s_numPartInCUSize - 1) << LOG2_RASTER_SIZE) - 1];
         return m_cuAbove;
     }
 
-    if (!isZeroRow(absPartIdx, s_numPartInCUSize))
+    if (!isZeroRow(absPartIdx))
     {
-        alPartUnitIdx = g_rasterToZscan[absPartIdx - 1];
+        alPartUnitIdx = g_rasterToZscan[absPartIdx - RASTER_SIZE + s_numPartInCUSize - 1];
         return m_cuLeft;
     }
 
-    alPartUnitIdx = g_rasterToZscan[NUM_4x4_PARTITIONS - 1];
+    alPartUnitIdx = NUM_4x4_PARTITIONS - 1;
     return m_cuAboveLeft;
 }
 
@@ -636,15 +641,15 @@
 
     uint32_t absPartIdxRT = g_zscanToRaster[curPartUnitIdx];
 
-    if (lessThanCol(absPartIdxRT, s_numPartInCUSize - 1, s_numPartInCUSize))
+    if (lessThanCol(absPartIdxRT, s_numPartInCUSize - 1))
     {
-        if (!isZeroRow(absPartIdxRT, s_numPartInCUSize))

x265_2.0.tar.gz/source/common/cudata.h -> x265_2.1.tar.gz/source/common/cudata.h Changed

@@ -180,6 +180,11 @@
     uint32_t      m_hChromaShift;
     uint32_t      m_vChromaShift;
 
+    /* multiple slices informations */
+    uint8_t      m_bFirstRowInSlice;
+    uint8_t      m_bLastRowInSlice;
+    uint8_t      m_bLastCuInSlice;
+
     /* Per-part data, stored contiguously */
     int8_t*       m_qp;               // array of QP values
     uint8_t*      m_log2CUSize;       // array of cu log2Size TODO: seems redundant to depth
@@ -214,7 +219,7 @@
     void     initialize(const CUDataMemPool& dataPool, uint32_t depth, int csp, int instance);
     static void calcCTUGeoms(uint32_t ctuWidth, uint32_t ctuHeight, uint32_t maxCUSize, uint32_t minCUSize, CUGeom cuDataArray[CUGeom::MAX_GEOMS]);
 
-    void     initCTU(const Frame& frame, uint32_t cuAddr, int qp);
+    void     initCTU(const Frame& frame, uint32_t cuAddr, int qp, uint32_t firstRowInSlice, uint32_t lastRowInSlice, uint32_t lastCUInSlice);
     void     initSubCU(const CUData& ctu, const CUGeom& cuGeom, int qp);
     void     initLosslessCU(const CUData& cu, const CUGeom& cuGeom);

x265_2.0.tar.gz/source/common/deblock.cpp -> x265_2.1.tar.gz/source/common/deblock.cpp Changed

@@ -90,7 +90,7 @@
     uint32_t numUnits = 1 << (cuGeom.log2CUSize - LOG2_UNIT_SIZE);
     setEdgefilterPU(cu, absPartIdx, dir, blockStrength, numUnits);
     setEdgefilterTU(cu, absPartIdx, 0, dir, blockStrength);
-    setEdgefilterMultiple(cu, absPartIdx, dir, 0, bsCuEdge(cu, absPartIdx, dir), blockStrength, numUnits);
+    setEdgefilterMultiple(absPartIdx, dir, 0, bsCuEdge(cu, absPartIdx, dir), blockStrength, numUnits);
 
     uint32_t numParts = cuGeom.numPartitions;
     for (uint32_t partIdx = absPartIdx; partIdx < absPartIdx + numParts; partIdx++)
@@ -114,22 +114,20 @@
     }
 }
 
-static inline uint32_t calcBsIdx(const CUData* cu, uint32_t absPartIdx, int32_t dir, int32_t edgeIdx, int32_t baseUnitIdx)
+static inline uint32_t calcBsIdx(uint32_t absPartIdx, int32_t dir, int32_t edgeIdx, int32_t baseUnitIdx)
 {
-    uint32_t numUnits = cu->m_slice->m_sps->numPartInCUSize;
-
     if (dir)
-        return g_rasterToZscan[g_zscanToRaster[absPartIdx] + edgeIdx * numUnits + baseUnitIdx];
+        return g_rasterToZscan[g_zscanToRaster[absPartIdx] + (edgeIdx << LOG2_RASTER_SIZE) + baseUnitIdx];
     else
-        return g_rasterToZscan[g_zscanToRaster[absPartIdx] + baseUnitIdx * numUnits + edgeIdx];
+        return g_rasterToZscan[g_zscanToRaster[absPartIdx] + (baseUnitIdx << LOG2_RASTER_SIZE) + edgeIdx];
 }
 
-void Deblock::setEdgefilterMultiple(const CUData* cu, uint32_t scanIdx, int32_t dir, int32_t edgeIdx, uint8_t value, uint8_t blockStrength[], uint32_t numUnits)
+void Deblock::setEdgefilterMultiple(uint32_t scanIdx, int32_t dir, int32_t edgeIdx, uint8_t value, uint8_t blockStrength[], uint32_t numUnits)
 {
     X265_CHECK(numUnits > 0, "numUnits edge filter check\n");
     for (uint32_t i = 0; i < numUnits; i++)
     {
-        const uint32_t bsidx = calcBsIdx(cu, scanIdx, dir, edgeIdx, i);
+        const uint32_t bsidx = calcBsIdx(scanIdx, dir, edgeIdx, i);
         blockStrength[bsidx] = value;
     }
 }
@@ -145,8 +143,8 @@
         return;
     }
 
-    uint32_t numUnits  = 1 << (log2TrSize - LOG2_UNIT_SIZE);
-    setEdgefilterMultiple(cu, absPartIdx, dir, 0, 2, blockStrength, numUnits);
+    uint32_t numUnits = 1 << (log2TrSize - LOG2_UNIT_SIZE);
+    setEdgefilterMultiple(absPartIdx, dir, 0, 2, blockStrength, numUnits);
 }
 
 void Deblock::setEdgefilterPU(const CUData* cu, uint32_t absPartIdx, int32_t dir, uint8_t blockStrength[], uint32_t numUnits)
@@ -158,30 +156,30 @@
     {
     case SIZE_2NxN:
         if (EDGE_HOR == dir)
-            setEdgefilterMultiple(cu, absPartIdx, dir, hNumUnits, 1, blockStrength, numUnits);
+            setEdgefilterMultiple(absPartIdx, dir, hNumUnits, 1, blockStrength, numUnits);
         break;
     case SIZE_Nx2N:
         if (EDGE_VER == dir)
-            setEdgefilterMultiple(cu, absPartIdx, dir, hNumUnits, 1, blockStrength, numUnits);
+            setEdgefilterMultiple(absPartIdx, dir, hNumUnits, 1, blockStrength, numUnits);
         break;
     case SIZE_NxN:
-        setEdgefilterMultiple(cu, absPartIdx, dir, hNumUnits, 1, blockStrength, numUnits);
+        setEdgefilterMultiple(absPartIdx, dir, hNumUnits, 1, blockStrength, numUnits);
         break;
     case SIZE_2NxnU:
         if (EDGE_HOR == dir)
-            setEdgefilterMultiple(cu, absPartIdx, dir, qNumUnits, 1, blockStrength, numUnits);
+            setEdgefilterMultiple(absPartIdx, dir, qNumUnits, 1, blockStrength, numUnits);
         break;
     case SIZE_nLx2N:
         if (EDGE_VER == dir)
-            setEdgefilterMultiple(cu, absPartIdx, dir, qNumUnits, 1, blockStrength, numUnits);
+            setEdgefilterMultiple(absPartIdx, dir, qNumUnits, 1, blockStrength, numUnits);
         break;
     case SIZE_2NxnD:
         if (EDGE_HOR == dir)
-            setEdgefilterMultiple(cu, absPartIdx, dir, numUnits - qNumUnits, 1, blockStrength, numUnits);
+            setEdgefilterMultiple(absPartIdx, dir, numUnits - qNumUnits, 1, blockStrength, numUnits);
         break;
     case SIZE_nRx2N:
         if (EDGE_VER == dir)
-            setEdgefilterMultiple(cu, absPartIdx, dir, numUnits - qNumUnits, 1, blockStrength, numUnits);
+            setEdgefilterMultiple(absPartIdx, dir, numUnits - qNumUnits, 1, blockStrength, numUnits);
         break;
 
     case SIZE_2Nx2N:
@@ -350,7 +348,7 @@
     uint32_t numUnits = cuQ->m_slice->m_sps->numPartInCUSize >> depth;
     for (uint32_t idx = 0; idx < numUnits; idx++)
     {
-        uint32_t partQ = calcBsIdx(cuQ, absPartIdx, dir, edge, idx);
+        uint32_t partQ = calcBsIdx(absPartIdx, dir, edge, idx);
         uint32_t bs = blockStrength[partQ];
 
         if (!bs)
@@ -461,7 +459,7 @@
     uint32_t numUnits = cuQ->m_slice->m_sps->numPartInCUSize >> (depth + chromaShift);
     for (uint32_t idx = 0; idx < numUnits; idx++)
     {
-        uint32_t partQ = calcBsIdx(cuQ, absPartIdx, dir, edge, idx << chromaShift);
+        uint32_t partQ = calcBsIdx(absPartIdx, dir, edge, idx << chromaShift);
         uint32_t bs = blockStrength[partQ];
 
         if (bs <= 1)

x265_2.0.tar.gz/source/common/deblock.h -> x265_2.1.tar.gz/source/common/deblock.h Changed

x265_2.0.tar.gz/source/common/frame.cpp -> x265_2.1.tar.gz/source/common/frame.cpp Changed

@@ -2,6 +2,7 @@
 * Copyright (C) 2013 x265 project
 *
 * Author: Steve Borho <steve@borho.org>
+*         Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -32,7 +33,7 @@
 {
     m_bChromaExtended = false;
     m_lowresInit = false;
-    m_reconRowCount.set(0);
+    m_reconRowFlag = NULL;
     m_reconColCount = NULL;
     m_countRefEncoders = 0;
     m_encData = NULL;
@@ -41,6 +42,8 @@
     m_next = NULL;
     m_prev = NULL;
     m_param = NULL;
+    m_userSEI.numPayloads = 0;
+    m_userSEI.payloads = NULL;
     memset(&m_lowres, 0, sizeof(m_lowres));
     m_rcData = NULL;
 }
@@ -52,15 +55,20 @@
     CHECKED_MALLOC_ZERO(m_rcData, RcStats, 1);
 
     if (m_fencPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp) &&
-        m_lowres.create(m_fencPic, param->bframes, !!param->rc.aqMode))
+        m_lowres.create(m_fencPic, param->bframes, !!param->rc.aqMode, param->rc.qgSize))
     {
         X265_CHECK((m_reconColCount == NULL), "m_reconColCount was initialized");
         m_numRows = (m_fencPic->m_picHeight + g_maxCUSize - 1)  / g_maxCUSize;
+        m_reconRowFlag = new ThreadSafeInteger[m_numRows];
         m_reconColCount = new ThreadSafeInteger[m_numRows];
 
         if (quantOffsets)
         {
-            int32_t cuCount = m_lowres.maxBlocksInRow * m_lowres.maxBlocksInCol;
+            int32_t cuCount;
+            if (param->rc.qgSize == 8)
+                cuCount = m_lowres.maxBlocksInRowFullRes * m_lowres.maxBlocksInColFullRes;
+            else
+                cuCount = m_lowres.maxBlocksInRow * m_lowres.maxBlocksInCol;
             m_quantOffsets = new float[cuCount];
         }
         return true;
@@ -132,6 +140,12 @@
         m_reconPic = NULL;
     }
 
+    if (m_reconRowFlag)
+    {
+        delete[] m_reconRowFlag;
+        m_reconRowFlag = NULL;
+    }
+
     if (m_reconColCount)
     {
         delete[] m_reconColCount;
@@ -143,6 +157,13 @@
         delete[] m_quantOffsets;
     }
 
+    if (m_userSEI.numPayloads)
+    {
+        for (int i = 0; i < m_userSEI.numPayloads; i++)
+            delete[] m_userSEI.payloads[i].payload;
+        delete[] m_userSEI.payloads;
+    }
+
     m_lowres.destroy();
     X265_FREE(m_rcData);
 }

x265_2.0.tar.gz/source/common/frame.h -> x265_2.1.tar.gz/source/common/frame.h Changed

@@ -2,6 +2,7 @@
 * Copyright (C) 2013 x265 project
 *
 * Author: Steve Borho <steve@borho.org>
+*         Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -49,6 +50,9 @@
     double   pCuCount;
     double   skipCuCount;
     double   qScale;
+    double   cumulativePQp;
+    double   cumulativePNorm;
+    double   lastQScaleFor[3];
     int      mvBits;
     int      miscBits;
     int      coeffBits;
@@ -82,9 +86,10 @@
     bool                   m_bChromaExtended;    // orig chroma planes motion extended for weight analysis
 
     float*                 m_quantOffsets;       // points to quantOffsets in x265_picture
+    x265_sei               m_userSEI;
 
     /* Frame Parallelism - notification between FrameEncoders of available motion reference rows */
-    ThreadSafeInteger      m_reconRowCount;      // count of CTU rows completely reconstructed and extended for motion reference
+    ThreadSafeInteger*     m_reconRowFlag;       // flag of CTU rows completely reconstructed and extended for motion reference
     ThreadSafeInteger*     m_reconColCount;      // count of CTU cols completely reconstructed and extended for motion reference
     int32_t                m_numRows;
     volatile uint32_t      m_countRefEncoders;   // count of FrameEncoder threads monitoring m_reconRowCount

x265_2.0.tar.gz/source/common/lowres.cpp -> x265_2.1.tar.gz/source/common/lowres.cpp Changed

@@ -27,7 +27,7 @@
 
 using namespace X265_NS;
 
-bool Lowres::create(PicYuv *origPic, int _bframes, bool bAQEnabled)
+bool Lowres::create(PicYuv *origPic, int _bframes, bool bAQEnabled, uint32_t qgSize)
 {
     isLowres = true;
     bframes = _bframes;
@@ -38,7 +38,14 @@
         lumaStride += 32 - (lumaStride & 31);
     maxBlocksInRow = (width + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
     maxBlocksInCol = (lines + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+    maxBlocksInRowFullRes = maxBlocksInRow * 2;
+    maxBlocksInColFullRes = maxBlocksInCol * 2;
     int cuCount = maxBlocksInRow * maxBlocksInCol;
+    int cuCountFullRes;
+    if (qgSize == 8)
+        cuCountFullRes = maxBlocksInRowFullRes * maxBlocksInColFullRes;
+    else
+        cuCountFullRes = cuCount;
 
     /* rounding the width to multiple of lowres CU size */
     width = maxBlocksInRow * X265_LOWRES_CU_SIZE;
@@ -46,13 +53,14 @@
 
     size_t planesize = lumaStride * (lines + 2 * origPic->m_lumaMarginY);
     size_t padoffset = lumaStride * origPic->m_lumaMarginY + origPic->m_lumaMarginX;
-
     if (bAQEnabled)
     {
-        CHECKED_MALLOC(qpAqOffset, double, cuCount);
-        CHECKED_MALLOC(invQscaleFactor, int, cuCount);
-        CHECKED_MALLOC(qpCuTreeOffset, double, cuCount);
-        CHECKED_MALLOC(blockVariance, uint32_t, cuCount);
+        CHECKED_MALLOC_ZERO(qpAqOffset, double, cuCountFullRes);
+        CHECKED_MALLOC_ZERO(invQscaleFactor, int, cuCountFullRes);
+        CHECKED_MALLOC_ZERO(qpCuTreeOffset, double, cuCountFullRes);
+        CHECKED_MALLOC_ZERO(blockVariance, uint32_t, cuCountFullRes);
+        if (qgSize == 8)
+            CHECKED_MALLOC_ZERO(invQscaleFactor8x8, int, cuCount);
     }
     CHECKED_MALLOC(propagateCost, uint16_t, cuCount);
 
@@ -122,6 +130,7 @@
     X265_FREE(qpCuTreeOffset);
     X265_FREE(propagateCost);
     X265_FREE(blockVariance);
+    X265_FREE(invQscaleFactor8x8);
 }
 
 // (re) initialize lowres state

x265_2.0.tar.gz/source/common/lowres.h -> x265_2.1.tar.gz/source/common/lowres.h Changed

@@ -132,6 +132,8 @@
     MV*       lowresMvs[2][X265_BFRAME_MAX + 1];
     uint32_t  maxBlocksInRow;
     uint32_t  maxBlocksInCol;
+    uint32_t  maxBlocksInRowFullRes;
+    uint32_t  maxBlocksInColFullRes;
 
     /* used for vbvLookahead */
     int       plannedType[X265_LOOKAHEAD_MAX + 1];
@@ -143,6 +145,7 @@
     double*   qpAqOffset;      // AQ QP offset values for each 16x16 CU
     double*   qpCuTreeOffset;  // cuTree QP offset values for each 16x16 CU
     int*      invQscaleFactor; // qScale values for qp Aq Offsets
+    int*      invQscaleFactor8x8; // temporary buffer for qg-size 8
     uint32_t* blockVariance;
     uint64_t  wp_ssd[3];       // This is different than SSDY, this is sum(pixel^2) - sum(pixel)^2 for entire frame
     uint64_t  wp_sum[3];
@@ -153,7 +156,7 @@
     double    weightedCostDelta[X265_BFRAME_MAX + 2];
     ReferencePlanes weightedRef[X265_BFRAME_MAX + 2];
 
-    bool create(PicYuv *origPic, int _bframes, bool bAqEnabled);
+    bool create(PicYuv *origPic, int _bframes, bool bAqEnabled, uint32_t qgSize);
     void destroy();
     void init(PicYuv *origPic, int poc);
 };

x265_2.0.tar.gz/source/common/param.cpp -> x265_2.1.tar.gz/source/common/param.cpp Changed

@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
+ *          Min Chen <min.chen@multicorewareinc.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -174,6 +175,7 @@
     param->bEnableTSkipFast = 0;
     param->maxNumReferences = 3;
     param->bEnableTemporalMvp = 1;
+    param->bSourceReferenceEstimation = 0;
 
     /* Loop Filter */
     param->bEnableLoopFilter = 1;
@@ -224,6 +226,10 @@
     param->rc.bEnableSlowFirstPass = 1;
     param->rc.bStrictCbr = 0;
     param->rc.bEnableGrain = 0;
+    param->rc.qpMin = 0;
+    param->rc.qpMax = QP_MAX_MAX;
+
+    param->bDiscardOptionalVUI = 0;
 
     /* Video Usability Information (VUI) */
     param->vui.aspectRatioIdc = 0;
@@ -249,6 +255,9 @@
     param->maxFALL = 0;
     param->minLuma = 0;
     param->maxLuma = PIXEL_MAX;
+    param->log2MaxPocLsb = 8;
+    param->bDiscardSEI = false;
+    param->maxSlices = 1;
 }
 
 int x265_param_default_preset(x265_param* param, const char* preset, const char* tune)
@@ -509,6 +518,7 @@
     bool bError = false;
     bool bNameWasBool = false;
     bool bValueWasNull = !value;
+    bool bExtraParams = false;
     char nameBuf[64];
 
     if (!name)
@@ -747,6 +757,7 @@
     OPT("vbv-init")    p->rc.vbvBufferInit = atof(value);
     OPT("crf-max")     p->rc.rfConstantMax = atof(value);
     OPT("crf-min")     p->rc.rfConstantMin = atof(value);
+    OPT("qpmax")       p->rc.qpMax = atoi(value);
     OPT("crf")
     {
         p->rc.rfConstant = atof(value);
@@ -885,7 +896,29 @@
     OPT("max-luma") p->maxLuma = (uint16_t)atoi(value);
     OPT("uhd-bd") p->uhdBluray = atobool(value);
     else
-        return X265_PARAM_BAD_NAME;
+        bExtraParams = true;
+
+    // solve "fatal error C1061: compiler limit : blocks nested too deeply"
+    if (bExtraParams)
+    {
+        bExtraParams = false;
+        if (0) ;
+        OPT("slices") p->maxSlices = atoi(value);
+        else
+            bExtraParams = true;
+    }
+
+    if (bExtraParams)
+    {
+        if (0) ;
+        OPT("qpmin") p->rc.qpMin = atoi(value);
+        OPT("analyze-src-pics") p->bSourceReferenceEstimation = atobool(value);
+        OPT("log2-max-poc-lsb") p->log2MaxPocLsb = atoi(value);
+        OPT("discard-sei") p->bDiscardSEI = atobool(value);
+        OPT("discard-vui") p->bDiscardOptionalVUI = atobool(value);
+        else
+            return X265_PARAM_BAD_NAME;
+    }
 #undef OPT
 #undef atobool
 #undef atoi
@@ -1041,6 +1074,8 @@
     uint32_t tuQTMaxLog2Size = X265_MIN(maxLog2CUSize, 5);
     uint32_t tuQTMinLog2Size = 2; //log2(4)
 
+    CHECK((param->maxSlices > 1) && !param->bEnableWavefront,
+        "Multiple-Slices mode must be enable Wavefront Parallel Processing (--wpp)");
     CHECK(param->internalBitDepth != X265_DEPTH,
           "internalBitDepth must match compiled bit depth");
     CHECK(param->minCUSize != 64 && param->minCUSize != 32 && param->minCUSize != 16 && param->minCUSize != 8,
@@ -1208,6 +1243,14 @@
           "Strict-cbr cannot be applied without specifying target bitrate or vbv bufsize");
     CHECK(param->analysisMode && (param->analysisMode < X265_ANALYSIS_OFF || param->analysisMode > X265_ANALYSIS_LOAD),
         "Invalid analysis mode. Analysis mode 0: OFF 1: SAVE : 2 LOAD");
+    CHECK(param->rc.qpMax < QP_MIN || param->rc.qpMax > QP_MAX_MAX,
+        "qpmax exceeds supported range (0 to 69)");
+    CHECK(param->rc.qpMin < QP_MIN || param->rc.qpMin > QP_MAX_MAX,
+        "qpmin exceeds supported range (0 to 69)");
+    CHECK(param->log2MaxPocLsb < 4,
+        "maximum of the picture order count can not be less than 4");
+    CHECK(1 > param->maxSlices || param->maxSlices > ((param->sourceHeight + param->maxCUSize - 1) / param->maxCUSize),
+        "The slices can not be more than number of rows");
     return check_failed;
 }
 
@@ -1258,12 +1301,9 @@
         // compute actual CU depth with respect to config depth and max transform size
         g_maxCUDepth    = maxLog2CUSize - minLog2CUSize;
         g_unitSizeDepth = maxLog2CUSize - LOG2_UNIT_SIZE;
-
-        // initialize partition order
-        uint32_t* tmp = &g_zscanToRaster[0];
-        initZscanToRaster(g_unitSizeDepth, 1, 0, tmp);
-        initRasterToZscan(g_unitSizeDepth);
     }
+
+    g_maxSlices = param->maxSlices;
     return 0;
 }
 
@@ -1363,6 +1403,8 @@
     TOOLOPT(param->bEnableFastIntra, "fast-intra");
     TOOLOPT(param->bEnableStrongIntraSmoothing, "strong-intra-smoothing");
     TOOLVAL(param->lookaheadSlices, "lslices=%d");
+    if (param->maxSlices > 1)
+        TOOLVAL(param->maxSlices, "slices=%d");
     if (param->bEnableLoopFilter)
     {
         if (param->deblockingFilterBetaOffset || param->deblockingFilterTCOffset)
@@ -1443,6 +1485,7 @@
     s += sprintf(s, " psy-rd=%.2f", p->psyRd);
     s += sprintf(s, " rdoq-level=%d", p->rdoqLevel);
     s += sprintf(s, " psy-rdoq=%.2f", p->psyRdoq);
+    s += sprintf(s, " log2-max-poc-lsb=%d", p->log2MaxPocLsb);
     BOOL(p->bEnableRdRefine, "rd-refine");
     BOOL(p->bEnableSignHiding, "signhide");
     BOOL(p->bEnableLoopFilter, "deblock");
@@ -1463,7 +1506,7 @@
         else
             s += sprintf(s, " bitrate=%d", p->rc.bitrate);
         s += sprintf(s, " qcomp=%.2f qpmin=%d qpmax=%d qpstep=%d",
-                     p->rc.qCompress, QP_MIN, QP_MAX_SPEC, p->rc.qpStep);
+                     p->rc.qCompress, p->rc.qpMin, p->rc.qpMax, p->rc.qpStep);
         if (p->rc.bStatRead)
             s += sprintf( s, " cplxblur=%.1f qblur=%.1f",
                           p->rc.complexityBlur, p->rc.qblur);

x265_2.0.tar.gz/source/common/pixel.cpp -> x265_2.1.tar.gz/source/common/pixel.cpp Changed

@@ -848,14 +848,13 @@
                                     const int32_t* invQscales, const double* fpsFactor, int len)
 {
     double fps = *fpsFactor / 256;  // range[0.01, 1.00]
-
     for (int i = 0; i < len; i++)
     {
         int intraCost = intraCosts[i];
         int interCost = X265_MIN(intraCosts[i], interCosts[i] & LOWRES_COST_MASK);
-        double propagateIntra  = intraCost * invQscales[i]; // Q16 x Q8.8 = Q24.8
+        double propagateIntra = intraCost * invQscales[i]; // Q16 x Q8.8 = Q24.8
         double propagateAmount = (double)propagateIn[i] + propagateIntra * fps; // Q16.0 + Q24.8 x Q0.x = Q25.0
-        double propagateNum    = (double)(intraCost - interCost); // Q32 - Q32 = Q33.0
+        double propagateNum = (double)(intraCost - interCost); // Q32 - Q32 = Q33.0
 
 #if 0
         // algorithm that output match to asm
@@ -866,10 +865,11 @@
         float propagateDenom = intraRcpError2 - intraRcpError1;
         dst[i] = (int)(propagateAmount * propagateNum * (double)propagateDenom + 0.5);
 #else
-        double propagateDenom  = (double)intraCost;             // Q32
+        double propagateDenom = (double)intraCost;             // Q32
         dst[i] = (int)(propagateAmount * propagateNum / propagateDenom + 0.5);
 #endif
-    }
+        }
+    //}
 }
 
 /* Conversion between double and Q8.8 fixed point (big-endian) for storage */

x265_2.0.tar.gz/source/common/predict.cpp -> x265_2.1.tar.gz/source/common/predict.cpp Changed

@@ -671,17 +671,14 @@
     int numIntraNeighbor;
     bool* bNeighborFlags = intraNeighbors->bNeighborFlags;
 
-    uint32_t numPartInWidth = 1 << (cu.m_log2CUSize[0] - LOG2_UNIT_SIZE - tuDepth);
-    uint32_t partIdxLT = cu.m_absIdxInCTU + absPartIdx;
-    uint32_t partIdxRT = g_rasterToZscan[g_zscanToRaster[partIdxLT] + numPartInWidth - 1];
-
     uint32_t tuSize = 1 << log2TrSize;
     int  tuWidthInUnits = tuSize >> log2UnitWidth;
     int  tuHeightInUnits = tuSize >> log2UnitHeight;
     int  aboveUnits = tuWidthInUnits << 1;
     int  leftUnits = tuHeightInUnits << 1;
-    int  partIdxStride = cu.m_slice->m_sps->numPartInCUSize;
-    uint32_t partIdxLB = g_rasterToZscan[g_zscanToRaster[partIdxLT] + ((tuHeightInUnits - 1) * partIdxStride)];
+    uint32_t partIdxLT = cu.m_absIdxInCTU + absPartIdx;
+    uint32_t partIdxRT = g_rasterToZscan[g_zscanToRaster[partIdxLT] + tuWidthInUnits - 1];
+    uint32_t partIdxLB = g_rasterToZscan[g_zscanToRaster[partIdxLT] + ((tuHeightInUnits - 1) << LOG2_RASTER_SIZE)];
 
     if (cu.m_slice->isIntra() || !cu.m_slice->m_pps->bConstrainedIntraPred)
     {
@@ -910,7 +907,7 @@
 {
     const uint32_t rasterPartBegin = g_zscanToRaster[partIdxLT];
     const uint32_t rasterPartEnd = g_zscanToRaster[partIdxLB];
-    const uint32_t idxStep = cu.m_slice->m_sps->numPartInCUSize;
+    const uint32_t idxStep = RASTER_SIZE;
     int numIntra = 0;
 
     for (uint32_t rasterPart = rasterPartBegin; rasterPart <= rasterPartEnd; rasterPart += idxStep, bValidFlags--) // opposite direction

x265_2.0.tar.gz/source/common/slice.h -> x265_2.1.tar.gz/source/common/slice.h Changed

x265_2.0.tar.gz/source/common/threadpool.cpp -> x265_2.1.tar.gz/source/common/threadpool.cpp Changed

x265_2.0.tar.gz/source/common/x86/dct8.asm -> x265_2.1.tar.gz/source/common/x86/dct8.asm Changed

x265_2.0.tar.gz/source/encoder/analysis.cpp -> x265_2.1.tar.gz/source/encoder/analysis.cpp Changed

@@ -255,7 +255,7 @@
             cuPrevCost = origCUCost;
 
             int modCUQP = qp + dir;
-            while (modCUQP >= QP_MIN && modCUQP <= QP_MAX_SPEC)
+            while (modCUQP >= m_param->rc.qpMin && modCUQP <= QP_MAX_SPEC)
             {
                 recodeCU(parentCTU, cuGeom, modCUQP, qp);
                 cuCost = md.bestMode->rdCost;
@@ -1731,19 +1731,19 @@
                     ProfileCounter(parentCTU, skippedIntraCU[cuGeom.depth]);
                 }
             }
-            if ((md.bestMode->cu.isInter(0) && !(md.bestMode->cu.m_mergeFlag[0] && md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N)) && (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400))
-            {
-                uint32_t numPU = md.bestMode->cu.getNumPartInter(0);
+        }
 
-                for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
-                {
-                    PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
-                    motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, m_csp != X265_CSP_I400);
-                }
-                encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
+        if ((md.bestMode->cu.isInter(0) && !(md.bestMode->cu.m_mergeFlag[0] && md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N)) && (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400))
+        {
+            uint32_t numPU = md.bestMode->cu.getNumPartInter(0);
+
+            for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
+            {
+                PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
+                motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, m_csp != X265_CSP_I400);
             }
+            encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
         }
-
         if (m_bTryLossless)
             tryLossless(cuGeom);
 
@@ -1936,10 +1936,26 @@
     }
     for (uint32_t i = 0; i < numMergeCand; ++i)
     {
-        if (m_bFrameParallel &&
-            (candMvField[i][0].mv.y >= (m_param->searchRange + 1) * 4 ||
-            candMvField[i][1].mv.y >= (m_param->searchRange + 1) * 4))
-            continue;
+        if (m_bFrameParallel)
+        {
+            // Parallel slices bound check
+            if (m_param->maxSlices > 1)
+            {
+                // NOTE: First row in slice can't negative
+                if ((candMvField[i][0].mv.y < m_sliceMinY) | (candMvField[i][1].mv.y < m_sliceMinY))
+                    continue;
+
+                // Last row in slice can't reference beyond bound since it is another slice area
+                // TODO: we may beyond bound in future since these area have a chance to finish because we use parallel slices. Necessary prepare research on load balance
+                if ((candMvField[i][0].mv.y > m_sliceMaxY) | (candMvField[i][1].mv.y > m_sliceMaxY))
+                    continue;
+            }
+
+            if (candMvField[i][0].mv.y >= (m_param->searchRange + 1) * 4 ||
+                candMvField[i][1].mv.y >= (m_param->searchRange + 1) * 4)
+                continue;
+        }
+
         if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE &&
             tempPred->cu.m_cuPelX / g_maxCUSize < m_frame->m_encData->m_pir.pirEndCol &&
             candMvField[i][0].mv.x > maxSafeMv)
@@ -2050,10 +2066,25 @@
     }
     for (uint32_t i = 0; i < numMergeCand; i++)
     {
-        if (m_bFrameParallel &&
-            (candMvField[i][0].mv.y >= (m_param->searchRange + 1) * 4 ||
-            candMvField[i][1].mv.y >= (m_param->searchRange + 1) * 4))
-            continue;
+        if (m_bFrameParallel)
+        {
+            // Parallel slices bound check
+            if (m_param->maxSlices > 1)
+            {
+                // NOTE: First row in slice can't negative
+                if ((candMvField[i][0].mv.y < m_sliceMinY) | (candMvField[i][1].mv.y < m_sliceMinY))
+                    continue;
+
+                // Last row in slice can't reference beyond bound since it is another slice area
+                // TODO: we may beyond bound in future since these area have a chance to finish because we use parallel slices. Necessary prepare research on load balance
+                if ((candMvField[i][0].mv.y > m_sliceMaxY) | (candMvField[i][1].mv.y > m_sliceMaxY))
+                    continue;
+            }
+
+            if (candMvField[i][0].mv.y >= (m_param->searchRange + 1) * 4 ||
+                candMvField[i][1].mv.y >= (m_param->searchRange + 1) * 4)
+                continue;
+        }
 
         /* the merge candidate list is packed with MV(0,0) ref 0 when it is not full */
         if (candDir[i] == 1 && !candMvField[i][0].mv.word && !candMvField[i][0].refIdx)
@@ -2637,7 +2668,11 @@
 {
     FrameData& curEncData = *m_frame->m_encData;
     double qp = baseQp >= 0 ? baseQp : curEncData.m_cuStat[ctu.m_cuAddr].baseQp;
-
+    int loopIncr;
+    if (m_param->rc.qgSize == 8)
+        loopIncr = 8;
+    else
+        loopIncr = 16;
     /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */
     bool isReferenced = IS_REFERENCED(m_frame);
     double *qpoffs = (isReferenced && m_param->rc.cuTree) ? m_frame->m_lowres.qpCuTreeOffset : m_frame->m_lowres.qpAqOffset;
@@ -2647,17 +2682,17 @@
         uint32_t height = m_frame->m_fencPic->m_picHeight;
         uint32_t block_x = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx];
         uint32_t block_y = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx];
-        uint32_t maxCols = (m_frame->m_fencPic->m_picWidth + (16 - 1)) / 16;
+        uint32_t maxCols = (m_frame->m_fencPic->m_picWidth + (loopIncr - 1)) / loopIncr;
         uint32_t blockSize = g_maxCUSize >> cuGeom.depth;
         double qp_offset = 0;
         uint32_t cnt = 0;
         uint32_t idx;
 
-        for (uint32_t block_yy = block_y; block_yy < block_y + blockSize && block_yy < height; block_yy += 16)
+        for (uint32_t block_yy = block_y; block_yy < block_y + blockSize && block_yy < height; block_yy += loopIncr)
         {
-            for (uint32_t block_xx = block_x; block_xx < block_x + blockSize && block_xx < width; block_xx += 16)
+            for (uint32_t block_xx = block_x; block_xx < block_x + blockSize && block_xx < width; block_xx += loopIncr)
             {
-                idx = ((block_yy / 16) * (maxCols)) + (block_xx / 16);
+                idx = ((block_yy / loopIncr) * (maxCols)) + (block_xx / loopIncr);
                 qp_offset += qpoffs[idx];
                 cnt++;
             }
@@ -2667,5 +2702,5 @@
         qp += qp_offset;
     }
 
-    return x265_clip3(QP_MIN, QP_MAX_MAX, (int)(qp + 0.5));
+    return x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, (int)(qp + 0.5));
 }

x265_2.0.tar.gz/source/encoder/analysis.h -> x265_2.1.tar.gz/source/encoder/analysis.h Changed

x265_2.0.tar.gz/source/encoder/api.cpp -> x265_2.1.tar.gz/source/encoder/api.cpp Changed

x265_2.0.tar.gz/source/encoder/dpb.cpp -> x265_2.1.tar.gz/source/encoder/dpb.cpp Changed

@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -71,14 +72,18 @@
         iterFrame = iterFrame->m_next;
         if (!curFrame->m_encData->m_bHasReferences && !curFrame->m_countRefEncoders)
         {
-            curFrame->m_reconRowCount.set(0);
             curFrame->m_bChromaExtended = false;
 
             // Reset column counter
+            X265_CHECK(curFrame->m_reconRowFlag != NULL, "curFrame->m_reconRowFlag check failure");
             X265_CHECK(curFrame->m_reconColCount != NULL, "curFrame->m_reconColCount check failure");
             X265_CHECK(curFrame->m_numRows > 0, "curFrame->m_numRows check failure");
-            for(int32_t col = 0; col < curFrame->m_numRows; col++)
-                curFrame->m_reconColCount[col].set(0);
+
+            for(int32_t row = 0; row < curFrame->m_numRows; row++)
+            {
+                curFrame->m_reconRowFlag[row].set(0);
+                curFrame->m_reconColCount[row].set(0);
+            }
 
             // iterator is invalidated by remove, restart scan
             m_picList.remove(*curFrame);
@@ -167,7 +172,9 @@
         slice->m_colFromL0Flag = true;
         slice->m_colRefIdx = 0;
     }
-    slice->m_sLFaseFlag = (SLFASE_CONSTANT & (1 << (pocCurr % 31))) > 0;
+
+    // Disable Loopfilter in bound area, because we will do slice-parallelism in future
+    slice->m_sLFaseFlag = (g_maxSlices > 1) ? false : ((SLFASE_CONSTANT & (1 << (pocCurr % 31))) > 0);
 
     /* Increment reference count of all motion-referenced frames to prevent them
      * from being recycled. These counts are decremented at the end of

x265_2.0.tar.gz/source/encoder/encoder.cpp -> x265_2.1.tar.gz/source/encoder/encoder.cpp Changed

@@ -151,6 +151,8 @@
         m_aborted = true;
     }
 
+    x265_log(p, X265_LOG_INFO, "Slices                              : %d\n", p->maxSlices);
+
     char buf[128];
     int len = 0;
     if (p->bEnableWavefront)
@@ -589,10 +591,27 @@
         inFrame->m_pts       = pic_in->pts;
         inFrame->m_forceqp   = pic_in->forceqp;
         inFrame->m_param     = m_reconfigure ? m_latestParam : m_param;
-        
+
+        if (pic_in->userSEI.numPayloads)
+        {
+            int numPayloads = inFrame->m_userSEI.numPayloads = pic_in->userSEI.numPayloads;
+            inFrame->m_userSEI.payloads = new x265_sei_payload[numPayloads];
+            for (int i = 0; i < numPayloads; i++)
+            {
+                int size = inFrame->m_userSEI.payloads[i].payloadSize = pic_in->userSEI.payloads[i].payloadSize;
+                inFrame->m_userSEI.payloads[i].payloadType = pic_in->userSEI.payloads[i].payloadType;
+                inFrame->m_userSEI.payloads[i].payload = new uint8_t[size];
+                memcpy(inFrame->m_userSEI.payloads[i].payload, pic_in->userSEI.payloads[i].payload, size);
+            }
+        }
+
         if (pic_in->quantOffsets != NULL)
         {
-            int cuCount = inFrame->m_lowres.maxBlocksInRow * inFrame->m_lowres.maxBlocksInCol;
+            int cuCount;
+            if (m_param->rc.qgSize == 8)
+                cuCount = inFrame->m_lowres.maxBlocksInRowFullRes * inFrame->m_lowres.maxBlocksInColFullRes;
+            else
+                cuCount = inFrame->m_lowres.maxBlocksInRow * inFrame->m_lowres.maxBlocksInCol;
             memcpy(inFrame->m_quantOffsets, pic_in->quantOffsets, cuCount * sizeof(float));
         }
 
@@ -776,9 +795,8 @@
             if (m_param->rc.bStatWrite)
                 if (m_rateControl->writeRateControlFrameStats(outFrame, &curEncoder->m_rce))
                     m_aborted = true;
-
-            if (pic_out && m_param->rc.bStatWrite)
-            {
+            if (pic_out)
+            { 
                 /* m_rcData is allocated for every frame */
                 pic_out->rcData = outFrame->m_rcData;
                 outFrame->m_rcData->qpaRc = outFrame->m_encData->m_avgQpRc;
@@ -1320,7 +1338,7 @@
     m_analyzeAll.m_maxFALL += curFrame->m_fencPic->m_avgLumaLevel;
     m_analyzeAll.m_maxCLL = X265_MAX(m_analyzeAll.m_maxCLL, curFrame->m_fencPic->m_maxLumaLevel);
 
-    char c = (slice->isIntra() ? 'I' : slice->isInterP() ? 'P' : 'B');
+    char c = (slice->isIntra() ? (curFrame->m_lowres.sliceType == X265_TYPE_IDR ? 'I' : 'i') : slice->isInterP() ? 'P' : 'B');
     int poc = slice->m_poc;
     if (!IS_REFERENCED(curFrame))
         c += 32; // lower case if unreferenced
@@ -1411,7 +1429,7 @@
     list.serialize(NAL_UNIT_SPS, bs);
 
     bs.resetBits();
-    sbacCoder.codePPS(m_pps);
+    sbacCoder.codePPS(m_pps, (m_param->maxSlices <= 1));
     bs.writeByteAlignment();
     list.serialize(NAL_UNIT_PPS, bs);
 
@@ -1440,7 +1458,7 @@
         list.serialize(NAL_UNIT_PREFIX_SEI, bs);
     }
 
-    if (m_param->bEmitInfoSEI)
+    if (!m_param->bDiscardSEI && m_param->bEmitInfoSEI)
     {
         char *opts = x265_param2string(m_param);
         if (opts)
@@ -1456,6 +1474,7 @@
                 
                 bs.resetBits();
                 SEIuserDataUnregistered idsei;
+                idsei.m_payloadType = USER_DATA_UNREGISTERED;
                 idsei.m_userData = (uint8_t*)buffer;
                 idsei.m_userDataLength = (uint32_t)strlen(buffer);
                 idsei.write(bs, m_sps);
@@ -1469,7 +1488,7 @@
         }
     }
 
-    if (m_param->bEmitHRDSEI || !!m_param->interlaceMode)
+    if (!m_param->bDiscardSEI && (m_param->bEmitHRDSEI || !!m_param->interlaceMode))
     {
         /* Picture Timing and Buffering Period SEI require the SPS to be "activated" */
         SEIActiveParameterSets sei;
@@ -1524,6 +1543,14 @@
 
     sps->bUseStrongIntraSmoothing = m_param->bEnableStrongIntraSmoothing;
     sps->bTemporalMVPEnabled = m_param->bEnableTemporalMvp;
+    sps->bDiscardOptionalVUI = m_param->bDiscardOptionalVUI;
+    sps->log2MaxPocLsb = m_param->log2MaxPocLsb;
+    int maxDeltaPOC = (m_param->bframes + 2) * (!!m_param->bBPyramid + 1) * 2;
+    while ((1 << sps->log2MaxPocLsb) <= maxDeltaPOC * 2)
+        sps->log2MaxPocLsb++;
+
+    if (sps->log2MaxPocLsb != m_param->log2MaxPocLsb)
+        x265_log(m_param, X265_LOG_WARNING, "Reset log2MaxPocLsb to %d to account for all POC values\n", sps->log2MaxPocLsb);
 
     VUI& vui = sps->vuiParameters;
     vui.aspectRatioInfoPresentFlag = !!m_param->vui.aspectRatioIdc;
@@ -1570,7 +1597,7 @@
     {
         pps->bUseDQP = true;
         pps->maxCuDQPDepth = g_log2Size[m_param->maxCUSize] - g_log2Size[m_param->rc.qgSize];
-        X265_CHECK(pps->maxCuDQPDepth <= 2, "max CU DQP depth cannot be greater than 2\n");
+        X265_CHECK(pps->maxCuDQPDepth <= 3, "max CU DQP depth cannot be greater than 3\n");
     }
     else
     {
@@ -1633,7 +1660,7 @@
         double fps = (double)p->fpsNum / p->fpsDenom;
         p->keyframeMin = X265_MIN((int)fps, p->keyframeMax / 10);
     }
-    p->keyframeMin = X265_MAX(1, X265_MIN(p->keyframeMin, p->keyframeMax / 2 + 1));
+    p->keyframeMin = X265_MAX(1, p->keyframeMin);
 
     if (!p->bframes)
         p->bBPyramid = 0;
@@ -1854,10 +1881,10 @@
     bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
     if (!m_param->bLossless && (m_param->rc.aqMode || bIsVbv))
     {
-        if (p->rc.qgSize < X265_MAX(16, p->minCUSize))
+        if (p->rc.qgSize < X265_MAX(8, p->minCUSize))
         {
-            p->rc.qgSize = X265_MAX(16, p->minCUSize);
-            x265_log(p, X265_LOG_WARNING, "QGSize should be greater than or equal to 16 and minCUSize, setting QGSize = %d\n", p->rc.qgSize);
+            p->rc.qgSize = X265_MAX(8, p->minCUSize);
+            x265_log(p, X265_LOG_WARNING, "QGSize should be greater than or equal to 8 and minCUSize, setting QGSize = %d\n", p->rc.qgSize);
         }
         if (p->rc.qgSize > p->maxCUSize)
         {
@@ -1979,6 +2006,13 @@
 
     if (p->csvfn)
         x265_log(p, X265_LOG_WARNING, "libx265 no longer supports CSV file statistics\n");
+
+    if (p->log2MaxPocLsb < 4)
+    {
+        x265_log(p, X265_LOG_WARNING, "maximum of the picture order count can not be less than 4\n");
+        p->log2MaxPocLsb = 4;
+    }
+
 }
 
 void Encoder::allocAnalysis(x265_analysis_data* analysis)

x265_2.0.tar.gz/source/encoder/entropy.cpp -> x265_2.1.tar.gz/source/encoder/entropy.cpp Changed

@@ -285,7 +285,7 @@
 
     WRITE_UVLC(X265_DEPTH - 8,   "bit_depth_luma_minus8");
     WRITE_UVLC(X265_DEPTH - 8,   "bit_depth_chroma_minus8");
-    WRITE_UVLC(BITS_FOR_POC - 4, "log2_max_pic_order_cnt_lsb_minus4");
+    WRITE_UVLC(sps.log2MaxPocLsb - 4, "log2_max_pic_order_cnt_lsb_minus4");
     WRITE_FLAG(true,             "sps_sub_layer_ordering_info_present_flag");
 
     for (uint32_t i = 0; i < sps.maxTempSubLayers; i++)
@@ -319,12 +319,12 @@
     WRITE_FLAG(sps.bUseStrongIntraSmoothing, "sps_strong_intra_smoothing_enable_flag");
 
     WRITE_FLAG(1, "vui_parameters_present_flag");
-    codeVUI(sps.vuiParameters, sps.maxTempSubLayers);
+    codeVUI(sps.vuiParameters, sps.maxTempSubLayers, sps.bDiscardOptionalVUI);
 
     WRITE_FLAG(0, "sps_extension_flag");
 }
 
-void Entropy::codePPS(const PPS& pps)
+void Entropy::codePPS(const PPS& pps, bool filerAcross)
 {
     WRITE_UVLC(0,                          "pps_pic_parameter_set_id");
     WRITE_UVLC(0,                          "pps_seq_parameter_set_id");
@@ -353,7 +353,7 @@
     WRITE_FLAG(pps.bTransquantBypassEnabled,  "transquant_bypass_enable_flag");
     WRITE_FLAG(0,                             "tiles_enabled_flag");
     WRITE_FLAG(pps.bEntropyCodingSyncEnabled, "entropy_coding_sync_enabled_flag");
-    WRITE_FLAG(1,                             "loop_filter_across_slices_enabled_flag");
+    WRITE_FLAG(filerAcross,                   "loop_filter_across_slices_enabled_flag");
 
     WRITE_FLAG(pps.bDeblockingFilterControlPresent, "deblocking_filter_control_present_flag");
     if (pps.bDeblockingFilterControlPresent)
@@ -422,65 +422,75 @@
     }
 }
 
-void Entropy::codeVUI(const VUI& vui, int maxSubTLayers)
+void Entropy::codeVUI(const VUI& vui, int maxSubTLayers, bool bDiscardOptionalVUI)
 {
-    WRITE_FLAG(vui.aspectRatioInfoPresentFlag,  "aspect_ratio_info_present_flag");
+    WRITE_FLAG(vui.aspectRatioInfoPresentFlag, "aspect_ratio_info_present_flag");
     if (vui.aspectRatioInfoPresentFlag)
     {
-        WRITE_CODE(vui.aspectRatioIdc, 8,       "aspect_ratio_idc");
+        WRITE_CODE(vui.aspectRatioIdc, 8, "aspect_ratio_idc");
         if (vui.aspectRatioIdc == 255)
         {
-            WRITE_CODE(vui.sarWidth, 16,        "sar_width");
-            WRITE_CODE(vui.sarHeight, 16,       "sar_height");
+            WRITE_CODE(vui.sarWidth, 16, "sar_width");
+            WRITE_CODE(vui.sarHeight, 16, "sar_height");
         }
     }
 
-    WRITE_FLAG(vui.overscanInfoPresentFlag,     "overscan_info_present_flag");
+    WRITE_FLAG(vui.overscanInfoPresentFlag, "overscan_info_present_flag");
     if (vui.overscanInfoPresentFlag)
         WRITE_FLAG(vui.overscanAppropriateFlag, "overscan_appropriate_flag");
 
-    WRITE_FLAG(vui.videoSignalTypePresentFlag,  "video_signal_type_present_flag");
+    WRITE_FLAG(vui.videoSignalTypePresentFlag, "video_signal_type_present_flag");
     if (vui.videoSignalTypePresentFlag)
     {
-        WRITE_CODE(vui.videoFormat, 3,          "video_format");
-        WRITE_FLAG(vui.videoFullRangeFlag,      "video_full_range_flag");
+        WRITE_CODE(vui.videoFormat, 3, "video_format");
+        WRITE_FLAG(vui.videoFullRangeFlag, "video_full_range_flag");
         WRITE_FLAG(vui.colourDescriptionPresentFlag, "colour_description_present_flag");
         if (vui.colourDescriptionPresentFlag)
         {
-            WRITE_CODE(vui.colourPrimaries, 8,         "colour_primaries");
+            WRITE_CODE(vui.colourPrimaries, 8, "colour_primaries");
             WRITE_CODE(vui.transferCharacteristics, 8, "transfer_characteristics");
-            WRITE_CODE(vui.matrixCoefficients, 8,      "matrix_coefficients");
+            WRITE_CODE(vui.matrixCoefficients, 8, "matrix_coefficients");
         }
     }
 
-    WRITE_FLAG(vui.chromaLocInfoPresentFlag,           "chroma_loc_info_present_flag");
+    WRITE_FLAG(vui.chromaLocInfoPresentFlag, "chroma_loc_info_present_flag");
     if (vui.chromaLocInfoPresentFlag)
     {
-        WRITE_UVLC(vui.chromaSampleLocTypeTopField,    "chroma_sample_loc_type_top_field");
+        WRITE_UVLC(vui.chromaSampleLocTypeTopField, "chroma_sample_loc_type_top_field");
         WRITE_UVLC(vui.chromaSampleLocTypeBottomField, "chroma_sample_loc_type_bottom_field");
     }
 
-    WRITE_FLAG(0,                                     "neutral_chroma_indication_flag");
-    WRITE_FLAG(vui.fieldSeqFlag,                      "field_seq_flag");
-    WRITE_FLAG(vui.frameFieldInfoPresentFlag,         "frame_field_info_present_flag");
+    WRITE_FLAG(0, "neutral_chroma_indication_flag");
+    WRITE_FLAG(vui.fieldSeqFlag, "field_seq_flag");
+    WRITE_FLAG(vui.frameFieldInfoPresentFlag, "frame_field_info_present_flag");
 
-    WRITE_FLAG(vui.defaultDisplayWindow.bEnabled,    "default_display_window_flag");
+    WRITE_FLAG(vui.defaultDisplayWindow.bEnabled, "default_display_window_flag");
     if (vui.defaultDisplayWindow.bEnabled)
     {
-        WRITE_UVLC(vui.defaultDisplayWindow.leftOffset,   "def_disp_win_left_offset");
-        WRITE_UVLC(vui.defaultDisplayWindow.rightOffset,  "def_disp_win_right_offset");
-        WRITE_UVLC(vui.defaultDisplayWindow.topOffset,    "def_disp_win_top_offset");
+        WRITE_UVLC(vui.defaultDisplayWindow.leftOffset, "def_disp_win_left_offset");
+        WRITE_UVLC(vui.defaultDisplayWindow.rightOffset, "def_disp_win_right_offset");
+        WRITE_UVLC(vui.defaultDisplayWindow.topOffset, "def_disp_win_top_offset");
         WRITE_UVLC(vui.defaultDisplayWindow.bottomOffset, "def_disp_win_bottom_offset");
     }
 
-    WRITE_FLAG(1,                                 "vui_timing_info_present_flag");
-    WRITE_CODE(vui.timingInfo.numUnitsInTick, 32, "vui_num_units_in_tick");
-    WRITE_CODE(vui.timingInfo.timeScale, 32,      "vui_time_scale");
-    WRITE_FLAG(0,                                 "vui_poc_proportional_to_timing_flag");
-
-    WRITE_FLAG(vui.hrdParametersPresentFlag,  "vui_hrd_parameters_present_flag");
-    if (vui.hrdParametersPresentFlag)
-        codeHrdParameters(vui.hrdParameters, maxSubTLayers);
+    if (bDiscardOptionalVUI)
+        WRITE_FLAG(0, "vui_timing_info_present_flag");
+    else
+    {
+        WRITE_FLAG(1, "vui_timing_info_present_flag");
+        WRITE_CODE(vui.timingInfo.numUnitsInTick, 32, "vui_num_units_in_tick");
+        WRITE_CODE(vui.timingInfo.timeScale, 32, "vui_time_scale");
+        WRITE_FLAG(0, "vui_poc_proportional_to_timing_flag");
+    }
+
+    if (bDiscardOptionalVUI)
+        WRITE_FLAG(0, "vui_hrd_parameters_present_flag");
+    else
+    {
+        WRITE_FLAG(vui.hrdParametersPresentFlag, "vui_hrd_parameters_present_flag");
+        if (vui.hrdParametersPresentFlag)
+            codeHrdParameters(vui.hrdParameters, maxSubTLayers);
+    }
 
     WRITE_FLAG(0, "bitstream_restriction_flag");
 }
@@ -570,22 +580,28 @@
     WRITE_CODE(picType, 3, "pic_type");
 }
 
-void Entropy::codeSliceHeader(const Slice& slice, FrameData& encData)
+void Entropy::codeSliceHeader(const Slice& slice, FrameData& encData, uint32_t slice_addr, uint32_t slice_addr_bits, int sliceQp)
 {
-    WRITE_FLAG(1, "first_slice_segment_in_pic_flag");
+    WRITE_FLAG((slice_addr == 0 ? 1 : 0), "first_slice_segment_in_pic_flag");
     if (slice.getRapPicFlag())
         WRITE_FLAG(0, "no_output_of_prior_pics_flag");
 
     WRITE_UVLC(0, "slice_pic_parameter_set_id");
 
     /* x265 does not use dependent slices, so always write all this data */
+    if (slice_addr)
+    {
+        // if( dependent_slice_segments_enabled_flag )
+        //     dependent_slice_segment_flag             u(1)
+        WRITE_CODE(slice_addr, slice_addr_bits, "slice_segment_address");
+    }
 
     WRITE_UVLC(slice.m_sliceType, "slice_type");
 
     if (!slice.getIdrPicFlag())
     {
-        int picOrderCntLSB = (slice.m_poc - slice.m_lastIDR + (1 << BITS_FOR_POC)) % (1 << BITS_FOR_POC);
-        WRITE_CODE(picOrderCntLSB, BITS_FOR_POC, "pic_order_cnt_lsb");
+        int picOrderCntLSB = (slice.m_poc - slice.m_lastIDR + (1 << slice.m_sps->log2MaxPocLsb)) % (1 << slice.m_sps->log2MaxPocLsb);
+        WRITE_CODE(picOrderCntLSB, slice.m_sps->log2MaxPocLsb, "pic_order_cnt_lsb");
 
 #if _DEBUG || CHECKED_BUILD
         // check for bitstream restriction stating that:
@@ -657,18 +673,24 @@
     if (!slice.isIntra())
         WRITE_UVLC(MRG_MAX_NUM_CANDS - slice.m_maxNumMergeCand, "five_minus_max_num_merge_cand");
 
-    int code = slice.m_sliceQp - 26;
+    int code = sliceQp - 26;
     WRITE_SVLC(code, "slice_qp_delta");
 
-    bool isSAOEnabled = slice.m_sps->bUseSAO ? saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1] : false;
-    bool isDBFEnabled = !slice.m_pps->bPicDisableDeblockingFilter;
+    // TODO: Enable when pps_loop_filter_across_slices_enabled_flag==1
+    //       We didn't support filter across slice board, so disable it now
 
-    if (isSAOEnabled || isDBFEnabled)
-        WRITE_FLAG(slice.m_sLFaseFlag, "slice_loop_filter_across_slices_enabled_flag");
+    if (g_maxSlices <= 1)
+    {
+        bool isSAOEnabled = slice.m_sps->bUseSAO ? saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1] : false;
+        bool isDBFEnabled = !slice.m_pps->bPicDisableDeblockingFilter;
+
+        if (isSAOEnabled || isDBFEnabled)
+            WRITE_FLAG(slice.m_sLFaseFlag, "slice_loop_filter_across_slices_enabled_flag");
+    }
 }
 
 /** write wavefront substreams sizes for the slice header */
-void Entropy::codeSliceHeaderWPPEntryPoints(const Slice& slice, const uint32_t *substreamSizes, uint32_t maxOffset)
+void Entropy::codeSliceHeaderWPPEntryPoints(const uint32_t *substreamSizes, uint32_t numSubStreams, uint32_t maxOffset)
 {
     uint32_t offsetLen = 1;

x265_2.0.tar.gz/source/encoder/entropy.h -> x265_2.1.tar.gz/source/encoder/entropy.h Changed

@@ -142,13 +142,13 @@
 
     void codeVPS(const VPS& vps);
     void codeSPS(const SPS& sps, const ScalingList& scalingList, const ProfileTierLevel& ptl);
-    void codePPS(const PPS& pps);
-    void codeVUI(const VUI& vui, int maxSubTLayers);
+    void codePPS(const PPS& pps, bool filerAcross);
+    void codeVUI(const VUI& vui, int maxSubTLayers, bool discardOptionalVUI);
     void codeAUD(const Slice& slice);
     void codeHrdParameters(const HRDInfo& hrd, int maxSubTLayers);
 
-    void codeSliceHeader(const Slice& slice, FrameData& encData);
-    void codeSliceHeaderWPPEntryPoints(const Slice& slice, const uint32_t *substreamSizes, uint32_t maxOffset);
+    void codeSliceHeader(const Slice& slice, FrameData& encData, uint32_t slice_addr, uint32_t slice_addr_bits, int sliceQp);
+    void codeSliceHeaderWPPEntryPoints(const uint32_t *substreamSizes, uint32_t numSubStreams, uint32_t maxOffset);
     void codeShortTermRefPicSet(const RPS& rps);
     void finishSlice()                 { encodeBinTrm(1); finish(); dynamic_cast<Bitstream*>(m_bitIf)->writeByteAlignment(); }

x265_2.0.tar.gz/source/encoder/frameencoder.cpp -> x265_2.1.tar.gz/source/encoder/frameencoder.cpp Changed

@@ -85,6 +85,7 @@
 
     delete[] m_rows;
     delete[] m_outStreams;
+    X265_FREE(m_sliceBaseRow);
     X265_FREE(m_cuGeoms);
     X265_FREE(m_ctuGeomMap);
     X265_FREE(m_substreamSizes);
@@ -113,12 +114,15 @@
     m_rows = new CTURow[m_numRows];
     bool ok = !!m_numRows;
 
+    m_sliceBaseRow = X265_MALLOC(uint32_t, m_param->maxSlices + 1);
+    ok &= !!m_sliceBaseRow;
+
     /* determine full motion search range */
     int range  = m_param->searchRange;       /* fpel search */
     range += !!(m_param->searchMethod < 2);  /* diamond/hex range check lag */
     range += NTAPS_LUMA / 2;                 /* subpel filter half-length */
     range += 2 + MotionEstimate::hpelIterationCount(m_param->subpelRefine) / 2; /* subpel refine steps */
-    m_refLagRows = 1 + ((range + g_maxCUSize - 1) / g_maxCUSize);
+    m_refLagRows = /*(m_param->maxSlices > 1 ? 1 : 0) +*/ 1 + ((range + g_maxCUSize - 1) / g_maxCUSize);
 
     // NOTE: 2 times of numRows because both Encoder and Filter in same queue
     if (!WaveFront::init(m_numRows * 2))
@@ -145,6 +149,13 @@
     else
         m_param->noiseReductionIntra = m_param->noiseReductionInter = 0;
 
+    // 7.4.7.1 - Ceil( Log2( PicSizeInCtbsY ) ) bits
+    {
+        unsigned long tmp;
+        CLZ(tmp, (numRows * numCols));
+        m_sliceAddrBits = (uint16_t)(tmp + 1);
+    }
+
     return ok;
 }
 
@@ -444,12 +455,33 @@
     /* ensure all rows are blocked prior to initializing row CTU counters */
     WaveFront::clearEnabledRowMask();
 
-    /* reset entropy coders */
+    /* reset entropy coders and compute slice id */
     m_entropyCoder.load(m_initSliceContext);
+    const uint32_t sliceGroupSize = (m_numRows + m_param->maxSlices - 1) / m_param->maxSlices;
+    const uint32_t sliceGroupSizeAccu = (m_numRows << 8) / m_param->maxSlices;
+    m_sliceGroupSize = (uint16_t)sliceGroupSize;
+
+    uint32_t rowSum = sliceGroupSizeAccu;
+    uint32_t sidx = 0;
     for (uint32_t i = 0; i < m_numRows; i++)
-        m_rows[i].init(m_initSliceContext);
+    {
+        const uint32_t rowRange = (rowSum >> 8);
 
-    uint32_t numSubstreams = m_param->bEnableWavefront ? slice->m_sps->numCuInHeight : 1;
+        if ((i >= rowRange) & (sidx != m_param->maxSlices - 1))
+        {
+            rowSum += sliceGroupSizeAccu;
+            m_sliceBaseRow[++sidx] = i;
+        }
+
+        m_rows[i].init(m_initSliceContext, sidx);
+    }
+    X265_CHECK(sidx < m_param->maxSlices, "sliceID check failed!");
+
+    m_sliceBaseRow[0] = 0;
+    m_sliceBaseRow[m_param->maxSlices] = m_numRows;
+
+    uint32_t numSubstreams = m_param->bEnableWavefront ? slice->m_sps->numCuInHeight : m_param->maxSlices;
+    X265_CHECK(m_param->bEnableWavefront || (m_param->maxSlices == 1), "Multiple slices without WPP unsupport now!");
     if (!m_outStreams)
     {
         m_outStreams = new Bitstream[numSubstreams];
@@ -466,7 +498,7 @@
 
     if (m_frame->m_lowres.bKeyframe)
     {
-        if (m_param->bEmitHRDSEI)
+        if (!m_param->bDiscardSEI && m_param->bEmitHRDSEI)
         {
             SEIBufferingPeriod* bpSei = &m_top->m_rateControl->m_bufPeriodSEI;
 
@@ -488,7 +520,7 @@
         }
     }
 
-    if (m_param->bEmitHRDSEI || !!m_param->interlaceMode)
+    if (!m_param->bDiscardSEI && (m_param->bEmitHRDSEI || !!m_param->interlaceMode))
     {
         SEIPictureTiming *sei = m_rce.picTimingSEI;
         const VUI *vui = &slice->m_sps->vuiParameters;
@@ -523,6 +555,25 @@
         m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
     }
 
+    /* Write user SEI */
+    if (!m_param->bDiscardSEI)
+    {
+        for (int i = 0; i < m_frame->m_userSEI.numPayloads; i++)
+        {
+            x265_sei_payload *payload = &m_frame->m_userSEI.payloads[i];
+            SEIuserDataUnregistered sei;
+
+            sei.m_payloadType = payload->payloadType;
+            sei.m_userDataLength = payload->payloadSize;
+            sei.m_userData = payload->payload;
+
+            m_bs.resetBits();
+            sei.write(m_bs, *slice->m_sps);
+            m_bs.writeByteAlignment();
+            m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
+        }
+    }
+
     /* CQP and CRF (without capped VBV) doesn't use mid-frame statistics to 
      * tune RateControl parameters for other frames.
      * Hence, for these modes, update m_startEndOrder and unlock RC for previous threads waiting in
@@ -540,35 +591,54 @@
      * compressed in a wave-front pattern if WPP is enabled. Row based loop
      * filters runs behind the CTU compression and reconstruction */
 
-    m_rows[0].active = true;
+    for (uint32_t sliceId = 0; sliceId < m_param->maxSlices; sliceId++)
+    {
+        m_rows[m_sliceBaseRow[sliceId]].active = true;
+    }
+
     if (m_param->bEnableWavefront)
     {
-        for (uint32_t row = 0; row < m_numRows; row++)
+        for (uint32_t rowInSlice = 0; rowInSlice < m_sliceGroupSize; rowInSlice++)
         {
-            // block until all reference frames have reconstructed the rows we need
-            for (int l = 0; l < numPredDir; l++)
+            for (uint32_t sliceId = 0; sliceId < m_param->maxSlices; sliceId++)
             {
-                for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
+                const uint32_t sliceStartRow = m_sliceBaseRow[sliceId];
+                const uint32_t sliceEndRow = m_sliceBaseRow[sliceId + 1] - 1;
+                const uint32_t row = sliceStartRow + rowInSlice;
+
+                if (row >= m_numRows)
+                    break;
+
+                if (row > sliceEndRow)
+                    continue;
+
+                // block until all reference frames have reconstructed the rows we need
+                for (int l = 0; l < numPredDir; l++)
                 {
-                    Frame *refpic = slice->m_refFrameList[l][ref];
+                    for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
+                    {
+                        Frame *refpic = slice->m_refFrameList[l][ref];
 
-                    uint32_t reconRowCount = refpic->m_reconRowCount.get();
-                    while ((reconRowCount != m_numRows) && (reconRowCount < row + m_refLagRows))
-                        reconRowCount = refpic->m_reconRowCount.waitForChange(reconRowCount);
+                        // NOTE: we unnecessary wait row that beyond current slice boundary
+                        const int rowIdx = X265_MIN(sliceEndRow, (row + m_refLagRows));
 
-                    if ((bUseWeightP || bUseWeightB) && m_mref[l][ref].isWeighted)
-                        m_mref[l][ref].applyWeight(row + m_refLagRows, m_numRows);
+                        while (refpic->m_reconRowFlag[rowIdx].get() == 0)
+                            refpic->m_reconRowFlag[rowIdx].waitForChange(0);
+
+                        if ((bUseWeightP || bUseWeightB) && m_mref[l][ref].isWeighted)
+                            m_mref[l][ref].applyWeight(row + m_refLagRows, m_numRows, sliceEndRow + 1, sliceId);
+                    }
                 }
-            }
 
-            enableRowEncoder(row); /* clear external dependency for this row */
-            if (!row)
-            {
-                m_row0WaitTime = x265_mdate();
-                enqueueRowEncoder(0); /* clear internal dependency, start wavefront */
-            }
-            tryWakeOne();
-        }
+                enableRowEncoder(row); /* clear external dependency for this row */
+                if (!rowInSlice)
+                {
+                    m_row0WaitTime = x265_mdate();
+                    enqueueRowEncoder(row); /* clear internal dependency, start wavefront */
+                }
+                tryWakeOne();
+            } // end of loop rowInSlice
+        } // end of loop sliceId
 
         m_allRowsAvailableTime = x265_mdate();
         tryWakeOne(); /* ensure one thread is active or help-wanted flag is set prior to blocking */
@@ -591,12 +661,12 @@
                     {
                         Frame *refpic = slice->m_refFrameList[list][ref];
 
-                        uint32_t reconRowCount = refpic->m_reconRowCount.get();

x265_2.0.tar.gz/source/encoder/frameencoder.h -> x265_2.1.tar.gz/source/encoder/frameencoder.h Changed

@@ -73,6 +73,7 @@
 {
     Entropy           bufferedEntropy;  /* store CTU2 context for next row CTU0 */
     Entropy           rowGoOnCoder;     /* store context between CTUs, code bitstream if !SAO */
+    unsigned int      sliceId;          /* store current row slice id */
 
     FrameStats        rowStats;
 
@@ -96,11 +97,12 @@
     volatile uint32_t completed;
 
     /* called at the start of each frame to initialize state */
-    void init(Entropy& initContext)
+    void init(Entropy& initContext, unsigned int sid)
     {
         active = false;
         busy = false;
         completed = 0;
+        sliceId = sid;
         memset(&rowStats, 0, sizeof(rowStats));
         rowGoOnCoder.load(initContext);
     }
@@ -142,6 +144,9 @@
     uint32_t                 m_refLagRows;
 
     CTURow*                  m_rows;
+    uint16_t                 m_sliceAddrBits;
+    uint16_t                 m_sliceGroupSize;
+    uint32_t*                m_sliceBaseRow;
     RateControlEntry         m_rce;
     SEIDecodedPictureHash    m_seiReconPictureDigest;
 
@@ -214,7 +219,7 @@
     void compressFrame();
 
     /* called by compressFrame to generate final per-row bitstreams */
-    void encodeSlice();
+    void encodeSlice(uint32_t sliceAddr);
 
     void threadMain();
     int  collectCTUStatistics(const CUData& ctu, FrameStats* frameLog);

x265_2.0.tar.gz/source/encoder/framefilter.cpp -> x265_2.1.tar.gz/source/encoder/framefilter.cpp Changed

@@ -174,11 +174,11 @@
         restoreOrigLosslessYuv(cu, frame, absPartIdx);
 }
 
-void FrameFilter::ParallelFilter::copySaoAboveRef(PicYuv* reconPic, uint32_t cuAddr, int col)
+void FrameFilter::ParallelFilter::copySaoAboveRef(const CUData *ctu, PicYuv* reconPic, uint32_t cuAddr, int col)
 {
     // Copy SAO Top Reference Pixels
     int ctuWidth  = g_maxCUSize;
-    const pixel* recY = reconPic->getPlaneAddr(0, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_stride);
+    const pixel* recY = reconPic->getPlaneAddr(0, cuAddr) - (ctu->m_bFirstRowInSlice ? 0 : reconPic->m_stride);
 
     // Luma
     memcpy(&m_sao.m_tmpU[0][col * ctuWidth], recY, ctuWidth * sizeof(pixel));
@@ -189,8 +189,8 @@
     {
         ctuWidth  >>= m_sao.m_hChromaShift;
 
-        const pixel* recU = reconPic->getPlaneAddr(1, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_strideC);
-        const pixel* recV = reconPic->getPlaneAddr(2, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_strideC);
+        const pixel* recU = reconPic->getPlaneAddr(1, cuAddr) - (ctu->m_bFirstRowInSlice ? 0 : reconPic->m_strideC);
+        const pixel* recV = reconPic->getPlaneAddr(2, cuAddr) - (ctu->m_bFirstRowInSlice ? 0 : reconPic->m_strideC);
         memcpy(&m_sao.m_tmpU[1][col * ctuWidth], recU, ctuWidth * sizeof(pixel));
         memcpy(&m_sao.m_tmpU[2][col * ctuWidth], recV, ctuWidth * sizeof(pixel));
 
@@ -325,7 +325,7 @@
     int colEnd = m_allowedCol.get();
 
     // Avoid threading conflict
-    if (m_prevRow && colEnd > m_prevRow->m_lastDeblocked.get())
+    if (!m_encData->getPicCTU(m_rowAddr)->m_bFirstRowInSlice && colEnd > m_prevRow->m_lastDeblocked.get())
         colEnd = m_prevRow->m_lastDeblocked.get();
 
     if (colStart >= colEnd)
@@ -334,29 +334,29 @@
     for (uint32_t col = (uint32_t)colStart; col < (uint32_t)colEnd; col++)
     {
         const uint32_t cuAddr = m_rowAddr + col;
+        const CUData* ctu = m_encData->getPicCTU(cuAddr);
 
         if (m_frameFilter->m_param->bEnableLoopFilter)
         {
-            const CUData* ctu = m_encData->getPicCTU(cuAddr);
             deblockCTU(ctu, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_VER);
         }
 
         if (col >= 1)
         {
+            const CUData* ctuPrev = m_encData->getPicCTU(cuAddr - 1);
             if (m_frameFilter->m_param->bEnableLoopFilter)
             {
-                const CUData* ctuPrev = m_encData->getPicCTU(cuAddr - 1);
                 deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr - 1]], Deblock::EDGE_HOR);
 
                 // When SAO Disable, setting column counter here
-                if ((!m_frameFilter->m_param->bEnableSAO) & (m_row >= 1))
+                if (!m_frameFilter->m_param->bEnableSAO & !ctuPrev->m_bFirstRowInSlice)
                     m_prevRow->processPostCu(col - 1);
             }
 
             if (m_frameFilter->m_param->bEnableSAO)
             {
                 // Save SAO bottom row reference pixels
-                copySaoAboveRef(reconPic, cuAddr - 1, col - 1);
+                copySaoAboveRef(ctuPrev, reconPic, cuAddr - 1, col - 1);
 
                 // SAO Decide
                 if (col >= 2)
@@ -364,11 +364,11 @@
                     // NOTE: Delay 2 column to avoid mistake on below case, it is Deblock sync logic issue, less probability but still alive
                     //       ... H V |
                     //       ..S H V |
-                    m_sao.rdoSaoUnitCu(saoParam, m_rowAddr, col - 2, cuAddr - 2);
+                    m_sao.rdoSaoUnitCu(saoParam, (ctu->m_bFirstRowInSlice ? 0 : m_rowAddr), col - 2, cuAddr - 2);
                 }
 
                 // Process Previous Row SAO CU
-                if (m_row >= 1 && col >= 3)
+                if (!ctu->m_bFirstRowInSlice && col >= 3)
                 {
                     // Must delay 1 row to avoid thread data race conflict
                     m_prevRow->processSaoCTU(saoParam, col - 3);
@@ -384,52 +384,54 @@
     if (colEnd == numCols)
     {
         const uint32_t cuAddr = m_rowAddr + numCols - 1;
+        const CUData* ctuPrev = m_encData->getPicCTU(cuAddr);
 
         if (m_frameFilter->m_param->bEnableLoopFilter)
         {
-            const CUData* ctuPrev = m_encData->getPicCTU(cuAddr);
             deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_HOR);
 
             // When SAO Disable, setting column counter here
-            if ((!m_frameFilter->m_param->bEnableSAO) & (m_row >= 1))
+            if (!m_frameFilter->m_param->bEnableSAO & !ctuPrev->m_bFirstRowInSlice)
                 m_prevRow->processPostCu(numCols - 1);
         }
 
         // TODO: move processPostCu() into processSaoUnitCu()
         if (m_frameFilter->m_param->bEnableSAO)
         {
+            const CUData* ctu = m_encData->getPicCTU(m_rowAddr + numCols - 2);
+
             // Save SAO bottom row reference pixels
-            copySaoAboveRef(reconPic, cuAddr, numCols - 1);
+            copySaoAboveRef(ctuPrev, reconPic, cuAddr, numCols - 1);
 
             // SAO Decide
             // NOTE: reduce condition check for 1 CU only video, Why someone play with it?
             if (numCols >= 2)
-                m_sao.rdoSaoUnitCu(saoParam, m_rowAddr, numCols - 2, cuAddr - 1);
+                m_sao.rdoSaoUnitCu(saoParam, (ctu->m_bFirstRowInSlice ? 0 : m_rowAddr), numCols - 2, cuAddr - 1);
 
             if (numCols >= 1)
-                m_sao.rdoSaoUnitCu(saoParam, m_rowAddr, numCols - 1, cuAddr);
+                m_sao.rdoSaoUnitCu(saoParam, (ctuPrev->m_bFirstRowInSlice ? 0 : m_rowAddr), numCols - 1, cuAddr);
 
             // Process Previous Rows SAO CU
-            if (m_row >= 1 && numCols >= 3)
+            if (!ctuPrev->m_bFirstRowInSlice & (numCols >= 3))
             {
                 m_prevRow->processSaoCTU(saoParam, numCols - 3);
                 m_prevRow->processPostCu(numCols - 3);
             }
 
-            if (m_row >= 1 && numCols >= 2)
+            if (!ctuPrev->m_bFirstRowInSlice & (numCols >= 2))
             {
                 m_prevRow->processSaoCTU(saoParam, numCols - 2);
                 m_prevRow->processPostCu(numCols - 2);
             }
 
-            if (m_row >= 1 && numCols >= 1)
+            if (!ctuPrev->m_bFirstRowInSlice & (numCols >= 1))
             {
                 m_prevRow->processSaoCTU(saoParam, numCols - 1);
                 m_prevRow->processPostCu(numCols - 1);
             }
 
             // Setting column sync counter
-            if (m_row >= 1)
+            if (!ctuPrev->m_bFirstRowInSlice)
                 m_frameFilter->m_frame->m_reconColCount[m_row - 1].set(numCols - 1);
         }
         m_lastDeblocked.set(numCols);
@@ -454,6 +456,7 @@
 
     // SAO: was integrate into encode loop
     SAOParam* saoParam = encData.m_saoParam;
+    CUData* ctu = encData.getPicCTU(m_parallelFilter[row].m_rowAddr);
 
     /* Processing left block Deblock with current threading */
     {
@@ -461,15 +464,15 @@
         m_parallelFilter[row].waitForExit();
 
         /* Check to avoid previous row process slower than current row */
-        X265_CHECK((row < 1) || m_parallelFilter[row - 1].m_lastDeblocked.get() == m_numCols, "previous row not finish");
+        X265_CHECK(ctu->m_bFirstRowInSlice || m_parallelFilter[row - 1].m_lastDeblocked.get() == m_numCols, "previous row not finish");
 
         m_parallelFilter[row].m_allowedCol.set(m_numCols);
         m_parallelFilter[row].processTasks(-1);
 
-        if (row == m_numRows - 1)
+        if (ctu->m_bLastRowInSlice)
         {
             /* TODO: Early start last row */
-            if ((row >= 1) && (m_parallelFilter[row - 1].m_lastDeblocked.get() != m_numCols))
+            if ((!ctu->m_bFirstRowInSlice) && (m_parallelFilter[row - 1].m_lastDeblocked.get() != m_numCols))
                 x265_log(m_param, X265_LOG_WARNING, "detected ParallelFilter race condition on last row\n");
 
             /* Apply SAO on last row of CUs, because we always apply SAO on row[X-1] */
@@ -493,10 +496,19 @@
 
     // this row of CTUs has been encoded
 
-    if (row > 0)
+    if (!ctu->m_bFirstRowInSlice)
         processPostRow(row - 1);
 
-    if (row == m_numRows - 1)
+    if (ctu->m_bLastRowInSlice)
+        processPostRow(row);
+
+    // NOTE: slices parallelism will be execute out-of-order
+    int numRowFinished;
+    for(numRowFinished = 0; numRowFinished < m_numRows; numRowFinished++)
+        if (!m_frame->m_reconRowFlag[numRowFinished].get())
+            break;
+
+    if (numRowFinished == m_numRows)
     {
         if (m_param->bEnableSAO)
         {
@@ -509,7 +521,6 @@
 
             m_parallelFilter[0].m_sao.rdoSaoUnitRowEnd(saoParam, encData.m_slice->m_sps->numCUsInFrame);
         }
-        processPostRow(row);

x265_2.0.tar.gz/source/encoder/framefilter.h -> x265_2.1.tar.gz/source/encoder/framefilter.h Changed

x265_2.0.tar.gz/source/encoder/motion.cpp -> x265_2.1.tar.gz/source/encoder/motion.cpp Changed

@@ -581,14 +581,15 @@
                                    int              numCandidates,
                                    const MV *       mvc,
                                    int              merange,
-                                   MV &             outQMv)
+                                   MV &             outQMv,
+                                   pixel *          srcReferencePlane)
 {
     ALIGN_VAR_16(int, costs[16]);
     if (ctuAddr >= 0)
         blockOffset = ref->reconPic->getLumaAddr(ctuAddr, absPartIdx) - ref->reconPic->getLumaAddr(0);
     intptr_t stride = ref->lumaStride;
     pixel* fenc = fencPUYuv.m_buf[0];
-    pixel* fref = ref->fpelPlane[0] + blockOffset;
+    pixel* fref = srcReferencePlane == 0 ? ref->fpelPlane[0] + blockOffset : srcReferencePlane + blockOffset;
 
     setMVP(qmvp);
 
@@ -1094,6 +1095,12 @@
 
     const SubpelWorkload& wl = workload[this->subpelRefine];
 
+    // check mv range for slice bound
+    if ((g_maxSlices > 1) & ((bmv.y < qmvmin.y) | (bmv.y > qmvmax.y)))
+    {
+        bmv.y = x265_min(x265_max(bmv.y, qmvmin.y), qmvmax.y);
+    }
+
     if (!bcost)
     {
         /* if there was zero residual at the clipped MVP, we can skip subpel
@@ -1141,6 +1148,11 @@
             for (int i = 1; i <= wl.hpel_dirs; i++)
             {
                 MV qmv = bmv + square1[i] * 2;
+
+                // check mv range for slice bound
+                if ((g_maxSlices > 1) & ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y)))
+                    continue;
+
                 int cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv);
                 COPY2_IF_LT(bcost, cost, bdir, i);
             }
@@ -1161,6 +1173,11 @@
             for (int i = 1; i <= wl.qpel_dirs; i++)
             {
                 MV qmv = bmv + square1[i];
+
+                // check mv range for slice bound
+                if ((g_maxSlices > 1) & ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y)))
+                    continue;
+
                 int cost = subpelCompare(ref, qmv, satd) + mvcost(qmv);
                 COPY2_IF_LT(bcost, cost, bdir, i);
             }

x265_2.0.tar.gz/source/encoder/motion.h -> x265_2.1.tar.gz/source/encoder/motion.h Changed

x265_2.0.tar.gz/source/encoder/ratecontrol.cpp -> x265_2.1.tar.gz/source/encoder/ratecontrol.cpp Changed

@@ -284,7 +284,11 @@
 #define ABR_SCENECUT_INIT_QP_MIN (12)
 #define CRF_INIT_QP (int)m_param->rc.rfConstant
     for (int i = 0; i < 3; i++)
+    {
         m_lastQScaleFor[i] = x265_qp2qScale(m_param->rc.rateControlMode == X265_RC_CRF ? CRF_INIT_QP : ABR_INIT_QP_MIN);
+        m_lmin[i] = x265_qp2qScale(m_param->rc.qpMin);
+        m_lmax[i] = x265_qp2qScale(m_param->rc.qpMax);
+    }
 
     if (m_param->rc.rateControlMode == X265_RC_CQP)
     {
@@ -543,8 +547,11 @@
                        &rce->mvBits, &rce->miscBits, &rce->iCuCount, &rce->pCuCount,
                        &rce->skipCuCount);
                 rce->keptAsRef = true;
+                rce->isIdr = false;
                 if (picType == 'b' || picType == 'p')
                     rce->keptAsRef = false;
+                if (picType == 'I')
+                    rce->isIdr = true;
                 if (picType == 'I' || picType == 'i')
                     rce->sliceType = I_SLICE;
                 else if (picType == 'P' || picType == 'p')
@@ -611,9 +618,18 @@
         }
         if (m_param->rc.cuTree)
         {
-            m_cuTreeStats.qpBuffer[0] = X265_MALLOC(uint16_t, m_ncu * sizeof(uint16_t));
-            if (m_param->bBPyramid && m_param->rc.bStatRead)
-                m_cuTreeStats.qpBuffer[1] = X265_MALLOC(uint16_t, m_ncu * sizeof(uint16_t));
+            if (m_param->rc.qgSize == 8)
+            {
+                m_cuTreeStats.qpBuffer[0] = X265_MALLOC(uint16_t, m_ncu * 4 * sizeof(uint16_t));
+                if (m_param->bBPyramid && m_param->rc.bStatRead)
+                    m_cuTreeStats.qpBuffer[1] = X265_MALLOC(uint16_t, m_ncu * 4 * sizeof(uint16_t));
+            }
+            else
+            {
+                m_cuTreeStats.qpBuffer[0] = X265_MALLOC(uint16_t, m_ncu * sizeof(uint16_t));
+                if (m_param->bBPyramid && m_param->rc.bStatRead)
+                    m_cuTreeStats.qpBuffer[1] = X265_MALLOC(uint16_t, m_ncu * sizeof(uint16_t));
+            }
             m_cuTreeStats.qpBufPos = -1;
         }
     }
@@ -808,13 +824,19 @@
                  (double)m_param->rc.bitrate,
                  expectedBits * m_fps / (m_numEntries * 1000.),
                  avgq);
-        if (expectedBits < allAvailableBits && avgq < QP_MIN + 2)
+        if (expectedBits < allAvailableBits && avgq < m_param->rc.qpMin + 2)
         {
-            x265_log(m_param, X265_LOG_WARNING, "try reducing target bitrate\n");
+            if (m_param->rc.qpMin > 0)
+                x265_log(m_param, X265_LOG_WARNING, "try reducing target bitrate or reducing qp_min (currently %d)\n", m_param->rc.qpMin);
+            else
+                x265_log(m_param, X265_LOG_WARNING, "try reducing target bitrate\n");
         }
-        else if (expectedBits > allAvailableBits && avgq > QP_MAX_SPEC - 2)
+        else if (expectedBits > allAvailableBits && avgq > m_param->rc.qpMax - 2)
         {
-            x265_log(m_param, X265_LOG_WARNING, "try increasing target bitrate\n");
+            if (m_param->rc.qpMax < QP_MAX_MAX)
+                x265_log(m_param, X265_LOG_WARNING, "try increasing target bitrate or increasing qp_max (currently %d)\n", m_param->rc.qpMax);
+            else
+                x265_log(m_param, X265_LOG_WARNING, "try increasing target bitrate\n");
         }
         else if (!(m_2pass && m_isVbv))
             x265_log(m_param, X265_LOG_WARNING, "internal error\n");
@@ -966,6 +988,8 @@
     double adjustment;
     double prevBits = 0;
     int t0, t1;
+    double qScaleMin = x265_qp2qScale(m_param->rc.qpMin);
+    double qScaleMax = x265_qp2qScale(m_param->rc.qpMax);
     int iterations = 0 , adjMin, adjMax;
     CHECKED_MALLOC(fills, double, m_numEntries + 1);
     fills++;
@@ -985,7 +1009,7 @@
             adjMin = 1;
             while (adjMin && findUnderflow(fills, &t0, &t1, 1, endPos))
             {
-                adjMin = fixUnderflow(t0, t1, adjustment, MIN_QPSCALE, MAX_MAX_QPSCALE);
+                adjMin = fixUnderflow(t0, t1, adjustment, qScaleMin, qScaleMax);
                 t0 = t1;
             }
         }
@@ -995,7 +1019,7 @@
         /* fix underflows -- should be done after overflow, as we'd better undersize target than underflowing VBV */
         adjMax = 1;
         while (adjMax && findUnderflow(fills, &t0, &t1, 0, endPos))
-            adjMax = fixUnderflow(t0, t1, 1.001, MIN_QPSCALE, MAX_MAX_QPSCALE );
+            adjMax = fixUnderflow(t0, t1, 1.001, qScaleMin, qScaleMax);
         expectedBits = countExpectedBits(startPos, endPos);
     }
     while ((expectedBits < .995 * allAvailableBits) && ((int64_t)(expectedBits+.5) > (int64_t)(prevBits+.5)) && !(m_param->rc.rateControlMode == X265_RC_CRF));
@@ -1044,7 +1068,7 @@
             return X265_TYPE_AUTO;
         }
         int index = m_encOrder[frameNum];
-        int frameType = m_rce2Pass[index].sliceType == I_SLICE ? (frameNum > 0 && m_param->bOpenGOP ? X265_TYPE_I : X265_TYPE_IDR)
+        int frameType = m_rce2Pass[index].sliceType == I_SLICE ? (m_rce2Pass[index].isIdr ? X265_TYPE_IDR : X265_TYPE_I)
                         : m_rce2Pass[index].sliceType == P_SLICE ? X265_TYPE_P
                         : (m_rce2Pass[index].sliceType == B_SLICE && m_rce2Pass[index].keptAsRef ? X265_TYPE_BREF : X265_TYPE_B);
         return frameType;
@@ -1216,13 +1240,17 @@
          * the scene-transition mini-gop */
 
         double q = x265_qScale2qp(rateEstimateQscale(curFrame, rce));
-        q = x265_clip3((double)QP_MIN, (double)QP_MAX_MAX, q);
+        q = x265_clip3((double)m_param->rc.qpMin, (double)m_param->rc.qpMax, q);
         m_qp = int(q + 0.5);
         q = m_isGrainEnabled ? m_qp : q;
         rce->qpaRc = curEncData.m_avgQpRc = curEncData.m_avgQpAq = q;
         /* copy value of lastRceq into thread local rce struct *to be used in RateControlEnd() */
         rce->qRceq = m_lastRceq;
         accumPQpUpdate();
+        curFrame->m_rcData->cumulativePQp = m_accumPQp;
+        curFrame->m_rcData->cumulativePNorm = m_accumPNorm;
+        for (int i = 0; i < 3; i++)
+            curFrame->m_rcData->lastQScaleFor[i] = m_lastQScaleFor[i];
     }
     else // CQP
     {
@@ -1250,7 +1278,7 @@
     if (curFrame->m_forceqp)
     {
         m_qp = (int32_t)(curFrame->m_forceqp + 0.5) - 1;
-        m_qp = x265_clip3(QP_MIN, QP_MAX_MAX, m_qp);
+        m_qp = x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, m_qp);
         rce->qpaRc = curEncData.m_avgQpRc = curEncData.m_avgQpAq = m_qp;
         if (m_isAbr || m_2pass)
         {
@@ -1408,6 +1436,11 @@
 {
     int index = m_encOrder[frame->m_poc];
     uint8_t sliceTypeActual = (uint8_t)m_rce2Pass[index].sliceType;
+    int ncu;
+    if (m_param->rc.qgSize == 8)
+        ncu = m_ncu * 4;
+    else
+        ncu = m_ncu;
     if (m_rce2Pass[index].keptAsRef)
     {
         /* TODO: We don't need pre-lookahead to measure AQ offsets, but there is currently
@@ -1421,7 +1454,7 @@
 
                 if (!fread(&type, 1, 1, m_cutreeStatFileIn))
                     goto fail;
-                if (fread(m_cuTreeStats.qpBuffer[m_cuTreeStats.qpBufPos], sizeof(uint16_t), m_ncu, m_cutreeStatFileIn) != (size_t)m_ncu)
+                if (fread(m_cuTreeStats.qpBuffer[m_cuTreeStats.qpBufPos], sizeof(uint16_t), ncu, m_cutreeStatFileIn) != (size_t)ncu)
                     goto fail;
 
                 if (type != sliceTypeActual && m_cuTreeStats.qpBufPos == 1)
@@ -1432,8 +1465,8 @@
             }
             while(type != sliceTypeActual);
         }
-        primitives.fix8Unpack(frame->m_lowres.qpCuTreeOffset, m_cuTreeStats.qpBuffer[m_cuTreeStats.qpBufPos], m_ncu);
-        for (int i = 0; i < m_ncu; i++)
+        primitives.fix8Unpack(frame->m_lowres.qpCuTreeOffset, m_cuTreeStats.qpBuffer[m_cuTreeStats.qpBufPos], ncu);
+        for (int i = 0; i < ncu; i++)
             frame->m_lowres.invQscaleFactor[i] = x265_exp2fix8(frame->m_lowres.qpCuTreeOffset[i]);
         m_cuTreeStats.qpBufPos--;
     }
@@ -1643,6 +1676,8 @@
         double abrBuffer = 2 * m_rateTolerance * m_bitrate;
         if (m_2pass)
         {
+            double lmin = m_lmin[m_sliceType];
+            double lmax = m_lmax[m_sliceType];
             int64_t diff;
             if (!m_isVbv)
             {
@@ -1681,8 +1716,16 @@
             }
             if (m_framesDone == 0 && m_param->rc.rateControlMode == X265_RC_ABR && m_isGrainEnabled)
                 q = X265_MIN(x265_qp2qScale(ABR_INIT_QP_GRAIN_MAX), q);
-
             rce->qpNoVbv = x265_qScale2qp(q);
+            if ((m_sliceType == I_SLICE && m_param->keyframeMax > 1
+                && m_lastNonBPictType != I_SLICE && !m_isAbrReset) || (m_isNextGop && !m_framesDone))
+                m_avgPFrameQp = 0;
+            if (m_sliceType == P_SLICE)
+            {
+                m_avgPFrameQp = m_avgPFrameQp == 0 ? rce->qpNoVbv : m_avgPFrameQp;
+                m_avgPFrameQp = (m_avgPFrameQp + rce->qpNoVbv) / 2;
+            }
+
             if (m_isVbv)
             {
                 /* Do not overflow vbv */
@@ -1693,17 +1736,17 @@
                 double sizeConstraint = 1 + expectedFullness;
                 qmax = X265_MAX(qmax, rce->newQScale);
                 if (expectedFullness < .05)
-                    qmax = MAX_MAX_QPSCALE;
-                qmax = X265_MIN(qmax, MAX_MAX_QPSCALE);
+                    qmax = lmax;

x265_2.0.tar.gz/source/encoder/ratecontrol.h -> x265_2.1.tar.gz/source/encoder/ratecontrol.h Changed

x265_2.0.tar.gz/source/encoder/reference.cpp -> x265_2.1.tar.gz/source/encoder/reference.cpp Changed

@@ -3,6 +3,7 @@
  *
  * Authors: Steve Borho <steve@borho.org>
  *          Deepthi Devaki <deepthidevaki@multicorewareinc.com>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -36,10 +37,12 @@
     weightBuffer[0] = NULL;
     weightBuffer[1] = NULL;
     weightBuffer[2] = NULL;
+    numSliceWeightedRows = NULL;
 }
 
 MotionReference::~MotionReference()
 {
+    X265_FREE(numSliceWeightedRows);
     X265_FREE(weightBuffer[0]);
     X265_FREE(weightBuffer[1]);
     X265_FREE(weightBuffer[2]);
@@ -48,11 +51,19 @@
 int MotionReference::init(PicYuv* recPic, WeightParam *wp, const x265_param& p)
 {
     reconPic = recPic;
-    numWeightedRows = 0;
     lumaStride = recPic->m_stride;
     chromaStride = recPic->m_strideC;
     numInterpPlanes = p.subpelRefine > 2 ? 3 : 1; /* is chroma satd possible? */
 
+    if (numSliceWeightedRows)
+    {
+        // Unnecessary, but avoid risk on parameters dynamic modify in future.
+        X265_FREE(numSliceWeightedRows);
+        numSliceWeightedRows = NULL;
+    }
+    numSliceWeightedRows = X265_MALLOC(uint32_t, p.maxSlices);
+    memset(numSliceWeightedRows, 0, p.maxSlices * sizeof(uint32_t));
+
     /* directly reference the extended integer pel planes */
     fpelPlane[0] = recPic->m_picOrg[0];
     fpelPlane[1] = recPic->m_picOrg[1];
@@ -105,9 +116,10 @@
     return 0;
 }
 
-void MotionReference::applyWeight(int finishedRows, int maxNumRows)
+void MotionReference::applyWeight(uint32_t finishedRows, uint32_t maxNumRows, uint32_t maxNumRowsInSlice, uint32_t sliceId)
 {
-    finishedRows = X265_MIN(finishedRows, maxNumRows);
+    const uint32_t numWeightedRows = numSliceWeightedRows[sliceId];
+    finishedRows = X265_MIN(finishedRows, maxNumRowsInSlice);
     if (numWeightedRows >= finishedRows)
         return;
 
@@ -116,7 +128,7 @@
     intptr_t stride = reconPic->m_stride;
     int width   = reconPic->m_picWidth;
     int height  = (finishedRows - numWeightedRows) * g_maxCUSize;
-    if (finishedRows == maxNumRows && (reconPic->m_picHeight % g_maxCUSize))
+    if ((finishedRows == maxNumRows) && (reconPic->m_picHeight % g_maxCUSize))
     {
         /* the last row may be partial height */
         height -= g_maxCUSize;
@@ -170,5 +182,5 @@
         }
     }
 
-    numWeightedRows = finishedRows;
+    numSliceWeightedRows[sliceId] = finishedRows;
 }

x265_2.0.tar.gz/source/encoder/reference.h -> x265_2.1.tar.gz/source/encoder/reference.h Changed

x265_2.0.tar.gz/source/encoder/sao.cpp -> x265_2.1.tar.gz/source/encoder/sao.cpp Changed

@@ -283,6 +283,16 @@
     int ctuHeight = g_maxCUSize;
     uint32_t lpelx = cu->m_cuPelX;
     uint32_t tpely = cu->m_cuPelY;
+    const uint32_t firstRowInSlice = cu->m_bFirstRowInSlice;
+    const uint32_t lastRowInSlice = cu->m_bLastRowInSlice;
+    const uint32_t bAboveUnavail = (!tpely) | firstRowInSlice;
+
+    // NOTE: Careful! the picHeight for Equal operator only, so I may safe to hack it
+    if (lastRowInSlice)
+    {
+        picHeight = x265_min(picHeight, (tpely + ctuHeight));
+    }
+
     if (plane)
     {
         picWidth  >>= m_hChromaShift;
@@ -367,9 +377,9 @@
     }
     case SAO_EO_1: // dir: |
     {
-        int startY = !tpely;
+        int startY = bAboveUnavail;
         int endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
-        if (!tpely)
+        if (startY)
             rec += stride;
 
         if (ctuWidth & 15)
@@ -408,10 +418,10 @@
         int startX = !lpelx;
         int endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
 
-        int startY = !tpely;
+        int startY = bAboveUnavail;
         int endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
 
-        if (!tpely)
+        if (startY)
             rec += stride;
 
         if (!(ctuWidth & 15))
@@ -474,10 +484,10 @@
         int startX = !lpelx;
         int endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
 
-        int startY = !tpely;
+        int startY = bAboveUnavail;
         int endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
 
-        if (!tpely)
+        if (startY)
             rec += stride;
 
         if (ctuWidth & 15)
@@ -737,6 +747,10 @@
     int ctuHeight = g_maxCUSize;
     uint32_t lpelx = cu->m_cuPelX;
     uint32_t tpely = cu->m_cuPelY;
+    const uint32_t firstRowInSlice = cu->m_bFirstRowInSlice;
+    const uint32_t lastRowInSlice = cu->m_bLastRowInSlice;
+    const uint32_t bAboveUnavail = (!tpely) | firstRowInSlice;
+
     if (plane)
     {
         picWidth  >>= m_hChromaShift;
@@ -751,6 +765,12 @@
     ctuWidth  = rpelx - lpelx;
     ctuHeight = bpely - tpely;
 
+    // NOTE: Careful! the picHeight apply for Equal operator only in below, so I may safe to hack it
+    if (lastRowInSlice)
+    {
+        picHeight = bpely;
+    }
+
     int startX;
     int startY;
     int endX;
@@ -825,10 +845,10 @@
 
             rec  = rec0;
 
-            startY = !tpely;
+            startY = bAboveUnavail;
             endX   = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR + plane_offset;
             endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB + plane_offset;
-            if (!tpely)
+            if (startY)
             {
                 rec += stride;
             }
@@ -852,9 +872,9 @@
             startX = !lpelx;
             endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR + plane_offset;
 
-            startY = !tpely;
+            startY = bAboveUnavail;
             endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB + plane_offset;
-            if (!tpely)
+            if (startY)
             {
                 fenc += stride;
                 rec += stride;
@@ -879,10 +899,10 @@
             startX = !lpelx;
             endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR + plane_offset;
 
-            startY = !tpely;
+            startY = bAboveUnavail;
             endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB + plane_offset;
 
-            if (!tpely)
+            if (startY)
             {
                 fenc += stride;
                 rec += stride;
@@ -911,6 +931,16 @@
     int ctuHeight = g_maxCUSize;
     uint32_t lpelx = cu->m_cuPelX;
     uint32_t tpely = cu->m_cuPelY;
+    const uint32_t firstRowInSlice = cu->m_bFirstRowInSlice;
+    const uint32_t lastRowInSlice = cu->m_bLastRowInSlice;
+    const uint32_t bAboveAvail = (!tpely) | firstRowInSlice;
+
+    // NOTE: Careful! the picHeight for Equal operator only, so I may safe to hack it
+    if (lastRowInSlice)
+    {
+        picHeight = x265_min(picHeight, (tpely + ctuHeight));
+    }
+
     uint32_t rpelx = x265_min(lpelx + ctuWidth,  picWidth);
     uint32_t bpely = x265_min(tpely + ctuHeight, picHeight);
     ctuWidth  = rpelx - lpelx;
@@ -1028,10 +1058,10 @@
 
             startX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR;
             startY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
-            firstY = !tpely;
+            firstY = bAboveAvail;
             // endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
             endY   = ctuHeight - 1; // not refer below CTU
-            if (!tpely)
+            if (firstY)
             {
                 fenc += stride;
                 rec += stride;
@@ -1074,12 +1104,12 @@
             startX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
             startY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
             firstX = !lpelx;
-            firstY = !tpely;
+            firstY = bAboveAvail;
             // endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
             // endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
             endX   = ctuWidth - 1;  // not refer right CTU
             endY   = ctuHeight - 1; // not refer below CTU
-            if (!tpely)
+            if (firstY)
             {
                 fenc += stride;
                 rec += stride;
@@ -1126,12 +1156,12 @@
             startX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
             startY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
             firstX = !lpelx;
-            firstY = !tpely;
+            firstY = bAboveAvail;
             // endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
             // endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
             endX   = ctuWidth - 1;  // not refer right CTU
             endY   = ctuHeight - 1; // not refer below CTU
-            if (!tpely)
+            if (firstY)
             {
                 fenc += stride;
                 rec += stride;
@@ -1197,7 +1227,7 @@
 
     int qpCb = qp;
     if (m_param->internalCsp == X265_CSP_I420)
-        qpCb = x265_clip3(QP_MIN, QP_MAX_MAX, (int)g_chromaScale[qp + slice->m_pps->chromaQpOffset[0]]);
+        qpCb = x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, (int)g_chromaScale[qp + slice->m_pps->chromaQpOffset[0]]);
     else
         qpCb = X265_MIN(qp + slice->m_pps->chromaQpOffset[0], QP_MAX_SPEC);

x265_2.0.tar.gz/source/encoder/search.cpp -> x265_2.1.tar.gz/source/encoder/search.cpp Changed

@@ -1854,10 +1854,26 @@
     for (uint32_t mergeCand = 0; mergeCand < numMergeCand; ++mergeCand)
     {
         /* Prevent TMVP candidates from using unavailable reference pixels */
-        if (m_bFrameParallel &&
-            (candMvField[mergeCand][0].mv.y >= (m_param->searchRange + 1) * 4 ||
-             candMvField[mergeCand][1].mv.y >= (m_param->searchRange + 1) * 4))
-            continue;
+        if (m_bFrameParallel)
+        {
+            // Parallel slices bound check
+            if (m_param->maxSlices > 1)
+            {
+                if (cu.m_bFirstRowInSlice &
+                    ((candMvField[mergeCand][0].mv.y < (2 * 4)) | (candMvField[mergeCand][1].mv.y < (2 * 4))))
+                    continue;
+
+                // Last row in slice can't reference beyond bound since it is another slice area
+                // TODO: we may beyond bound in future since these area have a chance to finish because we use parallel slices. Necessary prepare research on load balance
+                if (cu.m_bLastRowInSlice &&
+                    ((candMvField[mergeCand][0].mv.y > -3 * 4) | (candMvField[mergeCand][1].mv.y > -3 * 4)))
+                    continue;
+            }
+
+            if (candMvField[mergeCand][0].mv.y >= (m_param->searchRange + 1) * 4 ||
+                candMvField[mergeCand][1].mv.y >= (m_param->searchRange + 1) * 4)
+                continue;
+        }
 
         cu.m_mv[0][pu.puAbsPartIdx] = candMvField[mergeCand][0].mv;
         cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)candMvField[mergeCand][0].refIdx;
@@ -1925,17 +1941,24 @@
         MV mvCand = amvp[i];
 
         // NOTE: skip mvCand if Y is > merange and -FN>1
-        if (m_bFrameParallel && (mvCand.y >= (m_param->searchRange + 1) * 4))
-            costs[i] = m_me.COST_MAX;
-        else
+        if (m_bFrameParallel)
         {
-            cu.clipMv(mvCand);
-            predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refReconPicList[list][ref], mvCand);
-            costs[i] = m_me.bufSAD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size);
+            costs[i] = m_me.COST_MAX;
+
+            if (mvCand.y >= (m_param->searchRange + 1) * 4)
+                continue;
+
+            if ((m_param->maxSlices > 1) &
+                ((mvCand.y < m_sliceMinY)
+              |  (mvCand.y > m_sliceMaxY)))
+                continue;
         }
+        cu.clipMv(mvCand);
+        predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refReconPicList[list][ref], mvCand);
+        costs[i] = m_me.bufSAD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size);
     }
 
-    return costs[0] <= costs[1] ? 0 : 1;
+    return (costs[0] <= costs[1]) ? 0 : 1;
 }
 
 void Search::PME::processTasks(int workerThreadId)
@@ -2023,7 +2046,8 @@
 
     setSearchRange(interMode.cu, mvp, m_param->searchRange, mvmin, mvmax);
 
-    int satdCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv);
+    int satdCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, 
+      m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
 
     /* Get total cost of partition, but only include MV bit cost once */
     bits += m_me.bitcost(outmv);
@@ -2106,9 +2130,10 @@
                 const MV* amvp = interMode.amvpCand[list][ref];
                 int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
                 MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx];
-                
+
                 setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
-                int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv);
+                int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv,
+                  m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
 
                 /* Get total cost of partition, but only include MV bit cost once */
                 bits += m_me.bitcost(outmv);
@@ -2206,7 +2231,8 @@
                     }
 
                     setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
-                    int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv);
+                    int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, 
+                      m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
 
                     /* Get total cost of partition, but only include MV bit cost once */
                     bits += m_me.bitcost(outmv);
@@ -2497,6 +2523,13 @@
         mvmin.x = X265_MIN(mvmin.x, maxSafeMv);
     }
 
+    // apply restrict on slices
+    if ((m_param->maxSlices > 1) & m_bFrameParallel)
+    {
+        mvmin.y = X265_MAX(mvmin.y, m_sliceMinY);
+        mvmax.y = X265_MIN(mvmax.y, m_sliceMaxY);
+    }
+
     /* Clip search range to signaled maximum MV length.
      * We do not support this VUI field being changed from the default */
     const int maxMvLen = (1 << 15) - 1;

x265_2.0.tar.gz/source/encoder/search.h -> x265_2.1.tar.gz/source/encoder/search.h Changed

x265_2.0.tar.gz/source/encoder/sei.h -> x265_2.1.tar.gz/source/encoder/sei.h Changed

@@ -46,36 +46,7 @@
 
 protected:
 
-    enum PayloadType
-    {
-        BUFFERING_PERIOD                     = 0,
-        PICTURE_TIMING                       = 1,
-        PAN_SCAN_RECT                        = 2,
-        FILLER_PAYLOAD                       = 3,
-        USER_DATA_REGISTERED_ITU_T_T35       = 4,
-        USER_DATA_UNREGISTERED               = 5,
-        RECOVERY_POINT                       = 6,
-        SCENE_INFO                           = 9,
-        FULL_FRAME_SNAPSHOT                  = 15,
-        PROGRESSIVE_REFINEMENT_SEGMENT_START = 16,
-        PROGRESSIVE_REFINEMENT_SEGMENT_END   = 17,
-        FILM_GRAIN_CHARACTERISTICS           = 19,
-        POST_FILTER_HINT                     = 22,
-        TONE_MAPPING_INFO                    = 23,
-        FRAME_PACKING                        = 45,
-        DISPLAY_ORIENTATION                  = 47,
-        SOP_DESCRIPTION                      = 128,
-        ACTIVE_PARAMETER_SETS                = 129,
-        DECODING_UNIT_INFO                   = 130,
-        TEMPORAL_LEVEL0_INDEX                = 131,
-        DECODED_PICTURE_HASH                 = 132,
-        SCALABLE_NESTING                     = 133,
-        REGION_REFRESH_INFO                  = 134,
-        MASTERING_DISPLAY_INFO               = 137,
-        CONTENT_LIGHT_LEVEL_INFO             = 144,
-    };
-
-    virtual PayloadType payloadType() const = 0;
+    virtual SEIPayloadType payloadType() const = 0;
 
     virtual void writeSEI(const SPS&) { X265_CHECK(0, "empty writeSEI method called\n");  }
 
@@ -86,11 +57,12 @@
 {
 public:
 
-    PayloadType payloadType() const { return USER_DATA_UNREGISTERED; }
+    SEIPayloadType payloadType() const { return m_payloadType; }
 
     SEIuserDataUnregistered() : m_userData(NULL) {}
 
     static const uint8_t m_uuid_iso_iec_11578[16];
+    SEIPayloadType m_payloadType;
     uint32_t m_userDataLength;
     uint8_t *m_userData;
 
@@ -98,7 +70,7 @@
     {
         m_bitIf = &bs;
 
-        WRITE_CODE(USER_DATA_UNREGISTERED, 8, "payload_type");
+        WRITE_CODE(m_payloadType, 8, "payload_type");
 
         uint32_t payloadSize = 16 + m_userDataLength;
         for (; payloadSize >= 0xff; payloadSize -= 0xff)
@@ -123,7 +95,7 @@
     uint32_t maxDisplayMasteringLuminance;
     uint32_t minDisplayMasteringLuminance;
 
-    PayloadType payloadType() const { return MASTERING_DISPLAY_INFO; }
+    SEIPayloadType payloadType() const { return MASTERING_DISPLAY_INFO; }
 
     bool parse(const char* value)
     {
@@ -161,7 +133,7 @@
     uint16_t max_content_light_level;
     uint16_t max_pic_average_light_level;
 
-    PayloadType payloadType() const { return CONTENT_LIGHT_LEVEL_INFO; }
+    SEIPayloadType payloadType() const { return CONTENT_LIGHT_LEVEL_INFO; }
 
     void write(Bitstream& bs, const SPS&)
     {
@@ -178,7 +150,7 @@
 {
 public:
 
-    PayloadType payloadType() const { return DECODED_PICTURE_HASH; }
+    SEIPayloadType payloadType() const { return DECODED_PICTURE_HASH; }
 
     enum Method
     {
@@ -238,7 +210,7 @@
 {
 public:
 
-    PayloadType payloadType() const { return ACTIVE_PARAMETER_SETS; }
+    SEIPayloadType payloadType() const { return ACTIVE_PARAMETER_SETS; }
 
     bool m_selfContainedCvsFlag;
     bool m_noParamSetUpdateFlag;
@@ -258,7 +230,7 @@
 {
 public:
 
-    PayloadType payloadType() const { return BUFFERING_PERIOD; }
+    SEIPayloadType payloadType() const { return BUFFERING_PERIOD; }
 
     SEIBufferingPeriod()
         : m_cpbDelayOffset(0)
@@ -292,7 +264,7 @@
 {
 public:
 
-    PayloadType payloadType() const { return PICTURE_TIMING; }
+    SEIPayloadType payloadType() const { return PICTURE_TIMING; }
 
     uint32_t  m_picStruct;
     uint32_t  m_sourceScanType;
@@ -327,7 +299,7 @@
 {
 public:
 
-    PayloadType payloadType() const { return RECOVERY_POINT; }
+    SEIPayloadType payloadType() const { return RECOVERY_POINT; }
 
     int  m_recoveryPocCnt;
     bool m_exactMatchingFlag;

x265_2.0.tar.gz/source/encoder/slicetype.cpp -> x265_2.1.tar.gz/source/encoder/slicetype.cpp Changed

@@ -56,22 +56,36 @@
 }
 
 /* Find the energy of each block in Y/Cb/Cr plane */
-inline uint32_t acEnergyPlane(Frame *curFrame, pixel* src, intptr_t srcStride, int plane, int colorFormat)
+inline uint32_t acEnergyPlane(Frame *curFrame, pixel* src, intptr_t srcStride, int plane, int colorFormat, uint32_t qgSize)
 {
     if ((colorFormat != X265_CSP_I444) && plane)
     {
-        ALIGN_VAR_8(pixel, pix[8 * 8]);
-        primitives.cu[BLOCK_8x8].copy_pp(pix, 8, src, srcStride);
-        return acEnergyVar(curFrame, primitives.cu[BLOCK_8x8].var(pix, 8), 6, plane);
+        if (qgSize == 8)
+        {
+            ALIGN_VAR_4(pixel, pix[4 * 4]);
+            primitives.cu[BLOCK_4x4].copy_pp(pix, 4, src, srcStride);
+            return acEnergyVar(curFrame, primitives.cu[BLOCK_4x4].var(pix, 4), 4, plane);
+        }
+        else
+        {
+            ALIGN_VAR_8(pixel, pix[8 * 8]);
+            primitives.cu[BLOCK_8x8].copy_pp(pix, 8, src, srcStride);
+            return acEnergyVar(curFrame, primitives.cu[BLOCK_8x8].var(pix, 8), 6, plane);
+        }
     }
     else
-        return acEnergyVar(curFrame, primitives.cu[BLOCK_16x16].var(src, srcStride), 8, plane);
+    {
+        if (qgSize == 8)
+            return acEnergyVar(curFrame, primitives.cu[BLOCK_8x8].var(src, srcStride), 6, plane);
+        else
+            return acEnergyVar(curFrame, primitives.cu[BLOCK_16x16].var(src, srcStride), 8, plane);
+    }
 }
 
 } // end anonymous namespace
 
 /* Find the total AC energy of each block in all planes */
-uint32_t LookaheadTLD::acEnergyCu(Frame* curFrame, uint32_t blockX, uint32_t blockY, int csp)
+uint32_t LookaheadTLD::acEnergyCu(Frame* curFrame, uint32_t blockX, uint32_t blockY, int csp, uint32_t qgSize)
 {
     intptr_t stride = curFrame->m_fencPic->m_stride;
     intptr_t cStride = curFrame->m_fencPic->m_strideC;
@@ -82,11 +96,11 @@
 
     uint32_t var;
 
-    var  = acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[0] + blockOffsetLuma, stride, 0, csp);
+    var  = acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[0] + blockOffsetLuma, stride, 0, csp, qgSize);
     if (csp != X265_CSP_I400 && curFrame->m_fencPic->m_picCsp != X265_CSP_I400)
     {
-        var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[1] + blockOffsetChroma, cStride, 1, csp);
-        var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[2] + blockOffsetChroma, cStride, 2, csp);
+        var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[1] + blockOffsetChroma, cStride, 1, csp, qgSize);
+        var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[2] + blockOffsetChroma, cStride, 2, csp, qgSize);
     }
     x265_emms();
     return var;
@@ -97,7 +111,22 @@
     /* Actual adaptive quantization */
     int maxCol = curFrame->m_fencPic->m_picWidth;
     int maxRow = curFrame->m_fencPic->m_picHeight;
-    int blockCount = curFrame->m_lowres.maxBlocksInRow * curFrame->m_lowres.maxBlocksInCol;
+    int blockCount, loopIncr;
+    float modeOneConst, modeTwoConst;
+    if (param->rc.qgSize == 8)
+    {
+        blockCount = curFrame->m_lowres.maxBlocksInRowFullRes * curFrame->m_lowres.maxBlocksInColFullRes;
+        modeOneConst = 11.427f;
+        modeTwoConst = 8.f;
+        loopIncr = 8;
+    }
+    else
+    {
+        blockCount = widthInCU * heightInCU;
+        modeOneConst = 14.427f;
+        modeTwoConst = 11.f;
+        loopIncr = 16;
+    }
 
     float* quantOffsets = curFrame->m_quantOffsets;
     for (int y = 0; y < 3; y++)
@@ -106,14 +135,14 @@
         curFrame->m_lowres.wp_sum[y] = 0;
     }
 
-    /* Calculate Qp offset for each 16x16 block in the frame */
+    /* Calculate Qp offset for each 16x16 or 8x8 block in the frame */
     int blockXY = 0;
     int blockX = 0, blockY = 0;
     double strength = 0.f;
     if (param->rc.aqMode == X265_AQ_NONE || param->rc.aqStrength == 0)
     {
         /* Need to init it anyways for CU tree */
-        int cuCount = widthInCU * heightInCU;
+        int cuCount = blockCount;
 
         if (param->rc.aqMode && param->rc.aqStrength == 0)
         {
@@ -137,9 +166,9 @@
         /* Need variance data for weighted prediction */
         if (param->bEnableWeightedPred || param->bEnableWeightedBiPred)
         {
-            for (blockY = 0; blockY < maxRow; blockY += 16)
-                for (blockX = 0; blockX < maxCol; blockX += 16)
-                    acEnergyCu(curFrame, blockX, blockY, param->internalCsp);
+            for (blockY = 0; blockY < maxRow; blockY += loopIncr)
+                for (blockX = 0; blockX < maxCol; blockX += loopIncr)
+                    acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
         }
     }
     else
@@ -152,12 +181,12 @@
             double bit_depth_correction = 1.f / (1 << (2*(X265_DEPTH-8)));
             curFrame->m_lowres.frameVariance = 0;
             uint64_t rowVariance = 0;
-            for (blockY = 0; blockY < maxRow; blockY += 16)
+            for (blockY = 0; blockY < maxRow; blockY += loopIncr)
             {
                 rowVariance = 0;
-                for (blockX = 0; blockX < maxCol; blockX += 16)
+                for (blockX = 0; blockX < maxCol; blockX += loopIncr)
                 {
-                    uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp);
+                    uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
                     curFrame->m_lowres.blockVariance[blockXY] = energy;
                     rowVariance += energy;
                     qp_adj = pow(energy * bit_depth_correction + 1, 0.1);
@@ -172,21 +201,21 @@
             avg_adj /= blockCount;
             avg_adj_pow2 /= blockCount;
             strength = param->rc.aqStrength * avg_adj;
-            avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - (11.f)) / avg_adj;
+            avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - (modeTwoConst)) / avg_adj;
             bias_strength = param->rc.aqStrength;
         }
         else
             strength = param->rc.aqStrength * 1.0397f;
 
         blockXY = 0;
-        for (blockY = 0; blockY < maxRow; blockY += 16)
+        for (blockY = 0; blockY < maxRow; blockY += loopIncr)
         {
-            for (blockX = 0; blockX < maxCol; blockX += 16)
+            for (blockX = 0; blockX < maxCol; blockX += loopIncr)
             {
                 if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED)
                 {
                     qp_adj = curFrame->m_lowres.qpCuTreeOffset[blockXY];
-                    qp_adj = strength * (qp_adj - avg_adj) + bias_strength * (1.f - 11.f / (qp_adj * qp_adj));
+                    qp_adj = strength * (qp_adj - avg_adj) + bias_strength * (1.f - modeTwoConst / (qp_adj * qp_adj));
                 }
                 else if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
                 {
@@ -195,8 +224,8 @@
                 }
                 else
                 {
-                    uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp);
-                    qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (14.427f + 2 * (X265_DEPTH - 8)));
+                    uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp,param->rc.qgSize);
+                    qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (modeOneConst + 2 * (X265_DEPTH - 8)));
                 }
                 if (quantOffsets != NULL)
                     qp_adj += quantOffsets[blockXY];
@@ -208,6 +237,21 @@
         }
     }
 
+    if (param->rc.qgSize == 8)
+    {
+        for (int cuY = 0; cuY < heightInCU; cuY++)
+        {
+            for (int cuX = 0; cuX < widthInCU; cuX++)
+            {
+                const int cuXY = cuX + cuY * widthInCU;
+                curFrame->m_lowres.invQscaleFactor8x8[cuXY] = (curFrame->m_lowres.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4] +
+                                                               curFrame->m_lowres.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + 1] +
+                                                               curFrame->m_lowres.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + curFrame->m_lowres.maxBlocksInRowFullRes] +
+                                                               curFrame->m_lowres.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + curFrame->m_lowres.maxBlocksInRowFullRes + 1]) / 4;
+            }
+        }
+    }
+
     if (param->bEnableWeightedPred || param->bEnableWeightedBiPred)
     {
         int hShift = CHROMA_H_SHIFT(param->internalCsp);
@@ -227,7 +271,7 @@
     }
 }
 
-void LookaheadTLD::lowresIntraEstimate(Lowres& fenc)
+void LookaheadTLD::lowresIntraEstimate(Lowres& fenc, uint32_t qgSize)
 {
     ALIGN_VAR_32(pixel, prediction[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
     pixel fencIntra[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE];
@@ -314,12 +358,15 @@
             fenc.lowresCosts[0][0][cuXY] = (uint16_t)(X265_MIN(icost, LOWRES_COST_MASK) | (0 << LOWRES_COST_SHIFT));
             fenc.intraCost[cuXY] = icost;
             fenc.intraMode[cuXY] = (uint8_t)ilowmode;

x265_2.0.tar.gz/source/encoder/slicetype.h -> x265_2.1.tar.gz/source/encoder/slicetype.h Changed

x265_2.0.tar.gz/source/test/regression-tests.txt -> x265_2.1.tar.gz/source/test/regression-tests.txt Changed

@@ -49,6 +49,7 @@
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset medium --nr-inter 500 -F4 --no-psy-rdoq
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset slower --no-weightp --rdoq-level 0 --limit-refs 3
 FourPeople_1280x720_60.y4m,--preset superfast --no-wpp --lookahead-slices 2
+FourPeople_1280x720_60.y4m,--preset veryfast --aq-mode 2 --aq-strength 1.5 --qg-size 8
 FourPeople_1280x720_60.y4m,--preset medium --qp 38 --no-psy-rd
 FourPeople_1280x720_60.y4m,--preset medium --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
 FourPeople_1280x720_60.y4m,--preset veryslow --numa-pools "none"
@@ -90,7 +91,7 @@
 big_buck_bunny_360p24.y4m,--preset veryfast --no-deblock
 big_buck_bunny_360p24.y4m,--preset faster --keyint 240 --min-keyint 60 --rc-lookahead 200
 big_buck_bunny_360p24.y4m,--preset medium --keyint 60 --min-keyint 48 --weightb --limit-refs 3
-big_buck_bunny_360p24.y4m,--preset slow --psy-rdoq 2.0 --rdoq-level 1 --no-b-intra --aq-mode 3
+big_buck_bunny_360p24.y4m,--preset slow --psy-rdoq 2.0 --rdoq-level 1 --no-b-intra --aq-mode 3 --qg-size 8
 city_4cif_60fps.y4m,--preset superfast --rdpenalty 1 --tu-intra-depth 2
 city_4cif_60fps.y4m,--preset medium --crf 4 --cu-lossless --sao-non-deblock
 city_4cif_60fps.y4m,--preset slower --scaling-list default
@@ -128,7 +129,7 @@
 washdc_422_ntsc.y4m,--preset veryfast --tu-inter-depth 4
 washdc_422_ntsc.y4m,--preset faster --rdoq-level 1 --max-merge 5
 vtc1nw_422_ntsc.y4m,--preset medium --scaling-list default --ctu 16 --ref 5
-washdc_422_ntsc.y4m,--preset medium --no-weightp --max-tu-size 4 --limit-refs 1 --aq-mode 2
+washdc_422_ntsc.y4m,--preset medium --no-weightp --max-tu-size 4 --limit-refs 1 --aq-mode 2 --qg-size 8
 vtc1nw_422_ntsc.y4m,--preset slower --nr-inter 1000 -F4 --tune fast-decode --qg-size 16
 washdc_422_ntsc.y4m,--preset slower --psy-rdoq 2.0 --rdoq-level 2 --qg-size 32 --limit-refs 1
 washdc_422_ntsc.y4m,--preset veryslow --crf 4 --cu-lossless --limit-refs 3 --limit-modes

x265_2.0.tar.gz/source/x265-extras.cpp -> x265_2.1.tar.gz/source/x265-extras.cpp Changed

x265_2.0.tar.gz/source/x265.h -> x265_2.1.tar.gz/source/x265.h Changed

@@ -150,6 +150,55 @@
     x265_cu_stats    cuStats;
 } x265_frame_stats;
 
+/* Arbitrary User SEI
+ * Payload size is in bytes and the payload pointer must be non-NULL. 
+ * Payload types and syntax can be found in Annex D of the H.265 Specification.
+ * SEI Payload Alignment bits as described in Annex D must be included at the 
+ * end of the payload if needed. The payload should not be NAL-encapsulated.
+ * Payloads are written in the order of input */
+
+typedef enum
+{
+    BUFFERING_PERIOD                     = 0,
+    PICTURE_TIMING                       = 1,
+    PAN_SCAN_RECT                        = 2,
+    FILLER_PAYLOAD                       = 3,
+    USER_DATA_REGISTERED_ITU_T_T35       = 4,
+    USER_DATA_UNREGISTERED               = 5,
+    RECOVERY_POINT                       = 6,
+    SCENE_INFO                           = 9,
+    FULL_FRAME_SNAPSHOT                  = 15,
+    PROGRESSIVE_REFINEMENT_SEGMENT_START = 16,
+    PROGRESSIVE_REFINEMENT_SEGMENT_END   = 17,
+    FILM_GRAIN_CHARACTERISTICS           = 19,
+    POST_FILTER_HINT                     = 22,
+    TONE_MAPPING_INFO                    = 23,
+    FRAME_PACKING                        = 45,
+    DISPLAY_ORIENTATION                  = 47,
+    SOP_DESCRIPTION                      = 128,
+    ACTIVE_PARAMETER_SETS                = 129,
+    DECODING_UNIT_INFO                   = 130,
+    TEMPORAL_LEVEL0_INDEX                = 131,
+    DECODED_PICTURE_HASH                 = 132,
+    SCALABLE_NESTING                     = 133,
+    REGION_REFRESH_INFO                  = 134,
+    MASTERING_DISPLAY_INFO               = 137,
+    CONTENT_LIGHT_LEVEL_INFO             = 144,
+} SEIPayloadType;
+
+typedef struct x265_sei_payload
+{
+    int payloadSize;
+    SEIPayloadType payloadType;
+    uint8_t* payload;
+} x265_sei_payload;
+
+typedef struct x265_sei
+{
+    int numPayloads;
+    x265_sei_payload *payloads;
+} x265_sei;
+
 /* Used to pass pictures into the encoder, and to get picture data back out of
  * the encoder.  The input and output semantics are different */
 typedef struct x265_picture
@@ -214,13 +263,16 @@
     /* An array of quantizer offsets to be applied to this image during encoding.
      * These are added on top of the decisions made by rateControl.
      * Adaptive quantization must be enabled to use this feature. These quantizer
-     * offsets should be given for each 16x16 block. Behavior if quant
-     * offsets differ between encoding passes is undefined. */
+     * offsets should be given for each 16x16 block (8x8 block, when qg-size is 8).
+     * Behavior if quant offsets differ between encoding passes is undefined. */
     float            *quantOffsets;
 
     /* Frame level statistics */
     x265_frame_stats frameData;
 
+    /* User defined SEI */
+    x265_sei         userSEI;
+
     /* Ratecontrol statistics for collecting the ratecontrol information.
      * It is not used for collecting the last pass ratecontrol data in 
      * multi pass ratecontrol mode. */
@@ -883,6 +935,9 @@
     /* Enable weighted prediction in B slices. Default is disabled */
     int       bEnableWeightedBiPred;
 
+    /* Enable source pixels in motion estimation. Default is disabled */
+    int      bSourceReferenceEstimation;
+
     /*== Loop Filters ==*/
 
     /* Enable the deblocking loop filter, which improves visual quality by
@@ -1103,12 +1158,18 @@
 
         /* Enable adaptive quantization at CU granularity. This parameter specifies
          * the minimum CU size at which QP can be adjusted, i.e. Quantization Group
-         * (QG) size. Allowed values are 64, 32, 16 provided it falls within the
+         * (QG) size. Allowed values are 64, 32, 16, 8 provided it falls within the
          * inclusuve range [maxCUSize, minCUSize]. Experimental, default: maxCUSize */
         uint32_t qgSize;
 
         /* internally enable if tune grain is set */
         int      bEnableGrain;
+
+        /* sets a hard upper limit on QP */
+        int      qpMax;
+
+        /* sets a hard lower limit on QP */
+        int      qpMin;
     } rc;
 
     /*== Video Usability Information ==*/
@@ -1236,6 +1297,18 @@
      * value to that value. */
     uint16_t maxLuma;
 
+    /* Maximum of the picture order count */
+    int log2MaxPocLsb;
+
+    /* Dicard SEI messages when printing */
+    int bDiscardSEI;
+    
+    /* Control removing optional vui information (timing, HRD info) to get low bitrate */
+    int       bDiscardOptionalVUI;
+
+    /* Maximum count of Slices of picture, the value range is [1, maximum rows] */
+    unsigned int maxSlices;
+
 } x265_param;
 
 /* x265_param_alloc:

x265_2.0.tar.gz/source/x265cli.h -> x265_2.1.tar.gz/source/x265cli.h Changed

@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -152,6 +153,8 @@
     { "pbratio",        required_argument, NULL, 0 },
     { "qcomp",          required_argument, NULL, 0 },
     { "qpstep",         required_argument, NULL, 0 },
+    { "qpmin",          required_argument, NULL, 0 },
+    { "qpmax",          required_argument, NULL, 0 },
     { "ratetol",        required_argument, NULL, 0 },
     { "cplxblur",       required_argument, NULL, 0 },
     { "qblur",          required_argument, NULL, 0 },
@@ -204,6 +207,9 @@
     { "max-cll",        required_argument, NULL, 0 },
     { "min-luma",       required_argument, NULL, 0 },
     { "max-luma",       required_argument, NULL, 0 },
+    { "log2-max-poc-lsb", required_argument, NULL, 8 },
+    { "discard-sei",          no_argument, NULL, 0 },
+    { "discard-vui",          no_argument, NULL, 0 },
     { "no-dither",            no_argument, NULL, 0 },
     { "dither",               no_argument, NULL, 0 },
     { "no-repeat-headers",    no_argument, NULL, 0 },
@@ -230,6 +236,9 @@
     { "no-temporal-layers",   no_argument, NULL, 0 },
     { "qg-size",        required_argument, NULL, 0 },
     { "recon-y4m-exec", required_argument, NULL, 0 },
+    { "analyze-src-pics", no_argument, NULL, 0 },
+    { "no-analyze-src-pics", no_argument, NULL, 0 },
+    { "slices",         required_argument, NULL, 0 },
     { 0, 0, 0, 0 },
     { 0, 0, 0, 0 },
     { 0, 0, 0, 0 },
@@ -293,6 +302,7 @@
     H0("                                 '-' implies no threads on node, '+' implies one thread per core on node\n");
     H0("-F/--frame-threads <integer>     Number of concurrently encoded frames. 0: auto-determined by core count\n");
     H0("   --[no-]wpp                    Enable Wavefront Parallel Processing. Default %s\n", OPT(param->bEnableWavefront));
+    H0("   --[no-]slices <integer>       Enable Multiple Slices feature. Default %d\n", param->maxSlices);
     H0("   --[no-]pmode                  Parallel mode analysis. Default %s\n", OPT(param->bDistributeModeAnalysis));
     H0("   --[no-]pme                    Parallel motion estimation. Default %s\n", OPT(param->bDistributeMotionEstimation));
     H0("   --[no-]asm <bool|int|string>  Override CPU detection. Default: auto\n");
@@ -375,19 +385,22 @@
        "                                   - 2 : Last pass, does not overwrite stats file\n"
        "                                   - 3 : Nth pass, overwrites stats file\n");
     H0("   --stats                       Filename for stats file in multipass pass rate control. Default x265_2pass.log\n");
+    H0("   --[no-]analyze-src-pics       Motion estimation uses source frame planes. Default disable\n");
     H0("   --[no-]slow-firstpass         Enable a slow first pass in a multipass rate control mode. Default %s\n", OPT(param->rc.bEnableSlowFirstPass));
     H0("   --[no-]strict-cbr             Enable stricter conditions and tolerance for bitrate deviations in CBR mode. Default %s\n", OPT(param->rc.bStrictCbr));
     H0("   --analysis-mode <string|int>  save - Dump analysis info into file, load - Load analysis buffers from the file. Default %d\n", param->analysisMode);
     H0("   --analysis-file <filename>    Specify file name used for either dumping or reading analysis data.\n");
     H0("   --aq-mode <integer>           Mode for Adaptive Quantization - 0:none 1:uniform AQ 2:auto variance 3:auto variance with bias to dark scenes. Default %d\n", param->rc.aqMode);
     H0("   --aq-strength <float>         Reduces blocking and blurring in flat and textured areas (0 to 3.0). Default %.2f\n", param->rc.aqStrength);
-    H0("   --qg-size <int>               Specifies the size of the quantization group (64, 32, 16). Default %d\n", param->rc.qgSize);
+    H0("   --qg-size <int>               Specifies the size of the quantization group (64, 32, 16, 8). Default %d\n", param->rc.qgSize);
     H0("   --[no-]cutree                 Enable cutree for Adaptive Quantization. Default %s\n", OPT(param->rc.cuTree));
     H0("   --[no-]rc-grain               Enable ratecontrol mode to handle grains specifically. turned on with tune grain. Default %s\n", OPT(param->rc.bEnableGrain));
     H1("   --ipratio <float>             QP factor between I and P. Default %.2f\n", param->rc.ipFactor);
     H1("   --pbratio <float>             QP factor between P and B. Default %.2f\n", param->rc.pbFactor);
     H1("   --qcomp <float>               Weight given to predicted complexity. Default %.2f\n", param->rc.qCompress);
     H1("   --qpstep <integer>            The maximum single adjustment in QP allowed to rate control. Default %d\n", param->rc.qpStep);
+    H1("   --qpmin <integer>             sets a hard lower limit on QP allowed to ratecontrol. Default %d\n", param->rc.qpMin);
+    H1("   --qpmax <integer>             sets a hard upper limit on QP allowed to ratecontrol. Default %d\n", param->rc.qpMax);
     H1("   --cbqpoffs <integer>          Chroma Cb QP Offset [-12..12]. Default %d\n", param->cbQpOffset);
     H1("   --crqpoffs <integer>          Chroma Cr QP Offset [-12..12]. Default %d\n", param->crQpOffset);
     H1("   --scaling-list <string>       Specify a file containing HM style quant scaling lists or 'default' or 'off'. Default: off\n");
@@ -434,6 +447,9 @@
     H0("   --[no-]temporal-layers        Enable a temporal sublayer for unreferenced B frames. Default %s\n", OPT(param->bEnableTemporalSubLayers));
     H0("   --[no-]aud                    Emit access unit delimiters at the start of each access unit. Default %s\n", OPT(param->bEnableAccessUnitDelimiters));
     H1("   --hash <integer>              Decoded Picture Hash SEI 0: disabled, 1: MD5, 2: CRC, 3: Checksum. Default %d\n", param->decodedPictureHashSEI);
+    H0("   --log2-max-poc-lsb <integer>  Maximum of the picture order count\n");
+    H0("   --discard-sei                 Discard SEI packets in bitstream. Default %s\n", OPT(param->bDiscardSEI));
+    H0("   --discard-vui                 Discard optional VUI information from the bistream. Default %s\n", OPT(param->bDiscardOptionalVUI));
     H1("\nReconstructed video options (debugging):\n");
     H1("-r/--recon <filename>            Reconstructed raw image YUV or Y4M output file name\n");
     H1("   --recon-depth <integer>       Bit-depth of reconstructed raw image file. Defaults to input bit depth, or 8 if Y4M\n");

Changes of Revision 14