Packman Build Service PMBS

x265.changes Changed

@@ -1,4 +1,30 @@
 -------------------------------------------------------------------
+Wed Feb  3 13:22:42 UTC 2016 - idonmez@suse.com
+
+- Update to version 1.9
+  API Changes:
+  * x265_frame_stats returns many additional fields: maxCLL, maxFALL,
+    residual energy, scenecut and latency logging
+  * --qpfile now supports frametype 'K"
+  * x265 now allows CRF ratecontrol in pass N (N greater than or equal to 2)
+  * Chroma subsampling format YUV 4:0:0 is now fully supported and tested
+  New Features:
+  * Quant offsets: This feature allows block level quantization offsets
+    to be specified for every frame. An API-only feature.
+  * --intra-refresh: Keyframes can be replaced by a moving column
+    of intra blocks in non-keyframes.
+  * --limit-modes: Intelligently restricts mode analysis.
+  * --max-luma and --min-luma for luma clipping, optional for HDR use-cases
+  * Emergency denoising is now enabled by default in very low bitrate, 
+    VBV encodes
+  Presets and Performance:
+  * Recently added features lookahead-slices, limit-modes, limit-refs
+    have been enabled by default for applicable presets.
+  * The default psy-rd strength has been increased to 2.0
+  * Multi-socket machines now use a single pool of threads that can
+    work cross-socket.
+
+-------------------------------------------------------------------
 Fri Nov 27 18:21:04 UTC 2015 - aloisio@gmx.com
 
 - Update to version 1.8:

x265.spec Changed

@@ -1,10 +1,10 @@
 # based on the spec file from https://build.opensuse.org/package/view_file/home:Simmphonie/libx265/
 
 Name:           x265
-%define soname  68
+%define soname  79
 %define libname lib%{name}
 %define libsoname %{libname}-%{soname}
-Version:        1.8
+Version:        1.9
 Release:        0
 License:        GPL-2.0+
 Summary:        A free h265/HEVC encoder - encoder binary
@@ -43,35 +43,34 @@
 streams. 
 
 %prep
-%setup -q -n "%{name}_11047/build/linux"
-cd ../..
+%setup -q -n x265_%{version}
 %patch0 -p1
-cd -
+
 %define FAKE_BUILDDATE %(LC_ALL=C date -u -r %{_sourcedir}/%{name}.changes '+%%b %%e %%Y')
-sed -i -e "s/0.0/%{soname}.0/g" ../../source/cmake/version.cmake
+sed -i -e "s/0.0/%{soname}.0/g" source/cmake/version.cmake
 
 
 %build
-export CXXFLAGS="%optflags"
-export CFLAGS="%optflags"
-cmake  -DCMAKE_INSTALL_PREFIX=/usr -DENABLE_TESTS=ON -G "Unix Makefiles" ../../source
-cmake -DCMAKE_INSTALL_PREFIX=/usr ../../source
-#./make-Makefiles.bash
+export CXXFLAGS="%{optflags}"
+export CFLAGS="%{optflags}"
+
+cd build/linux
+cmake  -DCMAKE_INSTALL_PREFIX=%{_prefix} \
+       -DLIB_INSTALL_DIR=%{_lib} \
+       -DENABLE_TESTS=ON \
+       -G "Unix Makefiles" \
+       ../../source
+
 make %{?_smp_mflags} VERBOSE=1
 
 %install
+cd build/linux
 %makeinstall
-%ifarch x86_64
-  mv "%{buildroot}/usr/lib" "%{buildroot}%{_libdir}"
-%endif
 
 rm -f %{buildroot}%{_libdir}/%{libname}.a
 
 echo "%{libname}-%{soname}" > %{_sourcedir}/baselibs.conf
 
-%clean
-%{?buildroot:%__rm -rf "%{buildroot}"}
-
 %post -n %{libsoname} -p /sbin/ldconfig
 %postun -n %{libsoname} -p /sbin/ldconfig

x265_1.8.tar.gz/.hg_archival.txt -> x265_1.9.tar.gz/.hg_archival.txt Changed

x265_1.8.tar.gz/doc/reST/cli.rst -> x265_1.9.tar.gz/doc/reST/cli.rst Changed

@@ -84,8 +84,8 @@
 	it adds one line per run. If :option:`--csv-log-level` is greater than
 	0, it writes one line per frame. Default none
 
-	When frame level logging is enabled, several frame performance
-	statistics are listed:
+	Several frame performance statistics are available when 
+	:option:`--csv-log-level` is greater than or equal to 2:
 
 	**DecideWait ms** number of milliseconds the frame encoder had to
 	wait, since the previous frame was retrieved by the API thread,
@@ -202,15 +202,29 @@
 	"-"       - same as "none"
 	"10"      - allocate one pool, using up to 10 cores on node 0
 	"-,+"     - allocate one pool, using all cores on node 1
-	"+,-,+"   - allocate two pools, using all cores on nodes 0 and 2
-	"+,-,+,-" - allocate two pools, using all cores on nodes 0 and 2
-	"-,*"     - allocate three pools, using all cores on nodes 1, 2 and 3
+	"+,-,+"   - allocate one pool, using only cores on nodes 0 and 2
+	"+,-,+,-" - allocate one pool, using only cores on nodes 0 and 2
+	"-,*"     - allocate one pool, using all cores on nodes 1, 2 and 3
 	"8,8,8,8" - allocate four pools with up to 8 threads in each pool
-
-	The total number of threads will be determined by the number of threads
-	assigned to all nodes. The worker threads will each be given affinity for
-	their node, they will not be allowed to migrate between nodes, but they
-	will be allowed to move between CPU cores within their node.
+	"8,+,+,+" - allocate two pools, the first with 8 threads on node 0, and the second with all cores on node 1,2,3
+
+	A thread pool dedicated to a given NUMA node is enabled only when the
+	number of threads to be created on that NUMA node is explicitly mentioned
+	in that corresponding position with the --pools option. Else, all threads
+	are spawned from a single pool. The total number of threads will be
+	determined by the number of threads assigned to the enabled NUMA nodes for
+	that pool. The worker threads are be given affinity to all the enabled
+	NUMA nodes for that pool and may migrate between them, unless explicitly
+	specified as described above.
+
+	In the case that any threadpool has more than 64 threads, the threadpool
+	may be broken down into multiple pools of 64 threads each; on 32-bit
+	machines, this number is 32. All pools are given affinity to the NUMA
+	nodes on which the original pool had affinity. For performance reasons,
+	the last thread pool is spawned only if it has more than 32 threads for
+	64-bit machines, or 16 for 32-bit machines. If the total number of threads
+	in the system doesn't obey this constraint, we may spawn fewer threads
+	than cores which has been emperically shown to be better for performance. 
 
 	If the four pool features: :option:`--wpp`, :option:`--pmode`,
 	:option:`--pme` and :option:`--lookahead-slices` are all disabled,
@@ -219,10 +233,6 @@
 	If "none" is specified, then all four of the thread pool features are
 	implicitly disabled.
 
-	Multiple thread pools will be allocated for any NUMA node with more than
-	64 logical CPU cores. But any given thread pool will always use at most
-	one NUMA node.
-
 	Frame encoders are distributed between the available thread pools,
 	and the encoder will never generate more thread pools than
 	:option:`--frame-threads`.  The pools are used for WPP and for
@@ -238,8 +248,12 @@
 	system, a POSIX build of libx265 without libnuma will be less work
 	efficient. See :ref:`thread pools <pools>` for more detail.
 
-	Default "", one thread is allocated per detected hardware thread
-	(logical CPU cores) and one thread pool per NUMA node.
+	Default "", one pool is created across all available NUMA nodes, with
+	one thread allocated per detected hardware thread
+	(logical CPU cores). In the case that the total number of threads is more
+	than the maximum size that ATOMIC operations can handle (32 for 32-bit
+	compiles, and 64 for 64-bit compiles), multiple thread pools may be
+	spawned subject to the performance constraint described above.
 
 	Note that the string value will need to be escaped or quoted to
 	protect against shell expansion on many platforms
@@ -353,7 +367,7 @@
 
 	**CLI ONLY**
 
-.. option:: --total-frames <integer>
+.. option:: --frames <integer>
 
 	The number of frames intended to be encoded.  It may be left
 	unspecified, but when it is specified rate control can make use of
@@ -377,15 +391,15 @@
 
 .. option:: --input-csp <integer|string>
 
-	YUV only: Source color space. Only i420, i422, and i444 are
-	supported at this time. The internal color space is always the
-	same as the source color space (libx265 does not support any color
-	space conversions).
+	Chroma Subsampling (YUV only):  Only 4:0:0(monochrome), 4:2:0, 4:2:2, and 4:4:4 are supported at this time. 
+	The chroma subsampling format of your input must match your desired output chroma subsampling format 
+	(libx265 will not perform any chroma subsampling conversion), and it must be supported by the 
+	HEVC profile you have specified.
 
-	0. i400
-	1. i420 **(default)**
-	2. i422
-	3. i444
+	0. i400 (4:0:0 monochrome) - Not supported by Main or Main10 profiles
+	1. i420 (4:2:0 default)    - Supported by all HEVC profiles
+	2. i422 (4:2:2)            - Not supported by Main, Main10 and Main12 profiles
+	3. i444 (4:4:4)            - Supported by Main 4:4:4, Main 4:4:4 10, Main 4:4:4 12, Main 4:4:4 16 Intra profiles
 	4. nv12
 	5. nv16
 
@@ -436,8 +450,8 @@
 	depth of the encoder. If the requested bit depth is not the bit
 	depth of the linked libx265, it will attempt to bind libx265_main
 	for an 8bit encoder, libx265_main10 for a 10bit encoder, or
-	libx265_main12 for a 12bit encoder (EXPERIMENTAL), with the
-	same API version as the linked libx265.
+	libx265_main12 for a 12bit encoder, with the same API version as the
+	linked libx265.
 
 	If the output depth is not specified but :option:`--profile` is
 	specified, the output depth will be derived from the profile name.
@@ -486,13 +500,6 @@
 	The CLI application will derive the output bit depth from the
 	profile name if :option:`--output-depth` is not specified.
 
-.. note::
-
-	All 12bit presets are extremely unstable, do not use them yet.
-	16bit is not supported at all, but those profiles are included
-	because it is possible for libx265 to make bitstreams compatible
-	with them.
-
 .. option:: --level-idc <integer|float>
 
 	Minimum decoder requirement level. Defaults to 0, which implies
@@ -606,7 +613,8 @@
 	+-------+---------------------------------------------------------------+
 	| Level | Description                                                   |
 	+=======+===============================================================+
-	| 0     | sa8d mode and split decisions, intra w/ source pixels         |
+	| 0     | sa8d mode and split decisions, intra w/ source pixels,        |
+	|       | currently not supported                                       |
 	+-------+---------------------------------------------------------------+
 	| 1     | recon generated (better intra), RDO merge/skip selection      |
 	+-------+---------------------------------------------------------------+
@@ -677,7 +685,16 @@
 	(within your decoder level limits) if you enable one or
 	both of these flags.
 
-	This feature is EXPERIMENTAL and functional at all RD levels.
+	Default 3.
+
+.. option:: --limit-modes, --no-limit-modes
+    
+	When enabled, limit-modes will limit modes analyzed for each CU	using cost 
+	metrics from the 4 sub-CUs. When multiple inter modes like :option:`--rect`
+	and/or :option:`--amp` are enabled, this feature will use motion cost 
+	heuristics from the 4 sub-CUs to bypass modes that are unlikely to be the 
+	best choice. This can significantly improve performance when :option:`rect`
+	and/or :option:`--amp` are enabled at minimal compression efficiency loss.
 
 .. option:: --rect, --no-rect
 
@@ -1049,9 +1066,9 @@
 	energy of the source image in the encoded image at the expense of
 	compression efficiency. It only has effect on presets which use
 	RDO-based mode decisions (:option:`--rd` 3 and above). 1.0 is a
-	typical value. Default 0.3
+	typical value. Default 2.0
 
-	**Range of values:** 0 .. 2.0
+	**Range of values:** 0 .. 5.0
 
 .. option:: --psy-rdoq <float>
 
@@ -1076,7 +1093,8 @@
 
 	Max intra period in frames. A special case of infinite-gop (single
 	keyframe at the beginning of the stream) can be triggered with
-	argument -1. Use 1 to force all-intra. Default 250
+	argument -1. Use 1 to force all-intra. When intra-refresh is enabled
+	it specifies the interval between which refresh sweeps happen. Default 250
 
 .. option:: --min-keyint, -i <integer>
 
@@ -1095,6 +1113,14 @@
 	:option:`--scenecut` 0 or :option:`--no-scenecut` disables adaptive
 	I frame placement. Default 40
 
+.. option:: --intra-refresh
+
+	Enables Periodic Intra Refresh(PIR) instead of keyframe insertion.
+	PIR can replace keyframes by inserting a column of intra blocks in 
+	non-keyframes, that move across the video from one side to the other
+	and thereby refresh the image but over a period of multiple 
+	frames instead of a single keyframe.
+
 .. option:: --rc-lookahead <integer>
 
 	Number of frames for slice-type decision lookahead (a key
@@ -1108,21 +1134,31 @@

x265_1.8.tar.gz/doc/reST/presets.rst -> x265_1.9.tar.gz/doc/reST/presets.rst Changed

@@ -6,76 +6,83 @@
 Presets
 =======
 
-x265 has a number of predefined :option:`--preset` options that make
-trade-offs between encode speed (encoded frames per second) and
+x265 has ten predefined :option:`--preset` options that optimize the
+trade-off between encoding speed (encoded frames per second) and
 compression efficiency (quality per bit in the bitstream).  The default
-preset is medium, it does a reasonably good job of finding the best
-possible quality without spending enormous CPU cycles looking for the
-absolute most efficient way to achieve that quality.  As you go higher
-than medium, the encoder takes shortcuts to improve performance at the
-expense of quality and compression efficiency.  As you go lower than
-medium, the encoder tries harder and harder to achieve the best quailty
-per bit compression ratio.
-
-The presets adjust encoder parameters to affect these trade-offs.
-
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-|              | ultrafast | superfast | veryfast | faster | fast | medium | slow | slower | veryslow | placebo |
-+==============+===========+===========+==========+========+======+========+======+========+==========+=========+
-| ctu          |   32      |    32     |   32     |  64    |  64  |   64   |  64  |  64    |   64     |   64    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| min-cu-size  |   16      |     8     |    8     |   8    |   8  |    8   |   8  |   8    |    8     |    8    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| bframes      |    3      |     3     |    4     |   4    |  4   |    4   |  4   |   8    |    8     |    8    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| b-adapt      |    0      |     0     |    0     |   0    |  0   |    2   |  2   |   2    |    2     |    2    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| rc-lookahead |    5      |    10     |   15     |  15    |  15  |   20   |  25  |   30   |   40     |   60    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| scenecut     |    0      |    40     |   40     |  40    |  40  |   40   |  40  |   40   |   40     |   40    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| refs         |    1      |     1     |    1     |   1    |  2   |    3   |  3   |   3    |    5     |    5    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| me           |   dia     |   hex     |   hex    |  hex   | hex  |   hex  | star |  star  |   star   |   star  |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| merange      |   57      |    57     |   57     |  57    |  57  |   57   | 57   |  57    |   57     |   92    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| subme        |    0      |     1     |    1     |   2    |  2   |    2   |  3   |   3    |    4     |    5    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| rect         |    0      |     0     |    0     |   0    |  0   |    0   |  1   |   1    |    1     |    1    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| amp          |    0      |     0     |    0     |   0    |  0   |    0   |  0   |   1    |    1     |    1    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| max-merge    |    2      |     2     |    2     |   2    |  2   |    2   |  3   |   3    |    4     |    5    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| early-skip   |    1      |     1     |    1     |   1    |  0   |    0   |  0   |   0    |    0     |    0    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| fast-intra   |    1      |     1     |    1     |   1    |  1   |    0   |  0   |   0    |    0     |    0    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| b-intra      |    0      |     0     |    0     |   0    |  0   |    0   |  0   |   1    |    1     |    1    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| sao          |    0      |     0     |    1     |   1    |  1   |    1   |  1   |   1    |    1     |    1    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| signhide     |    0      |     1     |    1     |   1    |  1   |    1   |  1   |   1    |    1     |    1    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| weightp      |    0      |     0     |    1     |   1    |  1   |    1   |  1   |   1    |    1     |    1    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| weightb      |    0      |     0     |    0     |   0    |  0   |    0   |  0   |   1    |    1     |    1    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| aq-mode      |    0      |     0     |    1     |   1    |  1   |    1   |  1   |   1    |    1     |    1    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| cuTree       |    0      |     0     |    0     |   0    |  1   |    1   |  1   |   1    |    1     |    1    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| rdLevel      |    2      |     2     |    2     |   2    |  2   |    3   |  4   |   6    |    6     |    6    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| rdoq-level   |    0      |     0     |    0     |   0    |  0   |    0   |  2   |   2    |    2     |    2    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| tu-intra     |    1      |     1     |    1     |   1    |  1   |    1   |  1   |   2    |    3     |    4    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| tu-inter     |    1      |     1     |    1     |   1    |  1   |    1   |  1   |   2    |    3     |    4    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-
-Placebo mode enables transform-skip prediction evaluation.
+preset is medium.  It does a reasonably good job of finding the best
+possible quality without spending excessive CPU cycles looking for the
+absolute most efficient way to achieve that quality.  When you use 
+faster presets, the encoder takes shortcuts to improve performance at 
+the expense of quality and compression efficiency.  When you use slower
+presets, x265 tests more encoding options, using more computations to  
+achieve the best quality at your selected bit rate (or in the case of
+--crf rate control, the lowest bit rate at the selected quality).
+
+The presets adjust encoder parameters as shown in the following table.
+Any parameters below that are specified in your command-line will be 
+changed from the value specified by the preset.
+
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+|                 |ultrafast |superfast |veryfast |faster |fast |medium |slow |slower |veryslow |placebo |
++=================+==========+==========+=========+=======+=====+=======+=====+=======+=========+========+
+| ctu             |    32    |    32    |   64    |  64   | 64  |  64   | 64  |  64   |   64    |  64    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| min-cu-size     |    16    |     8    |    8    |   8   |  8  |   8   |  8  |   8   |    8    |   8    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| bframes         |     3    |     3    |    4    |   4   |  4  |   4   |  4  |   8   |    8    |   8    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| b-adapt         |     0    |     0    |    0    |   0   |  0  |   2   |  2  |   2   |    2    |   2    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| rc-lookahead    |     5    |    10    |   15    |  15   | 15  |  20   | 25  |  30   |   40    |  60    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| lookahead-slices|     8    |     8    |    8    |   8   |  8  |   8   |  4  |   4   |    1    |   1    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| scenecut        |     0    |    40    |   40    |  40   | 40  |  40   | 40  |  40   |   40    |  40    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| ref             |     1    |     1    |    2    |   2   |  3  |   3   |  4  |   4   |    5    |   5    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| limit-refs      |     0    |     0    |    3    |   3   |  3  |   3   |  3  |   2   |    1    |   0    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| me              |    dia   |   hex    |   hex   |  hex  |hex  |  hex  |star | star  |   star  |  star  |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| merange         |    57    |    57    |   57    |  57   | 57  |  57   | 57  |  57   |   57    |  92    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| subme           |     0    |     1    |    1    |   2   |  2  |   2   |  3  |   3   |    4    |   5    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| rect            |     0    |     0    |    0    |   0   |  0  |   0   |  1  |   1   |    1    |   1    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| amp             |     0    |     0    |    0    |   0   |  0  |   0   |  0  |   1   |    1    |   1    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| limit-modes     |     0    |     0    |    0    |   0   |  0  |   0   |  1  |   1   |    1    |   0    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| max-merge       |     2    |     2    |    2    |   2   |  2  |   2   |  3  |   3   |    4    |   5    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| early-skip      |     1    |     1    |    1    |   1   |  0  |   0   |  0  |   0   |    0    |   0    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| fast-intra      |     1    |     1    |    1    |   1   |  1  |   0   |  0  |   0   |    0    |   0    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| b-intra         |     0    |     0    |    0    |   0   |  0  |   0   |  0  |   1   |    1    |   1    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| sao             |     0    |     0    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| signhide        |     0    |     1    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| weightp         |     0    |     0    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| weightb         |     0    |     0    |    0    |   0   |  0  |   0   |  0  |   1   |    1    |   1    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| aq-mode         |     0    |     0    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| cuTree          |     1    |     1    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| rdLevel         |     2    |     2    |    2    |   2   |  2  |   3   |  4  |   6   |    6    |   6    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| rdoq-level      |     0    |     0    |    0    |   0   |  0  |   0   |  2  |   2   |    2    |   2    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| tu-intra        |     1    |     1    |    1    |   1   |  1  |   1   |  1  |   2   |    3    |   4    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| tu-inter        |     1    |     1    |    1    |   1   |  1  |   1   |  1  |   2   |    3    |   4    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
 
 .. _tunings:

x265_1.8.tar.gz/source/CMakeLists.txt -> x265_1.9.tar.gz/source/CMakeLists.txt Changed

@@ -30,7 +30,7 @@
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
 
 # X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 68)
+set(X265_BUILD 79)
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
                "${PROJECT_BINARY_DIR}/x265.def")
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
@@ -45,12 +45,14 @@
 set(POWER_ALIASES ppc64 ppc64le)
 list(FIND POWER_ALIASES "${SYSPROC}" POWERMATCH)
 if("${SYSPROC}" STREQUAL "" OR X86MATCH GREATER "-1")
-    message(STATUS "Detected x86 target processor")
     set(X86 1)
     add_definitions(-DX265_ARCH_X86=1)
     if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
         set(X64 1)
         add_definitions(-DX86_64=1)
+        message(STATUS "Detected x86_64 target processor")
+    else()
+        message(STATUS "Detected x86 target processor")
     endif()
 elseif(POWERMATCH GREATER "-1")
     message(STATUS "Detected POWER target processor")
@@ -71,23 +73,27 @@
     if(LIBRT)
         list(APPEND PLATFORM_LIBS rt)
     endif()
+    mark_as_advanced(LIBRT)
     find_library(LIBDL dl)
     if(LIBDL)
         list(APPEND PLATFORM_LIBS dl)
     endif()
-    find_package(Numa)
-    if(NUMA_FOUND)
-        link_directories(${NUMA_LIBRARY_DIR})
-        list(APPEND CMAKE_REQUIRED_LIBRARIES numa)
-        check_symbol_exists(numa_node_of_cpu numa.h NUMA_V2)
-        if(NUMA_V2)
-            add_definitions(-DHAVE_LIBNUMA)
-            message(STATUS "libnuma found, building with support for NUMA nodes")
-            list(APPEND PLATFORM_LIBS numa)
-            include_directories(${NUMA_INCLUDE_DIR})
+    option(ENABLE_LIBNUMA "Enable libnuma usage (Linux only)" ON)
+    if(ENABLE_LIBNUMA)
+        find_package(Numa)
+        if(NUMA_FOUND)
+            link_directories(${NUMA_LIBRARY_DIR})
+            list(APPEND CMAKE_REQUIRED_LIBRARIES numa)
+            check_symbol_exists(numa_node_of_cpu numa.h NUMA_V2)
+            if(NUMA_V2)
+                add_definitions(-DHAVE_LIBNUMA)
+                message(STATUS "libnuma found, building with support for NUMA nodes")
+                list(APPEND PLATFORM_LIBS numa)
+                include_directories(${NUMA_INCLUDE_DIR})
+            endif()
         endif()
-    endif()
-    mark_as_advanced(LIBRT NUMA_FOUND)
+        mark_as_advanced(NUMA_FOUND)
+    endif(ENABLE_LIBNUMA)
     option(NO_ATOMICS "Use a slow mutex to replace atomics" OFF)
     if(NO_ATOMICS)
         add_definitions(-DNO_ATOMICS=1)
@@ -157,6 +163,7 @@
 if(GCC)
     add_definitions(-Wall -Wextra -Wshadow)
     add_definitions(-D__STDC_LIMIT_MACROS=1)
+    add_definitions(-std=gnu++98)
     if(ENABLE_PIC)
          add_definitions(-fPIC)
     endif(ENABLE_PIC)
@@ -379,16 +386,19 @@
 
 option(ENABLE_VTUNE "Enable Vtune profiling instrumentation" OFF)
 if(ENABLE_VTUNE)
-    add_definitions(-DENABLE_VTUNE)
-    include_directories($ENV{VTUNE_AMPLIFIER_XE_2015_DIR}/include)
-    list(APPEND PLATFORM_LIBS vtune)
-    link_directories($ENV{VTUNE_AMPLIFIER_XE_2015_DIR}/lib64)
-    if(WIN32)
-        list(APPEND PLATFORM_LIBS libittnotify.lib)
-    else()
-        list(APPEND PLATFORM_LIBS libittnotify.a dl)
-    endif()
-    add_subdirectory(profile/vtune)
+    find_package(Vtune)
+    if(VTUNE_FOUND)
+        add_definitions(-DENABLE_VTUNE)
+        include_directories(${VTUNE_INCLUDE_DIR})
+        list(APPEND PLATFORM_LIBS vtune)
+        link_directories(${VTUNE_LIBRARY_DIR})
+        if(WIN32)
+            list(APPEND PLATFORM_LIBS libittnotify.lib)
+        else()
+            list(APPEND PLATFORM_LIBS libittnotify.a dl)
+        endif()
+        add_subdirectory(profile/vtune)
+    endif(VTUNE_FOUND)
 endif(ENABLE_VTUNE)
 
 option(DETAILED_CU_STATS "Enable internal profiling of encoder work" OFF)
@@ -455,6 +465,9 @@
 if(ENABLE_SHARED)
     add_library(x265-shared SHARED "${PROJECT_BINARY_DIR}/x265.def" ${YASM_OBJS}
                 ${X265_RC_FILE} $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common>)
+    if(EXTRA_LIB)
+        target_link_libraries(x265-shared ${EXTRA_LIB})
+    endif()
     target_link_libraries(x265-shared ${PLATFORM_LIBS})
     if(MSVC)
         set_target_properties(x265-shared PROPERTIES OUTPUT_NAME libx265)
@@ -465,6 +478,8 @@
         set_target_properties(x265-shared PROPERTIES VERSION ${X265_BUILD})
         if(APPLE)
             set_target_properties(x265-shared PROPERTIES MACOSX_RPATH 1)
+        elseif(CYGWIN)
+            # Cygwin is not officially supported or tested. MinGW with msys is recommended.
         else()
             list(APPEND LINKER_OPTIONS "-Wl,-Bsymbolic,-znoexecstack")
         endif()
@@ -480,9 +495,6 @@
                 ARCHIVE DESTINATION ${LIB_INSTALL_DIR}
                 RUNTIME DESTINATION ${BIN_INSTALL_DIR})
     endif()
-    if(EXTRA_LIB)
-        target_link_libraries(x265-shared ${EXTRA_LIB})
-    endif()
     if(LINKER_OPTIONS)
         # set_target_properties can't do list expansion
         string(REPLACE ";" " " LINKER_OPTION_STR "${LINKER_OPTIONS}")

x265_1.9.tar.gz/source/cmake/FindVtune.cmake Added

x265_1.8.tar.gz/source/common/bitstream.cpp -> x265_1.9.tar.gz/source/common/bitstream.cpp Changed

x265_1.8.tar.gz/source/common/bitstream.h -> x265_1.9.tar.gz/source/common/bitstream.h Changed

x265_1.8.tar.gz/source/common/common.h -> x265_1.9.tar.gz/source/common/common.h Changed

@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -134,10 +135,10 @@
 typedef int32_t  ssum2_t; // Signed sum
 #endif // if HIGH_BIT_DEPTH
 
-#if X265_DEPTH <= 10
-typedef uint32_t sse_ret_t;
+#if X265_DEPTH < 10
+typedef uint32_t sse_t;
 #else
-typedef uint64_t sse_ret_t;
+typedef uint64_t sse_t;
 #endif
 
 #ifndef NULL
@@ -214,6 +215,7 @@
 
 #define X265_MALLOC(type, count)    (type*)x265_malloc(sizeof(type) * (count))
 #define X265_FREE(ptr)              x265_free(ptr)
+#define X265_FREE_ZERO(ptr)         x265_free(ptr); (ptr) = NULL
 #define CHECKED_MALLOC(var, type, count) \
     { \
         var = (type*)x265_malloc(sizeof(type) * (count)); \
@@ -317,6 +319,9 @@
 #define CHROMA_V_SHIFT(x) (x == X265_CSP_I420)
 #define X265_MAX_PRED_MODE_PER_CTU 85 * 2 * 8
 
+#define MAX_NUM_TR_COEFFS           MAX_TR_SIZE * MAX_TR_SIZE // Maximum number of transform coefficients, for a 32x32 transform
+#define MAX_NUM_TR_CATEGORIES       16                        // 32, 16, 8, 4 transform categories each for luma and chroma
+
 namespace X265_NS {
 
 enum { SAO_NUM_OFFSET = 4 };
@@ -366,25 +371,6 @@
         delete[] ctuParam[2];
     }
 };
-
-/* Stores inter analysis data for a single frame */
-struct analysis_inter_data
-{
-    int32_t*    ref;
-    uint8_t*    depth;
-    uint8_t*    modes;
-    uint32_t*   bestMergeCand;
-};
-
-/* Stores intra analysis data for a single frame. This struct needs better packing */
-struct analysis_intra_data
-{
-    uint8_t*  depth;
-    uint8_t*  modes;
-    char*     partSizes;
-    uint8_t*  chromaModes;
-};
-
 enum TextType
 {
     TEXT_LUMA     = 0,  // luma

x265_1.8.tar.gz/source/common/constants.cpp -> x265_1.9.tar.gz/source/common/constants.cpp Changed

x265_1.8.tar.gz/source/common/constants.h -> x265_1.9.tar.gz/source/common/constants.h Changed

x265_1.8.tar.gz/source/common/contexts.h -> x265_1.9.tar.gz/source/common/contexts.h Changed

x265_1.8.tar.gz/source/common/cudata.cpp -> x265_1.9.tar.gz/source/common/cudata.cpp Changed

@@ -2,6 +2,7 @@
  * Copyright (C) 2015 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -192,44 +193,82 @@
         break;
     }
 
-    /* Each CU's data is layed out sequentially within the charMemBlock */
-    uint8_t *charBuf = dataPool.charMemBlock + (m_numPartitions * BytesPerPartition) * instance;
-
-    m_qp        = (int8_t*)charBuf; charBuf += m_numPartitions;
-    m_log2CUSize         = charBuf; charBuf += m_numPartitions;
-    m_lumaIntraDir       = charBuf; charBuf += m_numPartitions;
-    m_tqBypass           = charBuf; charBuf += m_numPartitions;
-    m_refIdx[0] = (int8_t*)charBuf; charBuf += m_numPartitions;
-    m_refIdx[1] = (int8_t*)charBuf; charBuf += m_numPartitions;
-    m_cuDepth            = charBuf; charBuf += m_numPartitions;
-    m_predMode           = charBuf; charBuf += m_numPartitions; /* the order up to here is important in initCTU() and initSubCU() */
-    m_partSize           = charBuf; charBuf += m_numPartitions;
-    m_mergeFlag          = charBuf; charBuf += m_numPartitions;
-    m_interDir           = charBuf; charBuf += m_numPartitions;
-    m_mvpIdx[0]          = charBuf; charBuf += m_numPartitions;
-    m_mvpIdx[1]          = charBuf; charBuf += m_numPartitions;
-    m_tuDepth            = charBuf; charBuf += m_numPartitions;
-    m_transformSkip[0]   = charBuf; charBuf += m_numPartitions;
-    m_transformSkip[1]   = charBuf; charBuf += m_numPartitions;
-    m_transformSkip[2]   = charBuf; charBuf += m_numPartitions;
-    m_cbf[0]             = charBuf; charBuf += m_numPartitions;
-    m_cbf[1]             = charBuf; charBuf += m_numPartitions;
-    m_cbf[2]             = charBuf; charBuf += m_numPartitions;
-    m_chromaIntraDir     = charBuf; charBuf += m_numPartitions;
-
-    X265_CHECK(charBuf == dataPool.charMemBlock + (m_numPartitions * BytesPerPartition) * (instance + 1), "CU data layout is broken\n");
-
-    m_mv[0]  = dataPool.mvMemBlock + (instance * 4) * m_numPartitions;
-    m_mv[1]  = m_mv[0] +  m_numPartitions;
-    m_mvd[0] = m_mv[1] +  m_numPartitions;
-    m_mvd[1] = m_mvd[0] + m_numPartitions;
-
-    uint32_t cuSize = g_maxCUSize >> depth;
-    uint32_t sizeL = cuSize * cuSize;
-    uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
-    m_trCoeff[0] = dataPool.trCoeffMemBlock + instance * (sizeL + sizeC * 2);
-    m_trCoeff[1] = m_trCoeff[0] + sizeL;
-    m_trCoeff[2] = m_trCoeff[0] + sizeL + sizeC;
+    if (csp == X265_CSP_I400)
+    {
+        /* Each CU's data is layed out sequentially within the charMemBlock */
+        uint8_t *charBuf = dataPool.charMemBlock + (m_numPartitions * (BytesPerPartition - 4)) * instance;
+
+        m_qp        = (int8_t*)charBuf; charBuf += m_numPartitions;
+        m_log2CUSize         = charBuf; charBuf += m_numPartitions;
+        m_lumaIntraDir       = charBuf; charBuf += m_numPartitions;
+        m_tqBypass           = charBuf; charBuf += m_numPartitions;
+        m_refIdx[0] = (int8_t*)charBuf; charBuf += m_numPartitions;
+        m_refIdx[1] = (int8_t*)charBuf; charBuf += m_numPartitions;
+        m_cuDepth            = charBuf; charBuf += m_numPartitions;
+        m_predMode           = charBuf; charBuf += m_numPartitions; /* the order up to here is important in initCTU() and initSubCU() */
+        m_partSize           = charBuf; charBuf += m_numPartitions;
+        m_mergeFlag          = charBuf; charBuf += m_numPartitions;
+        m_interDir           = charBuf; charBuf += m_numPartitions;
+        m_mvpIdx[0]          = charBuf; charBuf += m_numPartitions;
+        m_mvpIdx[1]          = charBuf; charBuf += m_numPartitions;
+        m_tuDepth            = charBuf; charBuf += m_numPartitions;
+        m_transformSkip[0]   = charBuf; charBuf += m_numPartitions;
+        m_cbf[0]             = charBuf; charBuf += m_numPartitions;
+        m_chromaIntraDir     = charBuf; charBuf += m_numPartitions;
+
+        X265_CHECK(charBuf == dataPool.charMemBlock + (m_numPartitions * (BytesPerPartition - 4)) * (instance + 1), "CU data layout is broken\n"); //BytesPerPartition
+
+        m_mv[0]  = dataPool.mvMemBlock + (instance * 4) * m_numPartitions;
+        m_mv[1]  = m_mv[0] +  m_numPartitions;
+        m_mvd[0] = m_mv[1] +  m_numPartitions;
+        m_mvd[1] = m_mvd[0] + m_numPartitions;
+
+        uint32_t cuSize = g_maxCUSize >> depth;
+        m_trCoeff[0] = dataPool.trCoeffMemBlock + instance * (cuSize * cuSize);
+        m_trCoeff[1] = m_trCoeff[2] = 0;
+        m_transformSkip[1] = m_transformSkip[2] = m_cbf[1] = m_cbf[2] = 0;
+    }
+    else
+    {
+        /* Each CU's data is layed out sequentially within the charMemBlock */
+        uint8_t *charBuf = dataPool.charMemBlock + (m_numPartitions * BytesPerPartition) * instance;
+
+        m_qp        = (int8_t*)charBuf; charBuf += m_numPartitions;
+        m_log2CUSize         = charBuf; charBuf += m_numPartitions;
+        m_lumaIntraDir       = charBuf; charBuf += m_numPartitions;
+        m_tqBypass           = charBuf; charBuf += m_numPartitions;
+        m_refIdx[0] = (int8_t*)charBuf; charBuf += m_numPartitions;
+        m_refIdx[1] = (int8_t*)charBuf; charBuf += m_numPartitions;
+        m_cuDepth            = charBuf; charBuf += m_numPartitions;
+        m_predMode           = charBuf; charBuf += m_numPartitions; /* the order up to here is important in initCTU() and initSubCU() */
+        m_partSize           = charBuf; charBuf += m_numPartitions;
+        m_mergeFlag          = charBuf; charBuf += m_numPartitions;
+        m_interDir           = charBuf; charBuf += m_numPartitions;
+        m_mvpIdx[0]          = charBuf; charBuf += m_numPartitions;
+        m_mvpIdx[1]          = charBuf; charBuf += m_numPartitions;
+        m_tuDepth            = charBuf; charBuf += m_numPartitions;
+        m_transformSkip[0]   = charBuf; charBuf += m_numPartitions;
+        m_transformSkip[1]   = charBuf; charBuf += m_numPartitions;
+        m_transformSkip[2]   = charBuf; charBuf += m_numPartitions;
+        m_cbf[0]             = charBuf; charBuf += m_numPartitions;
+        m_cbf[1]             = charBuf; charBuf += m_numPartitions;
+        m_cbf[2]             = charBuf; charBuf += m_numPartitions;
+        m_chromaIntraDir     = charBuf; charBuf += m_numPartitions;
+
+        X265_CHECK(charBuf == dataPool.charMemBlock + (m_numPartitions * BytesPerPartition) * (instance + 1), "CU data layout is broken\n");
+
+        m_mv[0]  = dataPool.mvMemBlock + (instance * 4) * m_numPartitions;
+        m_mv[1]  = m_mv[0] +  m_numPartitions;
+        m_mvd[0] = m_mv[1] +  m_numPartitions;
+        m_mvd[1] = m_mvd[0] + m_numPartitions;
+
+        uint32_t cuSize = g_maxCUSize >> depth;
+        uint32_t sizeL = cuSize * cuSize;
+        uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift); // block chroma part
+        m_trCoeff[0] = dataPool.trCoeffMemBlock + instance * (sizeL + sizeC * 2);
+        m_trCoeff[1] = m_trCoeff[0] + sizeL;
+        m_trCoeff[2] = m_trCoeff[0] + sizeL + sizeC;
+    }
 }
 
 void CUData::initCTU(const Frame& frame, uint32_t cuAddr, int qp)
@@ -245,7 +284,8 @@
     /* sequential memsets */
     m_partSet((uint8_t*)m_qp, (uint8_t)qp);
     m_partSet(m_log2CUSize,   (uint8_t)g_maxLog2CUSize);
-    m_partSet(m_lumaIntraDir, (uint8_t)DC_IDX);
+    m_partSet(m_lumaIntraDir, (uint8_t)ALL_IDX);
+    m_partSet(m_chromaIntraDir, (uint8_t)ALL_IDX);
     m_partSet(m_tqBypass,     (uint8_t)frame.m_encData->m_param->bLossless);
     if (m_slice->m_sliceType != I_SLICE)
     {
@@ -256,7 +296,7 @@
     X265_CHECK(!(frame.m_encData->m_param->bLossless && !m_slice->m_pps->bTransquantBypassEnabled), "lossless enabled without TQbypass in PPS\n");
 
     /* initialize the remaining CU data in one memset */
-    memset(m_cuDepth, 0, (BytesPerPartition - 6) * m_numPartitions);
+    memset(m_cuDepth, 0, (frame.m_param->internalCsp == X265_CSP_I400 ? BytesPerPartition - 11 : BytesPerPartition - 7) * m_numPartitions);
 
     uint32_t widthInCU = m_slice->m_sps->numCuInWidth;
     m_cuLeft = (m_cuAddr % widthInCU) ? m_encData->getPicCTU(m_cuAddr - 1) : NULL;
@@ -283,14 +323,15 @@
     m_partSet((uint8_t*)m_qp, (uint8_t)qp);
 
     m_partSet(m_log2CUSize,   (uint8_t)cuGeom.log2CUSize);
-    m_partSet(m_lumaIntraDir, (uint8_t)DC_IDX);
+    m_partSet(m_lumaIntraDir, (uint8_t)ALL_IDX);
+    m_partSet(m_chromaIntraDir, (uint8_t)ALL_IDX);
     m_partSet(m_tqBypass,     (uint8_t)m_encData->m_param->bLossless);
     m_partSet((uint8_t*)m_refIdx[0], (uint8_t)REF_NOT_VALID);
     m_partSet((uint8_t*)m_refIdx[1], (uint8_t)REF_NOT_VALID);
     m_partSet(m_cuDepth,      (uint8_t)cuGeom.depth);
 
     /* initialize the remaining CU data in one memset */
-    memset(m_predMode, 0, (BytesPerPartition - 7) * m_numPartitions);
+    memset(m_predMode, 0, (ctu.m_chromaFormat == X265_CSP_I400 ? BytesPerPartition - 12 : BytesPerPartition - 8) * m_numPartitions);
 }
 
 /* Copy the results of a sub-part (split) CU to the parent CU */
@@ -314,13 +355,9 @@
     m_subPartCopy(m_mvpIdx[0] + offset, subCU.m_mvpIdx[0]);
     m_subPartCopy(m_mvpIdx[1] + offset, subCU.m_mvpIdx[1]);
     m_subPartCopy(m_tuDepth + offset, subCU.m_tuDepth);
+
     m_subPartCopy(m_transformSkip[0] + offset, subCU.m_transformSkip[0]);
-    m_subPartCopy(m_transformSkip[1] + offset, subCU.m_transformSkip[1]);
-    m_subPartCopy(m_transformSkip[2] + offset, subCU.m_transformSkip[2]);
     m_subPartCopy(m_cbf[0] + offset, subCU.m_cbf[0]);
-    m_subPartCopy(m_cbf[1] + offset, subCU.m_cbf[1]);
-    m_subPartCopy(m_cbf[2] + offset, subCU.m_cbf[2]);
-    m_subPartCopy(m_chromaIntraDir + offset, subCU.m_chromaIntraDir);
 
     memcpy(m_mv[0] + offset, subCU.m_mv[0], childGeom.numPartitions * sizeof(MV));
     memcpy(m_mv[1] + offset, subCU.m_mv[1], childGeom.numPartitions * sizeof(MV));
@@ -329,12 +366,21 @@
 
     uint32_t tmp = 1 << ((g_maxLog2CUSize - childGeom.depth) * 2);
     uint32_t tmp2 = subPartIdx * tmp;
-    memcpy(m_trCoeff[0] + tmp2, subCU.m_trCoeff[0], sizeof(coeff_t) * tmp);
+    memcpy(m_trCoeff[0] + tmp2, subCU.m_trCoeff[0], sizeof(coeff_t)* tmp);
 
-    uint32_t tmpC = tmp >> (m_hChromaShift + m_vChromaShift);
-    uint32_t tmpC2 = tmp2 >> (m_hChromaShift + m_vChromaShift);
-    memcpy(m_trCoeff[1] + tmpC2, subCU.m_trCoeff[1], sizeof(coeff_t) * tmpC);
-    memcpy(m_trCoeff[2] + tmpC2, subCU.m_trCoeff[2], sizeof(coeff_t) * tmpC);
+    if (subCU.m_chromaFormat != X265_CSP_I400)
+    {
+        m_subPartCopy(m_transformSkip[1] + offset, subCU.m_transformSkip[1]);
+        m_subPartCopy(m_transformSkip[2] + offset, subCU.m_transformSkip[2]);
+        m_subPartCopy(m_cbf[1] + offset, subCU.m_cbf[1]);
+        m_subPartCopy(m_cbf[2] + offset, subCU.m_cbf[2]);
+        m_subPartCopy(m_chromaIntraDir + offset, subCU.m_chromaIntraDir);
+

x265_1.8.tar.gz/source/common/cudata.h -> x265_1.9.tar.gz/source/common/cudata.h Changed

@@ -222,12 +222,12 @@
     void     copyToPic(uint32_t depth) const;
 
     /* RD-0 methods called only from encodeResidue */
-    void     copyFromPic(const CUData& ctu, const CUGeom& cuGeom);
+    void     copyFromPic(const CUData& ctu, const CUGeom& cuGeom, int csp);
     void     updatePic(uint32_t depth) const;
 
     void     setPartSizeSubParts(PartSize size)    { m_partSet(m_partSize, (uint8_t)size); }
     void     setPredModeSubParts(PredMode mode)    { m_partSet(m_predMode, (uint8_t)mode); }
-    void     clearCbf()                            { m_partSet(m_cbf[0], 0); m_partSet(m_cbf[1], 0); m_partSet(m_cbf[2], 0); }
+    void     clearCbf()                            { m_partSet(m_cbf[0], 0); if (m_chromaFormat != X265_CSP_I400) { m_partSet(m_cbf[1], 0); m_partSet(m_cbf[2], 0);} }
 
     /* these functions all take depth as an absolute depth from CTU, it is used to calculate the number of parts to copy */
     void     setQPSubParts(int8_t qp, uint32_t absPartIdx, uint32_t depth)                    { s_partSet[depth]((uint8_t*)m_qp + absPartIdx, (uint8_t)qp); }
@@ -246,7 +246,7 @@
     void     setPURefIdx(int list, int8_t refIdx, int absPartIdx, int puIdx);
 
     uint8_t  getCbf(uint32_t absPartIdx, TextType ttype, uint32_t tuDepth) const { return (m_cbf[ttype][absPartIdx] >> tuDepth) & 0x1; }
-    uint8_t  getQtRootCbf(uint32_t absPartIdx) const                             { return m_cbf[0][absPartIdx] || m_cbf[1][absPartIdx] || m_cbf[2][absPartIdx]; }
+    uint8_t  getQtRootCbf(uint32_t absPartIdx) const                             { if (m_chromaFormat == X265_CSP_I400) return m_cbf[0][absPartIdx] || false; else { return m_cbf[0][absPartIdx] || m_cbf[1][absPartIdx] || m_cbf[2][absPartIdx];} }
     int8_t   getRefQP(uint32_t currAbsIdxInCTU) const;
     uint32_t getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MVField (*candMvField)[2], uint8_t* candDir) const;
     void     clipMv(MV& outMV) const;
@@ -323,7 +323,6 @@
     const uint16_t *scan;
     const uint16_t *scanCG;
     ScanType        scanType;
-    uint32_t        log2TrSizeCG;
     uint32_t        firstSignificanceMapContext;
 };
 
@@ -340,8 +339,15 @@
         uint32_t numPartition = NUM_4x4_PARTITIONS >> (depth * 2);
         uint32_t cuSize = g_maxCUSize >> depth;
         uint32_t sizeL = cuSize * cuSize;
-        uint32_t sizeC = sizeL >> (CHROMA_H_SHIFT(csp) + CHROMA_V_SHIFT(csp));
-        CHECKED_MALLOC(trCoeffMemBlock, coeff_t, (sizeL + sizeC * 2) * numInstances);
+        if (csp == X265_CSP_I400)
+        {
+            CHECKED_MALLOC(trCoeffMemBlock, coeff_t, (sizeL) * numInstances);
+        }
+        else
+        {            
+            uint32_t sizeC = sizeL >> (CHROMA_H_SHIFT(csp) + CHROMA_V_SHIFT(csp));
+            CHECKED_MALLOC(trCoeffMemBlock, coeff_t, (sizeL + sizeC * 2) * numInstances);
+        }
         CHECKED_MALLOC(charMemBlock, uint8_t, numPartition * numInstances * CUData::BytesPerPartition);
         CHECKED_MALLOC(mvMemBlock, MV, numPartition * 4 * numInstances);
         return true;

x265_1.8.tar.gz/source/common/dct.cpp -> x265_1.9.tar.gz/source/common/dct.cpp Changed

@@ -703,7 +703,10 @@
         if (level)
             ++numSig;
         level *= sign;
-        qCoef[blockpos] = (int16_t)x265_clip3(-32768, 32767, level);
+
+        // TODO: when we limit range to [-32767, 32767], we can get more performance with output change
+        //       But nquant is a little percent in rdoQuant, so I keep old dynamic range for compatible
+        qCoef[blockpos] = (int16_t)abs(x265_clip3(-32768, 32767, level));
     }
 
     return numSig;
@@ -784,11 +787,12 @@
     return scanPosLast - 1;
 }
 
+// NOTE: no defined value on lastNZPosInCG & absSumSign when ALL ZEROS block as input
 static uint32_t findPosFirstLast_c(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16])
 {
     int n;
 
-    for (n = SCAN_SET_SIZE - 1; n >= 0; --n)
+    for (n = SCAN_SET_SIZE - 1; n >= 0; n--)
     {
         const uint32_t idx = scanTbl[n];
         const uint32_t idxY = idx / MLS_CG_SIZE;
@@ -812,8 +816,17 @@
 
     uint32_t firstNZPosInCG = (uint32_t)n;
 
+    uint32_t absSumSign = 0;
+    for (n = firstNZPosInCG; n <= (int)lastNZPosInCG; n++)
+    {
+        const uint32_t idx = scanTbl[n];
+        const uint32_t idxY = idx / MLS_CG_SIZE;
+        const uint32_t idxX = idx % MLS_CG_SIZE;
+        absSumSign += dstCoeff[idxY * trSize + idxX];
+    }
+
     // NOTE: when coeff block all ZERO, the lastNZPosInCG is undefined and firstNZPosInCG is 16
-    return ((lastNZPosInCG << 16) | firstNZPosInCG);
+    return ((absSumSign << 31) | (lastNZPosInCG << 8) | firstNZPosInCG);
 }

x265_1.8.tar.gz/source/common/deblock.cpp -> x265_1.9.tar.gz/source/common/deblock.cpp Changed

@@ -2,6 +2,7 @@
 * Copyright (C) 2013 x265 project
 *
 * Author: Gopu Govindaswamy <gopu@multicorewareinc.com>
+*         Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -108,7 +109,7 @@
     for (uint32_t e = 0; e < numUnits; e += partIdxIncr)
     {
         edgeFilterLuma(cu, absPartIdx, depth, dir, e, blockStrength);
-        if (!((e0 + e) & chromaMask))
+        if (!((e0 + e) & chromaMask) && cu->m_chromaFormat != X265_CSP_I400)
             edgeFilterChroma(cu, absPartIdx, depth, dir, e, blockStrength);
     }
 }
@@ -209,8 +210,8 @@
     const Slice* const sliceQ = cuQ->m_slice;
     const Slice* const sliceP = cuP->m_slice;
 
-    const Frame* refP0 = sliceP->getRefPic(0, cuP->m_refIdx[0][partP]);
-    const Frame* refQ0 = sliceQ->getRefPic(0, cuQ->m_refIdx[0][partQ]);
+    const Frame* refP0 = sliceP->m_refFrameList[0][cuP->m_refIdx[0][partP]];
+    const Frame* refQ0 = sliceQ->m_refFrameList[0][cuQ->m_refIdx[0][partQ]];
     const MV& mvP0 = refP0 ? cuP->m_mv[0][partP] : zeroMv;
     const MV& mvQ0 = refQ0 ? cuQ->m_mv[0][partQ] : zeroMv;
 
@@ -221,8 +222,8 @@
     }
 
     // (sliceQ->isInterB() || sliceP->isInterB())
-    const Frame* refP1 = sliceP->getRefPic(1, cuP->m_refIdx[1][partP]);
-    const Frame* refQ1 = sliceQ->getRefPic(1, cuQ->m_refIdx[1][partQ]);
+    const Frame* refP1 = sliceP->m_refFrameList[1][cuP->m_refIdx[1][partP]];
+    const Frame* refQ1 = sliceQ->m_refFrameList[1][cuQ->m_refIdx[1][partQ]];
     const MV& mvP1 = refP1 ? cuP->m_mv[1][partP] : zeroMv;
     const MV& mvQ1 = refQ1 ? cuQ->m_mv[1][partQ] : zeroMv;
 
@@ -279,31 +280,6 @@
  * \param maskQ   indicator to enable filtering on partQ
  * \param maskP1  decision weak filter/no filter for partP
  * \param maskQ1  decision weak filter/no filter for partQ */
-static inline void pelFilterLumaStrong(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ)
-{
-    int32_t tc2 = 2 * tc;
-    int32_t tcP = (tc2 & maskP);
-    int32_t tcQ = (tc2 & maskQ);
-    for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
-    {
-        int16_t m4  = (int16_t)src[0];
-        int16_t m3  = (int16_t)src[-offset];
-        int16_t m5  = (int16_t)src[offset];
-        int16_t m2  = (int16_t)src[-offset * 2];
-        int16_t m6  = (int16_t)src[offset * 2];
-        int16_t m1  = (int16_t)src[-offset * 3];
-        int16_t m7  = (int16_t)src[offset * 3];
-        int16_t m0  = (int16_t)src[-offset * 4];
-        src[-offset * 3] = (pixel)(x265_clip3(-tcP, tcP, ((2 * m0 + 3 * m1 + m2 + m3 + m4 + 4) >> 3) - m1) + m1);
-        src[-offset * 2] = (pixel)(x265_clip3(-tcP, tcP, ((m1 + m2 + m3 + m4 + 2) >> 2) - m2) + m2);
-        src[-offset]     = (pixel)(x265_clip3(-tcP, tcP, ((m1 + 2 * m2 + 2 * m3 + 2 * m4 + m5 + 4) >> 3) - m3) + m3);
-        src[0]           = (pixel)(x265_clip3(-tcQ, tcQ, ((m2 + 2 * m3 + 2 * m4 + 2 * m5 + m6 + 4) >> 3) - m4) + m4);
-        src[offset]      = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + m6 + 2) >> 2) - m5) + m5);
-        src[offset * 2]  = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + 3 * m6 + 2 * m7 + 4) >> 3) - m6) + m6);
-    }
-}
-
-/* Weak filter */
 static inline void pelFilterLuma(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ,
                                  int32_t maskP1, int32_t maskQ1)
 {
@@ -445,7 +421,12 @@
                    useStrongFiltering(offset, beta, tc, src + unitOffset + srcStep * 3));
 
         if (sw)
-            pelFilterLumaStrong(src + unitOffset, srcStep, offset, tc, maskP, maskQ);
+        {
+            int32_t tc2 = 2 * tc;
+            int32_t tcP = (tc2 & maskP);
+            int32_t tcQ = (tc2 & maskQ);
+            primitives.pelFilterLumaStrong[dir](src + unitOffset, srcStep, offset, tcP, tcQ);
+        }
         else
         {
             int32_t sideThreshold = (beta + (beta >> 1)) >> 3;

x265_1.8.tar.gz/source/common/deblock.h -> x265_1.9.tar.gz/source/common/deblock.h Changed

@@ -2,6 +2,7 @@
 * Copyright (C) 2013 x265 project
 *
 * Author: Gopu Govindaswamy <gopu@multicorewareinc.com>
+*         Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -37,24 +38,24 @@
 public:
     enum { EDGE_VER, EDGE_HOR };
 
-    void deblockCTU(const CUData* ctu, const CUGeom& cuGeom, int32_t dir);
+    static void deblockCTU(const CUData* ctu, const CUGeom& cuGeom, int32_t dir);
 
 protected:
 
     // CU-level deblocking function
-    void deblockCU(const CUData* cu, const CUGeom& cuGeom, const int32_t dir, uint8_t blockStrength[]);
+    static void deblockCU(const CUData* cu, const CUGeom& cuGeom, const int32_t dir, uint8_t blockStrength[]);
 
     // set filtering functions
-    void setEdgefilterTU(const CUData* cu, uint32_t absPartIdx, uint32_t tuDepth, int32_t dir, uint8_t blockStrength[]);
-    void setEdgefilterPU(const CUData* cu, uint32_t absPartIdx, int32_t dir, uint8_t blockStrength[], uint32_t numUnits);
-    void setEdgefilterMultiple(const CUData* cu, uint32_t absPartIdx, int32_t dir, int32_t edgeIdx, uint8_t value, uint8_t blockStrength[], uint32_t numUnits);
+    static void setEdgefilterTU(const CUData* cu, uint32_t absPartIdx, uint32_t tuDepth, int32_t dir, uint8_t blockStrength[]);
+    static void setEdgefilterPU(const CUData* cu, uint32_t absPartIdx, int32_t dir, uint8_t blockStrength[], uint32_t numUnits);
+    static void setEdgefilterMultiple(const CUData* cu, uint32_t absPartIdx, int32_t dir, int32_t edgeIdx, uint8_t value, uint8_t blockStrength[], uint32_t numUnits);
 
     // get filtering functions
-    uint8_t getBoundaryStrength(const CUData* cuQ, int32_t dir, uint32_t partQ, const uint8_t blockStrength[]);
+    static uint8_t getBoundaryStrength(const CUData* cuQ, int32_t dir, uint32_t partQ, const uint8_t blockStrength[]);
 
     // filter luma/chroma functions
-    void edgeFilterLuma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[]);
-    void edgeFilterChroma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[]);
+    static void edgeFilterLuma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[]);
+    static void edgeFilterChroma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[]);
 
     static const uint8_t s_tcTable[54];
     static const uint8_t s_betaTable[52];

x265_1.8.tar.gz/source/common/frame.cpp -> x265_1.9.tar.gz/source/common/frame.cpp Changed

@@ -33,22 +33,37 @@
     m_bChromaExtended = false;
     m_lowresInit = false;
     m_reconRowCount.set(0);
+    m_reconColCount = NULL;
     m_countRefEncoders = 0;
     m_encData = NULL;
     m_reconPic = NULL;
+    m_quantOffsets = NULL;
     m_next = NULL;
     m_prev = NULL;
     m_param = NULL;
     memset(&m_lowres, 0, sizeof(m_lowres));
 }
 
-bool Frame::create(x265_param *param)
+bool Frame::create(x265_param *param, float* quantOffsets)
 {
     m_fencPic = new PicYuv;
     m_param = param;
 
-    return m_fencPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp) &&
-           m_lowres.create(m_fencPic, param->bframes, !!param->rc.aqMode);
+    if (m_fencPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp) &&
+        m_lowres.create(m_fencPic, param->bframes, !!param->rc.aqMode))
+    {
+        X265_CHECK((m_reconColCount == NULL), "m_reconColCount was initialized");
+        m_numRows = (m_fencPic->m_picHeight + g_maxCUSize - 1)  / g_maxCUSize;
+        m_reconColCount = new ThreadSafeInteger[m_numRows];
+
+        if (quantOffsets)
+        {
+            int32_t cuCount = m_lowres.maxBlocksInRow * m_lowres.maxBlocksInCol;
+            m_quantOffsets = new float[cuCount];
+        }
+        return true;
+    }
+    return false;
 }
 
 bool Frame::allocEncodeData(x265_param *param, const SPS& sps)
@@ -56,15 +71,27 @@
     m_encData = new FrameData;
     m_reconPic = new PicYuv;
     m_encData->m_reconPic = m_reconPic;
-    bool ok = m_encData->create(param, sps) && m_reconPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp);
+    bool ok = m_encData->create(*param, sps) && m_reconPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp);
     if (ok)
     {
         /* initialize right border of m_reconpicYuv as SAO may read beyond the
          * end of the picture accessing uninitialized pixels */
         int maxHeight = sps.numCuInHeight * g_maxCUSize;
-        memset(m_reconPic->m_picOrg[0], 0, sizeof(pixel) * m_reconPic->m_stride * maxHeight);
-        memset(m_reconPic->m_picOrg[1], 0, sizeof(pixel) * m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift));
-        memset(m_reconPic->m_picOrg[2], 0, sizeof(pixel) * m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift));
+        memset(m_reconPic->m_picOrg[0], 0, sizeof(pixel)* m_reconPic->m_stride * maxHeight);
+
+        /* use pre-calculated cu/pu offsets cached in the SPS structure */
+        m_reconPic->m_cuOffsetY = sps.cuOffsetY;
+        m_reconPic->m_buOffsetY = sps.buOffsetY;
+
+        if (param->internalCsp != X265_CSP_I400)
+        {
+            memset(m_reconPic->m_picOrg[1], 0, sizeof(pixel) * m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift));
+            memset(m_reconPic->m_picOrg[2], 0, sizeof(pixel) * m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift));
+
+            /* use pre-calculated cu/pu offsets cached in the SPS structure */
+            m_reconPic->m_cuOffsetC = sps.cuOffsetC;
+            m_reconPic->m_buOffsetC = sps.buOffsetC;
+        }
     }
     return ok;
 }
@@ -100,5 +127,16 @@
         m_reconPic = NULL;
     }
 
+    if (m_reconColCount)
+    {
+        delete[] m_reconColCount;
+        m_reconColCount = NULL;
+    }
+
+    if (m_quantOffsets)
+    {
+        delete[] m_quantOffsets;
+    }
+
     m_lowres.destroy();
 }

x265_1.8.tar.gz/source/common/frame.h -> x265_1.9.tar.gz/source/common/frame.h Changed

@@ -35,7 +35,7 @@
 class PicYuv;
 struct SPS;
 
-#define IS_REFERENCED(frame) (frame->m_lowres.sliceType != X265_TYPE_B) 
+#define IS_REFERENCED(frame) (frame->m_lowres.sliceType != X265_TYPE_B)
 
 class Frame
 {
@@ -59,8 +59,12 @@
     bool                   m_lowresInit;         // lowres init complete (pre-analysis)
     bool                   m_bChromaExtended;    // orig chroma planes motion extended for weight analysis
 
+    float*                 m_quantOffsets;       // points to quantOffsets in x265_picture
+
     /* Frame Parallelism - notification between FrameEncoders of available motion reference rows */
     ThreadSafeInteger      m_reconRowCount;      // count of CTU rows completely reconstructed and extended for motion reference
+    ThreadSafeInteger*     m_reconColCount;      // count of CTU cols completely reconstructed and extended for motion reference
+    int32_t                m_numRows;
     volatile uint32_t      m_countRefEncoders;   // count of FrameEncoder threads monitoring m_reconRowCount
 
     Frame*                 m_next;               // PicList doubly linked list pointers
@@ -69,7 +73,7 @@
     x265_analysis_data     m_analysisData;
     Frame();
 
-    bool create(x265_param *param);
+    bool create(x265_param *param, float* quantOffsets);
     bool allocEncodeData(x265_param *param, const SPS& sps);
     void reinit(const SPS& sps);
     void destroy();

x265_1.8.tar.gz/source/common/framedata.cpp -> x265_1.9.tar.gz/source/common/framedata.cpp Changed

x265_1.8.tar.gz/source/common/framedata.h -> x265_1.9.tar.gz/source/common/framedata.h Changed

@@ -55,8 +55,7 @@
     double      avgLumaDistortion;
     double      avgChromaDistortion;
     double      avgPsyEnergy;
-    double      avgLumaLevel;
-    double      lumaLevel;
+    double      avgResEnergy;
     double      percentIntraNxN;
     double      percentSkipCu[NUM_CU_DEPTH];
     double      percentMergeCu[NUM_CU_DEPTH];
@@ -69,13 +68,13 @@
     uint64_t    lumaDistortion;
     uint64_t    chromaDistortion;
     uint64_t    psyEnergy;
+    uint64_t    resEnergy;
     uint64_t    cntSkipCu[NUM_CU_DEPTH];
     uint64_t    cntMergeCu[NUM_CU_DEPTH];
     uint64_t    cntInter[NUM_CU_DEPTH];
     uint64_t    cntIntra[NUM_CU_DEPTH];
     uint64_t    cuInterDistribution[NUM_CU_DEPTH][INTER_MODES];
     uint64_t    cuIntraDistribution[NUM_CU_DEPTH][INTRA_MODES];
-    uint16_t    maxLumaLevel;
 
     FrameStats()
     {
@@ -96,7 +95,7 @@
 
     Slice*         m_slice;
     SAOParam*      m_saoParam;
-    x265_param*    m_param;
+    const x265_param* m_param;
 
     FrameData*     m_freeListNext;
     PicYuv*        m_reconPic;
@@ -135,19 +134,44 @@
     RCStatCU*      m_cuStat;
     RCStatRow*     m_rowStat;
     FrameStats     m_frameStats; // stats of current frame for multi-pass encodes
+    /* data needed for periodic intra refresh */
+    struct PeriodicIR
+    {
+        uint32_t   pirStartCol;
+        uint32_t   pirEndCol;
+        int        framesSinceLastPir;
+    };
 
+    PeriodicIR     m_pir;
     double         m_avgQpRc;    /* avg QP as decided by rate-control */
     double         m_avgQpAq;    /* avg QP as decided by AQ in addition to rate-control */
     double         m_rateFactor; /* calculated based on the Frame QP */
 
     FrameData();
 
-    bool create(x265_param *param, const SPS& sps);
+    bool create(const x265_param& param, const SPS& sps);
     void reinit(const SPS& sps);
     void destroy();
+    inline CUData* getPicCTU(uint32_t ctuAddr) { return &m_picCTU[ctuAddr]; }
+};
+
+/* Stores intra analysis data for a single frame. This struct needs better packing */
+struct analysis_intra_data
+{
+    uint8_t*  depth;
+    uint8_t*  modes;
+    char*     partSizes;
+    uint8_t*  chromaModes;
+};
 
-    CUData* getPicCTU(uint32_t ctuAddr) { return &m_picCTU[ctuAddr]; }
+/* Stores inter analysis data for a single frame */
+struct analysis_inter_data
+{
+    MV*         mv;
+    int32_t*    ref;
+    uint8_t*    depth;
+    uint8_t*    modes;
+    uint32_t*   bestMergeCand;
 };
 }
-
 #endif // ifndef X265_FRAMEDATA_H

x265_1.8.tar.gz/source/common/ipfilter.cpp -> x265_1.9.tar.gz/source/common/ipfilter.cpp Changed

x265_1.8.tar.gz/source/common/loopfilter.cpp -> x265_1.9.tar.gz/source/common/loopfilter.cpp Changed

@@ -3,6 +3,7 @@
 *
 * Authors: Praveen Kumar Tiwari <praveen@multicorewareinc.com>
 *          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
+*          Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -136,6 +137,27 @@
         rec += stride;
     }
 }
+
+static void pelFilterLumaStrong_c(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ)
+{
+    for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
+    {
+        int16_t m4  = (int16_t)src[0];
+        int16_t m3  = (int16_t)src[-offset];
+        int16_t m5  = (int16_t)src[offset];
+        int16_t m2  = (int16_t)src[-offset * 2];
+        int16_t m6  = (int16_t)src[offset * 2];
+        int16_t m1  = (int16_t)src[-offset * 3];
+        int16_t m7  = (int16_t)src[offset * 3];
+        int16_t m0  = (int16_t)src[-offset * 4];
+        src[-offset * 3] = (pixel)(x265_clip3(-tcP, tcP, ((2 * m0 + 3 * m1 + m2 + m3 + m4 + 4) >> 3) - m1) + m1);
+        src[-offset * 2] = (pixel)(x265_clip3(-tcP, tcP, ((m1 + m2 + m3 + m4 + 2) >> 2) - m2) + m2);
+        src[-offset]     = (pixel)(x265_clip3(-tcP, tcP, ((m1 + 2 * m2 + 2 * m3 + 2 * m4 + m5 + 4) >> 3) - m3) + m3);
+        src[0]           = (pixel)(x265_clip3(-tcQ, tcQ, ((m2 + 2 * m3 + 2 * m4 + 2 * m5 + m6 + 4) >> 3) - m4) + m4);
+        src[offset]      = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + m6 + 2) >> 2) - m5) + m5);
+        src[offset * 2]  = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + 3 * m6 + 2 * m7 + 4) >> 3) - m6) + m6);
+    }
+}
 }
 
 namespace X265_NS {
@@ -150,5 +172,9 @@
     p.saoCuOrgE3[1] = processSaoCUE3;
     p.saoCuOrgB0 = processSaoCUB0;
     p.sign = calSign;
+
+    // C code is same for EDGE_VER and EDGE_HOR only asm code is different
+    p.pelFilterLumaStrong[0] = pelFilterLumaStrong_c;
+    p.pelFilterLumaStrong[1] = pelFilterLumaStrong_c;
 }
 }

x265_1.8.tar.gz/source/common/lowres.cpp -> x265_1.9.tar.gz/source/common/lowres.cpp Changed

x265_1.8.tar.gz/source/common/lowres.h -> x265_1.9.tar.gz/source/common/lowres.h Changed

x265_1.8.tar.gz/source/common/param.cpp -> x265_1.9.tar.gz/source/common/param.cpp Changed

@@ -147,7 +147,7 @@
     param->bFrameAdaptive = X265_B_ADAPT_TRELLIS;
     param->bBPyramid = 1;
     param->scenecutThreshold = 40; /* Magic number pulled in from x264 */
-    param->lookaheadSlices = 0;
+    param->lookaheadSlices = 8;
 
     /* Intra Coding Tools */
     param->bEnableConstrainedIntra = 0;
@@ -159,7 +159,8 @@
     param->subpelRefine = 2;
     param->searchRange = 57;
     param->maxNumMergeCand = 2;
-    param->limitReferences = 0;
+    param->limitReferences = 3;
+    param->limitModes = 0;
     param->bEnableWeightedPred = 1;
     param->bEnableWeightedBiPred = 0;
     param->bEnableEarlySkip = 0;
@@ -184,7 +185,7 @@
     param->cbQpOffset = 0;
     param->crQpOffset = 0;
     param->rdPenalty = 0;
-    param->psyRd = 0.3;
+    param->psyRd = 2.0;
     param->psyRdoq = 0.0;
     param->analysisMode = 0;
     param->analysisFileName = NULL;
@@ -241,6 +242,10 @@
     param->vui.defDispWinRightOffset = 0;
     param->vui.defDispWinTopOffset = 0;
     param->vui.defDispWinBottomOffset = 0;
+    param->maxCLL = 0;
+    param->maxFALL = 0;
+    param->minLuma = 0;
+    param->maxLuma = (1 << X265_DEPTH) - 1;
 }
 
 int x265_param_default_preset(x265_param* param, const char* preset, const char* tune)
@@ -274,9 +279,9 @@
             param->bEnableWeightedPred = 0;
             param->rdLevel = 2;
             param->maxNumReferences = 1;
+            param->limitReferences = 0;
             param->rc.aqStrength = 0.0;
             param->rc.aqMode = X265_AQ_NONE;
-            param->rc.cuTree = 0;
             param->rc.qgSize = 32;
             param->bEnableFastIntra = 1;
         }
@@ -291,9 +296,9 @@
             param->bEnableWeightedPred = 0;
             param->rdLevel = 2;
             param->maxNumReferences = 1;
+            param->limitReferences = 0;
             param->rc.aqStrength = 0.0;
             param->rc.aqMode = X265_AQ_NONE;
-            param->rc.cuTree = 0;
             param->rc.qgSize = 32;
             param->bEnableSAO = 0;
             param->bEnableFastIntra = 1;
@@ -301,13 +306,11 @@
         else if (!strcmp(preset, "veryfast"))
         {
             param->lookaheadDepth = 15;
-            param->maxCUSize = 32;
             param->bFrameAdaptive = 0;
             param->subpelRefine = 1;
             param->bEnableEarlySkip = 1;
             param->rdLevel = 2;
-            param->maxNumReferences = 1;
-            param->rc.cuTree = 0;
+            param->maxNumReferences = 2;
             param->rc.qgSize = 32;
             param->bEnableFastIntra = 1;
         }
@@ -317,8 +320,7 @@
             param->bFrameAdaptive = 0;
             param->bEnableEarlySkip = 1;
             param->rdLevel = 2;
-            param->maxNumReferences = 1;
-            param->rc.cuTree = 0;
+            param->maxNumReferences = 2;
             param->bEnableFastIntra = 1;
         }
         else if (!strcmp(preset, "fast"))
@@ -326,7 +328,7 @@
             param->lookaheadDepth = 15;
             param->bFrameAdaptive = 0;
             param->rdLevel = 2;
-            param->maxNumReferences = 2;
+            param->maxNumReferences = 3;
             param->bEnableFastIntra = 1;
         }
         else if (!strcmp(preset, "medium"))
@@ -343,6 +345,9 @@
             param->subpelRefine = 3;
             param->maxNumMergeCand = 3;
             param->searchMethod = X265_STAR_SEARCH;
+            param->maxNumReferences = 4;
+            param->limitModes = 1;
+            param->lookaheadSlices = 4; // limit parallelism as already enough work exists
         }
         else if (!strcmp(preset, "slower"))
         {
@@ -359,7 +364,11 @@
             param->subpelRefine = 3;
             param->maxNumMergeCand = 3;
             param->searchMethod = X265_STAR_SEARCH;
+            param->maxNumReferences = 4;
+            param->limitReferences = 2;
+            param->limitModes = 1;
             param->bIntraInBFrames = 1;
+            param->lookaheadSlices = 4; // limit parallelism as already enough work exists
         }
         else if (!strcmp(preset, "veryslow"))
         {
@@ -377,7 +386,10 @@
             param->maxNumMergeCand = 4;
             param->searchMethod = X265_STAR_SEARCH;
             param->maxNumReferences = 5;
+            param->limitReferences = 1;
+            param->limitModes = 1;
             param->bIntraInBFrames = 1;
+            param->lookaheadSlices = 0; // disabled for best quality
         }
         else if (!strcmp(preset, "placebo"))
         {
@@ -397,8 +409,10 @@
             param->searchMethod = X265_STAR_SEARCH;
             param->bEnableTransformSkip = 1;
             param->maxNumReferences = 5;
+            param->limitReferences = 0;
             param->rc.bEnableSlowFirstPass = 1;
             param->bIntraInBFrames = 1;
+            param->lookaheadSlices = 0; // disabled for best quality
             // TODO: optimized esa
         }
         else
@@ -565,10 +579,14 @@
     OPT2("level-idc", "level")
     {
         /* allow "5.1" or "51", both converted to integer 51 */
-        if (atof(value) < 7)
+        /* if level-idc specifies an obviously wrong value in either float or int, 
+        throw error consistently. Stronger level checking will be done in encoder_open() */
+        if (atof(value) < 10)
             p->levelIdc = (int)(10 * atof(value) + .5);
-        else
+        else if (atoi(value) < 100)
             p->levelIdc = atoi(value);
+        else 
+            bError = true;
     }
     OPT("high-tier") p->bHighTier = atobool(value);
     OPT("allow-non-conformance") p->bAllowNonConformance = atobool(value);
@@ -608,6 +626,7 @@
     OPT2("constrained-intra", "cip") p->bEnableConstrainedIntra = atobool(value);
     OPT("fast-intra") p->bEnableFastIntra = atobool(value);
     OPT("open-gop") p->bOpenGOP = atobool(value);
+    OPT("intra-refresh") p->bIntraRefresh = atobool(value);
     OPT("lookahead-slices") p->lookaheadSlices = atoi(value);
     OPT("scenecut")
     {
@@ -644,6 +663,7 @@
     }
     OPT("ref") p->maxNumReferences = atoi(value);
     OPT("limit-refs") p->limitReferences = atoi(value);
+    OPT("limit-modes") p->limitModes = atobool(value);
     OPT("weightp") p->bEnableWeightedPred = atobool(value);
     OPT("weightb") p->bEnableWeightedBiPred = atobool(value);
     OPT("cbqpoffs") p->cbQpOffset = atoi(value);
@@ -854,7 +874,9 @@
     OPT("analysis-file") p->analysisFileName = strdup(value);
     OPT("qg-size") p->rc.qgSize = atoi(value);
     OPT("master-display") p->masteringDisplayColorVolume = strdup(value);
-    OPT("max-cll") p->contentLightLevelInfo = strdup(value);
+    OPT("max-cll") bError |= sscanf(value, "%hu,%hu", &p->maxCLL, &p->maxFALL) != 2;
+    OPT("min-luma") p->minLuma = (uint16_t)atoi(value);
+    OPT("max-luma") p->maxLuma = (uint16_t)atoi(value);
     else
         return X265_PARAM_BAD_NAME;
 #undef OPT
@@ -1035,6 +1057,8 @@
           "subme must be greater than or equal to 0");
     CHECK(param->limitReferences > 3,
           "limitReferences must be 0, 1, 2 or 3");
+    CHECK(param->limitModes > 1,
+          "limitRectAmp must be 0, 1");
     CHECK(param->frameNumThreads < 0 || param->frameNumThreads > X265_MAX_FRAME_THREADS,
           "frameNumThreads (--frame-threads) must be [0 .. X265_MAX_FRAME_THREADS)");
     CHECK(param->cbQpOffset < -12, "Min. Chroma Cb QP Offset is -12");
@@ -1063,8 +1087,8 @@
 
     CHECK(param->sourceWidth < (int)param->maxCUSize || param->sourceHeight < (int)param->maxCUSize,
           "Picture size must be at least one CTU");
-    CHECK(param->internalCsp < X265_CSP_I420 || X265_CSP_I444 < param->internalCsp,
-          "Color space must be i420, i422, or i444");
+    CHECK(param->internalCsp < X265_CSP_I400 || X265_CSP_I444 < param->internalCsp,
+          "chroma subsampling must be i400 (4:0:0 monochrome), i420 (4:2:0 default), i422 (4:2:0), i444 (4:4:4)");

x265_1.8.tar.gz/source/common/picyuv.cpp -> x265_1.9.tar.gz/source/common/picyuv.cpp Changed

@@ -2,6 +2,7 @@
  * Copyright (C) 2015 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -42,6 +43,9 @@
     m_cuOffsetC = NULL;
     m_buOffsetY = NULL;
     m_buOffsetC = NULL;
+
+    m_maxLumaLevel = 0;
+    m_avgLumaLevel = 0;
 }
 
 bool PicYuv::create(uint32_t picWidth, uint32_t picHeight, uint32_t picCsp)
@@ -59,20 +63,27 @@
     m_lumaMarginY = g_maxCUSize + 16; // margin for 8-tap filter and infinite padding
     m_stride = (numCuInWidth * g_maxCUSize) + (m_lumaMarginX << 1);
 
-    m_chromaMarginX = m_lumaMarginX;  // keep 16-byte alignment for chroma CTUs
-    m_chromaMarginY = m_lumaMarginY >> m_vChromaShift;
-
-    m_strideC = ((numCuInWidth * g_maxCUSize) >> m_hChromaShift) + (m_chromaMarginX * 2);
     int maxHeight = numCuInHeight * g_maxCUSize;
-
     CHECKED_MALLOC(m_picBuf[0], pixel, m_stride * (maxHeight + (m_lumaMarginY * 2)));
-    CHECKED_MALLOC(m_picBuf[1], pixel, m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
-    CHECKED_MALLOC(m_picBuf[2], pixel, m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
+    m_picOrg[0] = m_picBuf[0] + m_lumaMarginY * m_stride + m_lumaMarginX;
+
+    if (picCsp != X265_CSP_I400)
+    {
+        m_chromaMarginX = m_lumaMarginX;  // keep 16-byte alignment for chroma CTUs
+        m_chromaMarginY = m_lumaMarginY >> m_vChromaShift;
+        m_strideC = ((numCuInWidth * g_maxCUSize) >> m_hChromaShift) + (m_chromaMarginX * 2);
 
-    m_picOrg[0] = m_picBuf[0] + m_lumaMarginY   * m_stride  + m_lumaMarginX;
-    m_picOrg[1] = m_picBuf[1] + m_chromaMarginY * m_strideC + m_chromaMarginX;
-    m_picOrg[2] = m_picBuf[2] + m_chromaMarginY * m_strideC + m_chromaMarginX;
+        CHECKED_MALLOC(m_picBuf[1], pixel, m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
+        CHECKED_MALLOC(m_picBuf[2], pixel, m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
 
+        m_picOrg[1] = m_picBuf[1] + m_chromaMarginY * m_strideC + m_chromaMarginX;
+        m_picOrg[2] = m_picBuf[2] + m_chromaMarginY * m_strideC + m_chromaMarginX;
+    }
+    else
+    {
+        m_picBuf[1] = m_picBuf[2] = NULL;
+        m_picOrg[1] = m_picOrg[2] = NULL;
+    }
     return true;
 
 fail:
@@ -85,27 +96,45 @@
 bool PicYuv::createOffsets(const SPS& sps)
 {
     uint32_t numPartitions = 1 << (g_unitSizeDepth * 2);
-    CHECKED_MALLOC(m_cuOffsetY, intptr_t, sps.numCuInWidth * sps.numCuInHeight);
-    CHECKED_MALLOC(m_cuOffsetC, intptr_t, sps.numCuInWidth * sps.numCuInHeight);
-    for (uint32_t cuRow = 0; cuRow < sps.numCuInHeight; cuRow++)
+
+    if (m_picCsp != X265_CSP_I400)
     {
-        for (uint32_t cuCol = 0; cuCol < sps.numCuInWidth; cuCol++)
+        CHECKED_MALLOC(m_cuOffsetY, intptr_t, sps.numCuInWidth * sps.numCuInHeight);
+        CHECKED_MALLOC(m_cuOffsetC, intptr_t, sps.numCuInWidth * sps.numCuInHeight);
+        for (uint32_t cuRow = 0; cuRow < sps.numCuInHeight; cuRow++)
         {
-            m_cuOffsetY[cuRow * sps.numCuInWidth + cuCol] = m_stride * cuRow * g_maxCUSize + cuCol * g_maxCUSize;
-            m_cuOffsetC[cuRow * sps.numCuInWidth + cuCol] = m_strideC * cuRow * (g_maxCUSize >> m_vChromaShift) + cuCol * (g_maxCUSize >> m_hChromaShift);
+            for (uint32_t cuCol = 0; cuCol < sps.numCuInWidth; cuCol++)
+            {
+                m_cuOffsetY[cuRow * sps.numCuInWidth + cuCol] = m_stride * cuRow * g_maxCUSize + cuCol * g_maxCUSize;
+                m_cuOffsetC[cuRow * sps.numCuInWidth + cuCol] = m_strideC * cuRow * (g_maxCUSize >> m_vChromaShift) + cuCol * (g_maxCUSize >> m_hChromaShift);
+            }
         }
-    }
 
-    CHECKED_MALLOC(m_buOffsetY, intptr_t, (size_t)numPartitions);
-    CHECKED_MALLOC(m_buOffsetC, intptr_t, (size_t)numPartitions);
-    for (uint32_t idx = 0; idx < numPartitions; ++idx)
-    {
-        intptr_t x = g_zscanToPelX[idx];
-        intptr_t y = g_zscanToPelY[idx];
-        m_buOffsetY[idx] = m_stride * y + x;
-        m_buOffsetC[idx] = m_strideC * (y >> m_vChromaShift) + (x >> m_hChromaShift);
+        CHECKED_MALLOC(m_buOffsetY, intptr_t, (size_t)numPartitions);
+        CHECKED_MALLOC(m_buOffsetC, intptr_t, (size_t)numPartitions);
+        for (uint32_t idx = 0; idx < numPartitions; ++idx)
+        {
+            intptr_t x = g_zscanToPelX[idx];
+            intptr_t y = g_zscanToPelY[idx];
+            m_buOffsetY[idx] = m_stride * y + x;
+            m_buOffsetC[idx] = m_strideC * (y >> m_vChromaShift) + (x >> m_hChromaShift);
+        }
     }
+    else
+    {
+        CHECKED_MALLOC(m_cuOffsetY, intptr_t, sps.numCuInWidth * sps.numCuInHeight);
+        for (uint32_t cuRow = 0; cuRow < sps.numCuInHeight; cuRow++)
+        for (uint32_t cuCol = 0; cuCol < sps.numCuInWidth; cuCol++)
+            m_cuOffsetY[cuRow * sps.numCuInWidth + cuCol] = m_stride * cuRow * g_maxCUSize + cuCol * g_maxCUSize;
 
+        CHECKED_MALLOC(m_buOffsetY, intptr_t, (size_t)numPartitions);
+        for (uint32_t idx = 0; idx < numPartitions; ++idx)
+        {
+            intptr_t x = g_zscanToPelX[idx];
+            intptr_t y = g_zscanToPelY[idx];
+            m_buOffsetY[idx] = m_stride * y + x;
+        }
+    }
     return true;
 
 fail:
@@ -121,7 +150,7 @@
 
 /* Copy pixels from an x265_picture into internal PicYuv instance.
  * Shift pixels as necessary, mask off bits above X265_DEPTH for safety. */
-void PicYuv::copyFromPicture(const x265_picture& pic, int padx, int pady)
+void PicYuv::copyFromPicture(const x265_picture& pic, const x265_param& param, int padx, int pady)
 {
     /* m_picWidth is the width that is being encoded, padx indicates how many
      * of those pixels are padding to reach multiple of MinCU(4) size.
@@ -155,28 +184,29 @@
 #if (X265_DEPTH > 8)
         {
             pixel *yPixel = m_picOrg[0];
-            pixel *uPixel = m_picOrg[1];
-            pixel *vPixel = m_picOrg[2];
 
             uint8_t *yChar = (uint8_t*)pic.planes[0];
-            uint8_t *uChar = (uint8_t*)pic.planes[1];
-            uint8_t *vChar = (uint8_t*)pic.planes[2];
             int shift = (X265_DEPTH - 8);
 
             primitives.planecopy_cp(yChar, pic.stride[0] / sizeof(*yChar), yPixel, m_stride, width, height, shift);
-            primitives.planecopy_cp(uChar, pic.stride[1] / sizeof(*uChar), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
-            primitives.planecopy_cp(vChar, pic.stride[2] / sizeof(*vChar), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
+
+            if (pic.colorSpace != X265_CSP_I400)
+            {
+                pixel *uPixel = m_picOrg[1];
+                pixel *vPixel = m_picOrg[2];
+
+                uint8_t *uChar = (uint8_t*)pic.planes[1];
+                uint8_t *vChar = (uint8_t*)pic.planes[2];
+
+                primitives.planecopy_cp(uChar, pic.stride[1] / sizeof(*uChar), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
+                primitives.planecopy_cp(vChar, pic.stride[2] / sizeof(*vChar), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
+            }
         }
 #else /* Case for (X265_DEPTH == 8) */
         // TODO: Does we need this path? may merge into above in future
         {
             pixel *yPixel = m_picOrg[0];
-            pixel *uPixel = m_picOrg[1];
-            pixel *vPixel = m_picOrg[2];
-
             uint8_t *yChar = (uint8_t*)pic.planes[0];
-            uint8_t *uChar = (uint8_t*)pic.planes[1];
-            uint8_t *vChar = (uint8_t*)pic.planes[2];
 
             for (int r = 0; r < height; r++)
             {
@@ -186,15 +216,24 @@
                 yChar += pic.stride[0] / sizeof(*yChar);
             }
 
-            for (int r = 0; r < height >> m_vChromaShift; r++)
+            if (pic.colorSpace != X265_CSP_I400)
             {
-                memcpy(uPixel, uChar, (width >> m_hChromaShift) * sizeof(pixel));
-                memcpy(vPixel, vChar, (width >> m_hChromaShift) * sizeof(pixel));
+                pixel *uPixel = m_picOrg[1];
+                pixel *vPixel = m_picOrg[2];
+
+                uint8_t *uChar = (uint8_t*)pic.planes[1];
+                uint8_t *vChar = (uint8_t*)pic.planes[2];
+
+                for (int r = 0; r < height >> m_vChromaShift; r++)
+                {
+                    memcpy(uPixel, uChar, (width >> m_hChromaShift) * sizeof(pixel));
+                    memcpy(vPixel, vChar, (width >> m_hChromaShift) * sizeof(pixel));
 
-                uPixel += m_strideC;
-                vPixel += m_strideC;
-                uChar += pic.stride[1] / sizeof(*uChar);
-                vChar += pic.stride[2] / sizeof(*vChar);
+                    uPixel += m_strideC;
+                    vPixel += m_strideC;
+                    uChar += pic.stride[1] / sizeof(*uChar);
+                    vChar += pic.stride[2] / sizeof(*vChar);
+                }
             }
         }
 #endif /* (X265_DEPTH > 8) */
@@ -205,43 +244,63 @@

x265_1.8.tar.gz/source/common/picyuv.h -> x265_1.9.tar.gz/source/common/picyuv.h Changed

x265_1.8.tar.gz/source/common/pixel.cpp -> x265_1.9.tar.gz/source/common/pixel.cpp Changed

@@ -25,6 +25,7 @@
  *****************************************************************************/
 
 #include "common.h"
+#include "slicetype.h"      // LOWRES_COST_MASK
 #include "primitives.h"
 #include "x265.h"
 
@@ -117,9 +118,9 @@
 }
 
 template<int lx, int ly, class T1, class T2>
-sse_ret_t sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2)
+sse_t sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2)
 {
-    sse_ret_t sum = 0;
+    sse_t sum = 0;
     int tmp;
 
     for (int y = 0; y < ly; y++)
@@ -187,37 +188,6 @@
     return (int)(sum >> 1);
 }
 
-static int satd_4x4(const int16_t* pix1, intptr_t stride_pix1)
-{
-    int32_t tmp[4][4];
-    int32_t s01, s23, d01, d23;
-    int32_t satd = 0;
-    int d;
-
-    for (d = 0; d < 4; d++, pix1 += stride_pix1)
-    {
-        s01 = pix1[0] + pix1[1];
-        s23 = pix1[2] + pix1[3];
-        d01 = pix1[0] - pix1[1];
-        d23 = pix1[2] - pix1[3];
-
-        tmp[d][0] = s01 + s23;
-        tmp[d][1] = s01 - s23;
-        tmp[d][2] = d01 - d23;
-        tmp[d][3] = d01 + d23;
-    }
-
-    for (d = 0; d < 4; d++)
-    {
-        s01 = tmp[0][d] + tmp[1][d];
-        s23 = tmp[2][d] + tmp[3][d];
-        d01 = tmp[0][d] - tmp[1][d];
-        d23 = tmp[2][d] - tmp[3][d];
-        satd += abs(s01 + s23) + abs(s01 - s23) + abs(d01 - d23) + abs(d01 + d23);
-    }
-    return (int)(satd / 2);
-}
-
 // x264's SWAR version of satd 8x4, performs two 4x4 SATDs at once
 static int satd_8x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
 {
@@ -313,57 +283,6 @@
     return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2);
 }
 
-inline int _sa8d_8x8(const int16_t* pix1, intptr_t i_pix1)
-{
-    int32_t tmp[8][8];
-    int32_t a0, a1, a2, a3, a4, a5, a6, a7;
-    int32_t sum = 0;
-
-    for (int i = 0; i < 8; i++, pix1 += i_pix1)
-    {
-        a0 = pix1[0] + pix1[1];
-        a1 = pix1[2] + pix1[3];
-        a2 = pix1[4] + pix1[5];
-        a3 = pix1[6] + pix1[7];
-        a4 = pix1[0] - pix1[1];
-        a5 = pix1[2] - pix1[3];
-        a6 = pix1[4] - pix1[5];
-        a7 = pix1[6] - pix1[7];
-        tmp[i][0] = (a0 + a1) + (a2 + a3);
-        tmp[i][1] = (a0 + a1) - (a2 + a3);
-        tmp[i][2] = (a0 - a1) + (a2 - a3);
-        tmp[i][3] = (a0 - a1) - (a2 - a3);
-        tmp[i][4] = (a4 + a5) + (a6 + a7);
-        tmp[i][5] = (a4 + a5) - (a6 + a7);
-        tmp[i][6] = (a4 - a5) + (a6 - a7);
-        tmp[i][7] = (a4 - a5) - (a6 - a7);
-    }
-
-    for (int i = 0; i < 8; i++)
-    {
-        a0 = (tmp[0][i] + tmp[1][i]) + (tmp[2][i] + tmp[3][i]);
-        a2 = (tmp[0][i] + tmp[1][i]) - (tmp[2][i] + tmp[3][i]);
-        a1 = (tmp[0][i] - tmp[1][i]) + (tmp[2][i] - tmp[3][i]);
-        a3 = (tmp[0][i] - tmp[1][i]) - (tmp[2][i] - tmp[3][i]);
-        a4 = (tmp[4][i] + tmp[5][i]) + (tmp[6][i] + tmp[7][i]);
-        a6 = (tmp[4][i] + tmp[5][i]) - (tmp[6][i] + tmp[7][i]);
-        a5 = (tmp[4][i] - tmp[5][i]) + (tmp[6][i] - tmp[7][i]);
-        a7 = (tmp[4][i] - tmp[5][i]) - (tmp[6][i] - tmp[7][i]);
-        a0 = abs(a0 + a4) + abs(a0 - a4);
-        a0 += abs(a1 + a5) + abs(a1 - a5);
-        a0 += abs(a2 + a6) + abs(a2 - a6);
-        a0 += abs(a3 + a7) + abs(a3 - a7);
-        sum += a0;
-    }
-
-    return (int)sum;
-}
-
-static int sa8d_8x8(const int16_t* pix1, intptr_t i_pix1)
-{
-    return (int)((_sa8d_8x8(pix1, i_pix1) + 2) >> 2);
-}
-
 static int sa8d_16x16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
 {
     int sum = _sa8d_8x8(pix1, i_pix1, pix2, i_pix2)
@@ -403,9 +322,9 @@
 }
 
 template<int size>
-int pixel_ssd_s_c(const int16_t* a, intptr_t dstride)
+sse_t pixel_ssd_s_c(const int16_t* a, intptr_t dstride)
 {
-    int sum = 0;
+    sse_t sum = 0;
     for (int y = 0; y < size; y++)
     {
         for (int x = 0; x < size; x++)
@@ -783,39 +702,6 @@
     }
 }
 
-template<int size>
-int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
-{
-    static int16_t zeroBuf[8] /* = { 0 } */;
-
-    if (size)
-    {
-        int dim = 1 << (size + 2);
-        uint32_t totEnergy = 0;
-        for (int i = 0; i < dim; i += 8)
-        {
-            for (int j = 0; j < dim; j+= 8)
-            {
-                /* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */
-                int sourceEnergy = sa8d_8x8(source + i * sstride + j, sstride) - 
-                                   (sad<8, 8>(source + i * sstride + j, sstride, zeroBuf, 0) >> 2);
-                int reconEnergy =  sa8d_8x8(recon + i * rstride + j, rstride) - 
-                                   (sad<8, 8>(recon + i * rstride + j, rstride, zeroBuf, 0) >> 2);
-
-                totEnergy += abs(sourceEnergy - reconEnergy);
-            }
-        }
-        return totEnergy;
-    }
-    else
-    {
-        /* 4x4 is too small for sa8d */
-        int sourceEnergy = satd_4x4(source, sstride) - (sad<4, 4>(source, sstride, zeroBuf, 0) >> 2);
-        int reconEnergy = satd_4x4(recon, rstride) - (sad<4, 4>(recon, rstride, zeroBuf, 0) >> 2);
-        return abs(sourceEnergy - reconEnergy);
-    }
-}
-
 template<int bx, int by>
 void blockcopy_pp_c(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb)
 {
@@ -960,19 +846,57 @@
 /* Estimate the total amount of influence on future quality that could be had if we
  * were to improve the reference samples used to inter predict any given CU. */
 static void estimateCUPropagateCost(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts,
-                             const int32_t* invQscales, const double* fpsFactor, int len)
+                                    const int32_t* invQscales, const double* fpsFactor, int len)
 {
-    double fps = *fpsFactor / 256;
+    double fps = *fpsFactor / 256;  // range[0.01, 1.00]
 
     for (int i = 0; i < len; i++)
     {
-        double intraCost       = intraCosts[i] * invQscales[i];
-        double propagateAmount = (double)propagateIn[i] + intraCost * fps;
-        double propagateNum    = (double)intraCosts[i] - (interCosts[i] & ((1 << 14) - 1));
-        double propagateDenom  = (double)intraCosts[i];
+        int intraCost = intraCosts[i];
+        int interCost = X265_MIN(intraCosts[i], interCosts[i] & LOWRES_COST_MASK);
+        double propagateIntra  = intraCost * invQscales[i]; // Q16 x Q8.8 = Q24.8
+        double propagateAmount = (double)propagateIn[i] + propagateIntra * fps; // Q16.0 + Q24.8 x Q0.x = Q25.0
+        double propagateNum    = (double)(intraCost - interCost); // Q32 - Q32 = Q33.0
+
+#if 0
+        // algorithm that output match to asm
+        float intraRcp = (float)1.0f / intraCost;   // VC can't mapping this into RCPPS
+        float intraRcpError1 = (float)intraCost * (float)intraRcp;
+        intraRcpError1 *= (float)intraRcp;
+        float intraRcpError2 = intraRcp + intraRcp;
+        float propagateDenom = intraRcpError2 - intraRcpError1;
+        dst[i] = (int)(propagateAmount * propagateNum * (double)propagateDenom + 0.5);
+#else
+        double propagateDenom  = (double)intraCost;             // Q32

x265_1.8.tar.gz/source/common/predict.cpp -> x265_1.9.tar.gz/source/common/predict.cpp Changed

@@ -2,6 +2,7 @@
 * Copyright (C) 2013 x265 project
 *
 * Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
+*          Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -98,7 +99,7 @@
 
         if (cu.m_slice->m_pps->bUseWeightPred && wp0->bPresentFlag)
         {
-            for (int plane = 0; plane < 3; plane++)
+            for (int plane = 0; plane < (bChroma ? 3 : 1); plane++)
             {
                 wv0[plane].w      = wp0[plane].inputWeight;
                 wv0[plane].offset = wp0[plane].inputOffset * (1 << (X265_DEPTH - 8));
@@ -109,18 +110,18 @@
             ShortYuv& shortYuv = m_predShortYuv[0];
 
             if (bLuma)
-                predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
+                predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
             if (bChroma)
-                predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
+                predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
 
             addWeightUni(pu, predYuv, shortYuv, wv0, bLuma, bChroma);
         }
         else
         {
             if (bLuma)
-                predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
+                predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
             if (bChroma)
-                predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
+                predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
         }
     }
     else
@@ -141,7 +142,7 @@
             if (pwp0 && pwp1 && (pwp0->bPresentFlag || pwp1->bPresentFlag))
             {
                 /* biprediction weighting */
-                for (int plane = 0; plane < 3; plane++)
+                for (int plane = 0; plane < (bChroma ? 3 : 1); plane++)
                 {
                     wv0[plane].w = pwp0[plane].inputWeight;
                     wv0[plane].o = pwp0[plane].inputOffset * (1 << (X265_DEPTH - 8));
@@ -158,7 +159,7 @@
             {
                 /* uniprediction weighting, always outputs to wv0 */
                 const WeightParam* pwp = (refIdx0 >= 0) ? pwp0 : pwp1;
-                for (int plane = 0; plane < 3; plane++)
+                for (int plane = 0; plane < (bChroma ? 3 : 1); plane++)
                 {
                     wv0[plane].w = pwp[plane].inputWeight;
                     wv0[plane].offset = pwp[plane].inputOffset * (1 << (X265_DEPTH - 8));
@@ -179,13 +180,13 @@
 
             if (bLuma)
             {
-                predInterLumaShort(pu, m_predShortYuv[0], *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
-                predInterLumaShort(pu, m_predShortYuv[1], *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
+                predInterLumaShort(pu, m_predShortYuv[0], *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
+                predInterLumaShort(pu, m_predShortYuv[1], *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
             }
             if (bChroma)
             {
-                predInterChromaShort(pu, m_predShortYuv[0], *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
-                predInterChromaShort(pu, m_predShortYuv[1], *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
+                predInterChromaShort(pu, m_predShortYuv[0], *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
+                predInterChromaShort(pu, m_predShortYuv[1], *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
             }
 
             if (pwp0 && pwp1 && (pwp0->bPresentFlag || pwp1->bPresentFlag))
@@ -203,18 +204,18 @@
                 ShortYuv& shortYuv = m_predShortYuv[0];
 
                 if (bLuma)
-                    predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
+                    predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
                 if (bChroma)
-                    predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
+                    predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
 
                 addWeightUni(pu, predYuv, shortYuv, wv0, bLuma, bChroma);
             }
             else
             {
                 if (bLuma)
-                    predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
+                    predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
                 if (bChroma)
-                    predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
+                    predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
             }
         }
         else
@@ -230,18 +231,18 @@
                 ShortYuv& shortYuv = m_predShortYuv[0];
 
                 if (bLuma)
-                    predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
+                    predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
                 if (bChroma)
-                    predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
+                    predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
 
                 addWeightUni(pu, predYuv, shortYuv, wv0, bLuma, bChroma);
             }
             else
             {
                 if (bLuma)
-                    predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
+                    predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
                 if (bChroma)
-                    predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
+                    predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
             }
         }
     }
@@ -600,8 +601,9 @@
     int tuSize = 1 << intraNeighbors.log2TrSize;
     int tuSize2 = tuSize << 1;
 
-    pixel* adiOrigin = cu.m_encData->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
-    intptr_t picStride = cu.m_encData->m_reconPic->m_stride;
+    PicYuv* reconPic = cu.m_encData->m_reconPic;
+    pixel* adiOrigin = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
+    intptr_t picStride = reconPic->m_stride;
 
     fillReferenceSamples(adiOrigin, picStride, intraNeighbors, intraNeighbourBuf[0]);
 
@@ -648,8 +650,9 @@
 
 void Predict::initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t puAbsPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId)
 {
-    const pixel* adiOrigin = cu.m_encData->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
-    intptr_t picStride = cu.m_encData->m_reconPic->m_strideC;
+    PicYuv* reconPic = cu.m_encData->m_reconPic;
+    const pixel* adiOrigin = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
+    intptr_t picStride = reconPic->m_strideC;
 
     fillReferenceSamples(adiOrigin, picStride, intraNeighbors, intraNeighbourBuf[0]);

x265_1.8.tar.gz/source/common/predict.h -> x265_1.9.tar.gz/source/common/predict.h Changed

x265_1.8.tar.gz/source/common/primitives.h -> x265_1.9.tar.gz/source/common/primitives.h Changed

@@ -112,9 +112,9 @@
 
 typedef int  (*pixelcmp_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned
 typedef int  (*pixelcmp_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride);
-typedef sse_ret_t (*pixel_sse_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned
-typedef sse_ret_t (*pixel_sse_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride);
-typedef int  (*pixel_ssd_s_t)(const int16_t* fenc, intptr_t fencstride);
+typedef sse_t (*pixel_sse_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned
+typedef sse_t (*pixel_sse_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride);
+typedef sse_t (*pixel_ssd_s_t)(const int16_t* fenc, intptr_t fencstride);
 typedef void (*pixelcmp_x4_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
 typedef void (*pixelcmp_x3_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
 typedef void (*blockfill_s_t)(int16_t* dst, intptr_t dstride, int16_t val);
@@ -176,15 +176,16 @@
 typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX);
 typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
 
-typedef void (*saoCuStatsBO_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
-typedef void (*saoCuStatsE0_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
-typedef void (*saoCuStatsE1_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
-typedef void (*saoCuStatsE2_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBuff, int endX, int endY, int32_t *stats, int32_t *count);
-typedef void (*saoCuStatsE3_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
+typedef void (*saoCuStatsBO_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
+typedef void (*saoCuStatsE0_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
+typedef void (*saoCuStatsE1_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
+typedef void (*saoCuStatsE2_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBuff, int endX, int endY, int32_t *stats, int32_t *count);
+typedef void (*saoCuStatsE3_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
 
 typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
 typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
 typedef void (*planecopy_sp_t) (const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
+typedef pixel (*planeClipAndMax_t)(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix);
 
 typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
 
@@ -195,6 +196,8 @@
 typedef uint32_t (*costCoeffRemain_t)(uint16_t *absCoeff, int numNonZero, int idx);
 typedef uint32_t (*costC1C2Flag_t)(uint16_t *absCoeff, intptr_t numC1Flag, uint8_t *baseCtxMod, intptr_t ctxOffset);
 
+typedef void (*pelFilterLumaStrong_t)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
+
 /* Function pointers to optimized encoder primitives. Each pointer can reference
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */
 struct EncoderPrimitives
@@ -259,7 +262,6 @@
         pixel_sse_t     sse_pp;        // Sum of Square Error (pixel, pixel) fenc alignment not assumed
         pixel_sse_ss_t  sse_ss;        // Sum of Square Error (short, short) fenc alignment not assumed
         pixelcmp_t      psy_cost_pp;   // difference in AC energy between two pixel blocks
-        pixelcmp_ss_t   psy_cost_ss;   // difference in AC energy between two signed residual blocks
         pixel_ssd_s_t   ssd_s;         // Sum of Square Error (residual coeff to self)
         pixelcmp_t      sa8d;          // Sum of Transformed Differences (8x8 Hadamard), uses satd for 4x4 intra TU
 
@@ -316,6 +318,7 @@
     planecopy_cp_t        planecopy_cp;
     planecopy_sp_t        planecopy_sp;
     planecopy_sp_t        planecopy_sp_shl;
+    planeClipAndMax_t     planeClipAndMax;
 
     weightp_sp_t          weight_sp;
     weightp_pp_t          weight_pp;
@@ -328,6 +331,7 @@
     costCoeffRemain_t     costCoeffRemain;
     costC1C2Flag_t        costC1C2Flag;
 
+    pelFilterLumaStrong_t pelFilterLumaStrong[2]; // EDGE_VER = 0, EDGE_HOR = 1
 
     /* There is one set of chroma primitives per color space. An encoder will
      * have just a single color space and thus it will only ever use one entry

x265_1.8.tar.gz/source/common/quant.cpp -> x265_1.9.tar.gz/source/common/quant.cpp Changed

@@ -2,6 +2,7 @@
  * Copyright (C) 2015 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -50,9 +51,8 @@
     return y + ((x - y) & ((x - y) >> (sizeof(int) * CHAR_BIT - 1))); // min(x, y)
 }
 
-inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, const uint32_t absGoRice, const uint32_t maxVlc, uint32_t c1c2Idx)
+inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, const uint32_t absGoRice, const uint32_t maxVlc, const uint32_t c1c2Rate)
 {
-    X265_CHECK(c1c2Idx <= 3, "c1c2Idx check failure\n");
     X265_CHECK(absGoRice <= 4, "absGoRice check failure\n");
     if (!absLevel)
     {
@@ -94,12 +94,7 @@
         uint32_t numBins = fastMin(prefLen + absGoRice, 8 /* g_goRicePrefixLen[absGoRice] + absGoRice */);
 
         rate += numBins << 15;
-
-        if (c1c2Idx & 1)
-            rate += greaterOneBits[1];
-
-        if (c1c2Idx == 3)
-            rate += levelAbsBits[1];
+        rate += c1c2Rate;
     }
     return rate;
 }
@@ -140,7 +135,7 @@
 }
 
 /* Calculates the cost for specific absolute transform level */
-inline uint32_t getICRateCost(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx)
+inline uint32_t getICRateCost(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, uint32_t absGoRice, const uint32_t c1c2Rate)
 {
     X265_CHECK(absLevel, "absLevel should not be zero\n");
 
@@ -175,16 +170,15 @@
 
             rate = (COEF_REMAIN_BIN_REDUCTION + length + absGoRice + 1 + length) << 15;
         }
-        if (c1c2Idx & 1)
-            rate += greaterOneBits[1];
-        if (c1c2Idx == 3)
-            rate += levelAbsBits[1];
+        rate += c1c2Rate;
         return rate;
     }
 }
 
 }
 
+Quant::rdoQuant_t Quant::rdoQuant_func[NUM_CU_DEPTH] = {&Quant::rdoQuant<2>, &Quant::rdoQuant<3>, &Quant::rdoQuant<4>, &Quant::rdoQuant<5>};
+
 Quant::Quant()
 {
     m_resiDctCoeff = NULL;
@@ -229,8 +223,11 @@
 {
     m_nr = m_frameNr ? &m_frameNr[ctu.m_encData->m_frameEncoderID] : NULL;
     m_qpParam[TEXT_LUMA].setQpParam(qp + QP_BD_OFFSET);
-    setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[0], TEXT_CHROMA_U, ctu.m_chromaFormat);
-    setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[1], TEXT_CHROMA_V, ctu.m_chromaFormat);
+    if (ctu.m_chromaFormat != X265_CSP_I400)
+    {
+        setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[0], TEXT_CHROMA_U, ctu.m_chromaFormat);
+        setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[1], TEXT_CHROMA_V, ctu.m_chromaFormat);
+    }
 }
 
 void Quant::setChromaQP(int qpin, TextType ttype, int chFmt)
@@ -444,18 +441,18 @@
             primitives.cu[sizeIdx].dct(m_fencShortBuf, m_fencDctCoeff, trSize);
         }
 
-        if (m_nr)
+        if (m_nr && m_nr->offset)
         {
             /* denoise is not applied to intra residual, so DST can be ignored */
             int cat = sizeIdx + 4 * !isLuma + 8 * !isIntra;
             int numCoeff = 1 << (log2TrSize * 2);
-            primitives.denoiseDct(m_resiDctCoeff, m_nr->residualSum[cat], m_nr->offsetDenoise[cat], numCoeff);
+            primitives.denoiseDct(m_resiDctCoeff, m_nr->residualSum[cat], m_nr->offset[cat], numCoeff);
             m_nr->count[cat]++;
         }
     }
 
     if (m_rdoqLevel)
-        return rdoQuant(cu, coeff, log2TrSize, ttype, absPartIdx, usePsy);
+        return (this->*rdoQuant_func[log2TrSize - 2])(cu, coeff, ttype, absPartIdx, usePsy);
     else
     {
         int deltaU[32 * 32];
@@ -550,9 +547,10 @@
 
 /* Rate distortion optimized quantization for entropy coding engines using
  * probability models like CABAC */
-uint32_t Quant::rdoQuant(const CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool usePsy)
+template<uint32_t log2TrSize>
+uint32_t Quant::rdoQuant(const CUData& cu, int16_t* dstCoeff, TextType ttype, uint32_t absPartIdx, bool usePsy)
 {
-    int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
+    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
     int scalingListType = (cu.isIntra(absPartIdx) ? 0 : 3) + ttype;
     const uint32_t usePsyMask = usePsy ? -1 : 0;
 
@@ -564,13 +562,13 @@
     int add = (1 << (qbits - 1));
     const int32_t* qCoef = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem];
 
-    int numCoeff = 1 << (log2TrSize * 2);
+    const int numCoeff = 1 << (log2TrSize * 2);
     uint32_t numSig = primitives.nquant(m_resiDctCoeff, qCoef, dstCoeff, qbits, add, numCoeff);
     X265_CHECK((int)numSig == primitives.cu[log2TrSize - 2].count_nonzero(dstCoeff), "numSig differ\n");
     if (!numSig)
         return 0;
 
-    uint32_t trSize = 1 << log2TrSize;
+    const uint32_t trSize = 1 << log2TrSize;
     int64_t lambda2 = m_qpParam[ttype].lambda2;
     const int64_t psyScale = ((int64_t)m_psyRdoqScale * m_qpParam[ttype].lambda);
 
@@ -580,20 +578,20 @@
     const int32_t* unquantScale = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem];
     int unquantShift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift + (m_scalingList->m_bEnabled ? 4 : 0);
     int unquantRound = (unquantShift > per) ? 1 << (unquantShift - per - 1) : 0;
-    int scaleBits = SCALE_BITS - 2 * transformShift;
+    const int scaleBits = SCALE_BITS - 2 * transformShift;
 
 #define UNQUANT(lvl)    (((lvl) * (unquantScale[blkPos] << per) + unquantRound) >> unquantShift)
 #define SIGCOST(bits)   ((lambda2 * (bits)) >> 8)
 #define RDCOST(d, bits) ((((int64_t)d * d) << scaleBits) + SIGCOST(bits))
 #define PSYVALUE(rec)   ((psyScale * (rec)) >> X265_MAX(0, (2 * transformShift + 1)))
 
-    int64_t costCoeff[32 * 32];   /* d*d + lambda * bits */
-    int64_t costUncoded[32 * 32]; /* d*d + lambda * 0    */
-    int64_t costSig[32 * 32];     /* lambda * bits       */
+    int64_t costCoeff[trSize * trSize];   /* d*d + lambda * bits */
+    int64_t costUncoded[trSize * trSize]; /* d*d + lambda * 0    */
+    int64_t costSig[trSize * trSize];     /* lambda * bits       */
 
-    int rateIncUp[32 * 32];      /* signal overhead of increasing level */
-    int rateIncDown[32 * 32];    /* signal overhead of decreasing level */
-    int sigRateDelta[32 * 32];   /* signal difference between zero and non-zero */
+    int rateIncUp[trSize * trSize];      /* signal overhead of increasing level */
+    int rateIncDown[trSize * trSize];    /* signal overhead of decreasing level */
+    int sigRateDelta[trSize * trSize];   /* signal difference between zero and non-zero */
 
     int64_t costCoeffGroupSig[MLS_GRP_NUM]; /* lambda * bits of group coding cost */
     uint64_t sigCoeffGroupFlag64 = 0;
@@ -611,7 +609,8 @@
 
     TUEntropyCodingParameters codeParams;
     cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, bIsLuma);
-    const uint32_t cgNum = 1 << (codeParams.log2TrSizeCG * 2);
+    const uint32_t log2TrSizeCG = log2TrSize - 2;
+    const uint32_t cgNum = 1 << (log2TrSizeCG * 2);
     const uint32_t cgStride = (trSize >> MLS_CG_LOG2_SIZE);
 
     uint8_t coeffNum[MLS_GRP_NUM];      // value range[0, 16]
@@ -742,8 +741,8 @@
     {
         uint32_t ctxSet = (cgScanPos && bIsLuma) ? 2 : 0;
         const uint32_t cgBlkPos = codeParams.scanCG[cgScanPos];
-        const uint32_t cgPosY   = cgBlkPos >> codeParams.log2TrSizeCG;
-        const uint32_t cgPosX   = cgBlkPos - (cgPosY << codeParams.log2TrSizeCG);
+        const uint32_t cgPosY   = cgBlkPos >> log2TrSizeCG;
+        const uint32_t cgPosX   = cgBlkPos & ((1 << log2TrSizeCG) - 1);
         const uint64_t cgBlkPosMask = ((uint64_t)1 << cgBlkPos);
         const int patternSigCtx = calcPatternSigCtx(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);
         const int ctxSigOffset = codeParams.firstSignificanceMapContext + (cgScanPos && bIsLuma ? 3 : 0);
@@ -829,6 +828,7 @@
         uint32_t subFlagMask = coeffFlag[cgScanPos];
         int    c2            = 0;
         uint32_t goRiceParam = 0;
+        uint32_t levelThreshold = 3;
         uint32_t c1Idx       = 0;
         uint32_t c2Idx       = 0;
         /* iterate over coefficients in each group in reverse scan order */
@@ -836,7 +836,7 @@
         {
             scanPos              = (cgScanPos << MLS_CG_SIZE) + scanPosinCG;
             uint32_t blkPos      = codeParams.scan[scanPos];
-            uint32_t maxAbsLevel = abs(dstCoeff[blkPos]);             /* abs(quantized coeff) */
+            uint32_t maxAbsLevel = dstCoeff[blkPos];                  /* abs(quantized coeff) */
             int signCoef         = m_resiDctCoeff[blkPos];            /* pre-quantization DCT coeff */
             int predictedCoef    = m_fencDctCoeff[blkPos] - signCoef; /* predicted DCT = source DCT - residual DCT*/
 
@@ -855,7 +855,11 @@
 
             // coefficient level estimation
             const int* greaterOneBits = estBitsSbac.greaterOneBits[4 * ctxSet + c1];
-            const uint32_t ctxSig = (blkPos == 0) ? 0 : table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset;
+            //const uint32_t ctxSig = (blkPos == 0) ? 0 : table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset;
+            static const uint64_t table_cnt64[4] = {0x0000000100110112ULL, 0x0000000011112222ULL, 0x0012001200120012ULL, 0x2222222222222222ULL};

x265_1.8.tar.gz/source/common/quant.h -> x265_1.9.tar.gz/source/common/quant.h Changed

@@ -2,6 +2,7 @@
  * Copyright (C) 2015 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -59,18 +60,18 @@
     }
 };
 
-#define MAX_NUM_TR_COEFFS        MAX_TR_SIZE * MAX_TR_SIZE /* Maximum number of transform coefficients, for a 32x32 transform */
-#define MAX_NUM_TR_CATEGORIES    16                        /* 32, 16, 8, 4 transform categories each for luma and chroma */
-
 // NOTE: MUST be 16-byte aligned for asm code
 struct NoiseReduction
 {
     /* 0 = luma 4x4,   1 = luma 8x8,   2 = luma 16x16,   3 = luma 32x32
      * 4 = chroma 4x4, 5 = chroma 8x8, 6 = chroma 16x16, 7 = chroma 32x32
      * Intra 0..7 - Inter 8..15 */
-    ALIGN_VAR_16(uint32_t, residualSum[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS]);
-    uint32_t count[MAX_NUM_TR_CATEGORIES];
-    uint16_t offsetDenoise[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
+    ALIGN_VAR_16(uint32_t, nrResidualSum[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS]);
+    uint32_t nrCount[MAX_NUM_TR_CATEGORIES];
+    uint16_t nrOffsetDenoise[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
+    uint16_t (*offset)[MAX_NUM_TR_COEFFS];
+    uint32_t (*residualSum)[MAX_NUM_TR_COEFFS];
+    uint32_t *count;
 };
 
 class Quant
@@ -125,8 +126,8 @@
         const uint32_t sigPos = (uint32_t)(sigCoeffGroupFlag64 >> (cgBlkPos + 1)); // just need lowest 7-bits valid
 
         // TODO: instruction BT is faster, but _bittest64 still generate instruction 'BT m, r' in VS2012
-        const uint32_t sigRight = ((uint32_t)(cgPosX - (trSizeCG - 1)) >> 31) & sigPos;
-        const uint32_t sigLower = ((uint32_t)(cgPosY - (trSizeCG - 1)) >> 31) & (sigPos >> (trSizeCG - 1));
+        const uint32_t sigRight = (cgPosX != (trSizeCG - 1)) & sigPos;
+        const uint32_t sigLower = (cgPosY != (trSizeCG - 1)) & (sigPos >> (trSizeCG - 1));
         return sigRight + sigLower * 2;
     }
 
@@ -136,8 +137,8 @@
         X265_CHECK(cgBlkPos < 64, "cgBlkPos is too large\n");
         // NOTE: unsafe shift operator, see NOTE in calcPatternSigCtx
         const uint32_t sigPos = (uint32_t)(cgGroupMask >> (cgBlkPos + 1)); // just need lowest 8-bits valid
-        const uint32_t sigRight = ((uint32_t)(cgPosX - (trSizeCG - 1)) >> 31) & sigPos;
-        const uint32_t sigLower = ((uint32_t)(cgPosY - (trSizeCG - 1)) >> 31) & (sigPos >> (trSizeCG - 1));
+        const uint32_t sigRight = (cgPosX != (trSizeCG - 1)) & sigPos;
+        const uint32_t sigLower = (cgPosY != (trSizeCG - 1)) & (sigPos >> (trSizeCG - 1));
 
         return (sigRight | sigLower);
     }
@@ -151,7 +152,14 @@
 
     uint32_t signBitHidingHDQ(int16_t* qcoeff, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codingParameters, uint32_t log2TrSize);
 
-    uint32_t rdoQuant(const CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool usePsy);
+    template<uint32_t log2TrSize>
+    uint32_t rdoQuant(const CUData& cu, int16_t* dstCoeff, TextType ttype, uint32_t absPartIdx, bool usePsy);
+
+public:
+    typedef uint32_t (Quant::*rdoQuant_t)(const CUData& cu, int16_t* dstCoeff, TextType ttype, uint32_t absPartIdx, bool usePsy);
+
+private:
+    static rdoQuant_t rdoQuant_func[NUM_CU_DEPTH];
 };
 }

x265_1.8.tar.gz/source/common/shortyuv.cpp -> x265_1.9.tar.gz/source/common/shortyuv.cpp Changed

@@ -40,19 +40,26 @@
 bool ShortYuv::create(uint32_t size, int csp)
 {
     m_csp = csp;
+    m_size = size;
     m_hChromaShift = CHROMA_H_SHIFT(csp);
     m_vChromaShift = CHROMA_V_SHIFT(csp);
-
-    m_size = size;
-    m_csize = size >> m_hChromaShift;
-
     size_t sizeL = size * size;
-    size_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
-    X265_CHECK((sizeC & 15) == 0, "invalid size");
 
-    CHECKED_MALLOC(m_buf[0], int16_t, sizeL + sizeC * 2);
-    m_buf[1] = m_buf[0] + sizeL;
-    m_buf[2] = m_buf[0] + sizeL + sizeC;
+    if (csp != X265_CSP_I400)
+    {
+        m_csize = size >> m_hChromaShift;
+        size_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
+        X265_CHECK((sizeC & 15) == 0, "invalid size");
+
+        CHECKED_MALLOC(m_buf[0], int16_t, sizeL + sizeC * 2);
+        m_buf[1] = m_buf[0] + sizeL;
+        m_buf[2] = m_buf[0] + sizeL + sizeC;
+    }
+    else
+    {
+        CHECKED_MALLOC(m_buf[0], int16_t, sizeL);
+        m_buf[1] = m_buf[2] = NULL;
+    }
     return true;
 
 fail:
@@ -75,8 +82,11 @@
 {
     const int sizeIdx = log2Size - 2;
     primitives.cu[sizeIdx].sub_ps(m_buf[0], m_size, srcYuv0.m_buf[0], srcYuv1.m_buf[0], srcYuv0.m_size, srcYuv1.m_size);
-    primitives.chroma[m_csp].cu[sizeIdx].sub_ps(m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
-    primitives.chroma[m_csp].cu[sizeIdx].sub_ps(m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
+    if (m_csp != X265_CSP_I400)
+    {
+        primitives.chroma[m_csp].cu[sizeIdx].sub_ps(m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
+        primitives.chroma[m_csp].cu[sizeIdx].sub_ps(m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
+    }
 }
 
 void ShortYuv::copyPartToPartLuma(ShortYuv& dstYuv, uint32_t absPartIdx, uint32_t log2Size) const

x265_1.8.tar.gz/source/common/slice.cpp -> x265_1.9.tar.gz/source/common/slice.cpp Changed

@@ -33,7 +33,9 @@
 {
     if (m_sliceType == I_SLICE)
     {
-        memset(m_refPicList, 0, sizeof(m_refPicList));
+        memset(m_refFrameList, 0, sizeof(m_refFrameList));
+        memset(m_refReconPicList, 0, sizeof(m_refReconPicList));
+        memset(m_refPOCList, 0, sizeof(m_refPOCList));
         m_numRefIdx[1] = m_numRefIdx[0] = 0;
         return;
     }
@@ -106,13 +108,13 @@
     {
         cIdx = rIdx % numPocTotalCurr;
         X265_CHECK(cIdx >= 0 && cIdx < numPocTotalCurr, "RPS index check fail\n");
-        m_refPicList[0][rIdx] = rpsCurrList0[cIdx];
+        m_refFrameList[0][rIdx] = rpsCurrList0[cIdx];
     }
 
     if (m_sliceType != B_SLICE)
     {
         m_numRefIdx[1] = 0;
-        memset(m_refPicList[1], 0, sizeof(m_refPicList[1]));
+        memset(m_refFrameList[1], 0, sizeof(m_refFrameList[1]));
     }
     else
     {
@@ -120,13 +122,13 @@
         {
             cIdx = rIdx % numPocTotalCurr;
             X265_CHECK(cIdx >= 0 && cIdx < numPocTotalCurr, "RPS index check fail\n");
-            m_refPicList[1][rIdx] = rpsCurrList1[cIdx];
+            m_refFrameList[1][rIdx] = rpsCurrList1[cIdx];
         }
     }
 
     for (int dir = 0; dir < 2; dir++)
         for (int numRefIdx = 0; numRefIdx < m_numRefIdx[dir]; numRefIdx++)
-            m_refPOCList[dir][numRefIdx] = m_refPicList[dir][numRefIdx]->m_poc;
+            m_refPOCList[dir][numRefIdx] = m_refFrameList[dir][numRefIdx]->m_poc;
 }
 
 void Slice::disableWeights()

x265_1.8.tar.gz/source/common/slice.h -> x265_1.9.tar.gz/source/common/slice.h Changed

@@ -31,6 +31,7 @@
 
 class Frame;
 class PicList;
+class PicYuv;
 class MotionReference;
 
 enum SliceType
@@ -104,6 +105,12 @@
 
 struct ProfileTierLevel
 {
+    int      profileIdc;
+    int      levelIdc;
+    uint32_t minCrForLevel;
+    uint32_t maxLumaSrForLevel;
+    uint32_t bitDepthConstraint;
+    int      chromaFormatConstraint;
     bool     tierFlag;
     bool     progressiveSourceFlag;
     bool     interlacedSourceFlag;
@@ -113,12 +120,6 @@
     bool     intraConstraintFlag;
     bool     onePictureOnlyConstraintFlag;
     bool     lowerBitRateConstraintFlag;
-    int      profileIdc;
-    int      levelIdc;
-    uint32_t minCrForLevel;
-    uint32_t maxLumaSrForLevel;
-    uint32_t bitDepthConstraint;
-    int      chromaFormatConstraint;
 };
 
 struct HRDInfo
@@ -151,21 +152,21 @@
 
 struct VPS
 {
+    HRDInfo          hrdParameters;
+    ProfileTierLevel ptl;
     uint32_t         maxTempSubLayers;
     uint32_t         numReorderPics;
     uint32_t         maxDecPicBuffering;
     uint32_t         maxLatencyIncrease;
-    HRDInfo          hrdParameters;
-    ProfileTierLevel ptl;
 };
 
 struct Window
 {
-    bool bEnabled;
     int  leftOffset;
     int  rightOffset;
     int  topOffset;
     int  bottomOffset;
+    bool bEnabled;
 
     Window()
     {
@@ -175,40 +176,41 @@
 
 struct VUI
 {
-    bool       aspectRatioInfoPresentFlag;
     int        aspectRatioIdc;
     int        sarWidth;
     int        sarHeight;
-
-    bool       overscanInfoPresentFlag;
-    bool       overscanAppropriateFlag;
-
-    bool       videoSignalTypePresentFlag;
     int        videoFormat;
-    bool       videoFullRangeFlag;
-
-    bool       colourDescriptionPresentFlag;
     int        colourPrimaries;
     int        transferCharacteristics;
     int        matrixCoefficients;
-
-    bool       chromaLocInfoPresentFlag;
     int        chromaSampleLocTypeTopField;
     int        chromaSampleLocTypeBottomField;
 
-    Window     defaultDisplayWindow;
-
+    bool       aspectRatioInfoPresentFlag;
+    bool       overscanInfoPresentFlag;
+    bool       overscanAppropriateFlag;
+    bool       videoSignalTypePresentFlag;
+    bool       videoFullRangeFlag;
+    bool       colourDescriptionPresentFlag;
+    bool       chromaLocInfoPresentFlag;
     bool       frameFieldInfoPresentFlag;
     bool       fieldSeqFlag;
-
     bool       hrdParametersPresentFlag;
-    HRDInfo    hrdParameters;
 
+    HRDInfo    hrdParameters;
+    Window     defaultDisplayWindow;
     TimingInfo timingInfo;
 };
 
 struct SPS
 {
+    /* cached PicYuv offset arrays, shared by all instances of
+     * PicYuv created by this encoder */
+    intptr_t* cuOffsetY;
+    intptr_t* cuOffsetC;
+    intptr_t* buOffsetY;
+    intptr_t* buOffsetC;
+
     int      chromaFormatIdc;        // use param
     uint32_t picWidthInLumaSamples;  // use param
     uint32_t picHeightInLumaSamples; // use param
@@ -228,8 +230,6 @@
     uint32_t quadtreeTUMaxDepthInter; // use param
     uint32_t quadtreeTUMaxDepthIntra; // use param
 
-    bool     bUseSAO; // use param
-    bool     bUseAMP; // use param
     uint32_t maxAMPDepth;
 
     uint32_t maxTempSubLayers;   // max number of Temporal Sub layers
@@ -237,11 +237,26 @@
     uint32_t maxLatencyIncrease;
     int      numReorderPics;
 
+    bool     bUseSAO; // use param
+    bool     bUseAMP; // use param
     bool     bUseStrongIntraSmoothing; // use param
     bool     bTemporalMVPEnabled;
 
     Window   conformanceWindow;
     VUI      vuiParameters;
+
+    SPS()
+    {
+        memset(this, 0, sizeof(*this));
+    }
+
+    ~SPS()
+    {
+        X265_FREE(cuOffsetY);
+        X265_FREE(cuOffsetC);
+        X265_FREE(buOffsetY);
+        X265_FREE(buOffsetC);
+    }
 };
 
 struct PPS
@@ -249,6 +264,8 @@
     uint32_t maxCuDQPDepth;
 
     int      chromaQpOffset[2];      // use param
+    int      deblockingFilterBetaOffsetDiv2;
+    int      deblockingFilterTcOffsetDiv2;
 
     bool     bUseWeightPred;         // use param
     bool     bUseWeightedBiPred;     // use param
@@ -262,17 +279,15 @@
 
     bool     bDeblockingFilterControlPresent;
     bool     bPicDisableDeblockingFilter;
-    int      deblockingFilterBetaOffsetDiv2;
-    int      deblockingFilterTcOffsetDiv2;
 };
 
 struct WeightParam
 {
     // Explicit weighted prediction parameters parsed in slice header,
-    bool     bPresentFlag;
     uint32_t log2WeightDenom;
     int      inputWeight;
     int      inputOffset;
+    bool     bPresentFlag;
 
     /* makes a non-h265 weight (i.e. fix7), into an h265 weight */
     void setFromWeightAndOffset(int w, int o, int denom, bool bNormalize)
@@ -304,6 +319,9 @@
 
     const SPS*  m_sps;
     const PPS*  m_pps;
+    Frame*      m_refFrameList[2][MAX_NUM_REF + 1];
+    PicYuv*     m_refReconPicList[2][MAX_NUM_REF + 1];
+
     WeightParam m_weightPredTable[2][MAX_NUM_REF][3]; // [list][refIdx][0:Y, 1:U, 2:V]
     MotionReference (*m_mref)[MAX_NUM_REF + 1];
     RPS         m_rps;
@@ -312,34 +330,28 @@
     SliceType   m_sliceType;
     int         m_sliceQp;
     int         m_poc;
-    
     int         m_lastIDR;
 
-    bool        m_bCheckLDC;       // TODO: is this necessary?
-    bool        m_sLFaseFlag;      // loop filter boundary flag
-    bool        m_colFromL0Flag;   // collocated picture from List0 or List1 flag

x265_1.8.tar.gz/source/common/threading.h -> x265_1.9.tar.gz/source/common/threading.h Changed

x265_1.8.tar.gz/source/common/threadpool.cpp -> x265_1.9.tar.gz/source/common/threadpool.cpp Changed

@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -59,6 +60,9 @@
 #if HAVE_LIBNUMA
 #include <numa.h>
 #endif
+#if defined(_MSC_VER)
+# define strcasecmp _stricmp
+#endif
 
 namespace X265_NS {
 // x265 private namespace
@@ -226,8 +230,13 @@
 {
     enum { MAX_NODE_NUM = 127 };
     int cpusPerNode[MAX_NODE_NUM + 1];
+    int threadsPerPool[MAX_NODE_NUM + 2];
+    uint64_t nodeMaskPerPool[MAX_NODE_NUM + 2];
 
     memset(cpusPerNode, 0, sizeof(cpusPerNode));
+    memset(threadsPerPool, 0, sizeof(threadsPerPool));
+    memset(nodeMaskPerPool, 0, sizeof(nodeMaskPerPool));
+
     int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM);
     int cpuCount = getCpuCount();
     bool bNumaSupport = false;
@@ -258,7 +267,7 @@
         for (int i = 0; i < numNumaNodes; i++)
             x265_log(p, X265_LOG_DEBUG, "detected NUMA node %d with %d logical cores\n", i, cpusPerNode[i]);
 
-    /* limit nodes based on param->numaPools */
+    /* limit threads based on param->numaPools */
     if (p->numaPools && *p->numaPools)
     {
         const char *nodeStr = p->numaPools;
@@ -266,19 +275,30 @@
         {
             if (!*nodeStr)
             {
-                cpusPerNode[i] = 0;
+                threadsPerPool[i] = 0;
                 continue;
             }
             else if (*nodeStr == '-')
-                cpusPerNode[i] = 0;
-            else if (*nodeStr == '*')
+                threadsPerPool[i] = 0;
+			else if (*nodeStr == '*' || !strcasecmp(nodeStr, "NULL"))
+            {
+                for (int j = i; j < numNumaNodes; j++)
+                {
+                    threadsPerPool[numNumaNodes] += cpusPerNode[j];
+                    nodeMaskPerPool[numNumaNodes] |= ((uint64_t)1 << j);
+                }
                 break;
+            }
             else if (*nodeStr == '+')
-                ;
+            {
+                threadsPerPool[numNumaNodes] += cpusPerNode[i];
+                nodeMaskPerPool[numNumaNodes] |= ((uint64_t)1 << i);
+            }
             else
             {
                 int count = atoi(nodeStr);
-                cpusPerNode[i] = X265_MIN(count, cpusPerNode[i]);
+                threadsPerPool[i] = X265_MIN(count, cpusPerNode[i]);
+                nodeMaskPerPool[i] = ((uint64_t)1 << i);
             }
 
             /* consume current node string, comma, and white-space */
@@ -288,14 +308,31 @@
                ++nodeStr;
         }
     }
+    else
+    {
+        for (int i = 0; i < numNumaNodes; i++)
+        {
+            threadsPerPool[numNumaNodes]  += cpusPerNode[i];
+            nodeMaskPerPool[numNumaNodes] |= ((uint64_t)1 << i);
+        }
+    }
+ 
+    // If the last pool size is > MAX_POOL_THREADS, clip it to spawn thread pools only of size >= 1/2 max (heuristic)
+    if ((threadsPerPool[numNumaNodes] > MAX_POOL_THREADS) &&
+        ((threadsPerPool[numNumaNodes] % MAX_POOL_THREADS) < (MAX_POOL_THREADS / 2)))
+    {
+        threadsPerPool[numNumaNodes] -= (threadsPerPool[numNumaNodes] % MAX_POOL_THREADS);
+        x265_log(p, X265_LOG_DEBUG,
+                 "Creating only %d worker threads beyond specified numbers with --pools (if specified) to prevent asymmetry in pools; may not use all HW contexts\n", threadsPerPool[numNumaNodes]);
+    }
 
     numPools = 0;
-    for (int i = 0; i < numNumaNodes; i++)
+    for (int i = 0; i < numNumaNodes + 1; i++)
     {
         if (bNumaSupport)
             x265_log(p, X265_LOG_DEBUG, "NUMA node %d may use %d logical cores\n", i, cpusPerNode[i]);
-        if (cpusPerNode[i])
-            numPools += (cpusPerNode[i] + MAX_POOL_THREADS - 1) / MAX_POOL_THREADS;
+        if (threadsPerPool[i])
+            numPools += (threadsPerPool[i] + MAX_POOL_THREADS - 1) / MAX_POOL_THREADS;
     }
 
     if (!numPools)
@@ -314,20 +351,27 @@
         int node = 0;
         for (int i = 0; i < numPools; i++)
         {
-            while (!cpusPerNode[node])
+            while (!threadsPerPool[node])
                 node++;
-            int cores = X265_MIN(MAX_POOL_THREADS, cpusPerNode[node]);
-            if (!pools[i].create(cores, maxProviders, node))
+            int numThreads = X265_MIN(MAX_POOL_THREADS, threadsPerPool[node]);
+            if (!pools[i].create(numThreads, maxProviders, nodeMaskPerPool[node]))
             {
                 X265_FREE(pools);
                 numPools = 0;
                 return NULL;
             }
             if (numNumaNodes > 1)
-                x265_log(p, X265_LOG_INFO, "Thread pool %d using %d threads on NUMA node %d\n", i, cores, node);
+            {
+                char *nodesstr = new char[64 * strlen(",63") + 1];
+                int len = 0;
+                for (int j = 0; j < 64; j++)
+                    if ((nodeMaskPerPool[node] >> j) & 1)
+                        len += sprintf(nodesstr + len, ",%d", j);
+                x265_log(p, X265_LOG_INFO, "Thread pool %d using %d threads on numa nodes %s\n", i, numThreads, nodesstr + 1);
+            }
             else
-                x265_log(p, X265_LOG_INFO, "Thread pool created using %d threads\n", cores);
-            cpusPerNode[node] -= cores;
+                x265_log(p, X265_LOG_INFO, "Thread pool created using %d threads\n", numThreads);
+            threadsPerPool[node] -= numThreads;
         }
     }
     else
@@ -340,11 +384,37 @@
     memset(this, 0, sizeof(*this));
 }
 
-bool ThreadPool::create(int numThreads, int maxProviders, int node)
+bool ThreadPool::create(int numThreads, int maxProviders, uint64_t nodeMask)
 {
     X265_CHECK(numThreads <= MAX_POOL_THREADS, "a single thread pool cannot have more than MAX_POOL_THREADS threads\n");
 
-    m_numaNode = node;
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
+    m_winCpuMask = 0x0;
+    GROUP_AFFINITY groupAffinity;
+    for (int i = 0; i < getNumaNodeCount(); i++)
+    {
+        int numaNode = ((nodeMask >> i) & 0x1U) ? i : -1;
+        if (numaNode != -1)
+            if (GetNumaNodeProcessorMaskEx((USHORT)numaNode, &groupAffinity))
+                m_winCpuMask |= groupAffinity.Mask;
+    }
+    m_numaMask = &m_winCpuMask;
+#elif HAVE_LIBNUMA
+    if (numa_available() >= 0)
+    {
+        struct bitmask* nodemask = numa_allocate_nodemask();
+        if (nodemask)
+        {
+            *(nodemask->maskp) = nodeMask;
+            m_numaMask = nodemask;
+        }
+        else
+            x265_log(NULL, X265_LOG_ERROR, "unable to get NUMA node mask for %lx\n", nodeMask);
+    }
+#else
+    (void)nodeMask;
+#endif
+
     m_numWorkers = numThreads;
 
     m_workers = X265_MALLOC(WorkerThread, numThreads);
@@ -398,36 +468,39 @@
 
     X265_FREE(m_workers);
     X265_FREE(m_jpTable);
+
+#if HAVE_LIBNUMA
+    if(m_numaMask)
+        numa_free_nodemask((struct bitmask*)m_numaMask);
+#endif
 }
 
 void ThreadPool::setCurrentThreadAffinity()
 {
-    setThreadNodeAffinity(m_numaNode);

x265_1.8.tar.gz/source/common/threadpool.h -> x265_1.9.tar.gz/source/common/threadpool.h Changed

x265_1.8.tar.gz/source/common/version.cpp -> x265_1.9.tar.gz/source/common/version.cpp Changed

x265_1.8.tar.gz/source/common/wavefront.cpp -> x265_1.9.tar.gz/source/common/wavefront.cpp Changed

x265_1.8.tar.gz/source/common/wavefront.h -> x265_1.9.tar.gz/source/common/wavefront.h Changed

x265_1.8.tar.gz/source/common/x86/asm-primitives.cpp -> x265_1.9.tar.gz/source/common/x86/asm-primitives.cpp Changed

@@ -962,11 +962,8 @@
 
         p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar4_sse2);
         p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar8_sse2);
-
-#if X265_DEPTH <= 10
         p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_sse2);
         p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_sse2);
-#endif /* X265_DEPTH <= 10 */
         ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse2);
 
         p.cu[BLOCK_4x4].intra_pred[2] = PFX(intra_pred_ang4_2_sse2);
@@ -1003,13 +1000,12 @@
         p.cu[BLOCK_4x4].intra_pred[33] = PFX(intra_pred_ang4_33_sse2);
 
         p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_32x64_sse2);
-#if X265_DEPTH <= 10
-        p.cu[BLOCK_4x4].sse_ss = PFX(pixel_ssd_ss_4x4_mmx2);
-        ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2);
-
         p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_4x8_mmx2);
         p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_8x16_sse2);
         p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_16x32_sse2);
+#if X265_DEPTH <= 10
+        p.cu[BLOCK_4x4].sse_ss = PFX(pixel_ssd_ss_4x4_mmx2);
+        ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2);
 #endif
         p.cu[BLOCK_4x4].dct = PFX(dct4_sse2);
         p.cu[BLOCK_8x8].dct = PFX(dct8_sse2);
@@ -1031,6 +1027,7 @@
         ALL_CHROMA_444_PU(p2s, filterPixelToShort, sse2);
         ALL_LUMA_PU(convert_p2s, filterPixelToShort, sse2);
         ALL_LUMA_TU(count_nonzero, count_nonzero, sse2);
+        p.propagateCost = PFX(mbtree_propagate_cost_sse2);
     }
     if (cpuMask & X265_CPU_SSE3)
     {
@@ -1144,11 +1141,8 @@
 
         p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar4_sse4);
         p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar8_sse4);
-
-#if X265_DEPTH <= 10
         p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_sse4);
         p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_sse4);
-#endif
         ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4);
         INTRA_ANG_SSE4_COMMON(sse4);
         INTRA_ANG_SSE4_HIGH(sse4);
@@ -1158,14 +1152,12 @@
         p.weight_sp = PFX(weight_sp_sse4);
 
         p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_pp_4x4_sse4);
-        p.cu[BLOCK_4x4].psy_cost_ss = PFX(psyCost_ss_4x4_sse4);
 
         // TODO: check POPCNT flag!
         ALL_LUMA_TU_S(copy_cnt, copy_cnt_, sse4);
 #if X265_DEPTH <= 10
         ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);
 #endif
-        ALL_LUMA_CU(psy_cost_ss, psyCost_ss, sse4);
 
         p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].p2s = PFX(filterPixelToShort_2x4_sse4);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].p2s = PFX(filterPixelToShort_2x8_sse4);
@@ -1173,6 +1165,7 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].p2s = PFX(filterPixelToShort_2x8_sse4);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].p2s = PFX(filterPixelToShort_2x16_sse4);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s = PFX(filterPixelToShort_6x16_sse4);
+        p.costCoeffRemain = PFX(costCoeffRemain_sse4);
     }
     if (cpuMask & X265_CPU_AVX)
     {
@@ -1306,6 +1299,7 @@
         p.pu[LUMA_64x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x32_avx);
         p.pu[LUMA_64x48].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x48_avx);
         p.pu[LUMA_64x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx);
+        p.propagateCost = PFX(mbtree_propagate_cost_avx);
     }
     if (cpuMask & X265_CPU_XOP)
     {
@@ -1319,6 +1313,9 @@
     }
     if (cpuMask & X265_CPU_AVX2)
     {
+#if X265_DEPTH == 12
+        ASSIGN_SA8D(avx2);
+#endif
         p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_avx2);
 
         // TODO: the planecopy_sp is really planecopy_SC now, must be fix it
@@ -1479,20 +1476,14 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg = PFX(addAvg_32x16_avx2);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg = PFX(addAvg_32x48_avx2);
 
-        p.cu[BLOCK_4x4].psy_cost_ss = PFX(psyCost_ss_4x4_avx2);
-        p.cu[BLOCK_8x8].psy_cost_ss = PFX(psyCost_ss_8x8_avx2);
-        p.cu[BLOCK_16x16].psy_cost_ss = PFX(psyCost_ss_16x16_avx2);
-        p.cu[BLOCK_32x32].psy_cost_ss = PFX(psyCost_ss_32x32_avx2);
-        p.cu[BLOCK_64x64].psy_cost_ss = PFX(psyCost_ss_64x64_avx2);
         p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_pp_4x4_avx2);
-#if X265_DEPTH <= 10
+        p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_avx2);
+        p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_avx2);
+
         p.cu[BLOCK_8x8].psy_cost_pp = PFX(psyCost_pp_8x8_avx2);
         p.cu[BLOCK_16x16].psy_cost_pp = PFX(psyCost_pp_16x16_avx2);
         p.cu[BLOCK_32x32].psy_cost_pp = PFX(psyCost_pp_32x32_avx2);
         p.cu[BLOCK_64x64].psy_cost_pp = PFX(psyCost_pp_64x64_avx2);
-        p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_avx2);
-        p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_avx2);
-#endif
 
         p.cu[BLOCK_16x16].intra_pred[DC_IDX] = PFX(intra_pred_dc16_avx2);
         p.cu[BLOCK_32x32].intra_pred[DC_IDX] = PFX(intra_pred_dc32_avx2);
@@ -1536,20 +1527,13 @@
         p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16_avx2);
         p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx2);
 
-#if X265_DEPTH <= 10
-        p.cu[BLOCK_16x16].sse_ss = PFX(pixel_ssd_ss_16x16_avx2);
-        p.cu[BLOCK_32x32].sse_ss = PFX(pixel_ssd_ss_32x32_avx2);
-        p.cu[BLOCK_64x64].sse_ss = PFX(pixel_ssd_ss_64x64_avx2);
-
-        p.cu[BLOCK_16x16].sse_pp = PFX(pixel_ssd_16x16_avx2);
-        p.cu[BLOCK_32x32].sse_pp = PFX(pixel_ssd_32x32_avx2);
-        p.cu[BLOCK_64x64].sse_pp = PFX(pixel_ssd_64x64_avx2);
-        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sse_pp = PFX(pixel_ssd_16x16_avx2);
-        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = PFX(pixel_ssd_32x32_avx2);
+        p.cu[BLOCK_16x16].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_16x16_avx2);
+        p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_32x32_avx2);
+        p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_64x64_avx2);
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sse_pp = (pixel_sse_t)PFX(pixel_ssd_16x16_avx2);
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_32x32_avx2);
         p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_16x32_avx2);
         p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_32x64_avx2);
-#endif
-
         p.quant = PFX(quant_avx2);
         p.nquant = PFX(nquant_avx2);
         p.dequant_normal  = PFX(dequant_normal_avx2);
@@ -1588,21 +1572,16 @@
         p.cu[BLOCK_16x16].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16_avx2);
         p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32_avx2);
 
-#if X265_DEPTH <= 10
-        ALL_LUMA_TU_S(dct, dct, avx2);
         ALL_LUMA_TU_S(idct, idct, avx2);
-#endif
+        ALL_LUMA_TU_S(dct, dct, avx2);
+
         ALL_LUMA_CU_S(transpose, transpose, avx2);
 
         ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, avx2);
         ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, avx2);
-#if X265_DEPTH <= 10
         ALL_LUMA_PU(luma_vsp, interp_8tap_vert_sp, avx2);
-#endif
         ALL_LUMA_PU(luma_vss, interp_8tap_vert_ss, avx2);
-#if X265_DEPTH <= 10
         p.pu[LUMA_4x4].luma_vsp = PFX(interp_8tap_vert_sp_4x4_avx2);               // since ALL_LUMA_PU didn't declare 4x4 size, calling separately luma_vsp function to use 
-#endif
 
         p.cu[BLOCK_16x16].add_ps = PFX(pixel_add_ps_16x16_avx2);
         p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_avx2);
@@ -1625,7 +1604,6 @@
         p.pu[LUMA_16x12].sad = PFX(pixel_sad_16x12_avx2);
         p.pu[LUMA_16x16].sad = PFX(pixel_sad_16x16_avx2);
         p.pu[LUMA_16x32].sad = PFX(pixel_sad_16x32_avx2);
-#if X265_DEPTH <= 10
         p.pu[LUMA_16x64].sad = PFX(pixel_sad_16x64_avx2);
         p.pu[LUMA_32x8].sad = PFX(pixel_sad_32x8_avx2);
         p.pu[LUMA_32x16].sad = PFX(pixel_sad_32x16_avx2);
@@ -1637,7 +1615,6 @@
         p.pu[LUMA_64x32].sad = PFX(pixel_sad_64x32_avx2);
         p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx2);
         p.pu[LUMA_64x64].sad = PFX(pixel_sad_64x64_avx2);
-#endif
 
         p.pu[LUMA_16x4].sad_x3 = PFX(pixel_sad_x3_16x4_avx2);
         p.pu[LUMA_16x8].sad_x3 = PFX(pixel_sad_x3_16x8_avx2);
@@ -1712,7 +1689,6 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx2);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx2);
 
-#if X265_DEPTH <= 10
         p.pu[LUMA_4x4].luma_hps = PFX(interp_8tap_horiz_ps_4x4_avx2);
         p.pu[LUMA_4x8].luma_hps = PFX(interp_8tap_horiz_ps_4x8_avx2);
         p.pu[LUMA_4x16].luma_hps = PFX(interp_8tap_horiz_ps_4x16_avx2);
@@ -1738,7 +1714,6 @@
         p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_avx2);
         p.pu[LUMA_24x32].luma_hps = PFX(interp_8tap_horiz_ps_24x32_avx2);
         p.pu[LUMA_12x16].luma_hps = PFX(interp_8tap_horiz_ps_12x16_avx2);
-#endif
 
         p.pu[LUMA_4x4].luma_hpp = PFX(interp_8tap_horiz_pp_4x4_avx2);
         p.pu[LUMA_4x8].luma_hpp = PFX(interp_8tap_horiz_pp_4x8_avx2);
@@ -1766,7 +1741,6 @@
         p.pu[LUMA_24x32].luma_hpp = PFX(interp_8tap_horiz_pp_24x32_avx2);
         p.pu[LUMA_48x64].luma_hpp = PFX(interp_8tap_horiz_pp_48x64_avx2);

x265_1.8.tar.gz/source/common/x86/blockcopy8.asm -> x265_1.9.tar.gz/source/common/x86/blockcopy8.asm Changed

x265_1.8.tar.gz/source/common/x86/blockcopy8.h -> x265_1.9.tar.gz/source/common/x86/blockcopy8.h Changed

x265_1.8.tar.gz/source/common/x86/const-a.asm -> x265_1.9.tar.gz/source/common/x86/const-a.asm Changed

@@ -2,6 +2,7 @@
 ;* const-a.asm: x86 global constants
 ;*****************************************************************************
 ;* Copyright (C) 2010-2013 x264 project
+;* Copyright (C) 2013-2015 x265 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Fiona Glaser <fiona@x264.com>
@@ -31,10 +32,10 @@
 
 ;; 8-bit constants
 
-const pb_0,                 times 16 db 0
+const pb_0,                 times 32 db 0
 const pb_1,                 times 32 db 1
 const pb_2,                 times 32 db 2
-const pb_3,                 times 16 db 3
+const pb_3,                 times 32 db 3
 const pb_4,                 times 32 db 4
 const pb_8,                 times 32 db 8
 const pb_15,                times 32 db 15
@@ -54,6 +55,11 @@
 const pb_shuf8x8c,          times  1 db   0,   0,   0,   0,   2,   2,   2,   2,   4,   4,   4,   4,   6,   6,   6,   6
 const pb_movemask,          times 16 db 0x00
                             times 16 db 0xFF
+
+const pb_movemask_32,       times 32 db 0x00
+                            times 32 db 0xFF
+                            times 32 db 0x00
+
 const pb_0000000000000F0F,  times  2 db 0xff, 0x00
                             times 12 db 0x00
 const pb_000000000000000F,           db 0xff
@@ -61,6 +67,7 @@
 
 ;; 16-bit constants
 
+const pw_n1,                times 16 dw -1
 const pw_1,                 times 16 dw 1
 const pw_2,                 times 16 dw 2
 const pw_3,                 times 16 dw 3
@@ -86,12 +93,12 @@
 const pw_ff00,              times  8 dw 0xff00
 const pw_2000,              times 16 dw 0x2000
 const pw_8000,              times  8 dw 0x8000
-const pw_3fff,              times  8 dw 0x3fff
+const pw_3fff,              times 16 dw 0x3fff
 const pw_32_0,              times  4 dw 32,
                             times  4 dw 0
 const pw_pixel_max,         times 16 dw ((1 << BIT_DEPTH)-1)
 
-const pw_0_15,              times  2 dw   0,   1,   2,   3,   4,   5,   6,   7
+const pw_0_7,               times  2 dw   0,   1,   2,   3,   4,   5,   6,   7
 const pw_ppppmmmm,          times  1 dw   1,   1,   1,   1,  -1,  -1,  -1,  -1
 const pw_ppmmppmm,          times  1 dw   1,   1,  -1,  -1,   1,   1,  -1,  -1
 const pw_pmpmpmpm,          times 16 dw   1,  -1,   1,  -1,   1,  -1,   1,  -1
@@ -107,6 +114,7 @@
                             times  7 dw 0xff
 const hmul_16p,             times 16 db   1
                             times  8 db   1,  -1
+const pw_exp2_0_15,                  dw 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768
 
 
 ;; 32-bit constants
@@ -115,8 +123,9 @@
 const pd_2,                 times  8 dd 2
 const pd_4,                 times  4 dd 4
 const pd_8,                 times  4 dd 8
+const pd_15,                times  8 dd 15
 const pd_16,                times  8 dd 16
-const pd_31,                times  4 dd 31
+const pd_31,                times  8 dd 31
 const pd_32,                times  8 dd 32
 const pd_64,                times  4 dd 64
 const pd_128,               times  4 dd 128
@@ -129,7 +138,12 @@
 const pd_524416,            times  4 dd 524416
 const pd_n32768,            times  8 dd 0xffff8000
 const pd_n131072,           times  4 dd 0xfffe0000
-
+const pd_0000ffff,          times  8 dd 0x0000FFFF
+const pd_planar16_mul0,     times  1 dd  15,  14,  13,  12,  11,  10,   9,   8,    7,   6,   5,   4,   3,   2,   1,   0
+const pd_planar16_mul1,     times  1 dd   1,   2,   3,   4,   5,   6,   7,   8,    9,  10,  11,  12,  13,  14,  15,  16
+const pd_planar32_mul1,     times  1 dd  31,  30,  29,  28,  27,  26,  25,  24,   23,  22,  21,  20,  19,  18,  17,  16
+const pd_planar32_mul2,     times  1 dd  17,  18,  19,  20,  21,  22,  23,  24,   25,  26,  27,  28,  29,  30,  31,  32
+const pd_planar16_mul2,     times  1 dd  15,  14,  13,  12,  11,  10,   9,   8,    7,   6,   5,   4,   3,   2,   1,   0
 const trans8_shuf,          times  1 dd   0,   4,   1,   5,   2,   6,   3,   7
 
 const popcnt_table

x265_1.8.tar.gz/source/common/x86/cpu-a.asm -> x265_1.9.tar.gz/source/common/x86/cpu-a.asm Changed

x265_1.8.tar.gz/source/common/x86/dct8.asm -> x265_1.9.tar.gz/source/common/x86/dct8.asm Changed

@@ -2115,15 +2115,15 @@
     mova     m0, [r0]
     pabsw    m1, m0
 
-    mova     m2, [r1]
+    movu     m2, [r1]
     pmovsxwd m3, m1
     paddd    m2, m3
-    mova     [r1], m2
-    mova     m2, [r1 + 16]
+    movu     [r1], m2
+    movu     m2, [r1 + 16]
     psrldq   m3, m1, 8
     pmovsxwd m4, m3
     paddd    m2, m4
-    mova     [r1 + 16], m2
+    movu     [r1 + 16], m2
 
     movu     m3, [r2]
     psubusw  m1, m3
@@ -2174,7 +2174,7 @@
     pmaddwd         m0,                 m%4
     phaddd          m2,                 m0
     paddd           m2,                 m5
-    psrad           m2,                 DCT_SHIFT
+    psrad           m2,                 DCT8_SHIFT1
     packssdw        m2,                 m2
     vpermq          m2,                 m2, 0x08
     mova            [r5 + %2],          xm2
@@ -2190,7 +2190,7 @@
     phaddd          m8,                 m9
     phaddd          m6,                 m8
     paddd           m6,                 m5
-    psrad           m6,                 DCT_SHIFT2
+    psrad           m6,                 DCT8_SHIFT2
 
     vbroadcasti128  m4,                 [r6 + %2]
     pmaddwd         m10,                m0, m4
@@ -2201,7 +2201,7 @@
     phaddd          m8,                 m9
     phaddd          m10,                m8
     paddd           m10,                m5
-    psrad           m10,                DCT_SHIFT2
+    psrad           m10,                DCT8_SHIFT2
 
     packssdw        m6,                 m10
     vpermq          m10,                m6, 0xD8
@@ -2210,18 +2210,7 @@
 
 INIT_YMM avx2
 cglobal dct8, 3, 7, 11, 0-8*16
-%if BIT_DEPTH == 12
-    %define         DCT_SHIFT          6
-    vbroadcasti128  m5,                [pd_16]
-%elif BIT_DEPTH == 10
-    %define         DCT_SHIFT          4
-    vbroadcasti128  m5,                [pd_8]
-%elif BIT_DEPTH == 8
-    %define         DCT_SHIFT          2
-    vbroadcasti128  m5,                [pd_2]
-%else
-    %error Unsupported BIT_DEPTH!
-%endif
+vbroadcasti128      m5,                [pd_ %+ DCT8_ROUND1]
 %define             DCT_SHIFT2         9
 
     add             r2d,               r2d
@@ -2265,7 +2254,7 @@
     DCT8_PASS_1     7 * 16,             7 * 16, 4, 1
 
     ;pass2
-    vbroadcasti128  m5,                [pd_256]
+    vbroadcasti128  m5,                [pd_ %+ DCT8_ROUND2]
 
     mova            m0,                [r5]
     mova            m1,                [r5 + 32]
@@ -2904,7 +2893,7 @@
 cglobal idct8, 3, 7, 13, 0-8*16
 %if BIT_DEPTH == 12
     %define         IDCT_SHIFT2        8
-    vpbroadcastd    m12,                [pd_256]
+    vpbroadcastd    m12,                [pd_128]
 %elif BIT_DEPTH == 10
     %define         IDCT_SHIFT2        10
     vpbroadcastd    m12,                [pd_512]
@@ -3065,7 +3054,7 @@
 cglobal idct16, 3, 7, 16, 0-16*mmsize
 %if BIT_DEPTH == 12
     %define         IDCT_SHIFT2        8
-    vpbroadcastd    m15,                [pd_256]
+    vpbroadcastd    m15,                [pd_128]
 %elif BIT_DEPTH == 10
     %define         IDCT_SHIFT2        10
     vpbroadcastd    m15,                [pd_512]
@@ -3487,7 +3476,7 @@
 
 %if BIT_DEPTH == 12
     %define         IDCT_SHIFT2        8
-    vpbroadcastd    m15,                [pd_256]
+    vpbroadcastd    m15,                [pd_128]
 %elif BIT_DEPTH == 10
     %define         IDCT_SHIFT2        10
     vpbroadcastd    m15,                [pd_512]
@@ -3651,7 +3640,7 @@
 %define             IDCT_SHIFT1         7
 %if BIT_DEPTH == 12
     %define         IDCT_SHIFT2        8
-    vpbroadcastd    m5,                [pd_256]
+    vpbroadcastd    m5,                [pd_128]
 %elif BIT_DEPTH == 10
     %define         IDCT_SHIFT2        10
     vpbroadcastd    m5,                [pd_512]

x265_1.8.tar.gz/source/common/x86/dct8.h -> x265_1.9.tar.gz/source/common/x86/dct8.h Changed

x265_1.8.tar.gz/source/common/x86/intrapred16.asm -> x265_1.9.tar.gz/source/common/x86/intrapred16.asm Changed

@@ -109,9 +109,11 @@
 cextern pw_16
 cextern pw_31
 cextern pw_32
+cextern pd_15
 cextern pd_16
 cextern pd_31
 cextern pd_32
+cextern pd_0000ffff
 cextern pw_4096
 cextern pw_pixel_max
 cextern multiL
@@ -123,7 +125,12 @@
 cextern pb_unpackwq1
 cextern pb_unpackwq2
 cextern pw_planar16_mul
+cextern pd_planar16_mul0
+cextern pd_planar16_mul1
 cextern pw_planar32_mul
+cextern pd_planar32_mul1
+cextern pd_planar32_mul2
+cextern pd_planar16_mul2
 
 ;-----------------------------------------------------------------------------------
 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter)
@@ -731,6 +738,117 @@
 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
 ;---------------------------------------------------------------------------------------
 INIT_XMM sse2
+%if ARCH_X86_64 == 1 && BIT_DEPTH == 12
+cglobal intra_pred_planar16, 3,5,13
+    add             r1d, r1d
+    pxor            m12, m12
+
+    movu            m2, [r2 + 2]
+    movu            m10, [r2 + 18]
+
+    punpckhwd       m7, m2, m12
+    punpcklwd       m2, m12
+    punpckhwd       m0, m10, m12
+    punpcklwd       m10, m12
+
+    movzx           r3d, word [r2 + 34]                     ; topRight   = above[16]
+    lea             r4, [pd_planar16_mul1]
+
+    movd            m3, r3d
+    pshufd          m3, m3, 0                               ; topRight
+
+    pmaddwd         m8, m3, [r4 + 3*mmsize]                 ; (x + 1) * topRight
+    pmaddwd         m4, m3, [r4 + 2*mmsize]                 ; (x + 1) * topRight
+    pmaddwd         m9, m3, [r4 + 1*mmsize]                 ; (x + 1) * topRight
+    pmaddwd         m3, m3, [r4 + 0*mmsize]                 ; (x + 1) * topRight
+
+    mova            m11, [pd_15]
+    pmaddwd         m1, m2,  m11                            ; (blkSize - 1 - y) * above[x]
+    pmaddwd         m6, m7,  m11                            ; (blkSize - 1 - y) * above[x]
+    pmaddwd         m5, m10, m11                            ; (blkSize - 1 - y) * above[x]
+    pmaddwd         m11, m0                                 ; (blkSize - 1 - y) * above[x]
+
+    paddd           m4, m5
+    paddd           m3, m1
+    paddd           m8, m11
+    paddd           m9, m6
+
+    mova            m5, [pd_16]
+    paddd           m3, m5
+    paddd           m9, m5
+    paddd           m4, m5
+    paddd           m8, m5
+
+    movzx           r4d, word [r2 + 98]                     ; bottomLeft = left[16]
+    movd            m6, r4d
+    pshufd          m6, m6, 0                               ; bottomLeft
+
+    paddd           m4, m6
+    paddd           m3, m6
+    paddd           m8, m6
+    paddd           m9, m6
+
+    psubd           m1, m6, m0                              ; column 12-15
+    psubd           m11, m6, m10                            ; column 8-11
+    psubd           m10, m6, m7                             ; column 4-7
+    psubd           m6, m2                                  ; column 0-3
+
+    add             r2, 66
+    lea             r4, [pd_planar16_mul0]
+
+%macro INTRA_PRED_PLANAR16_sse2 1
+    movzx           r3d, word [r2 + %1*2]
+    movd            m5, r3d
+    pshufd          m5, m5, 0
+
+    pmaddwd         m0, m5, [r4 + 3*mmsize]                 ; column 12-15
+    pmaddwd         m2, m5, [r4 + 2*mmsize]                 ; column 8-11
+    pmaddwd         m7, m5, [r4 + 1*mmsize]                 ; column 4-7
+    pmaddwd         m5, m5, [r4 + 0*mmsize]                 ; column 0-3
+
+    paddd           m0, m8
+    paddd           m2, m4
+    paddd           m7, m9
+    paddd           m5, m3
+
+    paddd           m8, m1
+    paddd           m4, m11
+    paddd           m9, m10
+    paddd           m3, m6
+
+    psrad           m0, 5
+    psrad           m2, 5
+    psrad           m7, 5
+    psrad           m5, 5
+
+    packssdw        m2, m0
+    packssdw        m5, m7
+    movu            [r0], m5
+    movu            [r0 + mmsize], m2
+
+    add             r0, r1
+%endmacro
+
+    INTRA_PRED_PLANAR16_sse2 0
+    INTRA_PRED_PLANAR16_sse2 1
+    INTRA_PRED_PLANAR16_sse2 2
+    INTRA_PRED_PLANAR16_sse2 3
+    INTRA_PRED_PLANAR16_sse2 4
+    INTRA_PRED_PLANAR16_sse2 5
+    INTRA_PRED_PLANAR16_sse2 6
+    INTRA_PRED_PLANAR16_sse2 7
+    INTRA_PRED_PLANAR16_sse2 8
+    INTRA_PRED_PLANAR16_sse2 9
+    INTRA_PRED_PLANAR16_sse2 10
+    INTRA_PRED_PLANAR16_sse2 11
+    INTRA_PRED_PLANAR16_sse2 12
+    INTRA_PRED_PLANAR16_sse2 13
+    INTRA_PRED_PLANAR16_sse2 14
+    INTRA_PRED_PLANAR16_sse2 15
+    RET
+
+%else
+; code for BIT_DEPTH == 10
 cglobal intra_pred_planar16, 3,3,8
     movu            m2, [r2 + 2]
     movu            m7, [r2 + 18]
@@ -809,7 +927,180 @@
     INTRA_PRED_PLANAR_16 14
     INTRA_PRED_PLANAR_16 15
     RET
+%endif
+
+;---------------------------------------------------------------------------------------
+; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
+;---------------------------------------------------------------------------------------
+INIT_XMM sse2
+%if ARCH_X86_64 == 1 && BIT_DEPTH == 12
+cglobal intra_pred_planar32, 3,7,16
+    ; NOTE: align stack to 64 bytes, so all of local data in same cache line
+    mov             r6, rsp
+    sub             rsp, 4*mmsize
+    and             rsp, ~63
+    %define         m16 [rsp + 0 * mmsize]
+    %define         m17 [rsp + 1 * mmsize]
+    %define         m18 [rsp + 2 * mmsize]
+    %define         m19 [rsp + 3 * mmsize]
+
+    add             r1, r1
+    pxor            m12, m12
+
+    movzx           r3d, word [r2 + 66]
+    lea             r4, [planar32_table1]
+
+    movd            m0, r3d
+    pshufd          m0, m0, 0
+
+    pmaddwd         m8, m0, [r4 + 0]
+    pmaddwd         m9, m0, [r4 + 16]
+    pmaddwd         m10, m0, [r4 + 32]
+    pmaddwd         m11, m0, [r4 + 48]
+    pmaddwd         m7, m0, [r4 + 64]
+    pmaddwd         m13, m0, [r4 + 80]
+    pmaddwd         m14, m0, [r4 + 96]
+    pmaddwd         m15, m0, [r4 + 112]
+
+    movzx           r3d, word [r2 + 194]
+    movd            m0, r3d
+    pshufd          m0, m0, 0
+
+    paddd           m8, m0
+    paddd           m9, m0
+    paddd           m10, m0
+    paddd           m11, m0
+    paddd           m7, m0
+    paddd           m13, m0
+    paddd           m14, m0
+    paddd           m15, m0
+
+    paddd           m8, [pd_32]
+    paddd           m9, [pd_32]
+    paddd           m10, [pd_32]
+    paddd           m11, [pd_32]
+    paddd           m7, [pd_32]

x265_1.8.tar.gz/source/common/x86/intrapred8.asm -> x265_1.9.tar.gz/source/common/x86/intrapred8.asm Changed

@@ -27,7 +27,9 @@
 
 SECTION_RODATA 32
 
-intra_pred_shuff_0_8:    times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+const intra_pred_shuff_0_8,     times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+                                        db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
+
 intra_pred_shuff_15_0:   times 2 db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
 intra_filter4_shuf0:  times 2 db  2,  3,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13
@@ -54,13 +56,13 @@
 c_shuf8_0:            db  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8
 c_deinterval8:        db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
 pb_unpackbq:          db  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1
-c_mode16_12:    db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 6
-c_mode16_13:    db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4
-c_mode16_14:    db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2
+c_mode16_12:          db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 6
+c_mode16_13:          db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4
+c_mode16_14:          db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2
 c_mode16_15:          db  0,  0,  0,  0,  0,  0,  0,  0, 15, 13, 11,  9,  8,  6,  4,  2
 c_mode16_16:          db  8,  6,  5,  3,  2,  0, 15, 14, 12, 11,  9,  8,  6,  5,  3,  2
 c_mode16_17:          db  4,  2,  1,  0, 15, 14, 12, 11, 10,  9,  7,  6,  5,  4,  2,  1
-c_mode16_18:    db 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
+c_mode16_18:          db 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
 
 ALIGN 32
 c_ang8_src1_9_2_10:   db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
@@ -259,235 +261,6 @@
                      db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
                      db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
 
-
-ALIGN 32
-c_ang32_mode_27:    db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
-                    db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
-                    db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
-                    db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-                    db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
-                    db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-                    db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-                    db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
-                    db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
-                    db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
-                    db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
-                    db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
-                    db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
-                    db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
-                    db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
-                    db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
-                    db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
-
-ALIGN 32
-c_ang32_mode_28:    db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
-                    db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
-                    db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
-                    db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
-                    db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
-                    db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-                    db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
-                    db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-                    db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
-                    db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
-                    db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9
-                    db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
-                    db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
-                    db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7
-                    db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
-                    db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
-                    db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
-ALIGN 32
-c_ang32_mode_29:    db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
-                    db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
-                    db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
-                    db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
-                    db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
-                    db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
-                    db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
-                    db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
-                    db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-                    db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
-                    db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
-                    db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
-                    db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
-                    db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-                    db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
-                    db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-                    db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
-                    db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
-                    db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
-
-ALIGN 32
-c_ang32_mode_30:    db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
-                    db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
-                    db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
-                    db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
-                    db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
-                    db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
-                    db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-                    db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
-                    db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-                    db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29,  3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
-                    db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
-                    db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
-                    db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
-                    db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-                    db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
-                    db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
-                    db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
-                    db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
-                    db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
-
-ALIGN 32
-c_ang32_mode_31:    db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
-                    db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
-                    db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
-                    db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
-                    db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
-                    db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
-                    db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
-                    db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
-                    db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-                    db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
-                    db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
-                    db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
-                    db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-                    db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
-                    db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-                    db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
-                    db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
-                    db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
-
-ALIGN 32
-c_ang32_mode_32:   db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
-                   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
-                   db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
-                   db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
-                   db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
-                   db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
-                   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
-                   db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-                   db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
-                   db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
-                   db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-                   db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
-                   db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
-                   db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
-                   db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
-                   db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-                   db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
-                   db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
-                   db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
-                   db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
-                   db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
-                   db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
-ALIGN 32
-c_ang32_mode_25:   db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-                   db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-                   db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
-                   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-                   db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
-                   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
-                   db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
-                   db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-                   db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-                   db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-                   db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
-                   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-                   db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
-                   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
-                   db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
-                   db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
-ALIGN 32
-c_ang32_mode_24:   db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
-                   db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
-                   db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
-                   db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-                   db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
-                   db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
-                   db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
-                   db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-                   db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
-                   db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1
-                   db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
-                   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
-                   db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3
-                   db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
-                   db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
-                   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5
-                   db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
-

x265_1.8.tar.gz/source/common/x86/intrapred8_allangs.asm -> x265_1.9.tar.gz/source/common/x86/intrapred8_allangs.asm Changed

@@ -27,62 +27,63 @@
 
 SECTION_RODATA 32
 
-all_ang4_shuff: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
-                db 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7
-                db 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4
-                db 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
-                db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12
-                db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 4, 0, 0, 9, 9, 10, 10, 11
-                db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11
-                db 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11, 4, 2, 2, 0, 0, 9, 9, 10
-                db 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11, 3, 2, 2, 0, 0, 9, 9, 10
-                db 0, 9, 9, 10, 10, 11, 11, 12, 1, 0, 0, 9, 9, 10, 10, 11, 2, 1, 1, 0, 0, 9, 9, 10, 4, 2, 2, 1, 1, 0, 0, 9
-                db 0, 1, 2, 3, 9, 0, 1, 2, 10, 9, 0, 1, 11, 10, 9, 0, 0, 1, 2, 3, 9, 0, 1, 2, 10, 9, 0, 1, 11, 10, 9, 0
-                db 0, 1, 1, 2, 2, 3, 3, 4, 9, 0, 0, 1, 1, 2, 2, 3, 10, 9, 9, 0, 0, 1, 1, 2, 12, 10, 10, 9, 9, 0, 0, 1
-                db 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3, 11, 10, 10, 0, 0, 1, 1, 2
-                db 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3, 12, 10, 10, 0, 0, 1, 1, 2
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 12, 0, 0, 1, 1, 2, 2, 3
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4
-                db 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4
-                db 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5
-                db 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6
-                db 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6
-                db 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7
-                db 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7, 4, 5, 5, 6, 6, 7, 7, 8
-                db 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8
-
-all_ang4: db 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8
-          db 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20
-          db 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4
-          db 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20
-          db 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4
-          db 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
-          db 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8
-          db 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24
-          db 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
-          db 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28
-          db 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12
-          db 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28
-          db 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12
-          db 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24
-          db 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24
-          db 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12
-          db 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28
-          db 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12
-          db 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28
-          db 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
-          db 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24
-          db 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8
-          db 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
-          db 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4
-          db 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20
-          db 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4
-          db 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20
-          db 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8
+const allAng4_shuf_mode2,       db  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,  4,  5,  6,  7,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,  4,  5,  6,  7
+const allAng4_shuf_mode3_4,     db  0,  1,  1,  2,  2,  3,  3,  4,  1,  2,  2,  3,  3,  4,  4,  5,  0,  1,  1,  2,  2,  3,  3,  4,  1,  2,  2,  3,  3,  4,  4,  5
+                                db  2,  3,  3,  4,  4,  5,  5,  6,  3,  4,  4,  5,  5,  6,  6,  7,  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6
+const allAng4_shuf_mode5_6,     db  0,  1,  1,  2,  2,  3,  3,  4,  1,  2,  2,  3,  3,  4,  4,  5,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
+                                db  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6,  1,  2,  2,  3,  3,  4,  4,  5,  1,  2,  2,  3,  3,  4,  4,  5
+const allAng4_shuf_mode7_8,     db  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
+                                db  0,  1,  1,  2,  2,  3,  3,  4,  1,  2,  2,  3,  3,  4,  4,  5,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
+const allAng4_shuf_mode10,      db  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3
+const allAng4_shuf_mode11_12,   db  0,  9,  9, 10, 10, 11, 11, 12,  0,  9,  9, 10, 10, 11, 11, 12,  0,  9,  9, 10, 10, 11, 11, 12,  0,  9,  9, 10, 10, 11, 11, 12
+const allAng4_shuf_mode13_14,   db  0,  9,  9, 10, 10, 11, 11, 12,  4,  0,  0,  9,  9, 10, 10, 11,  2,  0,  0,  9,  9, 10, 10, 11,  2,  0,  0,  9,  9, 10, 10, 11
+const allAng4_shuf_mode15_16,   db  0,  9,  9, 10, 10, 11, 11, 12,  2,  0,  0,  9,  9, 10, 10, 11,  0,  9,  9, 10, 10, 11, 11, 12,  2,  0,  0,  9,  9, 10, 10, 11
+                                db  2,  0,  0,  9,  9, 10, 10, 11,  4,  2,  2,  0,  0,  9,  9, 10,  2,  0,  0,  9,  9, 10, 10, 11,  3,  2,  2,  0,  0,  9,  9, 10
+const allAng4_shuf_mode17,      db  0,  9,  9, 10, 10, 11, 11, 12,  1,  0,  0,  9,  9, 10, 10, 11,  2,  1,  1,  0,  0,  9,  9, 10,  4,  2,  2,  1,  1,  0,  0,  9
+                                db  0,  1,  2,  3,  9,  0,  1,  2, 10,  9,  0,  1, 11, 10,  9,  0,  0,  1,  2,  3,  9,  0,  1,  2, 10,  9,  0,  1, 11, 10,  9,  0
+const allAng4_shuf_mode18,      db  0,  1,  2,  3,  9,  0,  1,  2, 10,  9,  0,  1, 11, 10,  9,  0,  0,  1,  2,  3,  9,  0,  1,  2, 10,  9,  0,  1, 11, 10,  9,  0
+const allAng4_shuf_mode19_20,   db  0,  1,  1,  2,  2,  3,  3,  4,  9,  0,  0,  1,  1,  2,  2,  3,  0,  1,  1,  2,  2,  3,  3,  4, 10,  0,  0,  1,  1,  2,  2,  3
+                                db 10,  9,  9,  0,  0,  1,  1,  2, 12, 10, 10,  9,  9,  0,  0,  1, 10,  0,  0,  1,  1,  2,  2,  3, 11, 10, 10,  0,  0,  1,  1,  2
+const allAng4_shuf_mode21_22,   db  0,  1,  1,  2,  2,  3,  3,  4, 10,  0,  0,  1,  1,  2,  2,  3,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
+                                db 10,  0,  0,  1,  1,  2,  2,  3, 12, 10, 10,  0,  0,  1,  1,  2, 10,  0,  0,  1,  1,  2,  2,  3, 10,  0,  0,  1,  1,  2,  2,  3
+const allAng4_shuf_mode23_24,   db  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
+                                db  0,  1,  1,  2,  2,  3,  3,  4, 12,  0,  0,  1,  1,  2,  2,  3,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
+const allAng4_shuf_mode26,      db  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4
+const allAng4_shuf_mode27_28,   db  1,  2,  2,  3,  3,  4,  4,  5,  1,  2,  2,  3,  3,  4,  4,  5,  1,  2,  2,  3,  3,  4,  4,  5,  1,  2,  2,  3,  3,  4,  4,  5
+const allAng4_shuf_mode29_30,   db  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6,  2,  3,  3,  4,  4,  5,  5,  6,  2,  3,  3,  4,  4,  5,  5,  6
+const allAng4_shuf_mode31_32,   db  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6,  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6
+                                db  2,  3,  3,  4,  4,  5,  5,  6,  3,  4,  4,  5,  5,  6,  6,  7,  2,  3,  3,  4,  4,  5,  5,  6,  3,  4,  4,  5,  5,  6,  6,  7
+const allAng4_shuf_mode33,      db  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6,  3,  4,  4,  5,  5,  6,  6,  7,  4,  5,  5,  6,  6,  7,  7,  8
+const allAng4_shuf_mode34,      db  2,  3,  4,  5,  3,  4,  5,  6,  4,  5,  6,  7,  5,  6,  7,  8,  2,  3,  4,  5,  3,  4,  5,  6,  4,  5,  6,  7,  5,  6,  7,  8
+
+const allAng4_fact_mode3_4,     db  6, 26,  6, 26,  6, 26,  6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10
+                                db 18, 14, 18, 14, 18, 14, 18, 14, 24,  8, 24,  8, 24,  8, 24,  8,  1, 31,  1, 31,  1, 31,  1, 31, 12, 20, 12, 20, 12, 20, 12, 20
+const allAng4_fact_mode5_6,     db 15, 17, 15, 17, 15, 17, 15, 17, 30,  2, 30,  2, 30,  2, 30,  2, 19, 13, 19, 13, 19, 13, 19, 13,  6, 26,  6, 26,  6, 26,  6, 26
+                                db 13, 19, 13, 19, 13, 19, 13, 19, 28,  4, 28,  4, 28,  4, 28,  4, 25,  7, 25,  7, 25,  7, 25,  7, 12, 20, 12, 20, 12, 20, 12, 20
+const allAng4_fact_mode7_8,     db 23,  9, 23,  9, 23,  9, 23,  9, 14, 18, 14, 18, 14, 18, 14, 18, 27,  5, 27,  5, 27,  5, 27,  5, 22, 10, 22, 10, 22, 10, 22, 10
+                                db  5, 27,  5, 27,  5, 27,  5, 27, 28,  4, 28,  4, 28,  4, 28,  4, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
+const allAng4_fact_mode9,       db 30,  2, 30,  2, 30,  2, 30,  2, 28,  4, 28,  4, 28,  4, 28,  4, 26,  6, 26,  6, 26,  6, 26,  6, 24,  8, 24,  8, 24,  8, 24,  8
+const allAng4_fact_mode11_12,   db  2, 30,  2, 30,  2, 30,  2, 30,  4, 28,  4, 28,  4, 28,  4, 28,  5, 27,  5, 27,  5, 27,  5, 27, 10, 22, 10, 22, 10, 22, 10, 22
+                                db  6, 26,  6, 26,  6, 26,  6, 26,  8, 24,  8, 24,  8, 24,  8, 24, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
+const allAng4_fact_mode13_14,   db  9, 23,  9, 23,  9, 23,  9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 13, 19, 13, 19, 13, 19, 13, 19, 26,  6, 26,  6, 26,  6, 26,  6
+                                db 27,  5, 27,  5, 27,  5, 27,  5,  4, 28,  4, 28,  4, 28,  4, 28,  7, 25,  7, 25,  7, 25,  7, 25, 20, 12, 20, 12, 20, 12, 20, 12
+const allAng4_fact_mode15_16,   db 17, 15, 17, 15, 17, 15, 17, 15,  2, 30,  2, 30,  2, 30,  2, 30, 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22
+                                db 19, 13, 19, 13, 19, 13, 19, 13,  4, 28,  4, 28,  4, 28,  4, 28, 31,  1, 31,  1, 31,  1, 31,  1, 20, 12, 20, 12, 20, 12, 20, 12
+const allAng4_fact_mode17,      db 26,  6, 26,  6, 26,  6, 26,  6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18,  8, 24,  8, 24,  8, 24,  8, 24
+const allAng4_fact_mode19_20,   db 26,  6, 26,  6, 26,  6, 26,  6, 20, 12, 20, 12, 20, 12, 20, 12, 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22
+                                db 14, 18, 14, 18, 14, 18, 14, 18,  8, 24,  8, 24,  8, 24,  8, 24, 31,  1, 31,  1, 31,  1, 31,  1, 20, 12, 20, 12, 20, 12, 20, 12
+const allAng4_fact_mode21_22,   db 17, 15, 17, 15, 17, 15, 17, 15,  2, 30,  2, 30,  2, 30,  2, 30, 13, 19, 13, 19, 13, 19, 13, 19, 26,  6, 26,  6, 26,  6, 26,  6
+                                db 19, 13, 19, 13, 19, 13, 19, 13,  4, 28,  4, 28,  4, 28,  4, 28,  7, 25,  7, 25,  7, 25,  7, 25, 20, 12, 20, 12, 20, 12, 20, 12
+const allAng4_fact_mode23_24,   db  9, 23,  9, 23,  9, 23,  9, 23, 18, 14, 18, 14, 18, 14, 18, 14,  5, 27,  5, 27,  5, 27,  5, 27, 10, 22, 10, 22, 10, 22, 10, 22
+                                db 27,  5, 27,  5, 27,  5, 27,  5,  4, 28,  4, 28,  4, 28,  4, 28, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
+const allAng4_fact_mode25,      db  2, 30,  2, 30,  2, 30,  2, 30,  4, 28,  4, 28,  4, 28,  4, 28,  6, 26,  6, 26,  6, 26,  6, 26,  8, 24,  8, 24,  8, 24,  8, 24
+const allAng4_fact_mode27_28,   db 30,  2, 30,  2, 30,  2, 30,  2, 28,  4, 28,  4, 28,  4, 28,  4, 27,  5, 27,  5, 27,  5, 27,  5, 22, 10, 22, 10, 22, 10, 22, 10
+                                db 26,  6, 26,  6, 26,  6, 26,  6, 24,  8, 24,  8, 24,  8, 24,  8, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
+const allAng4_fact_mode29_30,   db 23,  9, 23,  9, 23,  9, 23,  9, 14, 18, 14, 18, 14, 18, 14, 18, 19, 13, 19, 13, 19, 13, 19, 13,  6, 26,  6, 26,  6, 26,  6, 26
+                                db  5, 27,  5, 27,  5, 27,  5, 27, 28,  4, 28,  4, 28,  4, 28,  4, 25,  7, 25,  7, 25,  7, 25,  7, 12, 20, 12, 20, 12, 20, 12, 20
+const allAng4_fact_mode31_32,   db 15, 17, 15, 17, 15, 17, 15, 17, 30,  2, 30,  2, 30,  2, 30,  2, 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10
+                                db 13, 19, 13, 19, 13, 19, 13, 19, 28,  4, 28,  4, 28,  4, 28,  4,  1, 31,  1, 31,  1, 31,  1, 31, 12, 20, 12, 20, 12, 20, 12, 20
+const allAng4_fact_mode33,      db  6, 26,  6, 26,  6, 26,  6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 18, 14, 18, 14, 18, 14, 18, 14, 24,  8, 24,  8, 24,  8, 24,  8
 
 
 SECTION .text
@@ -23075,80 +23076,69 @@
 ; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
 ;-----------------------------------------------------------------------------
 INIT_YMM avx2
-cglobal all_angs_pred_4x4, 4, 4, 6
+cglobal all_angs_pred_4x4, 2, 2, 6
 
     mova           m5, [pw_1024]
-    lea            r2, [all_ang4]
-    lea            r3, [all_ang4_shuff]
 
 ; mode 2
 
     vbroadcasti128 m0, [r1 + 9]
-    mova           xm1, xm0
-    psrldq         xm1, 1
-    pshufb         xm1, [r3]
+    pshufb         m1, m0, [allAng4_shuf_mode2]
     movu           [r0], xm1
 
 ; mode 3
 
-    pshufb         m1, m0, [r3 + 1 * mmsize]
-    pmaddubsw      m1, [r2]
+    pshufb         m1, m0, [allAng4_shuf_mode3_4]
+    pmaddubsw      m1, [allAng4_fact_mode3_4]
     pmulhrsw       m1, m5
 
 ; mode 4
 
-    pshufb         m2, m0, [r3 + 2 * mmsize]
-    pmaddubsw      m2, [r2 + 1 * mmsize]
+    pshufb         m2, m0, [allAng4_shuf_mode3_4 + mmsize]
+    pmaddubsw      m2, [allAng4_fact_mode3_4 + mmsize]
     pmulhrsw       m2, m5
     packuswb       m1, m2
-    vpermq         m1, m1, 11011000b
     movu           [r0 + (3 - 2) * 16], m1
 
 ; mode 5
 
-    pshufb         m1, m0, [r3 + 2 * mmsize]
-    pmaddubsw      m1, [r2 + 2 * mmsize]
+    pshufb         m1, m0, [allAng4_shuf_mode5_6]
+    pmaddubsw      m1, [allAng4_fact_mode5_6]
     pmulhrsw       m1, m5
 
 ; mode 6
 
-    pshufb         m2, m0, [r3 + 3 * mmsize]
-    pmaddubsw      m2, [r2 + 3 * mmsize]
+    pshufb         m2, m0, [allAng4_shuf_mode5_6 + mmsize]
+    pmaddubsw      m2, [allAng4_fact_mode5_6 + mmsize]
     pmulhrsw       m2, m5
     packuswb       m1, m2
-    vpermq         m1, m1, 11011000b
     movu           [r0 + (5 - 2) * 16], m1
 
-    add            r3, 4 * mmsize
-    add            r2, 4 * mmsize
-
 ; mode 7
 
-    pshufb         m1, m0, [r3 + 0 * mmsize]
-    pmaddubsw      m1, [r2 + 0 * mmsize]
+    pshufb         m3, m0, [allAng4_shuf_mode7_8]
+    pmaddubsw      m1, m3, [allAng4_fact_mode7_8]
     pmulhrsw       m1, m5
 
 ; mode 8
 
-    pshufb         m2, m0, [r3 + 1 * mmsize]
-    pmaddubsw      m2, [r2 + 1 * mmsize]
+    pshufb         m2, m0, [allAng4_shuf_mode7_8 + mmsize]
+    pmaddubsw      m2, [allAng4_fact_mode7_8 + mmsize]
     pmulhrsw       m2, m5
     packuswb       m1, m2
-    vpermq         m1, m1, 11011000b
     movu           [r0 + (7 - 2) * 16], m1

x265_1.8.tar.gz/source/common/x86/ipfilter16.asm -> x265_1.9.tar.gz/source/common/x86/ipfilter16.asm Changed

@@ -4869,7 +4869,7 @@
 %ifidn %2,pp
     vbroadcasti128  m8, [INTERP_OFFSET_PP]
 %elifidn %2, sp
-    mova            m8, [INTERP_OFFSET_SP]
+    vbroadcasti128  m8, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m8, [INTERP_OFFSET_PS]
 %endif
@@ -5011,11 +5011,11 @@
     mov       r4d, %1/2
 
 %ifidn %2, pp
-    mova      m7, [INTERP_OFFSET_PP]
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
 %elifidn %2, sp
-    mova      m7, [INTERP_OFFSET_SP]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %elifidn %2, ps
-    mova      m7, [INTERP_OFFSET_PS]
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
 
 .loopH:
@@ -5183,11 +5183,11 @@
     mov       r4d, %1/2
 
 %ifidn %2, pp
-    mova      m7, [INTERP_OFFSET_PP]
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
 %elifidn %2, sp
-    mova      m7, [INTERP_OFFSET_SP]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %elifidn %2, ps
-    mova      m7, [INTERP_OFFSET_PS]
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
 
 .loopH:
@@ -5325,11 +5325,11 @@
     mov       r4d, %1/2
 
 %ifidn %2, pp
-    mova      m7, [INTERP_OFFSET_PP]
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
 %elifidn %2, sp
-    mova      m7, [INTERP_OFFSET_SP]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %elifidn %2, ps
-    mova      m7, [INTERP_OFFSET_PS]
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
 
 .loopH:
@@ -5456,11 +5456,11 @@
     mov       r4d, %1/2
 
 %ifidn %2, pp
-    mova      m7, [INTERP_OFFSET_PP]
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
 %elifidn %2, sp
-    mova      m7, [INTERP_OFFSET_SP]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %elifidn %2, ps
-    mova      m7, [INTERP_OFFSET_PS]
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
 
 .loopH:
@@ -5609,11 +5609,11 @@
     mov       r4d, %1/2
 
 %ifidn %2, pp
-    mova      m7, [INTERP_OFFSET_PP]
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
 %elifidn %2, sp
-    mova      m7, [INTERP_OFFSET_SP]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %elifidn %2, ps
-    mova      m7, [INTERP_OFFSET_PS]
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
 
 .loopH:
@@ -5732,11 +5732,11 @@
     mov       r4d, 32
 
 %ifidn %1, pp
-    mova      m7, [INTERP_OFFSET_PP]
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
 %elifidn %1, sp
-    mova      m7, [INTERP_OFFSET_SP]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %elifidn %1, ps
-    mova      m7, [INTERP_OFFSET_PS]
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
 
 .loopH:
@@ -6068,7 +6068,7 @@
 %ifidn %1,pp
     vbroadcasti128  m6, [pd_32]
 %elifidn %1, sp
-    mova            m6, [pd_524800]
+    vbroadcasti128  m6, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m6, [INTERP_OFFSET_PS]
 %endif
@@ -6178,7 +6178,7 @@
 %ifidn %1,pp
     vbroadcasti128  m11, [pd_32]
 %elifidn %1, sp
-    mova            m11, [pd_524800]
+    vbroadcasti128  m11, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m11, [INTERP_OFFSET_PS]
 %endif
@@ -6816,7 +6816,7 @@
 %ifidn %1,pp
     vbroadcasti128  m14, [pd_32]
 %elifidn %1, sp
-    mova            m14, [INTERP_OFFSET_SP]
+    vbroadcasti128  m14, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
 %endif
@@ -6867,7 +6867,7 @@
 %ifidn %3,pp
     vbroadcasti128  m14, [pd_32]
 %elifidn %3, sp
-    mova            m14, [INTERP_OFFSET_SP]
+    vbroadcasti128  m14, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
 %endif
@@ -6950,7 +6950,7 @@
 %ifidn %1,pp
     vbroadcasti128  m14, [pd_32]
 %elifidn %1, sp
-    mova            m14, [INTERP_OFFSET_SP]
+    vbroadcasti128  m14, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
 %endif
@@ -7597,7 +7597,7 @@
 %ifidn %1,pp
     vbroadcasti128  m11, [pd_32]
 %elifidn %1, sp
-    mova            m11, [INTERP_OFFSET_SP]
+    vbroadcasti128  m11, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m11, [INTERP_OFFSET_PS]
 %endif
@@ -7644,7 +7644,7 @@
 %ifidn %1,pp
     vbroadcasti128  m14, [pd_32]
 %elifidn %1, sp
-    mova            m14, [INTERP_OFFSET_SP]
+    vbroadcasti128  m14, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
 %endif
@@ -7816,7 +7816,7 @@
 %ifidn %1,pp
     vbroadcasti128  m7, [pd_32]
 %elifidn %1, sp
-    mova            m7, [INTERP_OFFSET_SP]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
@@ -7861,7 +7861,7 @@
 %ifidn %1,pp
     vbroadcasti128  m7, [pd_32]
 %elifidn %1, sp
-    mova            m7, [INTERP_OFFSET_SP]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
@@ -7901,7 +7901,7 @@
 %ifidn %1,pp
     vbroadcasti128  m14, [pd_32]
 %elifidn %1, sp
-    mova            m14, [INTERP_OFFSET_SP]
+    vbroadcasti128  m14, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
 %endif
@@ -8248,7 +8248,7 @@
 %ifidn %1,pp
     vbroadcasti128  m7, [pd_32]
 %elifidn %1, sp
-    mova            m7, [INTERP_OFFSET_SP]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
@@ -8668,7 +8668,7 @@
 %ifidn %1,pp

x265_1.8.tar.gz/source/common/x86/ipfilter8.asm -> x265_1.9.tar.gz/source/common/x86/ipfilter8.asm Changed

@@ -12541,6 +12541,459 @@
 ;-----------------------------------------------------------------------------
 ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
 ;-----------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal filterPixelToShort_16x4, 3, 4, 2
+    mov             r3d, r3m
+    add             r3d, r3d
+
+    ; load constant
+    vbroadcasti128  m1, [pw_2000]
+
+    pmovzxbw        m0, [r0]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2], m0
+
+    pmovzxbw        m0, [r0 + r1]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3], m0
+
+    pmovzxbw        m0, [r0 + r1 * 2]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3 * 2], m0
+
+    lea             r1, [r1 * 3]
+    lea             r3, [r3 * 3]
+
+    pmovzxbw        m0, [r0 + r1]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3], m0
+    RET
+
+;-----------------------------------------------------------------------------
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
+;-----------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal filterPixelToShort_16x8, 3, 6, 2
+    mov             r3d, r3m
+    add             r3d, r3d
+    lea             r4, [r1 * 3]
+    lea             r5, [r3 * 3]
+
+    ; load constant
+    vbroadcasti128  m1, [pw_2000]
+
+    pmovzxbw        m0, [r0]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2], m0
+
+    pmovzxbw        m0, [r0 + r1]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3], m0
+
+    pmovzxbw        m0, [r0 + r1 * 2]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3 * 2], m0
+
+    pmovzxbw        m0, [r0 + r4]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r5], m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+
+    pmovzxbw        m0, [r0]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2], m0
+
+    pmovzxbw        m0, [r0 + r1]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3], m0
+
+    pmovzxbw        m0, [r0 + r1 * 2]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3 * 2], m0
+
+    pmovzxbw        m0, [r0 + r4]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r5], m0
+    RET
+
+;-----------------------------------------------------------------------------
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
+;-----------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal filterPixelToShort_16x12, 3, 6, 2
+    mov             r3d, r3m
+    add             r3d, r3d
+    lea             r4, [r1 * 3]
+    lea             r5, [r3 * 3]
+
+    ; load constant
+    vbroadcasti128  m1, [pw_2000]
+
+    pmovzxbw        m0, [r0]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2], m0
+
+    pmovzxbw        m0, [r0 + r1]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3], m0
+
+    pmovzxbw        m0, [r0 + r1 * 2]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3 * 2], m0
+
+    pmovzxbw        m0, [r0 + r4]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r5], m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+
+    pmovzxbw        m0, [r0]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2], m0
+
+    pmovzxbw        m0, [r0 + r1]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3], m0
+
+    pmovzxbw        m0, [r0 + r1 * 2]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3 * 2], m0
+
+    pmovzxbw        m0, [r0 + r4]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r5], m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+
+    pmovzxbw        m0, [r0]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2], m0
+
+    pmovzxbw        m0, [r0 + r1]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3], m0
+
+    pmovzxbw        m0, [r0 + r1 * 2]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3 * 2], m0
+
+    pmovzxbw        m0, [r0 + r4]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r5], m0
+    RET
+
+;-----------------------------------------------------------------------------
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
+;-----------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal filterPixelToShort_16x16, 3, 6, 2
+    mov             r3d, r3m
+    add             r3d, r3d
+    lea             r4, [r1 * 3]
+    lea             r5, [r3 * 3]
+
+    ; load constant
+    vbroadcasti128  m1, [pw_2000]
+
+    pmovzxbw        m0, [r0]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2], m0
+
+    pmovzxbw        m0, [r0 + r1]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3], m0
+
+    pmovzxbw        m0, [r0 + r1 * 2]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3 * 2], m0

x265_1.8.tar.gz/source/common/x86/loopfilter.asm -> x265_1.9.tar.gz/source/common/x86/loopfilter.asm Changed

@@ -26,24 +26,28 @@
 ;*****************************************************************************/
 
 %include "x86inc.asm"
+%include "x86util.asm"
 
 SECTION_RODATA 32
 pb_31:      times 32 db 31
 pb_124:     times 32 db 124
 pb_15:      times 32 db 15
-pb_movemask_32:  times 32 db 0x00
-                 times 32 db 0xFF
 
 SECTION .text
 cextern pb_1
-cextern pb_128
 cextern pb_2
+cextern pb_3
+cextern pb_4
+cextern pb_01
+cextern pb_128
+cextern pw_1
+cextern pw_n1
 cextern pw_2
+cextern pw_4
 cextern pw_pixel_max
 cextern pb_movemask
-cextern pw_1
+cextern pb_movemask_32
 cextern hmul_16p
-cextern pb_4
 
 
 ;============================================================================================================
@@ -1989,79 +1993,94 @@
 %endif
 
 ;--------------------------------------------------------------------------------------------------------------------------
-; saoCuStatsBO_c(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
+; saoCuStatsBO_c(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
 ;--------------------------------------------------------------------------------------------------------------------------
 %if ARCH_X86_64
 INIT_XMM sse4
-cglobal saoCuStatsBO, 7,12,6
-    mova        m3, [hmul_16p + 16]
-    mova        m4, [pb_124]
-    mova        m5, [pb_4]
-    xor         r7d, r7d
+cglobal saoCuStatsBO, 7,13,2
+    mova        m0, [pb_124]
+    add         r5, 4
+    add         r6, 4
 
 .loopH:
-    mov         r10, r0
+    mov         r12, r0
     mov         r11, r1
     mov         r9d, r3d
+
 .loopL:
     movu        m1, [r11]
-    movu        m0, [r10]
+    psrlw       m1, 1                   ; rec[x] >> boShift
+    pand        m1, m0
 
-    punpckhbw   m2, m0, m1
-    punpcklbw   m0, m1
-    psrlw       m1, 1               ; rec[x] >> boShift
-    pmaddubsw   m2, m3
-    pmaddubsw   m0, m3
-    pand        m1, m4
-    paddb       m1, m5
+    cmp         r9d, 8
+    jle        .proc8
 
+    movq        r10, m1
 %assign x 0
-%rep 16
-    pextrb      r7d, m1, x
+%rep 8
+    movzx       r7d, r10b
+    shr         r10, 8
 
-%if (x < 8)
-    pextrw      r8d, m0, (x % 8)
-%else
-    pextrw      r8d, m2, (x % 8)
-%endif
-    movsx       r8d, r8w
-    inc         dword  [r6 + r7]    ; count[classIdx]++
-    add         [r5 + r7], r8d      ; stats[classIdx] += (fenc[x] - rec[x]);
+    movsx       r8d, word [r12 + x*2]   ; diff[x]
+    inc         dword  [r6 + r7]        ; count[classIdx]++
+    add         [r5 + r7], r8d          ; stats[classIdx] += (fenc[x] - rec[x]);
+%assign x x+1
+%endrep
+    movhlps     m1, m1
+    sub         r9d, 8
+    add         r12, 8*2
+
+.proc8:
+    movq        r10, m1
+%assign x 0
+%rep 8
+    movzx       r7d, r10b
+    shr         r10, 8
+
+    movsx       r8d, word [r12 + x*2]   ; diff[x]
+    inc         dword  [r6 + r7]        ; count[classIdx]++
+    add         [r5 + r7], r8d          ; stats[classIdx] += (fenc[x] - rec[x]);
     dec         r9d
-    jz          .next
+    jz         .next
 %assign x x+1
 %endrep
 
-    add         r10, 16
+    add         r12, 8*2
     add         r11, 16
-    jmp         .loopL
+    jmp        .loopL
 
 .next:
-    add         r0, r2
+    add         r0, 64*2                ; MAX_CU_SIZE
     add         r1, r2
     dec         r4d
-    jnz         .loopH
+    jnz        .loopH
     RET
 %endif
 
 ;-----------------------------------------------------------------------------------------------------------------------
-; saoCuStatsE0(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
+; saoCuStatsE0(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
 ;-----------------------------------------------------------------------------------------------------------------------
 %if ARCH_X86_64
 INIT_XMM sse4
-cglobal saoCuStatsE0, 5,9,8, 0-32
+cglobal saoCuStatsE0, 3,10,6, 0-32
     mov         r3d, r3m
-    mov         r8, r5mp
+    mov         r4d, r4m
+    mov         r9, r5mp
 
     ; clear internal temporary buffer
     pxor        m0, m0
     mova        [rsp], m0
     mova        [rsp + mmsize], m0
     mova        m4, [pb_128]
-    mova        m5, [hmul_16p + 16]
-    mova        m6, [pb_2]
+    mova        m5, [pb_2]
     xor         r7d, r7d
 
+    ; correct stride for diff[] and rec
+    mov         r6d, r3d
+    and         r6d, ~15
+    sub         r2, r6
+    lea         r8, [(r6 - 64) * 2]             ; 64 = MAX_CU_SIZE
+
 .loopH:
     mov         r5d, r3d
 
@@ -2075,100 +2094,257 @@
     pinsrb      m0, r7d, 15
 
 .loopL:
-    movu        m7, [r1]
+    movu        m3, [r1]
     movu        m2, [r1 + 1]
 
-    pxor        m1, m7, m4
-    pxor        m3, m2, m4
-    pcmpgtb     m2, m1, m3
-    pcmpgtb     m3, m1
-    pand        m2, [pb_1]
-    por         m2, m3              ; signRight
+    pxor        m1, m3, m4
+    pxor        m2, m4
+    pcmpgtb     m3, m1, m2
+    pcmpgtb     m2, m1
+    pand        m3, [pb_1]
+    por         m2, m3                          ; signRight
 
     palignr     m3, m2, m0, 15
-    psignb      m3, m4              ; signLeft
+    psignb      m3, m4                          ; signLeft
 
     mova        m0, m2
     paddb       m2, m3
-    paddb       m2, m6              ; edgeType
+    paddb       m2, m5                          ; edgeType
 
     ; stats[edgeType]
-    movu        m3, [r0]            ; fenc[0-15]
-    punpckhbw   m1, m3, m7
-    punpcklbw   m3, m7
-    pmaddubsw   m1, m5
-    pmaddubsw   m3, m5

x265_1.8.tar.gz/source/common/x86/loopfilter.h -> x265_1.9.tar.gz/source/common/x86/loopfilter.h Changed

@@ -3,6 +3,7 @@
  *
  * Authors: Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
  *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
+;*          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -35,14 +36,17 @@
     void PFX(saoCuOrgE3_ ## cpu)(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX); \
     void PFX(saoCuOrgE3_32_ ## cpu)(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX); \
     void PFX(saoCuOrgB0_ ## cpu)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride); \
-    void PFX(saoCuStatsBO_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
-    void PFX(saoCuStatsE0_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
-    void PFX(saoCuStatsE1_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
-    void PFX(saoCuStatsE2_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count); \
-    void PFX(saoCuStatsE3_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
+    void PFX(saoCuStatsBO_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
+    void PFX(saoCuStatsE0_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
+    void PFX(saoCuStatsE1_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
+    void PFX(saoCuStatsE2_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count); \
+    void PFX(saoCuStatsE3_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
     void PFX(calSign_ ## cpu)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
 
 DECL_SAO(sse4);
 DECL_SAO(avx2);
 
+void PFX(pelFilterLumaStrong_V_sse4)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
+void PFX(pelFilterLumaStrong_H_sse4)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
+
 #endif // ifndef X265_LOOPFILTER_H

x265_1.8.tar.gz/source/common/x86/mc-a.asm -> x265_1.9.tar.gz/source/common/x86/mc-a.asm Changed

@@ -2,6 +2,7 @@
 ;* mc-a.asm: x86 motion compensation
 ;*****************************************************************************
 ;* Copyright (C) 2003-2013 x264 project
+;* Copyright (C) 2013-2015 x265 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Fiona Glaser <fiona@x264.com>
@@ -3989,8 +3990,12 @@
     test dword r4m, 15
     jz pixel_avg_w%1_sse2
 %endif
+%if (%1 == 8)
+    jmp pixel_avg_w8_unaligned_sse2
+%else
     jmp pixel_avg_w%1_mmx2
 %endif
+%endif
 %endmacro
 
 ;-----------------------------------------------------------------------------
@@ -4049,6 +4054,32 @@
     lea     r4, [r4 + 4 * r5]
 %endmacro
 
+INIT_XMM sse2
+cglobal pixel_avg_w8_unaligned
+    AVG_START
+.height_loop:
+%if HIGH_BIT_DEPTH
+    ; NO TEST BRANCH!
+    movu    m0, [t2]
+    movu    m1, [t2+SIZEOF_PIXEL*t3]
+    movu    m2, [t4]
+    movu    m3, [t4+SIZEOF_PIXEL*t5]
+    pavgw   m0, m2
+    pavgw   m1, m3
+    movu    [t0], m0
+    movu    [t0+SIZEOF_PIXEL*t1], m1
+%else ;!HIGH_BIT_DEPTH
+    movq    m0, [t2]
+    movhps  m0, [t2+SIZEOF_PIXEL*t3]
+    movq    m1, [t4]
+    movhps  m1, [t4+SIZEOF_PIXEL*t5]
+    pavgb   m0, m1
+    movq    [t0], m0
+    movhps  [t0+SIZEOF_PIXEL*t1], m0
+%endif
+    AVG_END
+
+
 ;-------------------------------------------------------------------------------------------------------------------------------
 ;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
 ;-------------------------------------------------------------------------------------------------------------------------------
@@ -4115,11 +4146,11 @@
 AVGH 4, 4
 AVGH 4, 2
 
-AVG_FUNC 8, movq, movq
-AVGH 8, 32
-AVGH 8, 16
-AVGH 8,  8
-AVGH 8,  4
+;AVG_FUNC 8, movq, movq
+;AVGH 8, 32
+;AVGH 8, 16
+;AVGH 8,  8
+;AVGH 8,  4
 
 AVG_FUNC 16, movq, movq
 AVGH 16, 64
@@ -4197,7 +4228,7 @@
 AVGH 4, 4
 AVGH 4, 2
 
-AVG_FUNC 8, movq, movq
+;AVG_FUNC 8, movq, movq
 AVGH 8, 32
 AVGH 8, 16
 AVGH 8,  8
@@ -4418,6 +4449,37 @@
     call pixel_avg_16x64_8bit
     call pixel_avg_16x64_8bit
     RET
+
+cglobal pixel_avg_48x64, 6,7,4
+   mov          r6d, 4
+.loop:
+%rep 8
+    movu        m0, [r2]
+    movu        xm2, [r2 + mmsize]
+    movu        m1, [r4]
+    movu        xm3, [r4 + mmsize]
+    pavgb       m0, m1
+    pavgb       xm2, xm3
+    movu        [r0], m0
+    movu        [r0 + mmsize], xm2
+
+    movu        m0, [r2 + r3]
+    movu        xm2, [r2 + r3 + mmsize]
+    movu        m1, [r4 + r5]
+    movu        xm3, [r4 + r5 + mmsize]
+    pavgb       m0, m1
+    pavgb       xm2, xm3
+    movu        [r0 + r1], m0
+    movu        [r0 + r1 + mmsize], xm2
+
+    lea         r2, [r2 + r3 * 2]
+    lea         r4, [r4 + r5 * 2]
+    lea         r0, [r0 + r1 * 2]
+%endrep
+
+    dec         r6d
+    jnz         .loop
+    RET
 %endif
 
 ;=============================================================================

x265_1.8.tar.gz/source/common/x86/mc-a2.asm -> x265_1.9.tar.gz/source/common/x86/mc-a2.asm Changed

@@ -2,12 +2,14 @@
 ;* mc-a2.asm: x86 motion compensation
 ;*****************************************************************************
 ;* Copyright (C) 2005-2013 x264 project
+;* Copyright (C) 2013-2015 x265 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Fiona Glaser <fiona@x264.com>
 ;*          Holger Lubitz <holger@lubitz.org>
 ;*          Mathieu Monnier <manao@melix.net>
 ;*          Oskar Arvidsson <oskar@irock.se>
+;*          Min Chen <chenm003@163.com>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
@@ -46,6 +48,8 @@
 pd_16: times 4 dd 16
 pd_0f: times 4 dd 0xffff
 pf_inv256: times 8 dd 0.00390625
+const pd_inv256,    times 4 dq 0.00390625
+const pd_0_5,       times 4 dq 0.5
 
 SECTION .text
 
@@ -987,151 +991,227 @@
 %endif
 
 ;-----------------------------------------------------------------------------
-; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
-;                             uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
+; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, int32_t *intra_costs,
+;                             uint16_t *inter_costs, int32_t *inv_qscales, double *fps_factor, int len )
 ;-----------------------------------------------------------------------------
-%macro MBTREE 0
+INIT_XMM sse2
 cglobal mbtree_propagate_cost, 7,7,7
-    add        r6d, r6d
-    lea         r0, [r0+r6*2]
-    add         r1, r6
-    add         r2, r6
-    add         r3, r6
-    add         r4, r6
-    neg         r6
-    pxor      xmm4, xmm4
-    movss     xmm6, [r5]
-    shufps    xmm6, xmm6, 0
-    mulps     xmm6, [pf_inv256]
-    movdqa    xmm5, [pw_3fff]
+    dec         r6d
+    movsd       m6, [r5]
+    mulpd       m6, [pd_inv256]
+    xor         r5d, r5d
+    lea         r0, [r0+r5*2]
+    pxor        m4, m4
+    movlhps     m6, m6
+    mova        m5, [pw_3fff]
+
 .loop:
-    movq      xmm2, [r2+r6] ; intra
-    movq      xmm0, [r4+r6] ; invq
-    movq      xmm3, [r3+r6] ; inter
-    movq      xmm1, [r1+r6] ; prop
-    punpcklwd xmm2, xmm4
-    punpcklwd xmm0, xmm4
-    pmaddwd   xmm0, xmm2
-    pand      xmm3, xmm5
-    punpcklwd xmm1, xmm4
-    punpcklwd xmm3, xmm4
-%if cpuflag(fma4)
-    cvtdq2ps  xmm0, xmm0
-    cvtdq2ps  xmm1, xmm1
-    fmaddps   xmm0, xmm0, xmm6, xmm1
-    cvtdq2ps  xmm1, xmm2
-    psubd     xmm2, xmm3
-    cvtdq2ps  xmm2, xmm2
-    rcpps     xmm3, xmm1
-    mulps     xmm1, xmm3
-    mulps     xmm0, xmm2
-    addps     xmm2, xmm3, xmm3
-    fnmaddps  xmm3, xmm1, xmm3, xmm2
-    mulps     xmm0, xmm3
-%else
-    cvtdq2ps  xmm0, xmm0
-    mulps     xmm0, xmm6    ; intra*invq*fps_factor>>8
-    cvtdq2ps  xmm1, xmm1    ; prop
-    addps     xmm0, xmm1    ; prop + (intra*invq*fps_factor>>8)
-    cvtdq2ps  xmm1, xmm2    ; intra
-    psubd     xmm2, xmm3    ; intra - inter
-    cvtdq2ps  xmm2, xmm2    ; intra - inter
-    rcpps     xmm3, xmm1    ; 1 / intra 1st approximation
-    mulps     xmm1, xmm3    ; intra * (1/intra 1st approx)
-    mulps     xmm1, xmm3    ; intra * (1/intra 1st approx)^2
-    mulps     xmm0, xmm2    ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
-    addps     xmm3, xmm3    ; 2 * (1/intra 1st approx)
-    subps     xmm3, xmm1    ; 2nd approximation for 1/intra
-    mulps     xmm0, xmm3    ; / intra
-%endif
-    cvtps2dq  xmm0, xmm0
-    movdqa [r0+r6*2], xmm0
-    add         r6, 8
-    jl .loop
+    movh        m2, [r2+r5*4]       ; intra
+    movh        m0, [r4+r5*4]       ; invq
+    movd        m3, [r3+r5*2]       ; inter
+    pand        m3, m5
+    punpcklwd   m3, m4
+
+    ; PMINSD
+    pcmpgtd     m1, m2, m3
+    pand        m3, m1
+    pandn       m1, m2
+    por         m3, m1
+
+    movd        m1, [r1+r5*2]       ; prop
+    punpckldq   m2, m2
+    punpckldq   m0, m0
+    pmuludq     m0, m2
+    pshufd      m2, m2, q3120
+    pshufd      m0, m0, q3120
+
+    punpcklwd   m1, m4
+    cvtdq2pd    m0, m0
+    mulpd       m0, m6              ; intra*invq*fps_factor>>8
+    cvtdq2pd    m1, m1              ; prop
+    addpd       m0, m1              ; prop + (intra*invq*fps_factor>>8)
+    ;cvtdq2ps    m1, m2              ; intra
+    cvtdq2pd    m1, m2              ; intra
+    psubd       m2, m3              ; intra - inter
+    cvtdq2pd    m2, m2              ; intra - inter
+    ;rcpps       m3, m1
+    ;mulps       m1, m3              ; intra * (1/intra 1st approx)
+    ;mulps       m1, m3              ; intra * (1/intra 1st approx)^2
+    ;addps       m3, m3              ; 2 * (1/intra 1st approx)
+    ;subps       m3, m1              ; 2nd approximation for 1/intra
+    ;cvtps2pd    m3, m3              ; 1 / intra 1st approximation
+    mulpd       m0, m2              ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
+    ;mulpd       m0, m3              ; / intra
+
+    ; TODO: DIVPD very slow, but match to C model output, since it is not bottleneck function, I comment above faster code
+    divpd       m0, m1
+    addpd       m0, [pd_0_5]
+    cvttpd2dq    m0, m0
+
+    movh        [r0+r5*4], m0
+    add         r5d, 2
+    cmp         r5d, r6d
+    jl         .loop
+
+    xor         r6d, r5d
+    jnz         .even
+    movd        m2, [r2+r5*4]       ; intra
+    movd        m0, [r4+r5*4]       ; invq
+    movd        m3, [r3+r5*2]       ; inter
+    pand        m3, m5
+    punpcklwd   m3, m4
+
+    ; PMINSD
+    pcmpgtd     m1, m2, m3
+    pand        m3, m1
+    pandn       m1, m2
+    por         m3, m1
+
+    movd        m1, [r1+r5*2]       ; prop
+    punpckldq   m2, m2              ; DWORD [_ 1 _ 0]
+    punpckldq   m0, m0
+    pmuludq     m0, m2              ; QWORD [m1 m0]
+    pshufd      m2, m2, q3120
+    pshufd      m0, m0, q3120
+    punpcklwd   m1, m4
+    cvtdq2pd    m0, m0
+    mulpd       m0, m6              ; intra*invq*fps_factor>>8
+    cvtdq2pd    m1, m1              ; prop
+    addpd       m0, m1              ; prop + (intra*invq*fps_factor>>8)
+    cvtdq2pd    m1, m2              ; intra
+    psubd       m2, m3              ; intra - inter
+    cvtdq2pd    m2, m2              ; intra - inter
+    mulpd       m0, m2              ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
+
+    divpd       m0, m1
+    addpd       m0, [pd_0_5]
+    cvttpd2dq    m0, m0
+    movd        [r0+r5*4], m0
+.even:
     RET
-%endmacro
 
-INIT_XMM sse2
-MBTREE
-; Bulldozer only has a 128-bit float unit, so the AVX version of this function is actually slower.
-INIT_XMM fma4
-MBTREE
-
-%macro INT16_UNPACK 1
-    vpunpckhwd   xm4, xm%1, xm7
-    vpunpcklwd  xm%1, xm7
-    vinsertf128  m%1, m%1, xm4, 1
-%endmacro
 
+;-----------------------------------------------------------------------------
+; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, int32_t *intra_costs,

x265_1.8.tar.gz/source/common/x86/mc.h -> x265_1.9.tar.gz/source/common/x86/mc.h Changed

x265_1.8.tar.gz/source/common/x86/pixel-a.asm -> x265_1.9.tar.gz/source/common/x86/pixel-a.asm Changed

@@ -2,6 +2,7 @@
 ;* pixel.asm: x86 pixel metrics
 ;*****************************************************************************
 ;* Copyright (C) 2003-2013 x264 project
+;* Copyright (C) 2013-2015 x265 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Holger Lubitz <holger@lubitz.org>
@@ -70,6 +71,7 @@
 cextern pd_2
 cextern hmul_16p
 cextern pb_movemask
+cextern pb_movemask_32
 cextern pw_pixel_max
 
 ;=============================================================================
@@ -6497,6 +6499,1357 @@
 %endif ; !ARCH_X86_64
 %endmacro ; SA8D
 
+
+%if ARCH_X86_64 == 1 && BIT_DEPTH == 12
+INIT_YMM avx2
+cglobal sa8d_8x8_12bit
+    pmovzxwd        m0, [r0]
+    pmovzxwd        m9, [r2]
+    psubd           m0, m9
+
+    pmovzxwd        m1, [r0 + r1]
+    pmovzxwd        m9, [r2 + r3]
+    psubd           m1, m9
+
+    pmovzxwd        m2, [r0 + r1 * 2]
+    pmovzxwd        m9, [r2 + r3 * 2]
+    psubd           m2, m9
+
+    pmovzxwd        m8, [r0 + r4]
+    pmovzxwd        m9, [r2 + r5]
+    psubd           m8, m9
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+
+    pmovzxwd        m4, [r0]
+    pmovzxwd        m9, [r2]
+    psubd           m4, m9
+
+    pmovzxwd        m5, [r0 + r1]
+    pmovzxwd        m9, [r2 + r3]
+    psubd           m5, m9
+
+    pmovzxwd        m3, [r0 + r1 * 2]
+    pmovzxwd        m9, [r2 + r3 * 2]
+    psubd           m3, m9
+
+    pmovzxwd        m7, [r0 + r4]
+    pmovzxwd        m9, [r2 + r5]
+    psubd           m7, m9
+
+    mova            m6, m0
+    paddd           m0, m1
+    psubd           m1, m6
+    mova            m6, m2
+    paddd           m2, m8
+    psubd           m8, m6
+    mova            m6, m0
+
+    punpckldq       m0, m1
+    punpckhdq       m6, m1
+
+    mova            m1, m0
+    paddd           m0, m6
+    psubd           m6, m1
+    mova            m1, m2
+
+    punpckldq       m2, m8
+    punpckhdq       m1, m8
+
+    mova            m8, m2
+    paddd           m2, m1
+    psubd           m1, m8
+    mova            m8, m4
+    paddd           m4, m5
+    psubd           m5, m8
+    mova            m8, m3
+    paddd           m3, m7
+    psubd           m7, m8
+    mova            m8, m4
+
+    punpckldq       m4, m5
+    punpckhdq       m8, m5
+
+    mova            m5, m4
+    paddd           m4, m8
+    psubd           m8, m5
+    mova            m5, m3
+    punpckldq       m3, m7
+    punpckhdq       m5, m7
+
+    mova            m7, m3
+    paddd           m3, m5
+    psubd           m5, m7
+    mova            m7, m0
+    paddd           m0, m2
+    psubd           m2, m7
+    mova            m7, m6
+    paddd           m6, m1
+    psubd           m1, m7
+    mova            m7, m0
+
+    punpcklqdq      m0, m2
+    punpckhqdq      m7, m2
+
+    mova            m2, m0
+    paddd           m0, m7
+    psubd           m7, m2
+    mova            m2, m6
+
+    punpcklqdq      m6, m1
+    punpckhqdq      m2, m1
+
+    mova            m1, m6
+    paddd           m6, m2
+    psubd           m2, m1
+    mova            m1, m4
+    paddd           m4, m3
+    psubd           m3, m1
+    mova            m1, m8
+    paddd           m8, m5
+    psubd           m5, m1
+    mova            m1, m4
+
+    punpcklqdq      m4, m3
+    punpckhqdq      m1, m3
+
+    mova            m3, m4
+    paddd           m4, m1
+    psubd           m1, m3
+    mova            m3, m8
+
+    punpcklqdq      m8, m5
+    punpckhqdq      m3, m5
+
+    mova            m5, m8
+    paddd           m8, m3
+    psubd           m3, m5
+    mova            m5, m0
+    paddd           m0, m4
+    psubd           m4, m5
+    mova            m5, m7
+    paddd           m7, m1
+    psubd           m1, m5
+    mova            m5, m0
+
+    vinserti128     m0, m0, xm4, 1
+    vperm2i128      m5, m5, m4, 00110001b
+
+    pxor            m4, m4
+    psubd           m4, m0
+    pmaxsd          m0, m4
+    pxor            m4, m4
+    psubd           m4, m5
+    pmaxsd          m5, m4
+    pmaxsd          m0, m5
+    mova            m4, m7
+
+    vinserti128     m7, m7, xm1, 1
+    vperm2i128      m4, m4, m1, 00110001b
+
+    pxor            m1, m1
+    psubd           m1, m7
+    pmaxsd          m7, m1
+    pxor            m1, m1
+    psubd           m1, m4
+    pmaxsd          m4, m1
+    pmaxsd          m7, m4
+    mova            m1, m6
+    paddd           m6, m8
+    psubd           m8, m1
+    mova            m1, m2
+    paddd           m2, m3
+    psubd           m3, m1
+    mova            m1, m6
+
+    vinserti128     m6, m6, xm8, 1
+    vperm2i128      m1, m1, m8, 00110001b
+
+    pxor            m8, m8
+    psubd           m8, m6
+    pmaxsd          m6, m8
+    pxor            m8, m8
+    psubd           m8, m1
+    pmaxsd          m1, m8
+    pmaxsd          m6, m1
+    mova            m8, m2
+
+    vinserti128     m2, m2, xm3, 1
+    vperm2i128      m8, m8, m3, 00110001b
+
+    pxor            m3, m3

x265_1.8.tar.gz/source/common/x86/pixel-util.h -> x265_1.9.tar.gz/source/common/x86/pixel-util.h Changed

@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+;*          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -55,5 +56,6 @@
 int PFX(scanPosLast_avx2_bmi2(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize));
 uint32_t PFX(findPosFirstLast_ssse3(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]));
 uint32_t PFX(costCoeffNxN_sse4(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase));
+uint32_t PFX(costCoeffNxN_avx2_bmi2(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase));
 
 #endif // ifndef X265_PIXEL_UTIL_H

x265_1.8.tar.gz/source/common/x86/pixel-util8.asm -> x265_1.9.tar.gz/source/common/x86/pixel-util8.asm Changed

@@ -49,6 +49,7 @@
 mask_ff:                times 16 db 0xff
                         times 16 db 0
 deinterleave_shuf:      times  2 db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+interleave_shuf:        times  2 db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
 deinterleave_word_shuf: times  2 db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
 hmulw_16p:              times  8 dw 1
                         times  4 dw 1, -1
@@ -56,7 +57,7 @@
 SECTION .text
 
 cextern pw_1
-cextern pw_0_15
+cextern pw_0_7
 cextern pb_1
 cextern pb_128
 cextern pw_00ff
@@ -78,6 +79,7 @@
 cextern trans8_shuf
 cextern_naked private_prefix %+ _entropyStateBits
 cextern pb_movemask
+cextern pw_exp2_0_15
 
 ;-----------------------------------------------------------------------------
 ; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
@@ -792,6 +794,7 @@
     pshufd      m6, m6, 0       ; m6 = add
     mov         r3d, r4d        ; r3 = numCoeff
     shr         r4d, 3
+    pxor        m4, m4
 
 .loop:
     pmovsxwd    m0, [r0]        ; m0 = level
@@ -810,13 +813,13 @@
     psignd      m3, m1
 
     packssdw    m2, m3
+    pabsw       m2, m2
 
     movu        [r2], m2
     add         r0, 16
     add         r1, 32
     add         r2, 16
 
-    pxor        m4, m4
     pcmpeqw     m2, m4
     psubw       m7, m2
 
@@ -862,9 +865,11 @@
     psignd      m2, m0
 
     packssdw    m1, m2
-    vpermq      m2, m1, q3120
+    pabsw       m1, m1
 
+    vpermq      m2, m1, q3120
     movu        [r2], m2
+
     add         r0, mmsize
     add         r1, mmsize * 2
     add         r2, mmsize
@@ -1560,7 +1565,7 @@
     movd        m0, r6d
     pshuflw     m0, m0, 0
     punpcklqdq  m0, m0
-    pcmpgtw     m0, [pw_0_15]
+    pcmpgtw     m0, [pw_0_7]
 
 .loopH:
     mov         r6d, r4d
@@ -1718,7 +1723,7 @@
     pshuflw                   m0, m0, 0
     punpcklqdq                m0, m0
     vinserti128               m0, m0, xm0, 1
-    pcmpgtw                   m0, [pw_0_15]
+    pcmpgtw                   m0, [pw_0_7]
 
 .loopH:
     mov                       r6d, r4d
@@ -6397,6 +6402,78 @@
     movd   edx, xm6
 %endif
     RET
+
+INIT_YMM avx2
+cglobal pixel_var_32x32, 2,4,7
+    VAR_START 0
+    mov             r2d, 16
+
+.loop:
+    pmovzxbw        m0, [r0]
+    pmovzxbw        m3, [r0 + 16]
+    pmovzxbw        m1, [r0 + r1]
+    pmovzxbw        m4, [r0 + r1 + 16]
+
+    lea             r0, [r0 + r1 * 2]
+
+    VAR_CORE
+
+    dec             r2d
+    jg              .loop
+
+    vextracti128   xm0, m5, 1
+    vextracti128   xm1, m6, 1
+    paddw          xm5, xm0
+    paddd          xm6, xm1
+    HADDW          xm5, xm2
+    HADDD          xm6, xm1
+
+%if ARCH_X86_64
+    punpckldq      xm5, xm6
+    movq           rax, xm5
+%else
+    movd           eax, xm5
+    movd           edx, xm6
+%endif
+    RET
+
+INIT_YMM avx2
+cglobal pixel_var_64x64, 2,4,7
+    VAR_START 0
+    mov             r2d, 64
+
+.loop:
+    pmovzxbw        m0, [r0]
+    pmovzxbw        m3, [r0 + 16]
+    pmovzxbw        m1, [r0 + mmsize]
+    pmovzxbw        m4, [r0 + mmsize + 16]
+
+    lea             r0, [r0 + r1]
+
+    VAR_CORE
+
+    dec             r2d
+    jg              .loop
+
+    pxor            m1, m1
+    punpcklwd       m0, m5, m1
+    punpckhwd       m5, m1
+    paddd           m5, m0
+    vextracti128   xm2, m5, 1
+    vextracti128   xm1, m6, 1
+    paddd          xm5, xm2
+    paddd          xm6, xm1
+    HADDD          xm5, xm2
+    HADDD          xm6, xm1
+
+%if ARCH_X86_64
+    punpckldq      xm5, xm6
+    movq           rax, xm5
+%else
+    movd           eax, xm5
+    movd           edx, xm6
+%endif
+    RET
 %endif ; !HIGH_BIT_DEPTH
 
 %macro VAR2_END 3
@@ -6578,10 +6655,10 @@
 
 
 ;-----------------------------------------------------------------------------
-; uint32_t[last first] findPosFirstAndLast(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16])
+; uint32_t[sumSign last first] findPosFirstLast(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16], uint32_t *absSum)
 ;-----------------------------------------------------------------------------
 INIT_XMM ssse3
-cglobal findPosFirstLast, 3,3,3
+cglobal findPosFirstLast, 3,3,4
     ; convert stride to int16_t
     add         r1d, r1d
 
@@ -6593,10 +6670,22 @@
     movh        m1, [r0]
     movhps      m1, [r0 + r1]
     movh        m2, [r0 + r1 * 2]
-    lea         r1, [r1 * 3]
+    lea         r1d, [r1 * 3]
     movhps      m2, [r0 + r1]
+    pxor        m3, m1, m2
     packsswb    m1, m2
 
+    ; get absSum
+    movhlps     m2, m3
+    pxor        m3, m2
+    pshufd      m2, m3, q2301
+    pxor        m3, m2
+    movd        r0d, m3
+    mov         r2d, r0d
+    shr         r2d, 16
+    xor         r2d, r0d
+    shl         r2d, 31
+
     ; get non-zero mask
     pxor        m2, m2
     pcmpeqb     m1, m2
@@ -6609,319 +6698,10 @@
     not         r0d
     bsr         r1w, r0w
     bsf         eax, r0d    ; side effect: clear AH to Zero
-    shl         r1d, 16

x265_1.8.tar.gz/source/common/x86/pixel.h -> x265_1.9.tar.gz/source/common/x86/pixel.h Changed

@@ -2,10 +2,12 @@
  * pixel.h: x86 pixel metrics
  *****************************************************************************
  * Copyright (C) 2003-2013 x264 project
+ * Copyright (C) 2013-2015 x265 project
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
  *          Fiona Glaser <fiona@x264.com>
+;*          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -34,9 +36,10 @@
 void PFX(upShift_16_avx2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
 void PFX(upShift_8_sse4)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
 void PFX(upShift_8_avx2)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
+pixel PFX(planeClipAndMax_avx2)(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix);
 
 #define DECL_PIXELS(cpu) \
-    FUNCDEF_PU(uint32_t, pixel_ssd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
+    FUNCDEF_PU(sse_t, pixel_ssd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
     FUNCDEF_PU(int, pixel_sa8d, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
     FUNCDEF_PU(void, pixel_sad_x3, cpu, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
     FUNCDEF_PU(void, pixel_sad_x4, cpu, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
@@ -45,10 +48,10 @@
     FUNCDEF_PU(void, pixel_sub_ps, cpu, int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); \
     FUNCDEF_CHROMA_PU(int, pixel_satd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
     FUNCDEF_CHROMA_PU(int, pixel_sad, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
-    FUNCDEF_CHROMA_PU(uint32_t, pixel_ssd_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t); \
+    FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t); \
     FUNCDEF_CHROMA_PU(void, addAvg, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
-    FUNCDEF_CHROMA_PU(int, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
-    FUNCDEF_TU_S(int, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
+    FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
+    FUNCDEF_TU_S(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
     FUNCDEF_TU(uint64_t, pixel_var, cpu, const pixel*, intptr_t); \
     FUNCDEF_TU(int, psyCost_pp, cpu, const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride); \
     FUNCDEF_TU(int, psyCost_ss, cpu, const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)

x265_1.8.tar.gz/source/common/x86/pixeladd8.asm -> x265_1.9.tar.gz/source/common/x86/pixeladd8.asm Changed

x265_1.8.tar.gz/source/common/x86/sad-a.asm -> x265_1.9.tar.gz/source/common/x86/sad-a.asm Changed

@@ -2,6 +2,7 @@
 ;* sad-a.asm: x86 sad functions
 ;*****************************************************************************
 ;* Copyright (C) 2003-2013 x264 project
+;* Copyright (C) 2013-2015 x265 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Fiona Glaser <fiona@x264.com>
@@ -3328,6 +3329,730 @@
     SAD_X4_END_SSE2 1
 %endmacro
 
+%if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
+INIT_YMM avx2
+%macro SAD_X4_64x8_AVX2 0
+    movu            m4, [r0]
+    movu            m5, [r1]
+    movu            m6, [r2]
+    movu            m7, [r3]
+    movu            m8, [r4]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + mmsize]
+    movu            m5, [r1 + mmsize]
+    movu            m6, [r2 + mmsize]
+    movu            m7, [r3 + mmsize]
+    movu            m8, [r4 + mmsize]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE]
+    movu            m5, [r1 + r5]
+    movu            m6, [r2 + r5]
+    movu            m7, [r3 + r5]
+    movu            m8, [r4 + r5]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE + mmsize]
+    movu            m5, [r1 + r5 + mmsize]
+    movu            m6, [r2 + r5 + mmsize]
+    movu            m7, [r3 + r5 + mmsize]
+    movu            m8, [r4 + r5 + mmsize]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE * 2]
+    movu            m5, [r1 + r5 * 2]
+    movu            m6, [r2 + r5 * 2]
+    movu            m7, [r3 + r5 * 2]
+    movu            m8, [r4 + r5 * 2]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE * 2 + mmsize]
+    movu            m5, [r1 + r5 * 2 + mmsize]
+    movu            m6, [r2 + r5 * 2 + mmsize]
+    movu            m7, [r3 + r5 * 2 + mmsize]
+    movu            m8, [r4 + r5 * 2 + mmsize]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE * 3]
+    movu            m5, [r1 + r7]
+    movu            m6, [r2 + r7]
+    movu            m7, [r3 + r7]
+    movu            m8, [r4 + r7]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE * 3 + mmsize]
+    movu            m5, [r1 + r7 + mmsize]
+    movu            m6, [r2 + r7 + mmsize]
+    movu            m7, [r3 + r7 + mmsize]
+    movu            m8, [r4 + r7 + mmsize]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+
+    movu            m4, [r0]
+    movu            m5, [r1]
+    movu            m6, [r2]
+    movu            m7, [r3]
+    movu            m8, [r4]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + mmsize]
+    movu            m5, [r1 + mmsize]
+    movu            m6, [r2 + mmsize]
+    movu            m7, [r3 + mmsize]
+    movu            m8, [r4 + mmsize]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE]
+    movu            m5, [r1 + r5]
+    movu            m6, [r2 + r5]
+    movu            m7, [r3 + r5]
+    movu            m8, [r4 + r5]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE + mmsize]
+    movu            m5, [r1 + r5 + mmsize]
+    movu            m6, [r2 + r5 + mmsize]
+    movu            m7, [r3 + r5 + mmsize]
+    movu            m8, [r4 + r5 + mmsize]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4

x265_1.8.tar.gz/source/common/x86/sad16-a.asm -> x265_1.9.tar.gz/source/common/x86/sad16-a.asm Changed

@@ -413,77 +413,50 @@
 SAD  16, 32
 
 INIT_YMM avx2
-cglobal pixel_sad_16x64, 4,7,4
+cglobal pixel_sad_16x64, 4,5,5
     pxor    m0, m0
-    pxor    m3, m3
-    mov     r4d, 64 / 8
-    add     r3d, r3d
-    add     r1d, r1d
-    lea     r5,     [r1 * 3]
-    lea     r6,     [r3 * 3]
+    mov     r4d, 16
+    mova    m4, [pw_1]
 .loop:
     movu    m1, [r2]
-    movu    m2, [r2 + r3]
+    movu    m2, [r2 + r3 * 2]
     psubw   m1, [r0]
-    psubw   m2, [r0 + r1]
-    pabsw   m1, m1
-    pabsw   m2, m2
-    paddw   m0, m1
-    paddw   m3, m2
-
-    movu    m1, [r2 + 2 * r3]
-    movu    m2, [r2 + r6]
-    psubw   m1, [r0 + 2 * r1]
-    psubw   m2, [r0 + r5]
+    psubw   m2, [r0 + r1 * 2]
     pabsw   m1, m1
     pabsw   m2, m2
-    paddw   m0, m1
-    paddw   m3, m2
-
+    paddw   m3, m1, m2
     lea     r0, [r0 + 4 * r1]
     lea     r2, [r2 + 4 * r3]
 
     movu    m1, [r2]
-    movu    m2, [r2 + r3]
+    movu    m2, [r2 + r3 * 2]
     psubw   m1, [r0]
-    psubw   m2, [r0 + r1]
+    psubw   m2, [r0 + r1 * 2]
     pabsw   m1, m1
     pabsw   m2, m2
-    paddw   m0, m1
-    paddw   m3, m2
-
-    movu    m1, [r2 + 2 * r3]
-    movu    m2, [r2 + r6]
-    psubw   m1, [r0 + 2 * r1]
-    psubw   m2, [r0 + r5]
-    pabsw   m1, m1
-    pabsw   m2, m2
-    paddw   m0, m1
-    paddw   m3, m2
-
-    lea     r0, [r0 + 4 * r1]
-    lea     r2, [r2 + 4 * r3]
-
-    dec    r4d
-    jg .loop
-
-    HADDUWD m0, m1
-    HADDUWD m3, m1
-    HADDD   m0, m1
-    HADDD   m3, m1
+    paddw   m1, m2
+    pmaddwd m3, m4
     paddd   m0, m3
+    pmaddwd m1, m4
+    paddd   m0, m1
+    lea     r0, [r0+4*r1]
+    lea     r2, [r2+4*r3]
+    dec     r4d
+    jg      .loop
 
+    HADDD   m0, m1
     movd    eax, xm0
     RET
 
 INIT_YMM avx2
-cglobal pixel_sad_32x8, 4,7,5
+cglobal pixel_sad_32x8, 4,7,7
     pxor    m0, m0
     mov     r4d, 8/4
+    mova    m6, [pw_1]
     add     r3d, r3d
     add     r1d, r1d
-    lea     r5,     [r1 * 3]
-    lea     r6,     [r3 * 3]
+    lea     r5d,     [r1 * 3]
+    lea     r6d,     [r3 * 3]
 .loop:
     movu    m1, [r2]
     movu    m2, [r2 + 32]
@@ -499,8 +472,7 @@
     pabsw   m4, m4
     paddw   m1, m2
     paddw   m3, m4
-    paddw   m0, m1
-    paddw   m0, m3
+    paddw   m5, m1, m3
 
     movu    m1, [r2 + 2 * r3]
     movu    m2, [r2 + 2 * r3 + 32]
@@ -518,24 +490,28 @@
     pabsw   m4, m4
     paddw   m1, m2
     paddw   m3, m4
-    paddw   m0, m1
-    paddw   m0, m3
+    paddw   m1, m3
 
+    pmaddwd m5, m6
+    paddd   m0, m5
+    pmaddwd m1, m6
+    paddd   m0, m1
     dec    r4d
     jg .loop
 
-    HADDW   m0, m1
+    HADDD   m0, m1
     movd    eax, xm0
     RET
 
 INIT_YMM avx2
-cglobal pixel_sad_32x16, 4,7,5
+cglobal pixel_sad_32x16, 4,7,7
     pxor    m0, m0
     mov     r4d, 16/8
+    mova    m6, [pw_1]
     add     r3d, r3d
     add     r1d, r1d
-    lea     r5,     [r1 * 3]
-    lea     r6,     [r3 * 3]
+    lea     r5d,     [r1 * 3]
+    lea     r6d,     [r3 * 3]
 .loop:
     movu    m1, [r2]
     movu    m2, [r2 + 32]
@@ -551,8 +527,7 @@
     pabsw   m4, m4
     paddw   m1, m2
     paddw   m3, m4
-    paddw   m0, m1
-    paddw   m0, m3
+    paddw   m5, m1, m3
 
     movu    m1, [r2 + 2 * r3]
     movu    m2, [r2 + 2 * r3 + 32]
@@ -570,8 +545,12 @@
     pabsw   m4, m4
     paddw   m1, m2
     paddw   m3, m4
-    paddw   m0, m1
-    paddw   m0, m3
+    paddw   m1, m3
+
+    pmaddwd m5, m6
+    paddd   m0, m5
+    pmaddwd m1, m6
+    paddd   m0, m1
 
     movu    m1, [r2]
     movu    m2, [r2 + 32]
@@ -587,8 +566,7 @@
     pabsw   m4, m4
     paddw   m1, m2
     paddw   m3, m4
-    paddw   m0, m1
-    paddw   m0, m3
+    paddw   m5, m1, m3
 
     movu    m1, [r2 + 2 * r3]
     movu    m2, [r2 + 2 * r3 + 32]
@@ -606,24 +584,28 @@
     pabsw   m4, m4
     paddw   m1, m2
     paddw   m3, m4
-    paddw   m0, m1
-    paddw   m0, m3
+    paddw   m1, m3
 
+    pmaddwd m5, m6
+    paddd   m0, m5
+    pmaddwd m1, m6
+    paddd   m0, m1
     dec    r4d
     jg .loop
 
-    HADDW   m0, m1
+    HADDD   m0, m1
     movd    eax, xm0
     RET
 
 INIT_YMM avx2

x265_1.8.tar.gz/source/common/x86/ssd-a.asm -> x265_1.9.tar.gz/source/common/x86/ssd-a.asm Changed

@@ -2,11 +2,13 @@
 ;* ssd-a.asm: x86 ssd functions
 ;*****************************************************************************
 ;* Copyright (C) 2003-2013 x264 project
+;* Copyright (C) 2013-2015 x265 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Fiona Glaser <fiona@x264.com>
 ;*          Laurent Aimar <fenrir@via.ecp.fr>
 ;*          Alex Izvorski <aizvorksi@gmail.com>
+;*          Min Chen <chenm003@163.com>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
@@ -105,8 +107,32 @@
     dec    r4d
     jg .loop
 %endif
+%if BIT_DEPTH == 12 && %1 >= 16 && %2 >=16
+%if  mmsize == 16
+    movu            m5, m0
+    pxor            m6, m6
+    punpckldq       m0, m6
+    punpckhdq       m5, m6
+    paddq           m0, m5
+    movhlps         m5, m0
+    paddq           m0, m5
+    movq            r6, xm0
+%elif mmsize == 32
+    movu            m1, m0
+    pxor            m2, m2
+    punpckldq       m0, m2
+    punpckhdq       m1, m2
+    paddq           m0, m1
+    vextracti128    xm2, m0, 1
+    paddq           xm2, xm0
+    movhlps         xm1, xm2
+    paddq           xm2, xm1
+    movq            rax, xm2
+%endif
+%else 
     HADDD   m0, m5
-    movd   eax, xm0
+    movd    eax,xm0
+%endif
 %ifidn movu,movq ; detect MMX
     EMMS
 %endif
@@ -168,6 +194,154 @@
     movq        rax, m9
     RET
 %endmacro
+%macro SSD_ONE_SS_32 0
+cglobal pixel_ssd_ss_32x32, 4,5,8
+    add         r1d, r1d
+    add         r3d, r3d
+    pxor        m5, m5
+    pxor        m6, m6
+    mov         r4d, 2
+
+.iterate:
+    mov         r5d, 16
+    pxor        m4, m4
+    pxor        m7, m7
+.loop:
+    movu        m0, [r0]
+    movu        m1, [r0 + mmsize]
+    movu        m2, [r2]
+    movu        m3, [r2 + mmsize]
+    psubw       m0, m2
+    psubw       m1, m3
+    pmaddwd     m0, m0
+    pmaddwd     m1, m1
+    paddd       m4, m0
+    paddd       m7, m1
+    movu        m0, [r0 + 2 * mmsize]
+    movu        m1, [r0 + 3 * mmsize]
+    movu        m2, [r2 + 2 * mmsize]
+    movu        m3, [r2 + 3 * mmsize]
+    psubw       m0, m2
+    psubw       m1, m3
+    pmaddwd     m0, m0
+    pmaddwd     m1, m1
+    paddd       m4, m0
+    paddd       m7, m1
+
+    add         r0, r1
+    add         r2, r3
+
+    dec         r5d
+    jnz         .loop
+
+    mova        m0, m4
+    pxor        m1, m1
+    punpckldq   m0, m1
+    punpckhdq   m4, m1
+    paddq       m5, m0
+    paddq       m6, m4
+
+    mova        m0, m7
+    punpckldq   m0, m1
+    punpckhdq   m7, m1
+    paddq       m5, m0
+    paddq       m6, m7
+
+    dec         r4d
+    jnz         .iterate
+
+    paddq       m5, m6
+    movhlps     m2, m5
+    paddq       m5, m2
+    movq        rax, m5
+    RET
+%endmacro
+
+%macro SSD_ONE_SS_64 0
+cglobal pixel_ssd_ss_64x64, 4,6,8
+    add         r1d, r1d
+    add         r3d, r3d
+    pxor        m5, m5
+    pxor        m6, m6
+    mov         r5d, 8
+
+.iterate:
+    pxor        m4, m4
+    pxor        m7, m7
+    mov         r4d, 8
+
+.loop:
+    ;----process 1st half a row----
+    movu        m0, [r0]
+    movu        m1, [r0 + mmsize]
+    movu        m2, [r2]
+    movu        m3, [r2 + mmsize]
+    psubw       m0, m2
+    psubw       m1, m3
+    pmaddwd     m0, m0
+    pmaddwd     m1, m1
+    paddd       m4, m0
+    paddd       m7, m1
+    movu        m0, [r0 + 2 * mmsize]
+    movu        m1, [r0 + 3 * mmsize]
+    movu        m2, [r2 + 2 * mmsize]
+    movu        m3, [r2 + 3 * mmsize]
+    psubw       m0, m2
+    psubw       m1, m3
+    pmaddwd     m0, m0
+    pmaddwd     m1, m1
+    paddd       m4, m0
+    paddd       m7, m1
+    ;----process 2nd half a row----
+    movu        m0, [r0 + 4 * mmsize]
+    movu        m1, [r0 + 5 * mmsize]
+    movu        m2, [r2 + 4 * mmsize]
+    movu        m3, [r2 + 5 * mmsize]
+    psubw       m0, m2
+    psubw       m1, m3
+    pmaddwd     m0, m0
+    pmaddwd     m1, m1
+    paddd       m4, m0
+    paddd       m7, m1
+    movu        m0, [r0 + 6 * mmsize]
+    movu        m1, [r0 + 7 * mmsize]
+    movu        m2, [r2 + 6 * mmsize]
+    movu        m3, [r2 + 7 * mmsize]
+    psubw       m0, m2
+    psubw       m1, m3
+    pmaddwd     m0, m0
+    pmaddwd     m1, m1
+    paddd       m4, m0
+    paddd       m7, m1
+
+    add         r0, r1
+    add         r2, r3
+
+    dec         r4d
+    jnz         .loop
+
+    mova        m0, m4
+    pxor        m1, m1
+    punpckldq   m0, m1
+    punpckhdq   m4, m1
+    paddq       m5, m0
+    paddq       m6, m4
+
+    mova        m0, m7
+    punpckldq   m0, m1
+    punpckhdq   m7, m1
+    paddq       m5, m0
+    paddq       m6, m7
+
+    dec         r5
+    jne         .iterate
+
+    paddq       m5, m6
+    movhlps     m2, m5
+    paddq       m5, m2
+    movq        rax, m5
+    RET
+%endmacro

x265_1.8.tar.gz/source/common/x86/x86util.asm -> x265_1.9.tar.gz/source/common/x86/x86util.asm Changed

x265_1.8.tar.gz/source/common/yuv.cpp -> x265_1.9.tar.gz/source/common/yuv.cpp Changed

@@ -2,6 +2,7 @@
  * Copyright (C) 2015 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -50,7 +51,7 @@
     {
         CHECKED_MALLOC(m_buf[0], pixel, size * size + 8);
         m_buf[1] = m_buf[2] = 0;
-        m_csize = MAX_INT;
+        m_csize = 0;
         return true;
     }
     else
@@ -82,22 +83,26 @@
 {
     pixel* dstY = dstPic.getLumaAddr(cuAddr, absPartIdx);
     primitives.cu[m_part].copy_pp(dstY, dstPic.m_stride, m_buf[0], m_size);
-
-    pixel* dstU = dstPic.getCbAddr(cuAddr, absPartIdx);
-    pixel* dstV = dstPic.getCrAddr(cuAddr, absPartIdx);
-    primitives.chroma[m_csp].cu[m_part].copy_pp(dstU, dstPic.m_strideC, m_buf[1], m_csize);
-    primitives.chroma[m_csp].cu[m_part].copy_pp(dstV, dstPic.m_strideC, m_buf[2], m_csize);
+    if (m_csp != X265_CSP_I400)
+    {
+        pixel* dstU = dstPic.getCbAddr(cuAddr, absPartIdx);
+        pixel* dstV = dstPic.getCrAddr(cuAddr, absPartIdx);
+        primitives.chroma[m_csp].cu[m_part].copy_pp(dstU, dstPic.m_strideC, m_buf[1], m_csize);
+        primitives.chroma[m_csp].cu[m_part].copy_pp(dstV, dstPic.m_strideC, m_buf[2], m_csize);
+    }
 }
 
 void Yuv::copyFromPicYuv(const PicYuv& srcPic, uint32_t cuAddr, uint32_t absPartIdx)
 {
     const pixel* srcY = srcPic.getLumaAddr(cuAddr, absPartIdx);
     primitives.cu[m_part].copy_pp(m_buf[0], m_size, srcY, srcPic.m_stride);
-
-    const pixel* srcU = srcPic.getCbAddr(cuAddr, absPartIdx);
-    const pixel* srcV = srcPic.getCrAddr(cuAddr, absPartIdx);
-    primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[1], m_csize, srcU, srcPic.m_strideC);
-    primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[2], m_csize, srcV, srcPic.m_strideC);
+    if (m_csp != X265_CSP_I400)
+    {
+        const pixel* srcU = srcPic.getCbAddr(cuAddr, absPartIdx);
+        const pixel* srcV = srcPic.getCrAddr(cuAddr, absPartIdx);
+        primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[1], m_csize, srcU, srcPic.m_strideC);
+        primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[2], m_csize, srcV, srcPic.m_strideC);
+    }
 }
 
 void Yuv::copyFromYuv(const Yuv& srcYuv)
@@ -105,8 +110,11 @@
     X265_CHECK(m_size >= srcYuv.m_size, "invalid size\n");
 
     primitives.cu[m_part].copy_pp(m_buf[0], m_size, srcYuv.m_buf[0], srcYuv.m_size);
-    primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[1], m_csize, srcYuv.m_buf[1], srcYuv.m_csize);
-    primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[2], m_csize, srcYuv.m_buf[2], srcYuv.m_csize);
+    if (m_csp != X265_CSP_I400)
+    {
+        primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[1], m_csize, srcYuv.m_buf[1], srcYuv.m_csize);
+        primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[2], m_csize, srcYuv.m_buf[2], srcYuv.m_csize);
+    }
 }
 
 /* This version is intended for use by ME, which required FENC_STRIDE for luma fenc pixels */
@@ -130,11 +138,13 @@
 {
     pixel* dstY = dstYuv.getLumaAddr(absPartIdx);
     primitives.cu[m_part].copy_pp(dstY, dstYuv.m_size, m_buf[0], m_size);
-
-    pixel* dstU = dstYuv.getCbAddr(absPartIdx);
-    pixel* dstV = dstYuv.getCrAddr(absPartIdx);
-    primitives.chroma[m_csp].cu[m_part].copy_pp(dstU, dstYuv.m_csize, m_buf[1], m_csize);
-    primitives.chroma[m_csp].cu[m_part].copy_pp(dstV, dstYuv.m_csize, m_buf[2], m_csize);
+    if (m_csp != X265_CSP_I400)
+    {
+        pixel* dstU = dstYuv.getCbAddr(absPartIdx);
+        pixel* dstV = dstYuv.getCrAddr(absPartIdx);
+        primitives.chroma[m_csp].cu[m_part].copy_pp(dstU, dstYuv.m_csize, m_buf[1], m_csize);
+        primitives.chroma[m_csp].cu[m_part].copy_pp(dstV, dstYuv.m_csize, m_buf[2], m_csize);
+    }
 }
 
 void Yuv::copyPartToYuv(Yuv& dstYuv, uint32_t absPartIdx) const
@@ -142,20 +152,25 @@
     pixel* srcY = m_buf[0] + getAddrOffset(absPartIdx, m_size);
     pixel* dstY = dstYuv.m_buf[0];
     primitives.cu[dstYuv.m_part].copy_pp(dstY, dstYuv.m_size, srcY, m_size);
-
-    pixel* srcU = m_buf[1] + getChromaAddrOffset(absPartIdx);
-    pixel* srcV = m_buf[2] + getChromaAddrOffset(absPartIdx);
-    pixel* dstU = dstYuv.m_buf[1];
-    pixel* dstV = dstYuv.m_buf[2];
-    primitives.chroma[m_csp].cu[dstYuv.m_part].copy_pp(dstU, dstYuv.m_csize, srcU, m_csize);
-    primitives.chroma[m_csp].cu[dstYuv.m_part].copy_pp(dstV, dstYuv.m_csize, srcV, m_csize);
+    if (m_csp != X265_CSP_I400)
+    {
+        pixel* srcU = m_buf[1] + getChromaAddrOffset(absPartIdx);
+        pixel* srcV = m_buf[2] + getChromaAddrOffset(absPartIdx);
+        pixel* dstU = dstYuv.m_buf[1];
+        pixel* dstV = dstYuv.m_buf[2];
+        primitives.chroma[m_csp].cu[dstYuv.m_part].copy_pp(dstU, dstYuv.m_csize, srcU, m_csize);
+        primitives.chroma[m_csp].cu[dstYuv.m_part].copy_pp(dstV, dstYuv.m_csize, srcV, m_csize);
+    }
 }
 
 void Yuv::addClip(const Yuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t log2SizeL)
 {
     primitives.cu[log2SizeL - 2].add_ps(m_buf[0], m_size, srcYuv0.m_buf[0], srcYuv1.m_buf[0], srcYuv0.m_size, srcYuv1.m_size);
-    primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps(m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
-    primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps(m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
+    if (m_csp != X265_CSP_I400)
+    {
+        primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps(m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
+        primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps(m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
+    }
 }
 
 void Yuv::addAvg(const ShortYuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t absPartIdx, uint32_t width, uint32_t height, bool bLuma, bool bChroma)

x265_1.8.tar.gz/source/encoder/analysis.cpp -> x265_1.9.tar.gz/source/encoder/analysis.cpp Changed

@@ -3,6 +3,7 @@
 *
 * Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
 *          Steve Borho <steve@borho.org>
+*          Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -71,12 +72,11 @@
 
 Analysis::Analysis()
 {
-    m_reuseIntraDataCTU = NULL;
     m_reuseInterDataCTU = NULL;
     m_reuseRef = NULL;
     m_reuseBestMergeCand = NULL;
+    m_reuseMv = NULL;
 }
-
 bool Analysis::create(ThreadLocalData *tld)
 {
     m_tld = tld;
@@ -127,9 +127,6 @@
     m_frame = &frame;
 
 #if _DEBUG || CHECKED_BUILD
-    for (uint32_t i = 0; i <= g_maxCUDepth; i++)
-        for (uint32_t j = 0; j < MAX_PRED_TYPES; j++)
-            m_modeDepth[i].pred[j].invalidate();
     invalidateContexts(0);
 #endif
 
@@ -140,40 +137,46 @@
     m_modeDepth[0].fencYuv.copyFromPicYuv(*m_frame->m_fencPic, ctu.m_cuAddr, 0);
 
     uint32_t numPartition = ctu.m_numPartitions;
-    if (m_param->analysisMode)
+    if (m_param->analysisMode && m_slice->m_sliceType != I_SLICE)
     {
-        if (m_slice->m_sliceType == I_SLICE)
-            m_reuseIntraDataCTU = (analysis_intra_data*)m_frame->m_analysisData.intraData;
-        else
-        {
-            int numPredDir = m_slice->isInterP() ? 1 : 2;
-            m_reuseInterDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData;
-            m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
-            m_reuseBestMergeCand = &m_reuseInterDataCTU->bestMergeCand[ctu.m_cuAddr * CUGeom::MAX_GEOMS];
-        }
+        int numPredDir = m_slice->isInterP() ? 1 : 2;
+        m_reuseInterDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData;
+        m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
+        m_reuseBestMergeCand = &m_reuseInterDataCTU->bestMergeCand[ctu.m_cuAddr * CUGeom::MAX_GEOMS];
+        m_reuseMv = &m_reuseInterDataCTU->mv[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
     }
-
     ProfileCUScope(ctu, totalCTUTime, totalCTUs);
 
-    uint32_t zOrder = 0;
     if (m_slice->m_sliceType == I_SLICE)
     {
-        compressIntraCU(ctu, cuGeom, zOrder, qp);
-        if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_analysisData.intraData)
+        analysis_intra_data* intraDataCTU = (analysis_intra_data*)m_frame->m_analysisData.intraData;
+        if (m_param->analysisMode == X265_ANALYSIS_LOAD)
+        {
+            memcpy(ctu.m_cuDepth, &intraDataCTU->depth[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
+            memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
+            memcpy(ctu.m_partSize, &intraDataCTU->partSizes[ctu.m_cuAddr * numPartition], sizeof(char) * numPartition);
+            memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
+        }
+        compressIntraCU(ctu, cuGeom, qp);
+        if (m_param->analysisMode == X265_ANALYSIS_SAVE && intraDataCTU)
         {
             CUData* bestCU = &m_modeDepth[0].bestMode->cu;
-            memcpy(&m_reuseIntraDataCTU->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition);
-            memcpy(&m_reuseIntraDataCTU->modes[ctu.m_cuAddr * numPartition], bestCU->m_lumaIntraDir, sizeof(uint8_t) * numPartition);
-            memcpy(&m_reuseIntraDataCTU->partSizes[ctu.m_cuAddr * numPartition], bestCU->m_partSize, sizeof(uint8_t) * numPartition);
-            memcpy(&m_reuseIntraDataCTU->chromaModes[ctu.m_cuAddr * numPartition], bestCU->m_chromaIntraDir, sizeof(uint8_t) * numPartition);
+            memcpy(&intraDataCTU->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition);
+            memcpy(&intraDataCTU->modes[ctu.m_cuAddr * numPartition], bestCU->m_lumaIntraDir, sizeof(uint8_t) * numPartition);
+            memcpy(&intraDataCTU->partSizes[ctu.m_cuAddr * numPartition], bestCU->m_partSize, sizeof(uint8_t) * numPartition);
+            memcpy(&intraDataCTU->chromaModes[ctu.m_cuAddr * numPartition], bestCU->m_chromaIntraDir, sizeof(uint8_t) * numPartition);
         }
     }
     else
     {
-        if (!m_param->rdLevel)
+        if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE &&
+            ctu.m_cuPelX / g_maxCUSize >= frame.m_encData->m_pir.pirStartCol
+            && ctu.m_cuPelX / g_maxCUSize < frame.m_encData->m_pir.pirEndCol)
+            compressIntraCU(ctu, cuGeom, qp);
+        else if (!m_param->rdLevel)
         {
             /* In RD Level 0/1, copy source pixels into the reconstructed block so
-            * they are available for intra predictions */
+             * they are available for intra predictions */
             m_modeDepth[0].fencYuv.copyToPicYuv(*m_frame->m_reconPic, ctu.m_cuAddr, 0);
 
             compressInterCU_rd0_4(ctu, cuGeom, qp);
@@ -187,6 +190,7 @@
             compressInterCU_rd0_4(ctu, cuGeom, qp);
         else
         {
+            uint32_t zOrder = 0;
             compressInterCU_rd5_6(ctu, cuGeom, zOrder, qp);
             if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_analysisData.interData)
             {
@@ -212,8 +216,7 @@
         md.pred[PRED_LOSSLESS].initCosts();
         md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
         PartSize size = (PartSize)md.pred[PRED_LOSSLESS].cu.m_partSize[0];
-        uint8_t* modes = md.pred[PRED_LOSSLESS].cu.m_lumaIntraDir;
-        checkIntra(md.pred[PRED_LOSSLESS], cuGeom, size, modes, NULL);
+        checkIntra(md.pred[PRED_LOSSLESS], cuGeom, size);
         checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
     }
     else
@@ -226,7 +229,7 @@
     }
 }
 
-void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t& zOrder, int32_t qp)
+void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
 {
     uint32_t depth = cuGeom.depth;
     ModeDepth& md = m_modeDepth[depth];
@@ -235,42 +238,37 @@
     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
 
-    if (m_param->analysisMode == X265_ANALYSIS_LOAD)
-    {
-        uint8_t* reuseDepth  = &m_reuseIntraDataCTU->depth[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
-        uint8_t* reuseModes  = &m_reuseIntraDataCTU->modes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
-        char* reusePartSizes = &m_reuseIntraDataCTU->partSizes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
-        uint8_t* reuseChromaModes = &m_reuseIntraDataCTU->chromaModes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
+    bool bAlreadyDecided = parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] != (uint8_t)ALL_IDX;
+    bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
 
-        if (mightNotSplit && depth == reuseDepth[zOrder] && zOrder == cuGeom.absPartIdx)
+    if (bAlreadyDecided)
+    {
+        if (bDecidedDepth)
         {
-            PartSize size = (PartSize)reusePartSizes[zOrder];
-            Mode& mode = size == SIZE_2Nx2N ? md.pred[PRED_INTRA] : md.pred[PRED_INTRA_NxN];
+            Mode& mode = md.pred[0];
+            md.bestMode = &mode;
             mode.cu.initSubCU(parentCTU, cuGeom, qp);
-            checkIntra(mode, cuGeom, size, &reuseModes[zOrder], &reuseChromaModes[zOrder]);
-            checkBestMode(mode, depth);
+            memcpy(mode.cu.m_lumaIntraDir, parentCTU.m_lumaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
+            memcpy(mode.cu.m_chromaIntraDir, parentCTU.m_chromaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
+            checkIntra(mode, cuGeom, (PartSize)parentCTU.m_partSize[cuGeom.absPartIdx]);
 
             if (m_bTryLossless)
                 tryLossless(cuGeom);
 
             if (mightSplit)
                 addSplitFlagCost(*md.bestMode, cuGeom.depth);
-
-            // increment zOrder offset to point to next best depth in sharedDepth buffer
-            zOrder += g_depthInc[g_maxCUDepth - 1][reuseDepth[zOrder]];
-            mightSplit = false;
         }
     }
-    else if (mightNotSplit)
+    else if (cuGeom.log2CUSize != MAX_LOG2_CU_SIZE && mightNotSplit)
     {
         md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
-        checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL, NULL);
+        checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N);
         checkBestMode(md.pred[PRED_INTRA], depth);
 
         if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
         {
             md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp);
-            checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL, NULL);
+            checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN);
             checkBestMode(md.pred[PRED_INTRA_NxN], depth);
         }
 
@@ -281,6 +279,9 @@
             addSplitFlagCost(*md.bestMode, cuGeom.depth);
     }
 
+    // stop recursion if we reach the depth of previous analysis decision
+    mightSplit &= !(bAlreadyDecided && bDecidedDepth);
+
     if (mightSplit)
     {
         Mode* splitPred = &md.pred[PRED_SPLIT];
@@ -305,7 +306,7 @@
                 if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
                     nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
 
-                compressIntraCU(parentCTU, childGeom, zOrder, nextQP);
+                compressIntraCU(parentCTU, childGeom, nextQP);
 
                 // Save best CU and pred data for this sub CU

x265_1.8.tar.gz/source/encoder/analysis.h -> x265_1.9.tar.gz/source/encoder/analysis.h Changed

@@ -3,6 +3,7 @@
 *
 * Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
 *          Steve Borho <steve@borho.org>
+*          Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -40,6 +41,21 @@
 
 class Entropy;
 
+struct SplitData
+{
+    uint32_t splitRefs;
+    uint32_t mvCost[2];
+    uint64_t sa8dCost;
+
+    void initSplitCUData()
+    {
+        splitRefs = 0;
+        mvCost[0] = 0; // L0
+        mvCost[1] = 0; // L1
+        sa8dCost    = 0;
+    }
+};
+
 class Analysis : public Search
 {
 public:
@@ -101,20 +117,20 @@
     Mode& compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext);
 
 protected:
-
     /* Analysis data for load/save modes, keeps getting incremented as CTU analysis proceeds and data is consumed or read */
-    analysis_intra_data* m_reuseIntraDataCTU;
     analysis_inter_data* m_reuseInterDataCTU;
+    MV*                  m_reuseMv;
     int32_t*             m_reuseRef;
     uint32_t*            m_reuseBestMergeCand;
+    uint32_t m_splitRefIdx[4];
 
     /* full analysis for an I-slice CU */
-    void compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder, int32_t qp);
+    void compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
 
     /* full analysis for a P or B slice CU */
-    void compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
-    uint32_t compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
-    uint32_t compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder, int32_t qp);
+    uint32_t compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
+    SplitData compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
+    SplitData compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder, int32_t qp);
 
     /* measure merge and skip */
     void checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGeom);
@@ -139,13 +155,11 @@
     /* generate residual and recon pixels for an entire CTU recursively (RD0) */
     void encodeResidue(const CUData& parentCTU, const CUGeom& cuGeom);
 
-    int calculateQpforCuSize(const CUData& ctu, const CUGeom& cuGeom);
+    int calculateQpforCuSize(const CUData& ctu, const CUGeom& cuGeom, double baseQP = -1);
 
     /* check whether current mode is the new best */
     inline void checkBestMode(Mode& mode, uint32_t depth)
     {
-        X265_CHECK(mode.ok(), "mode costs are uninitialized\n");
-
         ModeDepth& md = m_modeDepth[depth];
         if (md.bestMode)
         {

x265_1.8.tar.gz/source/encoder/api.cpp -> x265_1.9.tar.gz/source/encoder/api.cpp Changed

@@ -72,9 +72,7 @@
 #endif
 
 #if HIGH_BIT_DEPTH
-    if (X265_DEPTH == 12)
-        x265_log(p, X265_LOG_WARNING, "Main12 is HIGHLY experimental, do not use!\n");
-    else if (X265_DEPTH != 10 && X265_DEPTH != 12)
+    if (X265_DEPTH != 10 && X265_DEPTH != 12)
 #else
     if (X265_DEPTH != 8)
 #endif
@@ -247,6 +245,16 @@
     }
 }
 
+int x265_encoder_intra_refresh(x265_encoder *enc)
+{
+    if (!enc)
+        return -1;
+
+    Encoder *encoder = static_cast<Encoder*>(enc);
+    encoder->m_bQueuedIntraRefresh = 1;
+    return 0;
+}
+
 void x265_cleanup(void)
 {
     if (!g_ctuSizeConfigured)
@@ -268,6 +276,7 @@
     pic->bitDepth = param->internalBitDepth;
     pic->colorSpace = param->internalCsp;
     pic->forceqp = X265_QP_AUTO;
+    pic->quantOffsets = NULL;
     if (param->analysisMode)
     {
         uint32_t widthInCU       = (param->sourceWidth  + g_maxCUSize - 1) >> g_maxLog2CUSize;
@@ -318,6 +327,7 @@
     &x265_cleanup,
 
     sizeof(x265_frame_stats),
+    &x265_encoder_intra_refresh,
 };
 
 typedef const x265_api* (*api_get_func)(int bitDepth);

x265_1.8.tar.gz/source/encoder/bitcost.cpp -> x265_1.9.tar.gz/source/encoder/bitcost.cpp Changed

@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -40,7 +41,12 @@
             x265_emms(); // just to be safe
 
             CalculateLogs();
-            s_costs[qp] = new uint16_t[4 * BC_MAX_MV + 1] + 2 * BC_MAX_MV;
+            s_costs[qp] = X265_MALLOC(uint16_t, 4 * BC_MAX_MV + 1) + 2 * BC_MAX_MV;
+            if (!s_costs[qp])
+            {
+                x265_log(NULL, X265_LOG_ERROR, "BitCost s_costs buffer allocation failure\n");
+                return;
+            }
             double lambda = x265_lambda_tab[qp];
 
             // estimate same cost for negative and positive MVD
@@ -66,11 +72,16 @@
 {
     if (!s_bitsizes)
     {
-        s_bitsizes = new float[2 * BC_MAX_MV + 1];
+        s_bitsizes = X265_MALLOC(float, 4 * BC_MAX_MV + 1) + 2 * BC_MAX_MV;
+        if (!s_bitsizes)
+        {
+            x265_log(NULL, X265_LOG_ERROR, "BitCost s_bitsizes buffer allocation failure\n");
+            return;
+        }
         s_bitsizes[0] = 0.718f;
         float log2_2 = 2.0f / log(2.0f);  // 2 x 1/log(2)
         for (int i = 1; i <= 2 * BC_MAX_MV; i++)
-            s_bitsizes[i] = log((float)(i + 1)) * log2_2 + 1.718f;
+            s_bitsizes[i] = s_bitsizes[-i] = log((float)(i + 1)) * log2_2 + 1.718f;
     }
 }
 
@@ -80,12 +91,15 @@
     {
         if (s_costs[i])
         {
-            delete [] (s_costs[i] - 2 * BC_MAX_MV);
+            X265_FREE(s_costs[i] - 2 * BC_MAX_MV);
 
-            s_costs[i] = 0;
+            s_costs[i] = NULL;
         }
     }
 
-    delete [] s_bitsizes;
-    s_bitsizes = 0;
+    if (s_bitsizes)
+    {
+        X265_FREE(s_bitsizes - 2 * BC_MAX_MV);
+        s_bitsizes = NULL;
+    }
 }

x265_1.8.tar.gz/source/encoder/bitcost.h -> x265_1.9.tar.gz/source/encoder/bitcost.h Changed

x265_1.8.tar.gz/source/encoder/dpb.cpp -> x265_1.9.tar.gz/source/encoder/dpb.cpp Changed

@@ -47,16 +47,16 @@
         delete curFrame;
     }
 
-    while (m_picSymFreeList)
+    while (m_frameDataFreeList)
     {
-        FrameData* next = m_picSymFreeList->m_freeListNext;
-        m_picSymFreeList->destroy();
+        FrameData* next = m_frameDataFreeList->m_freeListNext;
+        m_frameDataFreeList->destroy();
 
-        m_picSymFreeList->m_reconPic->destroy();
-        delete m_picSymFreeList->m_reconPic;
+        m_frameDataFreeList->m_reconPic->destroy();
+        delete m_frameDataFreeList->m_reconPic;
 
-        delete m_picSymFreeList;
-        m_picSymFreeList = next;
+        delete m_frameDataFreeList;
+        m_frameDataFreeList = next;
     }
 }
 
@@ -74,13 +74,19 @@
             curFrame->m_reconRowCount.set(0);
             curFrame->m_bChromaExtended = false;
 
+            // Reset column counter
+            X265_CHECK(curFrame->m_reconColCount != NULL, "curFrame->m_reconColCount check failure");
+            X265_CHECK(curFrame->m_numRows > 0, "curFrame->m_numRows check failure");
+            for(int32_t col = 0; col < curFrame->m_numRows; col++)
+                curFrame->m_reconColCount[col].set(0);
+
             // iterator is invalidated by remove, restart scan
             m_picList.remove(*curFrame);
             iterFrame = m_picList.first();
 
             m_freeList.pushBack(*curFrame);
-            curFrame->m_encData->m_freeListNext = m_picSymFreeList;
-            m_picSymFreeList = curFrame->m_encData;
+            curFrame->m_encData->m_freeListNext = m_frameDataFreeList;
+            m_frameDataFreeList = curFrame->m_encData;
             curFrame->m_encData = NULL;
             curFrame->m_reconPic = NULL;
         }
@@ -171,7 +177,7 @@
     {
         for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
         {
-            Frame *refpic = slice->m_refPicList[l][ref];
+            Frame *refpic = slice->m_refFrameList[l][ref];
             ATOMIC_INC(&refpic->m_countRefEncoders);
         }
     }

x265_1.8.tar.gz/source/encoder/dpb.h -> x265_1.9.tar.gz/source/encoder/dpb.h Changed

x265_1.8.tar.gz/source/encoder/encoder.cpp -> x265_1.9.tar.gz/source/encoder/encoder.cpp Changed

@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -39,6 +40,10 @@
 
 #include "x265.h"
 
+#if _MSC_VER
+#pragma warning(disable: 4996) // POSIX functions are just fine, thanks
+#endif
+
 namespace X265_NS {
 const char g_sliceTypeToChar[] = {'B', 'P', 'I'};
 }
@@ -66,12 +71,9 @@
     m_outputCount = 0;
     m_param = NULL;
     m_latestParam = NULL;
-    m_cuOffsetY = NULL;
-    m_cuOffsetC = NULL;
-    m_buOffsetY = NULL;
-    m_buOffsetC = NULL;
     m_threadPool = NULL;
     m_analysisFile = NULL;
+    m_offsetEmergency = NULL;
     for (int i = 0; i < X265_MAX_FRAME_THREADS; i++)
         m_frameEncoder[i] = NULL;
 
@@ -191,6 +193,7 @@
     {
         x265_log(m_param, X265_LOG_ERROR, "Unable to allocate scaling list arrays\n");
         m_aborted = true;
+        return;
     }
     else if (!m_param->scalingLists || !strcmp(m_param->scalingLists, "off"))
         m_scalingList.m_bEnabled = false;
@@ -198,7 +201,6 @@
         m_scalingList.setDefaultScalingList();
     else if (m_scalingList.parseScalingList(m_param->scalingLists))
         m_aborted = true;
-    m_scalingList.setupQuantMatrices();
 
     m_lookahead = new Lookahead(m_param, m_threadPool);
     if (m_numPools)
@@ -213,6 +215,82 @@
     initVPS(&m_vps);
     initSPS(&m_sps);
     initPPS(&m_pps);
+   
+    if (m_param->rc.vbvBufferSize)
+    {
+        m_offsetEmergency = (uint16_t(*)[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS])X265_MALLOC(uint16_t, MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS * (QP_MAX_MAX - QP_MAX_SPEC));
+        if (!m_offsetEmergency)
+        {
+            x265_log(m_param, X265_LOG_ERROR, "Unable to allocate memory\n");
+            m_aborted = true;
+            return;
+        }
+
+        bool scalingEnabled = m_scalingList.m_bEnabled;
+        if (!scalingEnabled)
+        {
+            m_scalingList.setDefaultScalingList();
+            m_scalingList.setupQuantMatrices();
+        }
+        else
+            m_scalingList.setupQuantMatrices();
+
+        for (int q = 0; q < QP_MAX_MAX - QP_MAX_SPEC; q++)
+        {
+            for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
+            {
+                uint16_t *nrOffset = m_offsetEmergency[q][cat];
+
+                int trSize = cat & 3;
+
+                int coefCount = 1 << ((trSize + 2) * 2);
+
+                /* Denoise chroma first then luma, then DC. */
+                int dcThreshold = (QP_MAX_MAX - QP_MAX_SPEC) * 2 / 3;
+                int lumaThreshold = (QP_MAX_MAX - QP_MAX_SPEC) * 2 / 3;
+                int chromaThreshold = 0;
+
+                int thresh = (cat < 4 || (cat >= 8 && cat < 12)) ? lumaThreshold : chromaThreshold;
+
+                double quantF = (double)(1ULL << (q / 6 + 16 + 8));
+
+                for (int i = 0; i < coefCount; i++)
+                {
+                    /* True "emergency mode": remove all DCT coefficients */
+                    if (q == QP_MAX_MAX - QP_MAX_SPEC - 1)
+                    {
+                        nrOffset[i] = INT16_MAX;
+                        continue;
+                    }
+
+                    int iThresh = i == 0 ? dcThreshold : thresh;
+                    if (q < iThresh)
+                    {
+                        nrOffset[i] = 0;
+                        continue;
+                    }
+
+                    int numList = (cat >= 8) * 3 + ((int)!iThresh);
+
+                    double pos = (double)(q - iThresh + 1) / (QP_MAX_MAX - QP_MAX_SPEC - iThresh);
+                    double start = quantF / (m_scalingList.m_quantCoef[trSize][numList][QP_MAX_SPEC % 6][i]);
+
+                    // Formula chosen as an exponential scale to vaguely mimic the effects of a higher quantizer.
+                    double bias = (pow(2, pos * (QP_MAX_MAX - QP_MAX_SPEC)) * 0.003 - 0.003) * start;
+                    nrOffset[i] = (uint16_t)X265_MIN(bias + 0.5, INT16_MAX);
+                }
+            }
+        }
+
+        if (!scalingEnabled)
+        {
+            m_scalingList.m_bEnabled = false;
+            m_scalingList.m_bDataPresent = false;
+            m_scalingList.setupQuantMatrices();
+        }
+    }
+    else
+        m_scalingList.setupQuantMatrices();
 
     int numRows = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize;
     int numCols = (m_param->sourceWidth  + g_maxCUSize - 1) / g_maxCUSize;
@@ -259,6 +337,8 @@
     m_encodeStartTime = x265_mdate();
 
     m_nalList.m_annexB = !!m_param->bAnnexB;
+
+    m_emitCLLSEI = p->maxCLL || p->maxFALL;
 }
 
 void Encoder::stopJobs()
@@ -318,10 +398,7 @@
         delete m_rateControl;
     }
 
-    X265_FREE(m_cuOffsetY);
-    X265_FREE(m_cuOffsetC);
-    X265_FREE(m_buOffsetY);
-    X265_FREE(m_buOffsetC);
+    X265_FREE(m_offsetEmergency);
 
     if (m_analysisFile)
         fclose(m_analysisFile);
@@ -335,7 +412,6 @@
         free((char*)m_param->scalingLists);
         free((char*)m_param->numaPools);
         free((char*)m_param->masteringDisplayColorVolume);
-        free((char*)m_param->contentLightLevelInfo);
 
         PARAM_NS::x265_param_free(m_param);
     }
@@ -361,6 +437,45 @@
     }
 }
 
+void Encoder::calcRefreshInterval(Frame* frameEnc)
+{
+    Slice* slice = frameEnc->m_encData->m_slice;
+    uint32_t numBlocksInRow = slice->m_sps->numCuInWidth;
+    FrameData::PeriodicIR* pir = &frameEnc->m_encData->m_pir;
+    if (slice->m_sliceType == I_SLICE)
+    {
+        pir->framesSinceLastPir = 0;
+        m_bQueuedIntraRefresh = 0;
+        /* PIR is currently only supported with ref == 1, so any intra frame effectively refreshes
+         * the whole frame and counts as an intra refresh. */
+        pir->pirEndCol = numBlocksInRow;
+    }
+    else if (slice->m_sliceType == P_SLICE)
+    {
+        Frame* ref = frameEnc->m_encData->m_slice->m_refFrameList[0][0];
+        int pocdiff = frameEnc->m_poc - ref->m_poc;
+        int numPFramesInGOP = m_param->keyframeMax / pocdiff;
+        int increment = (numBlocksInRow + numPFramesInGOP - 1) / numPFramesInGOP;
+        pir->pirEndCol = ref->m_encData->m_pir.pirEndCol;
+        pir->framesSinceLastPir = ref->m_encData->m_pir.framesSinceLastPir + pocdiff;
+        if (pir->framesSinceLastPir >= m_param->keyframeMax ||
+            (m_bQueuedIntraRefresh && pir->pirEndCol >= numBlocksInRow))
+        {
+            pir->pirEndCol = 0;
+            pir->framesSinceLastPir = 0;
+            m_bQueuedIntraRefresh = 0;
+            frameEnc->m_lowres.bKeyframe = 1;
+        }
+        pir->pirStartCol = pir->pirEndCol;
+        pir->pirEndCol += increment;
+        /* If our intra refresh has reached the right side of the frame, we're done. */
+        if (pir->pirEndCol >= numBlocksInRow)
+        {
+            pir->pirEndCol = numBlocksInRow;

x265_1.8.tar.gz/source/encoder/encoder.h -> x265_1.9.tar.gz/source/encoder/encoder.h Changed

@@ -45,8 +45,10 @@
     double        m_psnrSumV;
     double        m_globalSsim;
     double        m_totalQp;
+    double        m_maxFALL;
     uint64_t      m_accBits;
     uint32_t      m_numPics;
+    uint16_t      m_maxCLL;
 
     EncStats()
     {
@@ -54,6 +56,8 @@
         m_accBits = 0;
         m_numPics = 0;
         m_totalQp = 0;
+        m_maxCLL = 0;
+        m_maxFALL = 0;
     }
 
     void addQP(double aveQp);
@@ -75,64 +79,62 @@
 {
 public:
 
-    int                m_pocLast;         // time index (POC)
-    int                m_encodedFrameNum;
-    int                m_outputCount;
+    uint32_t           m_residualSumEmergency[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
+    uint32_t           m_countEmergency[MAX_NUM_TR_CATEGORIES];
+    uint16_t           (*m_offsetEmergency)[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
 
-    int                m_bframeDelay;
     int64_t            m_firstPts;
     int64_t            m_bframeDelayTime;
     int64_t            m_prevReorderedPts[2];
+    int64_t            m_encodeStartTime;
 
-    ThreadPool*        m_threadPool;
-    FrameEncoder*      m_frameEncoder[X265_MAX_FRAME_THREADS];
-    DPB*               m_dpb;
-
-    Frame*             m_exportedPic;
-
+    int                m_pocLast;         // time index (POC)
+    int                m_encodedFrameNum;
+    int                m_outputCount;
+    int                m_bframeDelay;
     int                m_numPools;
     int                m_curEncoder;
 
-    /* cached PicYuv offset arrays, shared by all instances of
-     * PicYuv created by this encoder */
-    intptr_t*          m_cuOffsetY;
-    intptr_t*          m_cuOffsetC;
-    intptr_t*          m_buOffsetY;
-    intptr_t*          m_buOffsetC;
-
-    /* Collect statistics globally */
-    EncStats           m_analyzeAll;
-    EncStats           m_analyzeI;
-    EncStats           m_analyzeP;
-    EncStats           m_analyzeB;
-    int64_t            m_encodeStartTime;
-
     // weighted prediction
     int                m_numLumaWPFrames;    // number of P frames with weighted luma reference
     int                m_numChromaWPFrames;  // number of P frames with weighted chroma reference
     int                m_numLumaWPBiFrames;  // number of B frames with weighted luma reference
     int                m_numChromaWPBiFrames; // number of B frames with weighted chroma reference
-    FILE*              m_analysisFile;
     int                m_conformanceMode;
-    VPS                m_vps;
-    SPS                m_sps;
-    PPS                m_pps;
-    NALList            m_nalList;
-    ScalingList        m_scalingList;      // quantization matrix information
-
     int                m_lastBPSEI;
     uint32_t           m_numDelayedPic;
 
+    ThreadPool*        m_threadPool;
+    FrameEncoder*      m_frameEncoder[X265_MAX_FRAME_THREADS];
+    DPB*               m_dpb;
+    Frame*             m_exportedPic;
+    FILE*              m_analysisFile;
     x265_param*        m_param;
     x265_param*        m_latestParam;
     RateControl*       m_rateControl;
     Lookahead*         m_lookahead;
+
+    /* Collect statistics globally */
+    EncStats           m_analyzeAll;
+    EncStats           m_analyzeI;
+    EncStats           m_analyzeP;
+    EncStats           m_analyzeB;
+    VPS                m_vps;
+    SPS                m_sps;
+    PPS                m_pps;
+    NALList            m_nalList;
+    ScalingList        m_scalingList;      // quantization matrix information
     Window             m_conformanceWindow;
 
+    bool               m_emitCLLSEI;
     bool               m_bZeroLatency;     // x265_encoder_encode() returns NALs for the input picture, zero lag
     bool               m_aborted;          // fatal error detected
     bool               m_reconfigured;      // reconfigure of encoder detected
 
+    /* Begin intra refresh when one not in progress or else begin one as soon as the current 
+     * one is done. Requires bIntraRefresh to be set.*/
+    int                m_bQueuedIntraRefresh;
+
     Encoder();
     ~Encoder() {}
 
@@ -164,7 +166,9 @@
 
     void writeAnalysisFile(x265_analysis_data* pic);
 
-    void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, uint64_t bits, x265_frame_stats* frameStats);
+    void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, x265_frame_stats* frameStats, int inPoc);
+
+    void calcRefreshInterval(Frame* frameEnc);
 
 protected:

x265_1.8.tar.gz/source/encoder/entropy.cpp -> x265_1.9.tar.gz/source/encoder/entropy.cpp Changed

@@ -2,6 +2,7 @@
 * Copyright (C) 2013 x265 project
 *
 * Authors: Steve Borho <steve@borho.org>
+*          Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -429,7 +430,8 @@
     if (slice.m_sps->bUseSAO)
     {
         WRITE_FLAG(saoParam->bSaoFlag[0], "slice_sao_luma_flag");
-        WRITE_FLAG(saoParam->bSaoFlag[1], "slice_sao_chroma_flag");
+        if (encData.m_param->internalCsp != X265_CSP_I400)
+            WRITE_FLAG(saoParam->bSaoFlag[1], "slice_sao_chroma_flag");
     }
 
     // check if numRefIdx match the defaults (1, hard-coded in PPS). If not, override
@@ -828,6 +830,79 @@
     }
 }
 
+void Entropy::encodeTransformLuma(const CUData& cu, uint32_t absPartIdx, uint32_t curDepth, uint32_t log2CurSize,
+                              bool& bCodeDQP, const uint32_t depthRange[2])
+{
+    const bool subdiv = cu.m_tuDepth[absPartIdx] > curDepth;
+
+    /* in each of these conditions, the subdiv flag is implied and not signaled,
+     * so we have checks to make sure the implied value matches our intentions */
+    if (cu.isIntra(absPartIdx) && cu.m_partSize[absPartIdx] != SIZE_2Nx2N && log2CurSize == MIN_LOG2_CU_SIZE)
+    {
+        X265_CHECK(subdiv, "intra NxN requires TU depth below CU depth\n");
+    }
+    else if (cu.isInter(absPartIdx) && cu.m_partSize[absPartIdx] != SIZE_2Nx2N &&
+             !curDepth && cu.m_slice->m_sps->quadtreeTUMaxDepthInter == 1)
+    {
+        X265_CHECK(subdiv, "inter TU must be smaller than CU when not 2Nx2N part size: log2CurSize %d, depthRange[0] %d\n", log2CurSize, depthRange[0]);
+    }
+    else if (log2CurSize > depthRange[1])
+    {
+        X265_CHECK(subdiv, "TU is larger than the max allowed, it should have been split\n");
+    }
+    else if (log2CurSize == cu.m_slice->m_sps->quadtreeTULog2MinSize || log2CurSize == depthRange[0])
+    {
+        X265_CHECK(!subdiv, "min sized TU cannot be subdivided\n");
+    }
+    else
+    {
+        X265_CHECK(log2CurSize > depthRange[0], "transform size failure\n");
+        codeTransformSubdivFlag(subdiv, 5 - log2CurSize);
+    }
+
+    if (subdiv)
+    {
+        --log2CurSize;
+        ++curDepth;
+
+        uint32_t qNumParts = 1 << (log2CurSize - LOG2_UNIT_SIZE) * 2;
+
+        encodeTransformLuma(cu, absPartIdx + 0 * qNumParts, curDepth, log2CurSize, bCodeDQP, depthRange);
+        encodeTransformLuma(cu, absPartIdx + 1 * qNumParts, curDepth, log2CurSize, bCodeDQP, depthRange);
+        encodeTransformLuma(cu, absPartIdx + 2 * qNumParts, curDepth, log2CurSize, bCodeDQP, depthRange);
+        encodeTransformLuma(cu, absPartIdx + 3 * qNumParts, curDepth, log2CurSize, bCodeDQP, depthRange);
+        return;
+    }
+
+    if (!cu.isIntra(absPartIdx) && !curDepth)
+    {
+        X265_CHECK(cu.getCbf(absPartIdx, TEXT_LUMA, 0), "CBF should have been set\n");
+    }
+    else
+        codeQtCbfLuma(cu, absPartIdx, curDepth);
+
+    uint32_t cbfY = cu.getCbf(absPartIdx, TEXT_LUMA, curDepth);
+
+    if (!cbfY)
+        return;
+
+    // dQP: only for CTU once
+    if (cu.m_slice->m_pps->bUseDQP && bCodeDQP)
+    {
+        uint32_t log2CUSize = cu.m_log2CUSize[absPartIdx];
+        uint32_t absPartIdxLT = absPartIdx & (0xFF << (log2CUSize - LOG2_UNIT_SIZE) * 2);
+        codeDeltaQP(cu, absPartIdxLT);
+        bCodeDQP = false;
+    }
+
+    if (cbfY)
+    {
+        uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2);
+        codeCoeffNxN(cu, cu.m_trCoeff[0] + coeffOffset, absPartIdx, log2CurSize, TEXT_LUMA);
+    }
+}
+
+
 void Entropy::codePredInfo(const CUData& cu, uint32_t absPartIdx)
 {
     if (cu.isIntra(absPartIdx)) // If it is intra mode, encode intra prediction mode.
@@ -908,7 +983,10 @@
     }
 
     uint32_t log2CUSize = cu.m_log2CUSize[absPartIdx];
-    encodeTransform(cu, absPartIdx, 0, log2CUSize, bCodeDQP, depthRange);
+    if (cu.m_chromaFormat == X265_CSP_I400)
+        encodeTransformLuma(cu, absPartIdx, 0, log2CUSize, bCodeDQP, depthRange);
+    else
+        encodeTransform(cu, absPartIdx, 0, log2CUSize, bCodeDQP, depthRange);
 }
 
 void Entropy::codeSaoOffset(const SaoCtuParam& ctuParam, int plane)
@@ -1010,7 +1088,7 @@
 void Entropy::codePredWeightTable(const Slice& slice)
 {
     const WeightParam *wp;
-    bool            bChroma      = true; // 4:0:0 not yet supported
+    bool            bChroma = slice.m_sps->chromaFormatIdc != X265_CSP_I400;
     bool            bDenomCoded  = false;
     int             numRefDirs   = slice.m_sliceType == B_SLICE ? 2 : 1;
     uint32_t        totalSignalledWeightFlags = 0;
@@ -1565,11 +1643,16 @@
     uint8_t * const baseCtx = bIsLuma ? &m_contextState[OFF_SIG_FLAG_CTX] : &m_contextState[OFF_SIG_FLAG_CTX + NUM_SIG_FLAG_CTX_LUMA];
     uint32_t c1 = 1;
     int scanPosSigOff = scanPosLast - (lastScanSet << MLS_CG_SIZE) - 1;
-    ALIGN_VAR_32(uint16_t, absCoeff[(1 << MLS_CG_SIZE)]);
+    ALIGN_VAR_32(uint16_t, absCoeff[(1 << MLS_CG_SIZE) + 1]);   // extra 2 bytes(+1) space for AVX2 assembly, +1 because (numNonZero<=1) in costCoeffNxN path
     uint32_t numNonZero = 1;
     unsigned long lastNZPosInCG;
     unsigned long firstNZPosInCG;
 
+#if _DEBUG
+    // Unnecessary, for Valgrind-3.10.0 only
+    memset(absCoeff, 0, sizeof(absCoeff));
+#endif
+
     absCoeff[0] = (uint16_t)abs(coeff[posLast]);
 
     for (int subSet = lastScanSet; subSet >= 0; subSet--)
@@ -1715,6 +1798,7 @@
             {
                 // maximum g_entropyBits are 18-bits and maximum of count are 16, so intermedia of sum are 22-bits
                 const uint8_t *tabSigCtx = table_cnt[(log2TrSize == 2) ? 4 : (uint32_t)patternSigCtx];
+                X265_CHECK(numNonZero <= 1, "numNonZero check failure");
                 uint32_t sum = primitives.costCoeffNxN(g_scan4x4[codingParameters.scanType], &coeff[blkPosBase], (intptr_t)trSize, absCoeff + numNonZero, tabSigCtx, scanFlagMask, baseCtx, offset + posOffset, scanPosSigOff, subPosBase);
 
 #if CHECKED_BUILD || _DEBUG
@@ -1919,43 +2003,78 @@
         numCtx = bIsLuma ? 12 : 3;
     }
 
-    if (bIsLuma)
-    {
-        for (uint32_t bin = 0; bin < 2; bin++)
-            estBitsSbac.significantBits[bin][0] = sbacGetEntropyBits(m_contextState[OFF_SIG_FLAG_CTX], bin);
+    const int ctxSigOffset = OFF_SIG_FLAG_CTX + (bIsLuma ? 0 : NUM_SIG_FLAG_CTX_LUMA);
+
+    estBitsSbac.significantBits[0][0] = sbacGetEntropyBits(m_contextState[ctxSigOffset], 0);
+    estBitsSbac.significantBits[1][0] = sbacGetEntropyBits(m_contextState[ctxSigOffset], 1);
 
-        for (int ctxIdx = firstCtx; ctxIdx < firstCtx + numCtx; ctxIdx++)
-            for (uint32_t bin = 0; bin < 2; bin++)
-                estBitsSbac.significantBits[bin][ctxIdx] = sbacGetEntropyBits(m_contextState[OFF_SIG_FLAG_CTX + ctxIdx], bin);
+    for (int ctxIdx = firstCtx; ctxIdx < firstCtx + numCtx; ctxIdx++)
+    {
+        estBitsSbac.significantBits[0][ctxIdx] = sbacGetEntropyBits(m_contextState[ctxSigOffset + ctxIdx], 0);
+        estBitsSbac.significantBits[1][ctxIdx] = sbacGetEntropyBits(m_contextState[ctxSigOffset + ctxIdx], 1);
     }
-    else
+
+    const uint32_t maxGroupIdx = log2TrSize * 2 - 1;
+    if (bIsLuma)
     {
-        for (uint32_t bin = 0; bin < 2; bin++)
-            estBitsSbac.significantBits[bin][0] = sbacGetEntropyBits(m_contextState[OFF_SIG_FLAG_CTX + (NUM_SIG_FLAG_CTX_LUMA + 0)], bin);
+        if (log2TrSize == 2)
+        {
+            for (int i = 0, ctxIdx = 0; i < 2; i++, ctxIdx += NUM_CTX_LAST_FLAG_XY)
+            {
+                int bits = 0;
+                const uint8_t *ctxState = &m_contextState[OFF_CTX_LAST_FLAG_X + ctxIdx];
 
-        for (int ctxIdx = firstCtx; ctxIdx < firstCtx + numCtx; ctxIdx++)
-            for (uint32_t bin = 0; bin < 2; bin++)
-                estBitsSbac.significantBits[bin][ctxIdx] = sbacGetEntropyBits(m_contextState[OFF_SIG_FLAG_CTX + (NUM_SIG_FLAG_CTX_LUMA + ctxIdx)], bin);
-    }
+                for (uint32_t ctx = 0; ctx < 3; ctx++)
+                {
+                    estBitsSbac.lastBits[i][ctx] = bits + sbacGetEntropyBits(ctxState[ctx], 0);
+                    bits += sbacGetEntropyBits(ctxState[ctx], 1);
+                }
 
-    int blkSizeOffset = bIsLuma ? ((log2TrSize - 2) * 3 + ((log2TrSize - 1) >> 2)) : NUM_CTX_LAST_FLAG_XY_LUMA;
-    int ctxShift = bIsLuma ? ((log2TrSize + 1) >> 2) : log2TrSize - 2;
-    uint32_t maxGroupIdx = log2TrSize * 2 - 1;
+                estBitsSbac.lastBits[i][maxGroupIdx] = bits;
+            }
+        }
+        else
+        {
+            const int blkSizeOffset = ((log2TrSize - 2) * 3 + (log2TrSize == 5));

x265_1.8.tar.gz/source/encoder/entropy.h -> x265_1.9.tar.gz/source/encoder/entropy.h Changed

x265_1.8.tar.gz/source/encoder/frameencoder.cpp -> x265_1.9.tar.gz/source/encoder/frameencoder.cpp Changed

@@ -104,7 +104,8 @@
     m_param = top->m_param;
     m_numRows = numRows;
     m_numCols = numCols;
-    m_filterRowDelay = (m_param->bEnableSAO && m_param->bSaoNonDeblocked) ?
+    m_filterRowDelay = ((m_param->bEnableSAO && m_param->bSaoNonDeblocked)
+                        || (!m_param->bEnableLoopFilter && m_param->bEnableSAO)) ?
                         2 : (m_param->bEnableSAO || m_param->bEnableLoopFilter ? 1 : 0);
     m_filterRowDelayCus = m_filterRowDelay * numCols;
     m_rows = new CTURow[m_numRows];
@@ -124,7 +125,7 @@
         m_pool = NULL;
     }
 
-    m_frameFilter.init(top, this, numRows);
+    m_frameFilter.init(top, this, numRows, numCols);
 
     // initialize HRD parameters of SPS
     if (m_param->bEmitHRDSEI || !!m_param->interlaceMode)
@@ -135,7 +136,7 @@
         ok &= m_rce.picTimingSEI && m_rce.hrdTiming;
     }
 
-    if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
+    if (m_param->noiseReductionIntra || m_param->noiseReductionInter || m_param->rc.vbvBufferSize)
         m_nr = X265_MALLOC(NoiseReduction, 1);
     if (m_nr)
         memset(m_nr, 0, sizeof(NoiseReduction));
@@ -275,7 +276,7 @@
         m_localTldIdx = 0;
     }
 
-    m_done.trigger();     /* signal that thread is initialized */ 
+    m_done.trigger();     /* signal that thread is initialized */
     m_enable.wait();      /* Encoder::encode() triggers this event */
 
     while (m_threadActive)
@@ -357,15 +358,52 @@
             WeightParam *w = NULL;
             if ((bUseWeightP || bUseWeightB) && slice->m_weightPredTable[l][ref][0].bPresentFlag)
                 w = slice->m_weightPredTable[l][ref];
-            m_mref[l][ref].init(slice->m_refPicList[l][ref]->m_reconPic, w, *m_param);
+            slice->m_refReconPicList[l][ref] = slice->m_refFrameList[l][ref]->m_reconPic;
+            m_mref[l][ref].init(slice->m_refReconPicList[l][ref], w, *m_param);
         }
     }
 
+    int numTLD;
+    if (m_pool)
+        numTLD = m_param->bEnableWavefront ? m_pool->m_numWorkers : m_pool->m_numWorkers + m_pool->m_numProviders;
+    else
+        numTLD = 1;
+
     /* Get the QP for this frame from rate control. This call may block until
      * frames ahead of it in encode order have called rateControlEnd() */
     int qp = m_top->m_rateControl->rateControlStart(m_frame, &m_rce, m_top);
     m_rce.newQp = qp;
 
+    if (m_nr)
+    {
+        if (qp > QP_MAX_SPEC && m_frame->m_param->rc.vbvBufferSize)
+        {
+            for (int i = 0; i < numTLD; i++)
+            {
+                m_tld[i].analysis.m_quant.m_frameNr[m_jpId].offset = m_top->m_offsetEmergency[qp - QP_MAX_SPEC - 1];
+                m_tld[i].analysis.m_quant.m_frameNr[m_jpId].residualSum = m_top->m_residualSumEmergency;
+                m_tld[i].analysis.m_quant.m_frameNr[m_jpId].count = m_top->m_countEmergency;
+            }
+        }
+        else
+        {
+            if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
+            {
+                for (int i = 0; i < numTLD; i++)
+                {
+                    m_tld[i].analysis.m_quant.m_frameNr[m_jpId].offset = m_tld[i].analysis.m_quant.m_frameNr[m_jpId].nrOffsetDenoise;
+                    m_tld[i].analysis.m_quant.m_frameNr[m_jpId].residualSum = m_tld[i].analysis.m_quant.m_frameNr[m_jpId].nrResidualSum;
+                    m_tld[i].analysis.m_quant.m_frameNr[m_jpId].count = m_tld[i].analysis.m_quant.m_frameNr[m_jpId].nrCount;
+                }
+            }
+            else
+            {
+                for (int i = 0; i < numTLD; i++)
+                    m_tld[i].analysis.m_quant.m_frameNr[m_jpId].offset = NULL;
+            }
+        }
+    }
+
     /* Clip slice QP to 0-51 spec range before encoding */
     slice->m_sliceQp = x265_clip3(-QP_BD_OFFSET, QP_MAX_SPEC, qp);
 
@@ -458,7 +496,7 @@
     /* CQP and CRF (without capped VBV) doesn't use mid-frame statistics to 
      * tune RateControl parameters for other frames.
      * Hence, for these modes, update m_startEndOrder and unlock RC for previous threads waiting in
-     * RateControlEnd here, after the slicecontexts are initialized. For the rest - ABR
+     * RateControlEnd here, after the slice contexts are initialized. For the rest - ABR
      * and VBV, unlock only after rateControlUpdateStats of this frame is called */
     if (m_param->rc.rateControlMode != X265_RC_ABR && !m_top->m_rateControl->m_isVbv)
     {
@@ -482,7 +520,7 @@
             {
                 for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
                 {
-                    Frame *refpic = slice->m_refPicList[l][ref];
+                    Frame *refpic = slice->m_refFrameList[l][ref];
 
                     uint32_t reconRowCount = refpic->m_reconRowCount.get();
                     while ((reconRowCount != m_numRows) && (reconRowCount < row + m_refLagRows))
@@ -521,7 +559,7 @@
                     int list = l;
                     for (int ref = 0; ref < slice->m_numRefIdx[list]; ref++)
                     {
-                        Frame *refpic = slice->m_refPicList[list][ref];
+                        Frame *refpic = slice->m_refFrameList[list][ref];
 
                         uint32_t reconRowCount = refpic->m_reconRowCount.get();
                         while ((reconRowCount != m_numRows) && (reconRowCount < i + m_refLagRows))
@@ -572,10 +610,7 @@
         m_frame->m_encData->m_frameStats.lumaDistortion   += m_rows[i].rowStats.lumaDistortion;
         m_frame->m_encData->m_frameStats.chromaDistortion += m_rows[i].rowStats.chromaDistortion;
         m_frame->m_encData->m_frameStats.psyEnergy        += m_rows[i].rowStats.psyEnergy;
-        m_frame->m_encData->m_frameStats.lumaLevel        += m_rows[i].rowStats.lumaLevel;
-
-        if (m_rows[i].rowStats.maxLumaLevel > m_frame->m_encData->m_frameStats.maxLumaLevel)
-            m_frame->m_encData->m_frameStats.maxLumaLevel = m_rows[i].rowStats.maxLumaLevel;
+        m_frame->m_encData->m_frameStats.resEnergy        += m_rows[i].rowStats.resEnergy;
         for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
         {
             m_frame->m_encData->m_frameStats.cntSkipCu[depth] += m_rows[i].rowStats.cntSkipCu[depth];
@@ -589,7 +624,7 @@
     m_frame->m_encData->m_frameStats.avgLumaDistortion   = (double)(m_frame->m_encData->m_frameStats.lumaDistortion) / m_frame->m_encData->m_frameStats.totalCtu;
     m_frame->m_encData->m_frameStats.avgChromaDistortion = (double)(m_frame->m_encData->m_frameStats.chromaDistortion) / m_frame->m_encData->m_frameStats.totalCtu;
     m_frame->m_encData->m_frameStats.avgPsyEnergy        = (double)(m_frame->m_encData->m_frameStats.psyEnergy) / m_frame->m_encData->m_frameStats.totalCtu;
-    m_frame->m_encData->m_frameStats.avgLumaLevel        = m_frame->m_encData->m_frameStats.lumaLevel / m_frame->m_encData->m_frameStats.totalCtu;
+    m_frame->m_encData->m_frameStats.avgResEnergy        = (double)(m_frame->m_encData->m_frameStats.resEnergy) / m_frame->m_encData->m_frameStats.totalCtu;
     m_frame->m_encData->m_frameStats.percentIntraNxN     = (double)(m_frame->m_encData->m_frameStats.cntIntraNxN * 100) / m_frame->m_encData->m_frameStats.totalCu;
     for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
     {
@@ -626,22 +661,23 @@
 
     if (m_param->decodedPictureHashSEI)
     {
+        int planes = (m_frame->m_param->internalCsp != X265_CSP_I400) ? 3 : 1;
         if (m_param->decodedPictureHashSEI == 1)
         {
             m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::MD5;
-            for (int i = 0; i < 3; i++)
+            for (int i = 0; i < planes; i++)
                 MD5Final(&m_state[i], m_seiReconPictureDigest.m_digest[i]);
         }
         else if (m_param->decodedPictureHashSEI == 2)
         {
             m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CRC;
-            for (int i = 0; i < 3; i++)
+            for (int i = 0; i < planes; i++)
                 crcFinish(m_crc[i], m_seiReconPictureDigest.m_digest[i]);
         }
         else if (m_param->decodedPictureHashSEI == 3)
         {
             m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CHECKSUM;
-            for (int i = 0; i < 3; i++)
+            for (int i = 0; i < planes; i++)
                 checksumFinish(m_checksum[i], m_seiReconPictureDigest.m_digest[i]);
         }
 
@@ -678,41 +714,40 @@
     {
         for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
         {
-            Frame *refpic = slice->m_refPicList[l][ref];
+            Frame *refpic = slice->m_refFrameList[l][ref];
             ATOMIC_DEC(&refpic->m_countRefEncoders);
         }
     }
 
-    int numTLD;
-    if (m_pool)
-        numTLD = m_param->bEnableWavefront ? m_pool->m_numWorkers : m_pool->m_numWorkers + m_pool->m_numProviders;
-    else
-        numTLD = 1;
-
     if (m_nr)
     {
-        /* Accumulate NR statistics from all worker threads */
-        for (int i = 0; i < numTLD; i++)
+        bool nrEnabled = (m_rce.newQp < QP_MAX_SPEC || !m_param->rc.vbvBufferSize) && (m_param->noiseReductionIntra || m_param->noiseReductionInter);
+
+        if (nrEnabled)
         {
-            NoiseReduction* nr = &m_tld[i].analysis.m_quant.m_frameNr[m_jpId];
-            for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
+            /* Accumulate NR statistics from all worker threads */
+            for (int i = 0; i < numTLD; i++)
             {
-                for (int coeff = 0; coeff < MAX_NUM_TR_COEFFS; coeff++)
-                    m_nr->residualSum[cat][coeff] += nr->residualSum[cat][coeff];
-            
-                m_nr->count[cat] += nr->count[cat];
+                NoiseReduction* nr = &m_tld[i].analysis.m_quant.m_frameNr[m_jpId];

x265_1.8.tar.gz/source/encoder/framefilter.cpp -> x265_1.9.tar.gz/source/encoder/framefilter.cpp Changed

@@ -35,177 +35,486 @@
 static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride, uint32_t width, uint32_t height);
 static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t& cnt);
 
-FrameFilter::FrameFilter()
-    : m_param(NULL)
-    , m_frame(NULL)
-    , m_frameEncoder(NULL)
-    , m_ssimBuf(NULL)
-{
-}
-
 void FrameFilter::destroy()
 {
-    if (m_param->bEnableSAO)
-        m_sao.destroy();
-
     X265_FREE(m_ssimBuf);
+
+    if (m_parallelFilter)
+    {
+        if (m_param->bEnableSAO)
+        {
+            for(int row = 0; row < m_numRows; row++)
+                m_parallelFilter[row].m_sao.destroy((row == 0 ? 1 : 0));
+        }
+
+        delete[] m_parallelFilter;
+        m_parallelFilter = NULL;
+    }
 }
 
-void FrameFilter::init(Encoder *top, FrameEncoder *frame, int numRows)
+void FrameFilter::init(Encoder *top, FrameEncoder *frame, int numRows, uint32_t numCols)
 {
     m_param = top->m_param;
     m_frameEncoder = frame;
     m_numRows = numRows;
+    m_numCols = numCols;
     m_hChromaShift = CHROMA_H_SHIFT(m_param->internalCsp);
     m_vChromaShift = CHROMA_V_SHIFT(m_param->internalCsp);
     m_pad[0] = top->m_sps.conformanceWindow.rightOffset;
     m_pad[1] = top->m_sps.conformanceWindow.bottomOffset;
     m_saoRowDelay = m_param->bEnableLoopFilter ? 1 : 0;
-    m_lastHeight = m_param->sourceHeight % g_maxCUSize ? m_param->sourceHeight % g_maxCUSize : g_maxCUSize;
-
-    if (m_param->bEnableSAO)
-        if (!m_sao.create(m_param))
-            m_param->bEnableSAO = 0;
+    m_lastHeight = (m_param->sourceHeight % g_maxCUSize) ? (m_param->sourceHeight % g_maxCUSize) : g_maxCUSize;
+    m_lastWidth = (m_param->sourceWidth % g_maxCUSize) ? (m_param->sourceWidth % g_maxCUSize) : g_maxCUSize;
 
     if (m_param->bEnableSsim)
         m_ssimBuf = X265_MALLOC(int, 8 * (m_param->sourceWidth / 4 + 3));
+
+    m_parallelFilter = new ParallelFilter[numRows];
+
+    if (m_parallelFilter)
+    {
+        if (m_param->bEnableSAO)
+        {
+            for(int row = 0; row < numRows; row++)
+            {
+                if (!m_parallelFilter[row].m_sao.create(m_param, (row == 0 ? 1 : 0)))
+                    m_param->bEnableSAO = 0;
+                else
+                {
+                    if (row != 0)
+                        m_parallelFilter[row].m_sao.createFromRootNode(&m_parallelFilter[0].m_sao);
+                }
+
+            }
+        }
+
+        for(int row = 0; row < numRows; row++)
+        {
+            // Setting maximum bound information
+            m_parallelFilter[row].m_rowHeight = (row == numRows - 1) ? m_lastHeight : g_maxCUSize;
+            m_parallelFilter[row].m_row = row;
+            m_parallelFilter[row].m_rowAddr = row * numCols;
+            m_parallelFilter[row].m_frameFilter = this;
+
+            if (row > 0)
+                m_parallelFilter[row].m_prevRow = &m_parallelFilter[row - 1];
+        }
+    }
+
 }
 
 void FrameFilter::start(Frame *frame, Entropy& initState, int qp)
 {
     m_frame = frame;
 
-    if (m_param->bEnableSAO)
-        m_sao.startSlice(frame, initState, qp);
+    // Reset Filter Data Struct
+    if (m_parallelFilter)
+    {
+        for(int row = 0; row < m_numRows; row++)
+        {
+            if (m_param->bEnableSAO)
+                m_parallelFilter[row].m_sao.startSlice(frame, initState, qp);
+
+            m_parallelFilter[row].m_lastCol.set(0);
+            m_parallelFilter[row].m_allowedCol.set(0);
+            m_parallelFilter[row].m_lastDeblocked.set(-1);
+            m_parallelFilter[row].m_encData = frame->m_encData;
+        }
+
+        // Reset SAO common statistics
+        if (m_param->bEnableSAO)
+            m_parallelFilter[0].m_sao.resetStats();
+    }
 }
 
-void FrameFilter::processRow(int row)
+/* restore original YUV samples to recon after SAO (if lossless) */
+static void restoreOrigLosslessYuv(const CUData* cu, Frame& frame, uint32_t absPartIdx)
 {
-    ProfileScopeEvent(filterCTURow);
+    const int size = cu->m_log2CUSize[absPartIdx] - 2;
+    const uint32_t cuAddr = cu->m_cuAddr;
 
-#if DETAILED_CU_STATS
-    ScopedElapsedTime filterPerfScope(m_frameEncoder->m_cuStats.loopFilterElapsedTime);
-    m_frameEncoder->m_cuStats.countLoopFilter++;
-#endif
+    PicYuv* reconPic = frame.m_reconPic;
+    PicYuv* fencPic  = frame.m_fencPic;
 
-    if (!m_param->bEnableLoopFilter && !m_param->bEnableSAO)
+    pixel* dst = reconPic->getLumaAddr(cuAddr, absPartIdx);
+    pixel* src = fencPic->getLumaAddr(cuAddr, absPartIdx);
+
+    primitives.cu[size].copy_pp(dst, reconPic->m_stride, src, fencPic->m_stride);
+
+    if (cu->m_chromaFormat != X265_CSP_I400)
     {
-        processRowPost(row);
+        pixel* dstCb = reconPic->getCbAddr(cuAddr, absPartIdx);
+        pixel* srcCb = fencPic->getCbAddr(cuAddr, absPartIdx);
+        pixel* dstCr = reconPic->getCrAddr(cuAddr, absPartIdx);
+        pixel* srcCr = fencPic->getCrAddr(cuAddr, absPartIdx);
+
+        const int csp = fencPic->m_picCsp;
+        primitives.chroma[csp].cu[size].copy_pp(dstCb, reconPic->m_strideC, srcCb, fencPic->m_strideC);
+        primitives.chroma[csp].cu[size].copy_pp(dstCr, reconPic->m_strideC, srcCr, fencPic->m_strideC);
+    }
+}
+
+/* Original YUV restoration for CU in lossless coding */
+static void origCUSampleRestoration(const CUData* cu, const CUGeom& cuGeom, Frame& frame)
+{
+    uint32_t absPartIdx = cuGeom.absPartIdx;
+    if (cu->m_cuDepth[absPartIdx] > cuGeom.depth)
+    {
+        for (int subPartIdx = 0; subPartIdx < 4; subPartIdx++)
+        {
+            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
+            if (childGeom.flags & CUGeom::PRESENT)
+                origCUSampleRestoration(cu, childGeom, frame);
+        }
         return;
     }
-    FrameData& encData = *m_frame->m_encData;
-    const uint32_t numCols = encData.m_slice->m_sps->numCuInWidth;
-    const uint32_t lineStartCUAddr = row * numCols;
 
-    if (m_param->bEnableLoopFilter)
+    // restore original YUV samples
+    if (cu->m_tqBypass[absPartIdx])
+        restoreOrigLosslessYuv(cu, frame, absPartIdx);
+}
+
+void FrameFilter::ParallelFilter::copySaoAboveRef(PicYuv* reconPic, uint32_t cuAddr, int col)
+{
+    // Copy SAO Top Reference Pixels
+    int ctuWidth  = g_maxCUSize;
+    const pixel* recY = reconPic->getPlaneAddr(0, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_stride);
+
+    // Luma
+    memcpy(&m_sao.m_tmpU[0][col * ctuWidth], recY, ctuWidth * sizeof(pixel));
+    X265_CHECK(col * ctuWidth + ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer beyond bound write detected");
+
+    // Chroma
+    if (m_frameFilter->m_param->internalCsp != X265_CSP_I400)
+    {
+        ctuWidth  >>= m_sao.m_hChromaShift;
+
+        const pixel* recU = reconPic->getPlaneAddr(1, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_strideC);
+        const pixel* recV = reconPic->getPlaneAddr(2, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_strideC);
+        memcpy(&m_sao.m_tmpU[1][col * ctuWidth], recU, ctuWidth * sizeof(pixel));
+        memcpy(&m_sao.m_tmpU[2][col * ctuWidth], recV, ctuWidth * sizeof(pixel));
+
+        X265_CHECK(col * ctuWidth + ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer beyond bound write detected");
+    }
+}
+
+void FrameFilter::ParallelFilter::processSaoUnitCu(SAOParam *saoParam, int col)
+{

x265_1.8.tar.gz/source/encoder/framefilter.h -> x265_1.9.tar.gz/source/encoder/framefilter.h Changed

@@ -29,6 +29,7 @@
 #include "frame.h"
 #include "deblock.h"
 #include "sao.h"
+#include "threadpool.h" // class BondedTaskGroup
 
 namespace X265_NS {
 // private x265 namespace
@@ -39,7 +40,7 @@
 struct ThreadLocalData;
 
 // Manages the processing of a single frame loopfilter
-class FrameFilter : public Deblock
+class FrameFilter
 {
 public:
 
@@ -50,24 +51,86 @@
     int           m_vChromaShift;
     int           m_pad[2];
 
-    SAO           m_sao;
     int           m_numRows;
+    int           m_numCols;
     int           m_saoRowDelay;
     int           m_lastHeight;
+    int           m_lastWidth;
     
-    void*         m_ssimBuf; /* Temp storage for ssim computation */
+    void*         m_ssimBuf;        /* Temp storage for ssim computation */
 
-    FrameFilter();
+#define MAX_PFILTER_CUS     (4) /* maximum CUs for every thread */
+    class ParallelFilter : public BondedTaskGroup, public Deblock
+    {
+    public:
+        uint32_t            m_rowHeight;
+        int                 m_row;
+        uint32_t            m_rowAddr;
+        FrameFilter*        m_frameFilter;
+        FrameData*          m_encData;
+        ParallelFilter*     m_prevRow;
+        SAO                 m_sao;
+        ThreadSafeInteger   m_lastCol;          /* The column that next to process */
+        ThreadSafeInteger   m_allowedCol;       /* The column that processed from Encode pipeline */
+        ThreadSafeInteger   m_lastDeblocked;   /* The column that finished all of Deblock stages  */
 
-    void init(Encoder *top, FrameEncoder *frame, int numRows);
+        ParallelFilter()
+            : m_rowHeight(0)
+            , m_row(0)
+            , m_rowAddr(0)
+            , m_frameFilter(NULL)
+            , m_encData(NULL)
+            , m_prevRow(NULL)
+        {
+        }
+
+        ~ParallelFilter()
+        { }
+
+        void processTasks(int workerThreadId);
+
+        // Apply SAO on a CU in current row
+        void processSaoUnitCu(SAOParam *saoParam, int col);
+
+        // Copy and Save SAO reference pixels for SAO Rdo decide
+        void copySaoAboveRef(PicYuv* reconPic, uint32_t cuAddr, int col);
+
+        // Post-Process (Border extension)
+        void processPostCu(int col) const;
+
+        uint32_t getCUHeight() const
+        {
+            return m_rowHeight;
+        }
+
+    protected:
+
+        ParallelFilter operator=(const ParallelFilter&);
+    };
+
+    ParallelFilter*     m_parallelFilter;
+
+    FrameFilter()
+        : m_param(NULL)
+        , m_frame(NULL)
+        , m_frameEncoder(NULL)
+        , m_ssimBuf(NULL)
+        , m_parallelFilter(NULL)
+    {
+    }
+
+    uint32_t getCUWidth(int colNum) const
+    {
+        return (colNum == (int)m_numCols - 1) ? m_lastWidth : g_maxCUSize;
+    }
+
+    void init(Encoder *top, FrameEncoder *frame, int numRows, uint32_t numCols);
     void destroy();
 
     void start(Frame *pic, Entropy& initState, int qp);
 
     void processRow(int row);
-    void processRowPost(int row);
-    void processSao(int row);
-    uint32_t getCUHeight(int rowNum) const;
+    void processPostRow(int row);
 };
 }

x265_1.8.tar.gz/source/encoder/level.cpp -> x265_1.9.tar.gz/source/encoder/level.cpp Changed

@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -462,7 +463,7 @@
     {
         if (param->internalCsp != X265_CSP_I420)
         {
-            x265_log(param, X265_LOG_ERROR, "%s profile not compatible with %s input color space.\n",
+            x265_log(param, X265_LOG_ERROR, "%s profile not compatible with %s input chroma subsampling.\n",
                      profile, x265_source_csp_names[param->internalCsp]);
             return -1;
         }
@@ -472,7 +473,7 @@
     {
         if (param->internalCsp != X265_CSP_I420 && param->internalCsp != X265_CSP_I422)
         {
-            x265_log(param, X265_LOG_ERROR, "%s profile not compatible with %s input color space.\n",
+            x265_log(param, X265_LOG_ERROR, "%s profile not compatible with %s input chroma subsampling.\n",
                      profile, x265_source_csp_names[param->internalCsp]);
             return -1;
         }

x265_1.8.tar.gz/source/encoder/motion.cpp -> x265_1.9.tar.gz/source/encoder/motion.cpp Changed

@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -188,11 +189,12 @@
     satd = primitives.pu[partEnum].satd;
     sad_x3 = primitives.pu[partEnum].sad_x3;
     sad_x4 = primitives.pu[partEnum].sad_x4;
+
     chromaSatd = primitives.chroma[fencPUYuv.m_csp].pu[partEnum].satd;
 
     /* Enable chroma residual cost if subpelRefine level is greater than 2 and chroma block size
      * is an even multiple of 4x4 pixels (indicated by non-null chromaSatd pointer) */
-    bChromaSATD = subpelRefine > 2 && chromaSatd;
+    bChromaSATD = subpelRefine > 2 && chromaSatd && (srcFencYuv.m_csp != X265_CSP_I400);
     X265_CHECK(!(bChromaSATD && !workload[subpelRefine].hpel_satd), "Chroma SATD cannot be used with SAD hpel\n");
 
     ctuAddr = _ctuAddr;
@@ -1214,8 +1216,11 @@
         const pixel* refCb = ref->getCbAddr(ctuAddr, absPartIdx) + refOffset;
         const pixel* refCr = ref->getCrAddr(ctuAddr, absPartIdx) + refOffset;
 
-        xFrac = qmv.x & ((1 << shiftHor) - 1);
-        yFrac = qmv.y & ((1 << shiftVer) - 1);
+        X265_CHECK((hshift == 0) || (hshift == 1), "hshift must be 0 or 1\n");
+        X265_CHECK((vshift == 0) || (vshift == 1), "vshift must be 0 or 1\n");
+
+        xFrac = qmv.x & (hshift ? 7 : 3);
+        yFrac = qmv.y & (vshift ? 7 : 3);
 
         if (!(yFrac | xFrac))
         {

x265_1.8.tar.gz/source/encoder/motion.h -> x265_1.9.tar.gz/source/encoder/motion.h Changed

x265_1.8.tar.gz/source/encoder/nal.cpp -> x265_1.9.tar.gz/source/encoder/nal.cpp Changed

x265_1.8.tar.gz/source/encoder/ratecontrol.cpp -> x265_1.9.tar.gz/source/encoder/ratecontrol.cpp Changed

@@ -23,6 +23,10 @@
  * For more information, contact us at license @ x265.com.
  *****************************************************************************/
 
+#if _MSC_VER
+#pragma warning(disable: 4127) // conditional expression is constant, yes I know
+#endif
+
 #include "common.h"
 #include "param.h"
 #include "frame.h"
@@ -142,6 +146,9 @@
     rce->expectedVbv = rce2Pass->expectedVbv;
     rce->blurredComplexity = rce2Pass->blurredComplexity;
     rce->sliceType = rce2Pass->sliceType;
+    rce->qpNoVbv = rce2Pass->qpNoVbv;
+    rce->newQp = rce2Pass->newQp;
+    rce->qRceq = rce2Pass->qRceq;
 }
 
 }  // end anonymous namespace
@@ -205,7 +212,7 @@
             m_rateFactorMaxDecrement = m_param->rc.rfConstant - m_param->rc.rfConstantMin;
     }
     m_isAbr = m_param->rc.rateControlMode != X265_RC_CQP && !m_param->rc.bStatRead;
-    m_2pass = m_param->rc.rateControlMode == X265_RC_ABR && m_param->rc.bStatRead;
+    m_2pass = (m_param->rc.rateControlMode == X265_RC_ABR || m_param->rc.vbvMaxBitrate > 0) && m_param->rc.bStatRead;
     m_bitrate = m_param->rc.bitrate * 1000;
     m_frameDuration = (double)m_param->fpsDenom / m_param->fpsNum;
     m_qp = m_param->rc.qp;
@@ -219,6 +226,7 @@
     m_cutreeStatFileOut = m_cutreeStatFileIn = NULL;
     m_rce2Pass = NULL;
     m_lastBsliceSatdCost = 0;
+    m_movingAvgSum = 0.0;
 
     // vbv initialization
     m_param->rc.vbvBufferSize = x265_clip3(0, 2000000, m_param->rc.vbvBufferSize);
@@ -444,6 +452,7 @@
                 CMP_OPT_FIRST_PASS("open-gop", m_param->bOpenGOP);
                 CMP_OPT_FIRST_PASS("keyint", m_param->keyframeMax);
                 CMP_OPT_FIRST_PASS("scenecut", m_param->scenecutThreshold);
+                CMP_OPT_FIRST_PASS("intra-refresh", m_param->bIntraRefresh);
 
                 if ((p = strstr(opts, "b-adapt=")) != 0 && sscanf(p, "b-adapt=%d", &i) && i >= X265_B_ADAPT_NONE && i <= X265_B_ADAPT_TRELLIS)
                 {
@@ -488,6 +497,12 @@
                  x265_log(m_param, X265_LOG_ERROR, "Rce Entries for 2 pass cannot be allocated\n");
                  return false;
             }
+            m_encOrder = X265_MALLOC(int, m_numEntries);
+            if (!m_encOrder)
+            {
+                x265_log(m_param, X265_LOG_ERROR, "Encode order for 2 pass cannot be allocated\n");
+                return false;
+            }
             /* init all to skipped p frames */
             for (int i = 0; i < m_numEntries; i++)
             {
@@ -504,22 +519,24 @@
             {
                 RateControlEntry *rce;
                 int frameNumber;
+                int encodeOrder;
                 char picType;
                 int e;
                 char *next;
-                double qpRc, qpAq;
+                double qpRc, qpAq, qNoVbv, qRceq;
                 next = strstr(p, ";");
                 if (next)
                     *next++ = 0;
-                e = sscanf(p, " in:%d ", &frameNumber);
+                e = sscanf(p, " in:%d out:%d", &frameNumber, &encodeOrder);
                 if (frameNumber < 0 || frameNumber >= m_numEntries)
                 {
                     x265_log(m_param, X265_LOG_ERROR, "bad frame number (%d) at stats line %d\n", frameNumber, i);
                     return false;
                 }
-                rce = &m_rce2Pass[frameNumber];
-                e += sscanf(p, " in:%*d out:%*d type:%c q:%lf q-aq:%lf tex:%d mv:%d misc:%d icu:%lf pcu:%lf scu:%lf",
-                       &picType, &qpRc, &qpAq, &rce->coeffBits,
+                rce = &m_rce2Pass[encodeOrder];
+                m_encOrder[frameNumber] = encodeOrder;
+                e += sscanf(p, " in:%*d out:%*d type:%c q:%lf q-aq:%lf q-noVbv:%lf q-Rceq:%lf tex:%d mv:%d misc:%d icu:%lf pcu:%lf scu:%lf",
+                       &picType, &qpRc, &qpAq, &qNoVbv, &qRceq, &rce->coeffBits,
                        &rce->mvBits, &rce->miscBits, &rce->iCuCount, &rce->pCuCount,
                        &rce->skipCuCount);
                 rce->keptAsRef = true;
@@ -538,13 +555,16 @@
                     x265_log(m_param, X265_LOG_ERROR, "statistics are damaged at line %d, parser out=%d\n", i, e);
                     return false;
                 }
-                rce->qScale = x265_qp2qScale(qpRc);
+                rce->qScale = rce->newQScale = x265_qp2qScale(qpRc);
                 totalQpAq += qpAq;
+                rce->qpNoVbv = qNoVbv;
+                rce->qpaRc = qpRc;
+                rce->qpAq = qpAq;
+                rce->qRceq = qRceq;
                 p = next;
             }
             X265_FREE(statsBuf);
-
-            if (m_param->rc.rateControlMode == X265_RC_ABR)
+            if (m_param->rc.rateControlMode == X265_RC_ABR || m_param->rc.vbvMaxBitrate > 0)
             {
                 if (!initPass2())
                     return false;
@@ -627,11 +647,8 @@
 
     #undef MAX_DURATION
 }
-
-bool RateControl::initPass2()
+bool RateControl::analyseABR2Pass(int startIndex, int endIndex, uint64_t allAvailableBits)
 {
-    uint64_t allConstBits = 0;
-    uint64_t allAvailableBits = uint64_t(m_param->rc.bitrate * 1000. * m_numEntries * m_frameDuration);
     double rateFactor, stepMult;
     double qBlur = m_param->rc.qblur;
     double cplxBlur = m_param->rc.complexityBlur;
@@ -640,30 +657,19 @@
     double *qScale, *blurredQscale;
     double baseCplx = m_ncu * (m_param->bframes ? 120 : 80);
     double clippedDuration = CLIP_DURATION(m_frameDuration) / BASE_FRAME_DURATION;
-
-    /* find total/average complexity & const_bits */
-    for (int i = 0; i < m_numEntries; i++)
-        allConstBits += m_rce2Pass[i].miscBits;
-
-    if (allAvailableBits < allConstBits)
-    {
-        x265_log(m_param, X265_LOG_ERROR, "requested bitrate is too low. estimated minimum is %d kbps\n",
-                 (int)(allConstBits * m_fps / m_numEntries * 1000.));
-        return false;
-    }
-
+    int framesCount = endIndex - startIndex + 1;
     /* Blur complexities, to reduce local fluctuation of QP.
      * We don't blur the QPs directly, because then one very simple frame
      * could drag down the QP of a nearby complex frame and give it more
      * bits than intended. */
-    for (int i = 0; i < m_numEntries; i++)
+    for (int i = startIndex; i <= endIndex; i++)
     {
         double weightSum = 0;
         double cplxSum = 0;
         double weight = 1.0;
         double gaussianWeight;
         /* weighted average of cplx of future frames */
-        for (int j = 1; j < cplxBlur * 2 && j < m_numEntries - i; j++)
+        for (int j = 1; j < cplxBlur * 2 && j <= endIndex - i; j++)
         {
             RateControlEntry *rcj = &m_rce2Pass[i + j];
             weight *= 1 - pow(rcj->iCuCount / m_ncu, 2);
@@ -687,11 +693,10 @@
         }
         m_rce2Pass[i].blurredComplexity = cplxSum / weightSum;
     }
-
-    CHECKED_MALLOC(qScale, double, m_numEntries);
+    CHECKED_MALLOC(qScale, double, framesCount);
     if (filterSize > 1)
     {
-        CHECKED_MALLOC(blurredQscale, double, m_numEntries);
+        CHECKED_MALLOC(blurredQscale, double, framesCount);
     }
     else
         blurredQscale = qScale;
@@ -702,9 +707,8 @@
      * because qscale2bits is not invertible, but we can start with the simple
      * approximation of scaling the 1st pass by the ratio of bitrates.
      * The search range is probably overkill, but speed doesn't matter here. */
-
     expectedBits = 1;
-    for (int i = 0; i < m_numEntries; i++)
+    for (int i = startIndex; i <= endIndex; i++)
     {
         RateControlEntry* rce = &m_rce2Pass[i];
         double q = getQScale(rce, 1.0);
@@ -781,12 +785,10 @@
     X265_FREE(qScale);
     if (filterSize > 1)
         X265_FREE(blurredQscale);
-
     if (m_isVbv)
-        if (!vbv2Pass(allAvailableBits))
+    if (!vbv2Pass(allAvailableBits, endIndex, startIndex))
             return false;
-    expectedBits = countExpectedBits();
-
+    expectedBits = countExpectedBits(startIndex, endIndex);
     if (fabs(expectedBits / allAvailableBits - 1.0) > 0.01)
     {
         double avgq = 0;
@@ -819,7 +821,123 @@
     return false;
 }

x265_1.8.tar.gz/source/encoder/ratecontrol.h -> x265_1.9.tar.gz/source/encoder/ratecontrol.h Changed

@@ -48,6 +48,7 @@
 
 struct Predictor
 {
+    double coeffMin;
     double coeff;
     double count;
     double decay;
@@ -74,6 +75,7 @@
     double  qpaRc;
     double  qpAq;
     double  qRceq;
+    double  qpPrev;
     double  frameSizePlanned;  /* frame Size decided by RateCotrol before encoding the frame */
     double  bufferRate;
     double  movingAvgSum;
@@ -167,6 +169,8 @@
     int64_t m_satdCostWindow[50];
     int64_t m_encodedBitsWindow[50];
     int     m_sliderPos;
+    int64_t m_lastRemovedSatdCost;
+    double  m_movingAvgSum;
 
     /* To detect a pattern of low detailed static frames in single pass ABR using satdcosts */
     int64_t m_lastBsliceSatdCost;
@@ -205,8 +209,8 @@
     double  m_lastAccumPNorm;
     double  m_expectedBitsSum;   /* sum of qscale2bits after rceq, ratefactor, and overflow, only includes finished frames */
     int64_t m_predictedBits;
+    int     *m_encOrder;
     RateControlEntry* m_rce2Pass;
-
     struct
     {
         uint16_t *qpBuffer[2]; /* Global buffers for converting MB-tree quantizer data. */
@@ -258,11 +262,12 @@
     void   checkAndResetABR(RateControlEntry* rce, bool isFrameDone);
     double predictRowsSizeSum(Frame* pic, RateControlEntry* rce, double qpm, int32_t& encodedBits);
     bool   initPass2();
+    bool   analyseABR2Pass(int startPoc, int endPoc, uint64_t allAvailableBits);
     void   initFramePredictors();
     double getDiffLimitedQScale(RateControlEntry *rce, double q);
-    double countExpectedBits();
-    bool   vbv2Pass(uint64_t allAvailableBits);
-    bool   findUnderflow(double *fills, int *t0, int *t1, int over);
+    double countExpectedBits(int startPos, int framesCount);
+    bool   vbv2Pass(uint64_t allAvailableBits, int frameCount, int startPos);
+    bool   findUnderflow(double *fills, int *t0, int *t1, int over, int framesCount);
     bool   fixUnderflow(int t0, int t1, double adjustment, double qscaleMin, double qscaleMax);
 };
 }

x265_1.8.tar.gz/source/encoder/rdcost.h -> x265_1.9.tar.gz/source/encoder/rdcost.h Changed

@@ -2,6 +2,7 @@
 * Copyright (C) 2013 x265 project
 *
 * Authors: Steve Borho <steve@borho.org>
+*          Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -73,13 +74,18 @@
             qpCr = x265_clip3(QP_MIN, QP_MAX_SPEC, qp + slice.m_pps->chromaQpOffset[1]);
         }
 
-        int chroma_offset_idx = X265_MIN(qp - qpCb + 12, MAX_CHROMA_LAMBDA_OFFSET);
-        uint16_t lambdaOffset = m_psyRd ? x265_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;
-        m_chromaDistWeight[0] = lambdaOffset;
+        if (slice.m_sps->chromaFormatIdc == X265_CSP_I444)
+        {
+            int chroma_offset_idx = X265_MIN(qp - qpCb + 12, MAX_CHROMA_LAMBDA_OFFSET);
+            uint16_t lambdaOffset = m_psyRd ? x265_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;
+            m_chromaDistWeight[0] = lambdaOffset;
 
-        chroma_offset_idx = X265_MIN(qp - qpCr + 12, MAX_CHROMA_LAMBDA_OFFSET);
-        lambdaOffset = m_psyRd ? x265_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;
-        m_chromaDistWeight[1] = lambdaOffset;
+            chroma_offset_idx = X265_MIN(qp - qpCr + 12, MAX_CHROMA_LAMBDA_OFFSET);
+            lambdaOffset = m_psyRd ? x265_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;
+            m_chromaDistWeight[1] = lambdaOffset;
+        }
+        else
+            m_chromaDistWeight[0] = m_chromaDistWeight[1] = 256;
     }
 
     void setLambda(double lambda2, double lambda)
@@ -88,9 +94,9 @@
         m_lambda = (uint64_t)floor(256.0 * lambda);
     }
 
-    inline uint64_t calcRdCost(sse_ret_t distortion, uint32_t bits) const
+    inline uint64_t calcRdCost(sse_t distortion, uint32_t bits) const
     {
-#if X265_DEPTH <= 10
+#if X265_DEPTH < 10
         X265_CHECK(bits <= (UINT64_MAX - 128) / m_lambda2,
                    "calcRdCost wrap detected dist: %u, bits %u, lambda: " X265_LL "\n",
                    distortion, bits, m_lambda2);
@@ -108,15 +114,18 @@
         return primitives.cu[size].psy_cost_pp(source, sstride, recon, rstride);
     }
 
-    /* return the difference in energy between the source block and the recon block */
-    inline int psyCost(int size, const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride) const
-    {
-        return primitives.cu[size].psy_cost_ss(source, sstride, recon, rstride);
-    }
-
     /* return the RD cost of this prediction, including the effect of psy-rd */
-    inline uint64_t calcPsyRdCost(sse_ret_t distortion, uint32_t bits, uint32_t psycost) const
+    inline uint64_t calcPsyRdCost(sse_t distortion, uint32_t bits, uint32_t psycost) const
     {
+#if X265_DEPTH < 10
+        X265_CHECK((bits <= (UINT64_MAX / m_lambda2)) && (psycost <= UINT64_MAX / (m_lambda * m_psyRd)),
+                   "calcPsyRdCost wrap detected dist: %u, bits: %u, lambda: " X265_LL ", lambda2: " X265_LL "\n",
+                   distortion, bits, m_lambda, m_lambda2);
+#else
+        X265_CHECK((bits <= (UINT64_MAX / m_lambda2)) && (psycost <= UINT64_MAX / (m_lambda * m_psyRd)),
+                   "calcPsyRdCost wrap detected dist: " X265_LL ", bits: %u, lambda: " X265_LL ", lambda2: " X265_LL "\n",
+                   distortion, bits, m_lambda, m_lambda2);
+#endif
         return distortion + ((m_lambda * m_psyRd * psycost) >> 24) + ((bits * m_lambda2) >> 8);
     }
 
@@ -127,9 +136,9 @@
         return sadCost + ((bits * m_lambda + 128) >> 8);
     }
 
-    inline sse_ret_t scaleChromaDist(uint32_t plane, sse_ret_t dist) const
+    inline sse_t scaleChromaDist(uint32_t plane, sse_t dist) const
     {
-#if X265_DEPTH <= 10
+#if X265_DEPTH < 10
         X265_CHECK(dist <= (UINT64_MAX - 128) / m_chromaDistWeight[plane - 1],
                    "scaleChromaDist wrap detected dist: %u, lambda: %u\n",
                    dist, m_chromaDistWeight[plane - 1]);
@@ -138,11 +147,13 @@
                    "scaleChromaDist wrap detected dist: " X265_LL " lambda: %u\n",
                    dist, m_chromaDistWeight[plane - 1]);
 #endif
-        return (sse_ret_t)((dist * (uint64_t)m_chromaDistWeight[plane - 1] + 128) >> 8);
+        return (sse_t)((dist * (uint64_t)m_chromaDistWeight[plane - 1] + 128) >> 8);
     }
 
     inline uint32_t getCost(uint32_t bits) const
     {
+        X265_CHECK(bits <= (UINT64_MAX - 128) / m_lambda,
+                   "getCost wrap detected bits: %u, lambda: " X265_LL "\n", bits, m_lambda);
         return (uint32_t)((bits * m_lambda + 128) >> 8);
     }
 };

x265_1.8.tar.gz/source/encoder/reference.cpp -> x265_1.9.tar.gz/source/encoder/reference.cpp Changed

x265_1.8.tar.gz/source/encoder/sao.cpp -> x265_1.9.tar.gz/source/encoder/sao.cpp Changed

@@ -73,9 +73,6 @@
 
 SAO::SAO()
 {
-    m_count = NULL;
-    m_offset = NULL;
-    m_offsetOrg = NULL;
     m_countPreDblk = NULL;
     m_offsetOrgPreDblk = NULL;
     m_refDepth = 0;
@@ -84,28 +81,22 @@
     m_param = NULL;
     m_clipTable = NULL;
     m_clipTableBase = NULL;
-    m_tmpU1[0] = NULL;
-    m_tmpU1[1] = NULL;
-    m_tmpU1[2] = NULL;
-    m_tmpU2[0] = NULL;
-    m_tmpU2[1] = NULL;
-    m_tmpU2[2] = NULL;
-    m_tmpL1 = NULL;
-    m_tmpL2 = NULL;
-
-    m_depthSaoRate[0][0] = 0;
-    m_depthSaoRate[0][1] = 0;
-    m_depthSaoRate[0][2] = 0;
-    m_depthSaoRate[0][3] = 0;
-    m_depthSaoRate[1][0] = 0;
-    m_depthSaoRate[1][1] = 0;
-    m_depthSaoRate[1][2] = 0;
-    m_depthSaoRate[1][3] = 0;
+    m_tmpU[0] = NULL;
+    m_tmpU[1] = NULL;
+    m_tmpU[2] = NULL;
+    m_tmpL1[0] = NULL;
+    m_tmpL1[1] = NULL;
+    m_tmpL1[2] = NULL;
+    m_tmpL2[0] = NULL;
+    m_tmpL2[1] = NULL;
+    m_tmpL2[2] = NULL;
+    m_depthSaoRate = NULL;
 }
 
-bool SAO::create(x265_param* param)
+bool SAO::create(x265_param* param, int initCommon)
 {
     m_param = param;
+    m_chromaFormat = param->internalCsp;
     m_hChromaShift = CHROMA_H_SHIFT(param->internalCsp);
     m_vChromaShift = CHROMA_V_SHIFT(param->internalCsp);
 
@@ -116,37 +107,56 @@
     const pixel rangeExt = maxY >> 1;
     int numCtu = m_numCuInWidth * m_numCuInHeight;
 
-    CHECKED_MALLOC(m_clipTableBase,  pixel, maxY + 2 * rangeExt);
-
-    CHECKED_MALLOC(m_tmpL1, pixel, g_maxCUSize + 1);
-    CHECKED_MALLOC(m_tmpL2, pixel, g_maxCUSize + 1);
-
-    for (int i = 0; i < 3; i++)
+    for (int i = 0; i < (param->internalCsp != X265_CSP_I400 ? 3 : 1); i++)
     {
+        CHECKED_MALLOC(m_tmpL1[i], pixel, g_maxCUSize + 1);
+        CHECKED_MALLOC(m_tmpL2[i], pixel, g_maxCUSize + 1);
+
         // SAO asm code will read 1 pixel before and after, so pad by 2
-        CHECKED_MALLOC(m_tmpU1[i], pixel, m_param->sourceWidth + 2);
-        m_tmpU1[i] += 1;
-        CHECKED_MALLOC(m_tmpU2[i], pixel, m_param->sourceWidth + 2);
-        m_tmpU2[i] += 1;
+        // NOTE: m_param->sourceWidth+2 enough, to avoid condition check in copySaoAboveRef(), I alloc more up to 63 bytes in here
+        CHECKED_MALLOC(m_tmpU[i], pixel, m_numCuInWidth * g_maxCUSize + 2 + 32);
+        m_tmpU[i] += 1;
     }
 
-    CHECKED_MALLOC(m_count, PerClass, NUM_PLANE);
-    CHECKED_MALLOC(m_offset, PerClass, NUM_PLANE);
-    CHECKED_MALLOC(m_offsetOrg, PerClass, NUM_PLANE);
-
-    CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu);
-    CHECKED_MALLOC(m_offsetOrgPreDblk, PerPlane, numCtu);
-
-    m_clipTable = &(m_clipTableBase[rangeExt]);
-
-    for (int i = 0; i < rangeExt; i++)
-        m_clipTableBase[i] = 0;
+    if (initCommon)
+    {
+        CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu);
+        CHECKED_MALLOC(m_offsetOrgPreDblk, PerPlane, numCtu);
+        CHECKED_MALLOC(m_depthSaoRate, double, 2 * SAO_DEPTHRATE_SIZE);
+
+        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 0] = 0;
+        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 1] = 0;
+        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 2] = 0;
+        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 3] = 0;
+        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 0] = 0;
+        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 1] = 0;
+        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 2] = 0;
+        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 3] = 0;
+
+        CHECKED_MALLOC(m_clipTableBase,  pixel, maxY + 2 * rangeExt);
+        m_clipTable = &(m_clipTableBase[rangeExt]);
+
+        // Share with fast clip lookup table
+        if (initCommon)
+        {
+            for (int i = 0; i < rangeExt; i++)
+                m_clipTableBase[i] = 0;
 
-    for (int i = 0; i < maxY; i++)
-        m_clipTable[i] = (pixel)i;
+            for (int i = 0; i < maxY; i++)
+                m_clipTable[i] = (pixel)i;
 
-    for (int i = maxY; i < maxY + rangeExt; i++)
-        m_clipTable[i] = maxY;
+            for (int i = maxY; i < maxY + rangeExt; i++)
+                m_clipTable[i] = maxY;
+        }
+    }
+    else
+    {
+        // must initialize these common pointer outside of function
+        m_countPreDblk = NULL;
+        m_offsetOrgPreDblk = NULL;
+        m_clipTableBase = NULL;
+        m_clipTable = NULL;
+    }
 
     return true;
 
@@ -154,34 +164,61 @@
     return false;
 }
 
-void SAO::destroy()
+void SAO::createFromRootNode(SAO* root)
 {
-    X265_FREE(m_clipTableBase);
-
-    X265_FREE(m_tmpL1);
-    X265_FREE(m_tmpL2);
+    X265_CHECK(m_countPreDblk == NULL, "duplicate initialize on m_countPreDblk");
+    X265_CHECK(m_offsetOrgPreDblk == NULL, "duplicate initialize on m_offsetOrgPreDblk");
+    X265_CHECK(m_depthSaoRate == NULL, "duplicate initialize on m_depthSaoRate");
+    X265_CHECK(m_clipTableBase == NULL, "duplicate initialize on m_clipTableBase");
+    X265_CHECK(m_clipTable == NULL, "duplicate initialize on m_clipTable");
+
+    m_countPreDblk = root->m_countPreDblk;
+    m_offsetOrgPreDblk = root->m_offsetOrgPreDblk;
+    m_depthSaoRate = root->m_depthSaoRate;
+    m_clipTableBase = root->m_clipTableBase; // Unnecessary
+    m_clipTable = root->m_clipTable;
+}
 
+void SAO::destroy(int destoryCommon)
+{
     for (int i = 0; i < 3; i++)
     {
-        if (m_tmpU1[i]) X265_FREE(m_tmpU1[i] - 1);
-        if (m_tmpU2[i]) X265_FREE(m_tmpU2[i] - 1);
+        if (m_tmpL1[i])
+        {
+            X265_FREE(m_tmpL1[i]);
+            m_tmpL1[i] = NULL;
+        }
+
+        if (m_tmpL2[i])
+        {
+            X265_FREE(m_tmpL2[i]);
+            m_tmpL2[i] = NULL;
+        }
+
+        if (m_tmpU[i])
+        {
+            X265_FREE(m_tmpU[i] - 1);
+            m_tmpU[i] = NULL;
+        }
     }
 
-    X265_FREE(m_count);
-    X265_FREE(m_offset);
-    X265_FREE(m_offsetOrg);
-    X265_FREE(m_countPreDblk);
-    X265_FREE(m_offsetOrgPreDblk);
+    if (destoryCommon)
+    {
+        X265_FREE_ZERO(m_countPreDblk);
+        X265_FREE_ZERO(m_offsetOrgPreDblk);
+        X265_FREE_ZERO(m_depthSaoRate);
+        X265_FREE_ZERO(m_clipTableBase);
+    }
 }
 
 /* allocate memory for SAO parameters */
 void SAO::allocSaoParam(SAOParam* saoParam) const
 {
+    int planes = (m_param->internalCsp != X265_CSP_I400) ? 3 : 1;

x265_1.8.tar.gz/source/encoder/sao.h -> x265_1.9.tar.gz/source/encoder/sao.h Changed

@@ -62,6 +62,7 @@
     enum { NUM_EDGETYPE = 5 };
     enum { NUM_PLANE = 3 };
     enum { NUM_MERGE_MODE = 3 };
+    enum { SAO_DEPTHRATE_SIZE = 4 };
 
     static const uint32_t s_eoTable[NUM_EDGETYPE];
 
@@ -71,18 +72,19 @@
 protected:
 
     /* allocated per part */
-    PerClass*   m_count;
-    PerClass*   m_offset;
-    PerClass*   m_offsetOrg;
+    PerPlane    m_count;
+    PerPlane    m_offset;
+    PerPlane    m_offsetOrg;
 
     /* allocated per CTU */
     PerPlane*   m_countPreDblk;
     PerPlane*   m_offsetOrgPreDblk;
 
-    double      m_depthSaoRate[2][4];
-    int8_t      m_offsetBo[SAO_NUM_BO_CLASSES];
-    int8_t      m_offsetEo[NUM_EDGETYPE];
+    double*     m_depthSaoRate;
+    int8_t      m_offsetBo[NUM_PLANE][SAO_NUM_BO_CLASSES];
+    int8_t      m_offsetEo[NUM_PLANE][NUM_EDGETYPE];
 
+    int         m_chromaFormat;
     int         m_numCuInWidth;
     int         m_numCuInHeight;
     int         m_hChromaShift;
@@ -91,10 +93,9 @@
     pixel*      m_clipTable;
     pixel*      m_clipTableBase;
 
-    pixel*      m_tmpU1[3];
-    pixel*      m_tmpU2[3];
-    pixel*      m_tmpL1;
-    pixel*      m_tmpL2;
+    pixel*      m_tmpU[3];
+    pixel*      m_tmpL1[3];
+    pixel*      m_tmpL2[3];
 
 public:
 
@@ -119,8 +120,9 @@
 
     SAO();
 
-    bool create(x265_param* param);
-    void destroy();
+    bool create(x265_param* param, int initCommon);
+    void createFromRootNode(SAO *root);
+    void destroy(int destoryCommon);
 
     void allocSaoParam(SAOParam* saoParam) const;
 
@@ -131,6 +133,8 @@
     // CTU-based SAO process without slice granularity
     void processSaoCu(int addr, int typeIdx, int plane);
     void processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane);
+    void processSaoUnitCuLuma(SaoCtuParam* ctuParam, int idxY, int idxX);
+    void processSaoUnitCuChroma(SaoCtuParam* ctuParam[3], int idxY, int idxX);
 
     void copySaoUnit(SaoCtuParam* saoUnitDst, const SaoCtuParam* saoUnitSrc);
 
@@ -146,6 +150,9 @@
 
     void rdoSaoUnitRowEnd(const SAOParam* saoParam, int numctus);
     void rdoSaoUnitRow(SAOParam* saoParam, int idxY);
+    void rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr);
+
+    friend class FrameFilter;
 };
 
 }

x265_1.8.tar.gz/source/encoder/search.cpp -> x265_1.9.tar.gz/source/encoder/search.cpp Changed

@@ -2,6 +2,7 @@
 * Copyright (C) 2013 x265 project
 *
 * Authors: Steve Borho <steve@borho.org>
+*          Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -80,7 +81,7 @@
     m_me.init(param.searchMethod, param.subpelRefine, param.internalCsp);
 
     bool ok = m_quant.init(param.rdoqLevel, param.psyRdoq, scalingList, m_entropyCoder);
-    if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
+    if (m_param->noiseReductionIntra || m_param->noiseReductionInter || m_param->rc.vbvBufferSize)
         ok &= m_quant.allocNoiseReduction(param);
 
     ok &= Predict::allocBuffers(param.internalCsp); /* sets m_hChromaShift & m_vChromaShift */
@@ -97,13 +98,27 @@
      * the coeffRQT and reconQtYuv are allocated to the max CU size at every depth. The parts
      * which are reconstructed at each depth are valid. At the end, the transform depth table
      * is walked and the coeff and recon at the correct depths are collected */
-    for (uint32_t i = 0; i <= m_numLayers; i++)
+
+    if (param.internalCsp != X265_CSP_I400)
+    {
+        for (uint32_t i = 0; i <= m_numLayers; i++)
+        {
+            CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL + sizeC * 2);
+            m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[0] + sizeL;
+            m_rqt[i].coeffRQT[2] = m_rqt[i].coeffRQT[0] + sizeL + sizeC;
+            ok &= m_rqt[i].reconQtYuv.create(g_maxCUSize, param.internalCsp);
+            ok &= m_rqt[i].resiQtYuv.create(g_maxCUSize, param.internalCsp);
+        }
+    }
+    else
     {
-        CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL + sizeC * 2);
-        m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[0] + sizeL;
-        m_rqt[i].coeffRQT[2] = m_rqt[i].coeffRQT[0] + sizeL + sizeC;
-        ok &= m_rqt[i].reconQtYuv.create(g_maxCUSize, param.internalCsp);
-        ok &= m_rqt[i].resiQtYuv.create(g_maxCUSize, param.internalCsp);
+        for (uint32_t i = 0; i <= m_numLayers; i++)
+        {
+            CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL);
+            m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[2] = NULL;
+            ok &= m_rqt[i].reconQtYuv.create(g_maxCUSize, param.internalCsp);
+            ok &= m_rqt[i].resiQtYuv.create(g_maxCUSize, param.internalCsp);
+        }
     }
 
     /* the rest of these buffers are indexed per-depth */
@@ -116,12 +131,22 @@
         ok &= m_rqt[i].bidirPredYuv[1].create(cuSize, param.internalCsp);
     }
 
-    CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions * 3);
-    m_qtTempCbf[1] = m_qtTempCbf[0] + numPartitions;
-    m_qtTempCbf[2] = m_qtTempCbf[0] + numPartitions * 2;
-    CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions * 3);
-    m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[0] + numPartitions;
-    m_qtTempTransformSkipFlag[2] = m_qtTempTransformSkipFlag[0] + numPartitions * 2;
+    if (param.internalCsp != X265_CSP_I400)
+    {
+        CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions * 3);
+        m_qtTempCbf[1] = m_qtTempCbf[0] + numPartitions;
+        m_qtTempCbf[2] = m_qtTempCbf[0] + numPartitions * 2;
+        CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions * 3);
+        m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[0] + numPartitions;
+        m_qtTempTransformSkipFlag[2] = m_qtTempTransformSkipFlag[0] + numPartitions * 2;
+    }
+    else
+    {
+        CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions);
+        m_qtTempCbf[1] = m_qtTempCbf[2] = NULL;
+        CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions);
+        m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[2] = NULL;
+    }
 
     CHECKED_MALLOC(m_intraPred, pixel, (32 * 32) * (33 + 3));
     m_fencScaled = m_intraPred + 32 * 32;
@@ -163,12 +188,12 @@
     X265_FREE(m_tsRecon);
 }
 
-int Search::setLambdaFromQP(const CUData& ctu, int qp)
+int Search::setLambdaFromQP(const CUData& ctu, int qp, int lambdaQp)
 {
     X265_CHECK(qp >= QP_MIN && qp <= QP_MAX_MAX, "QP used for lambda is out of range\n");
 
     m_me.setQP(qp);
-    m_rdCost.setQP(*m_slice, qp);
+    m_rdCost.setQP(*m_slice, lambdaQp < 0 ? qp : lambdaQp);
 
     int quantQP = x265_clip3(QP_MIN, QP_MAX_SPEC, qp);
     m_quant.setQPforQuant(ctu, quantQP);
@@ -446,8 +471,9 @@
     }
 
     // set reconstruction for next intra prediction blocks if full TU prediction won
-    pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
-    intptr_t picStride = m_frame->m_reconPic->m_stride;
+    PicYuv*  reconPic = m_frame->m_reconPic;
+    pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
+    intptr_t picStride = reconPic->m_stride;
     primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
 
     outCost.rdcost     += fullCost.rdcost;
@@ -530,7 +556,7 @@
             // no residual coded, recon = pred
             primitives.cu[sizeIdx].copy_pp(tmpRecon, tmpReconStride, pred, stride);
 
-        sse_ret_t tmpDist = primitives.cu[sizeIdx].sse_pp(tmpRecon, tmpReconStride, fenc, stride);
+        sse_t tmpDist = primitives.cu[sizeIdx].sse_pp(tmpRecon, tmpReconStride, fenc, stride);
 
         cu.setTransformSkipSubParts(useTSkip, TEXT_LUMA, absPartIdx, fullDepth);
         cu.setCbfSubParts((!!numSig) << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
@@ -611,8 +637,9 @@
     }
 
     // set reconstruction for next intra prediction blocks
-    pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
-    intptr_t picStride = m_frame->m_reconPic->m_stride;
+    PicYuv*  reconPic = m_frame->m_reconPic;
+    pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
+    intptr_t picStride = reconPic->m_stride;
     primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
 
     outCost.rdcost += fullCost.rdcost;
@@ -661,8 +688,9 @@
         uint32_t sizeIdx   = log2TrSize - 2;
         primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
 
-        pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
-        intptr_t picStride = m_frame->m_reconPic->m_stride;
+        PicYuv*  reconPic = m_frame->m_reconPic;
+        pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
+        intptr_t picStride = reconPic->m_stride;
 
         uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
         if (numSig)
@@ -750,7 +778,7 @@
 }
 
 /* returns distortion */
-uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, uint32_t& psyEnergy)
+void Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost)
 {
     CUData& cu = mode.cu;
     uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
@@ -758,10 +786,10 @@
     if (tuDepth < cu.m_tuDepth[absPartIdx])
     {
         uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
-        uint32_t outDist = 0, splitCbfU = 0, splitCbfV = 0;
+        uint32_t splitCbfU = 0, splitCbfV = 0;
         for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
         {
-            outDist += codeIntraChromaQt(mode, cuGeom, tuDepth + 1, qPartIdx, psyEnergy);
+            codeIntraChromaQt(mode, cuGeom, tuDepth + 1, qPartIdx, outCost);
             splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
             splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
         }
@@ -770,8 +798,7 @@
             cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << tuDepth);
             cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << tuDepth);
         }
-
-        return outDist;
+        return;
     }
 
     uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
@@ -780,7 +807,7 @@
     {
         X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
         if (absPartIdx & 3)
-            return 0;
+            return;
         log2TrSizeC = 2;
         tuDepthC--;
     }
@@ -791,13 +818,15 @@
     bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && log2TrSizeC <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
     checkTransformSkip &= !m_param->bEnableTSkipFast || (log2TrSize <= MAX_LOG2_TS_SIZE && cu.m_transformSkip[TEXT_LUMA][absPartIdx]);
     if (checkTransformSkip)
-        return codeIntraChromaTSkip(mode, cuGeom, tuDepth, tuDepthC, absPartIdx, psyEnergy);
+    {
+        codeIntraChromaTSkip(mode, cuGeom, tuDepth, tuDepthC, absPartIdx, outCost);
+        return;
+    }
 
     ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
     uint32_t qtLayer = log2TrSize - 2;
     uint32_t stride = mode.fencYuv->m_csize;
     const uint32_t sizeIdxC = log2TrSizeC - 2;
-    sse_ret_t outDist = 0;
 
     uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
     const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
@@ -821,8 +850,9 @@

x265_1.8.tar.gz/source/encoder/search.h -> x265_1.9.tar.gz/source/encoder/search.h Changed

@@ -2,6 +2,7 @@
 * Copyright (C) 2013 x265 project
 *
 * Authors: Steve Borho <steve@borho.org>
+*          Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -84,8 +85,14 @@
     MV       mvp;
     int      mvpIdx;
     int      ref;
-    uint32_t cost;
     int      bits;
+    uint32_t mvCost;
+    uint32_t cost;
+
+    MotionData()
+    {
+        memset(this, 0, sizeof(MotionData));
+    }
 };
 
 struct Mode
@@ -105,16 +112,17 @@
     // temporal candidate.
     InterNeighbourMV interNeighbours[6];
 
-    uint64_t   rdCost;     // sum of partition (psy) RD costs          (sse(fenc, recon) + lambda2 * bits)
-    uint64_t   sa8dCost;   // sum of partition sa8d distortion costs   (sa8d(fenc, pred) + lambda * bits)
-    uint32_t   sa8dBits;   // signal bits used in sa8dCost calculation
-    uint32_t   psyEnergy;  // sum of partition psycho-visual energy difference
-    sse_ret_t  lumaDistortion;
-    sse_ret_t  chromaDistortion;
-    sse_ret_t  distortion; // sum of partition SSE distortion
-    uint32_t   totalBits;  // sum of partition bits (mv + coeff)
-    uint32_t   mvBits;     // Mv bits + Ref + block type (or intra mode)
-    uint32_t   coeffBits;  // Texture bits (DCT Coeffs)
+    uint64_t    rdCost;     // sum of partition (psy) RD costs          (sse(fenc, recon) + lambda2 * bits)
+    uint64_t    sa8dCost;   // sum of partition sa8d distortion costs   (sa8d(fenc, pred) + lambda * bits)
+    uint32_t    sa8dBits;   // signal bits used in sa8dCost calculation
+    uint32_t    psyEnergy;  // sum of partition psycho-visual energy difference
+    sse_t   resEnergy;  // sum of partition residual energy after motion prediction
+    sse_t   lumaDistortion;
+    sse_t   chromaDistortion;
+    sse_t  distortion; // sum of partition SSE distortion
+    uint32_t    totalBits;  // sum of partition bits (mv + coeff)
+    uint32_t    mvBits;     // Mv bits + Ref + block type (or intra mode)
+    uint32_t    coeffBits;  // Texture bits (DCT Coeffs)
 
     void initCosts()
     {
@@ -122,6 +130,7 @@
         sa8dCost = 0;
         sa8dBits = 0;
         psyEnergy = 0;
+        resEnergy = 0;
         lumaDistortion = 0;
         chromaDistortion = 0;
         distortion = 0;
@@ -130,62 +139,13 @@
         coeffBits = 0;
     }
 
-    void invalidate()
-    {
-        /* set costs to invalid data, catch uninitialized re-use */
-        rdCost = UINT64_MAX / 2;
-        sa8dCost = UINT64_MAX / 2;
-        sa8dBits = MAX_UINT / 2;
-        psyEnergy = MAX_UINT / 2;
-#if X265_DEPTH <= 10
-        lumaDistortion = MAX_UINT / 2;
-        chromaDistortion = MAX_UINT / 2;
-        distortion = MAX_UINT / 2;
-#else
-        lumaDistortion = UINT64_MAX / 2;
-        chromaDistortion = UINT64_MAX / 2;
-        distortion = UINT64_MAX / 2;
-#endif
-        totalBits = MAX_UINT / 2;
-        mvBits = MAX_UINT / 2;
-        coeffBits = MAX_UINT / 2;
-    }
-
-    bool ok() const
-    {
-#if X265_DEPTH <= 10
-        return !(rdCost >= UINT64_MAX / 2 ||
-            sa8dCost >= UINT64_MAX / 2 ||
-            sa8dBits >= MAX_UINT / 2 ||
-            psyEnergy >= MAX_UINT / 2 ||
-            lumaDistortion >= MAX_UINT / 2 ||
-            chromaDistortion >= MAX_UINT / 2 ||
-            distortion >= MAX_UINT / 2 ||
-            totalBits >= MAX_UINT / 2 ||
-            mvBits >= MAX_UINT / 2 ||
-            coeffBits >= MAX_UINT / 2);
-#else
-        return !(rdCost >= UINT64_MAX / 2 ||
-                 sa8dCost >= UINT64_MAX / 2 ||
-                 sa8dBits >= MAX_UINT / 2 ||
-                 psyEnergy >= MAX_UINT / 2 ||
-                 lumaDistortion >= UINT64_MAX / 2 ||
-                 chromaDistortion >= UINT64_MAX / 2 ||
-                 distortion >= UINT64_MAX / 2 ||
-                 totalBits >= MAX_UINT / 2 ||
-                 mvBits >= MAX_UINT / 2 ||
-                 coeffBits >= MAX_UINT / 2);
-#endif
-    }
-
     void addSubCosts(const Mode& subMode)
     {
-        X265_CHECK(subMode.ok(), "sub-mode not initialized");
-
         rdCost += subMode.rdCost;
         sa8dCost += subMode.sa8dCost;
         sa8dBits += subMode.sa8dBits;
         psyEnergy += subMode.psyEnergy;
+        resEnergy += subMode.resEnergy;
         lumaDistortion += subMode.lumaDistortion;
         chromaDistortion += subMode.chromaDistortion;
         distortion += subMode.distortion;
@@ -325,13 +285,13 @@
     ~Search();
 
     bool     initSearch(const x265_param& param, ScalingList& scalingList);
-    int      setLambdaFromQP(const CUData& ctu, int qp); /* returns real quant QP in valid spec range */
+    int      setLambdaFromQP(const CUData& ctu, int qp, int lambdaQP = -1); /* returns real quant QP in valid spec range */
 
     // mark temp RD entropy contexts as uninitialized; useful for finding loads without stores
     void     invalidateContexts(int fromDepth);
 
-    // full RD search of intra modes. if sharedModes is not NULL, it directly uses them
-    void     checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize, uint8_t* sharedModes, uint8_t* sharedChromaModes);
+    // full RD search of intra modes
+    void     checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSizes);
 
     // select best intra mode using only sa8d costs, cannot measure NxN intra
     void     checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom);
@@ -397,10 +357,10 @@
     void     saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t tuDepth);
 
     // RDO search of luma intra modes; result is fully encoded luma. luma distortion is returned
-    uint32_t estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2], uint8_t* sharedModes);
+    sse_t estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2]);
 
     // RDO select best chroma mode from luma; result is fully encode chroma. chroma distortion is returned
-    uint32_t estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom, uint8_t* sharedChromaModes);
+    sse_t estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom);
 
     void     codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx);
     void     codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t tuDepth, const uint32_t depthRange[2]);
@@ -410,12 +370,12 @@
     {
         uint64_t rdcost;
         uint32_t bits;
-        sse_ret_t distortion;
+        sse_t distortion;
         uint32_t energy;
         Cost() { rdcost = 0; bits = 0; distortion = 0; energy = 0; }
     };
 
-    uint64_t estimateNullCbfCost(uint32_t &dist, uint32_t &psyEnergy, uint32_t tuDepth, TextType compId);
+    uint64_t estimateNullCbfCost(sse_t dist, uint32_t psyEnergy, uint32_t tuDepth, TextType compId);
     void     estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& costs, const uint32_t depthRange[2]);
 
     // generate prediction, generate residual and recon. if bAllowSplit, find optimal RQT splits
@@ -424,8 +384,8 @@
     void     extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t tuDepth, uint32_t absPartIdx);
 
     // generate chroma prediction, generate residual and recon
-    uint32_t codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, uint32_t& psyEnergy);
-    uint32_t codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, uint32_t& psyEnergy);
+    void     codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost);
+    void     codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, Cost& outCost);
     void     extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t tuDepth);
 
     // reshuffle CBF flags after coding a pair of 4:2:2 chroma blocks

x265_1.8.tar.gz/source/encoder/sei.h -> x265_1.9.tar.gz/source/encoder/sei.h Changed

@@ -163,12 +163,6 @@
 
     PayloadType payloadType() const { return CONTENT_LIGHT_LEVEL_INFO; }
 
-    bool parse(const char* value)
-    {
-        return sscanf(value, "%hu,%hu",
-                      &max_content_light_level, &max_pic_average_light_level) == 2;
-    }
-
     void write(Bitstream& bs, const SPS&)
     {
         m_bitIf = &bs;
@@ -195,29 +189,31 @@
 
     uint8_t m_digest[3][16];
 
-    void write(Bitstream& bs, const SPS&)
+    void write(Bitstream& bs, const SPS& sps)
     {
         m_bitIf = &bs;
 
+        int planes = (sps.chromaFormatIdc != X265_CSP_I400) ? 3 : 1;
+
         WRITE_CODE(DECODED_PICTURE_HASH, 8, "payload_type");
 
         switch (m_method)
         {
         case MD5:
-            WRITE_CODE(1 + 16 * 3, 8, "payload_size");
+            WRITE_CODE(1 + 16 * planes, 8, "payload_size");
             WRITE_CODE(MD5, 8, "hash_type");
             break;
         case CRC:
-            WRITE_CODE(1 + 2 * 3, 8, "payload_size");
+            WRITE_CODE(1 + 2 * planes, 8, "payload_size");
             WRITE_CODE(CRC, 8, "hash_type");
             break;
         case CHECKSUM:
-            WRITE_CODE(1 + 4 * 3, 8, "payload_size");
+            WRITE_CODE(1 + 4 * planes, 8, "payload_size");
             WRITE_CODE(CHECKSUM, 8, "hash_type");
             break;
         }
 
-        for (int yuvIdx = 0; yuvIdx < 3; yuvIdx++)
+        for (int yuvIdx = 0; yuvIdx < planes; yuvIdx++)
         {
             if (m_method == MD5)
             {

x265_1.8.tar.gz/source/encoder/slicetype.cpp -> x265_1.9.tar.gz/source/encoder/slicetype.cpp Changed

@@ -83,8 +83,11 @@
     uint32_t var;
 
     var  = acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[0] + blockOffsetLuma, stride, 0, csp);
-    var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[1] + blockOffsetChroma, cStride, 1, csp);
-    var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[2] + blockOffsetChroma, cStride, 2, csp);
+    if (csp != X265_CSP_I400)
+    {
+        var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[1] + blockOffsetChroma, cStride, 1, csp);
+        var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[2] + blockOffsetChroma, cStride, 2, csp);
+    }
     x265_emms();
     return var;
 }
@@ -96,6 +99,7 @@
     int maxRow = curFrame->m_fencPic->m_picHeight;
     int blockCount = curFrame->m_lowres.maxBlocksInRow * curFrame->m_lowres.maxBlocksInCol;
 
+    float* quantOffsets = curFrame->m_quantOffsets;
     for (int y = 0; y < 3; y++)
     {
         curFrame->m_lowres.wp_ssd[y] = 0;
@@ -113,10 +117,21 @@
 
         if (param->rc.aqMode && param->rc.aqStrength == 0)
         {
-            memset(curFrame->m_lowres.qpCuTreeOffset, 0, cuCount * sizeof(double));
-            memset(curFrame->m_lowres.qpAqOffset, 0, cuCount * sizeof(double));
-            for (int cuxy = 0; cuxy < cuCount; cuxy++)
-                curFrame->m_lowres.invQscaleFactor[cuxy] = 256;
+            if (quantOffsets)
+            {
+                for (int cuxy = 0; cuxy < cuCount; cuxy++)
+                {
+                    curFrame->m_lowres.qpCuTreeOffset[cuxy] = curFrame->m_lowres.qpAqOffset[cuxy] = quantOffsets[cuxy];
+                    curFrame->m_lowres.invQscaleFactor[cuxy] = x265_exp2fix8(curFrame->m_lowres.qpCuTreeOffset[cuxy]);
+                }
+            }
+            else
+            {
+                memset(curFrame->m_lowres.qpCuTreeOffset, 0, cuCount * sizeof(double));
+                memset(curFrame->m_lowres.qpAqOffset, 0, cuCount * sizeof(double));
+                for (int cuxy = 0; cuxy < cuCount; cuxy++)
+                    curFrame->m_lowres.invQscaleFactor[cuxy] = 256;
+            }
         }
 
         /* Need variance data for weighted prediction */
@@ -135,19 +150,25 @@
         if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE || param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED)
         {
             double bit_depth_correction = 1.f / (1 << (2*(X265_DEPTH-8)));
+            curFrame->m_lowres.frameVariance = 0;
+            uint64_t rowVariance = 0;
             for (blockY = 0; blockY < maxRow; blockY += 16)
             {
+                rowVariance = 0;
                 for (blockX = 0; blockX < maxCol; blockX += 16)
                 {
                     uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp);
+                    curFrame->m_lowres.blockVariance[blockXY] = energy;
+                    rowVariance += energy;
                     qp_adj = pow(energy * bit_depth_correction + 1, 0.1);
                     curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
                     avg_adj += qp_adj;
                     avg_adj_pow2 += qp_adj * qp_adj;
                     blockXY++;
                 }
+                curFrame->m_lowres.frameVariance += (rowVariance / maxCol);
             }
-
+            curFrame->m_lowres.frameVariance /= maxRow;
             avg_adj /= blockCount;
             avg_adj_pow2 /= blockCount;
             strength = param->rc.aqStrength * avg_adj;
@@ -177,6 +198,8 @@
                     uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp);
                     qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (14.427f + 2 * (X265_DEPTH - 8)));
                 }
+                if (quantOffsets != NULL)
+                    qp_adj += quantOffsets[blockXY];
                 curFrame->m_lowres.qpAqOffset[blockXY] = qp_adj;
                 curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
                 curFrame->m_lowres.invQscaleFactor[blockXY] = x265_exp2fix8(qp_adj);
@@ -328,7 +351,7 @@
 
         primitives.weight_pp(ref.buffer[0], wbuffer[0], stride, widthHeight, paddedLines,
             scale, round << correction, denom + correction, offset);
-        src = weightedRef.fpelPlane[0];
+        src = fenc.weightedRef[fenc.frameNum - ref.frameNum].fpelPlane[0];
     }
 
     uint32_t cost = 0;
@@ -350,7 +373,6 @@
 bool LookaheadTLD::allocWeightedRef(Lowres& fenc)
 {
     intptr_t planesize = fenc.buffer[1] - fenc.buffer[0];
-    intptr_t padoffset = fenc.lowresPlane[0] - fenc.buffer[0];
     paddedLines = (int)(planesize / fenc.lumaStride);
 
     wbuffer[0] = X265_MALLOC(pixel, 4 * planesize);
@@ -363,14 +385,6 @@
     else
         return false;
 
-    for (int i = 0; i < 4; i++)
-        weightedRef.lowresPlane[i] = wbuffer[i] + padoffset;
-
-    weightedRef.fpelPlane[0] = weightedRef.lowresPlane[0];
-    weightedRef.lumaStride = fenc.lumaStride;
-    weightedRef.isLowres = true;
-    weightedRef.isWeighted = false;
-
     return true;
 }
 
@@ -388,6 +402,16 @@
             return;
     }
 
+    ReferencePlanes& weightedRef = fenc.weightedRef[deltaIndex];
+    intptr_t padoffset = fenc.lowresPlane[0] - fenc.buffer[0];
+    for (int i = 0; i < 4; i++)
+        weightedRef.lowresPlane[i] = wbuffer[i] + padoffset;
+
+    weightedRef.fpelPlane[0] = weightedRef.lowresPlane[0];
+    weightedRef.lumaStride = fenc.lumaStride;
+    weightedRef.isLowres = true;
+    weightedRef.isWeighted = false;
+
     /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
     float guessScale, fencMean, refMean;
     x265_emms();
@@ -478,7 +502,13 @@
 
     m_8x8Height = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
     m_8x8Width = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
-    m_8x8Blocks = m_8x8Width > 2 && m_8x8Height > 2 ? (m_8x8Width - 2) * (m_8x8Height - 2) : m_8x8Width * m_8x8Height;
+    m_cuCount = m_8x8Width * m_8x8Height;
+    m_8x8Blocks = m_8x8Width > 2 && m_8x8Height > 2 ? (m_cuCount + 4 - 2 * (m_8x8Width + m_8x8Height)) : m_cuCount;
+
+    /* Allow the strength to be adjusted via qcompress, since the two concepts
+     * are very similar. */
+
+    m_cuTreeStrength = 5.0 * (1.0 - m_param->rc.qCompress);
 
     m_lastKeyframe = -m_param->keyframeMax;
     m_sliceTypeBusy = false;
@@ -502,7 +532,16 @@
     m_bBatchFrameCosts = m_bBatchMotionSearch;
 
     if (m_param->lookaheadSlices && !m_pool)
+    {
+        x265_log(param, X265_LOG_WARNING, "No pools found; disabling lookahead-slices\n");
+        m_param->lookaheadSlices = 0;
+    }
+
+    if (m_param->lookaheadSlices && (m_param->sourceHeight < 720))
+    {
+        x265_log(param, X265_LOG_WARNING, "Source height < 720p; disabling lookahead-slices\n");
         m_param->lookaheadSlices = 0;
+    }
 
     if (m_param->lookaheadSlices > 1)
     {
@@ -715,16 +754,16 @@
 
     case P_SLICE:
         b = p1 = poc - l0poc;
-        frames[p0] = &slice->m_refPicList[0][0]->m_lowres;
+        frames[p0] = &slice->m_refFrameList[0][0]->m_lowres;
         frames[b] = &curFrame->m_lowres;
         break;
 
     case B_SLICE:
         b = poc - l0poc;
         p1 = b + l1poc - poc;
-        frames[p0] = &slice->m_refPicList[0][0]->m_lowres;
+        frames[p0] = &slice->m_refFrameList[0][0]->m_lowres;
         frames[b] = &curFrame->m_lowres;
-        frames[p1] = &slice->m_refPicList[1][0]->m_lowres;
+        frames[p1] = &slice->m_refFrameList[1][0]->m_lowres;
         break;
 
     default:
@@ -736,10 +775,13 @@
     if (m_param->rc.cuTree && !m_param->rc.bStatRead)
         /* update row satds based on cutree offsets */
         curFrame->m_lowres.satdCost = frameCostRecalculate(frames, p0, p1, b);
-    else if (m_param->rc.aqMode)
-        curFrame->m_lowres.satdCost = curFrame->m_lowres.costEstAq[b - p0][p1 - b];
-    else
-        curFrame->m_lowres.satdCost = curFrame->m_lowres.costEst[b - p0][p1 - b];
+    else if (m_param->analysisMode != X265_ANALYSIS_LOAD)
+    {
+        if (m_param->rc.aqMode)
+            curFrame->m_lowres.satdCost = curFrame->m_lowres.costEstAq[b - p0][p1 - b];
+        else
+            curFrame->m_lowres.satdCost = curFrame->m_lowres.costEst[b - p0][p1 - b];
+    }

x265_1.8.tar.gz/source/encoder/slicetype.h -> x265_1.9.tar.gz/source/encoder/slicetype.h Changed

@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -44,7 +45,6 @@
 struct LookaheadTLD
 {
     MotionEstimate  me;
-    ReferencePlanes weightedRef;
     pixel*          wbuffer[4];
     int             widthInCU;
     int             heightInCU;
@@ -103,29 +103,30 @@
     PicList       m_outputQueue;     // pictures to be encoded, in encode order
     Lock          m_inputLock;
     Lock          m_outputLock;
-
-    /* pre-lookahead */
-    int           m_fullQueueSize;
-    bool          m_isActive;
-    bool          m_sliceTypeBusy;
-    bool          m_bAdaptiveQuant;
-    bool          m_outputSignalRequired;
-    bool          m_bBatchMotionSearch;
-    bool          m_bBatchFrameCosts;
     Event         m_outputSignal;
-
     LookaheadTLD* m_tld;
     x265_param*   m_param;
     Lowres*       m_lastNonB;
     int*          m_scratch;         // temp buffer for cutree propagate
-    
+
+    /* pre-lookahead */
+    int           m_fullQueueSize;
     int           m_histogram[X265_BFRAME_MAX + 1];
     int           m_lastKeyframe;
     int           m_8x8Width;
     int           m_8x8Height;
     int           m_8x8Blocks;
+    int           m_cuCount;
     int           m_numCoopSlices;
     int           m_numRowsPerSlice;
+    double        m_cuTreeStrength;
+
+    bool          m_isActive;
+    bool          m_sliceTypeBusy;
+    bool          m_bAdaptiveQuant;
+    bool          m_outputSignalRequired;
+    bool          m_bBatchMotionSearch;
+    bool          m_bBatchFrameCosts;
     bool          m_filled;
     bool          m_isSceneTransition;
     Lookahead(x265_param *param, ThreadPool *pool);

x265_1.8.tar.gz/source/encoder/weightPrediction.cpp -> x265_1.9.tar.gz/source/encoder/weightPrediction.cpp Changed

@@ -4,6 +4,7 @@
  * Author: Shazeb Nawaz Khan <shazeb@multicorewareinc.com>
  *         Steve Borho <steve@borho.org>
  *         Kavitha Sampas <kavitha@multicorewareinc.com>
+ *         Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -259,13 +260,13 @@
     for (int list = 0; list < cache.numPredDir; list++)
     {
         WeightParam *weights = wp[list][0];
-        Frame *refFrame = slice.m_refPicList[list][0];
+        Frame *refFrame = slice.m_refFrameList[list][0];
         Lowres& refLowres = refFrame->m_lowres;
         int diffPoc = abs(curPoc - refFrame->m_poc);
 
         /* prepare estimates */
         float guessScale[3], fencMean[3], refMean[3];
-        for (int plane = 0; plane < 3; plane++)
+        for (int plane = 0; plane < (param.internalCsp != X265_CSP_I400 ? 3 : 1); plane++)
         {
             SET_WEIGHT(weights[plane], false, 1, 0, 0);
             uint64_t fencVar = fenc.wp_ssd[plane] + !refLowres.wp_ssd[plane];
@@ -289,7 +290,7 @@
 
         MV *mvs = NULL;
 
-        for (int plane = 0; plane < 3; plane++)
+        for (int plane = 0; plane < (param.internalCsp != X265_CSP_I400 ? 3 : 1); plane++)
         {
             denom = plane ? chromaDenom : lumaDenom;
             if (plane && !weights[0].bPresentFlag)
@@ -328,7 +329,7 @@
                 {
                     /* reference chroma planes must be extended prior to being
                      * used as motion compensation sources */
-                    if (!refFrame->m_bChromaExtended)
+                    if (!refFrame->m_bChromaExtended && param.internalCsp != X265_CSP_I400)
                     {
                         refFrame->m_bChromaExtended = true;
                         PicYuv *refPic = refFrame->m_fencPic;

x265_1.8.tar.gz/source/output/y4m.cpp -> x265_1.9.tar.gz/source/output/y4m.cpp Changed

x265_1.8.tar.gz/source/output/yuv.cpp -> x265_1.9.tar.gz/source/output/yuv.cpp Changed

x265_1.8.tar.gz/source/profile/vtune/CMakeLists.txt -> x265_1.9.tar.gz/source/profile/vtune/CMakeLists.txt Changed

x265_1.8.tar.gz/source/profile/vtune/vtune.cpp -> x265_1.9.tar.gz/source/profile/vtune/vtune.cpp Changed

x265_1.8.tar.gz/source/test/checkasm-a.asm -> x265_1.9.tar.gz/source/test/checkasm-a.asm Changed

x265_1.8.tar.gz/source/test/intrapredharness.cpp -> x265_1.9.tar.gz/source/test/intrapredharness.cpp Changed

x265_1.8.tar.gz/source/test/ipfilterharness.h -> x265_1.9.tar.gz/source/test/ipfilterharness.h Changed

x265_1.8.tar.gz/source/test/pixelharness.cpp -> x265_1.9.tar.gz/source/test/pixelharness.cpp Changed

@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -41,6 +42,7 @@
         int_test_buff[0][i]     = rand() % SHORT_MAX;
         ushort_test_buff[0][i]  = rand() % ((1 << 16) - 1);
         uchar_test_buff[0][i]   = rand() % ((1 << 8) - 1);
+        residual_test_buff[0][i] = (rand() % (2 * RMAX + 1)) - RMAX - 1;// For sse_ss only
 
         pixel_test_buff[1][i]   = PIXEL_MIN;
         short_test_buff[1][i]   = SMIN;
@@ -49,6 +51,7 @@
         int_test_buff[1][i]     = SHORT_MIN;
         ushort_test_buff[1][i]  = PIXEL_MIN;
         uchar_test_buff[1][i]   = PIXEL_MIN;
+        residual_test_buff[1][i] = RMIN;
 
         pixel_test_buff[2][i]   = PIXEL_MAX;
         short_test_buff[2][i]   = SMAX;
@@ -57,6 +60,7 @@
         int_test_buff[2][i]     = SHORT_MAX;
         ushort_test_buff[2][i]  = ((1 << 16) - 1);
         uchar_test_buff[2][i]   = 255;
+        residual_test_buff[2][i] = RMAX;
 
         pbuf1[i] = rand() & PIXEL_MAX;
         pbuf2[i] = rand() & PIXEL_MAX;
@@ -103,8 +107,8 @@
     {
         int index1 = rand() % TEST_CASES;
         int index2 = rand() % TEST_CASES;
-        sse_ret_t vres = (sse_ret_t)checked(opt, pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
-        sse_ret_t cres = ref(pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
+        sse_t vres = (sse_t)checked(opt, pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
+        sse_t cres = ref(pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
         if (vres != cres)
             return false;
 
@@ -124,8 +128,8 @@
     {
         int index1 = rand() % TEST_CASES;
         int index2 = rand() % TEST_CASES;
-        sse_ret_t vres = (sse_ret_t)checked(opt, short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
-        sse_ret_t cres = ref(short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
+        sse_t vres = (sse_t)checked(opt, residual_test_buff[index1], stride, residual_test_buff[index2] + j, stride);
+        sse_t cres = ref(residual_test_buff[index1], stride, residual_test_buff[index2] + j, stride);
         if (vres != cres)
             return false;
 
@@ -227,8 +231,8 @@
     {
         // NOTE: stride must be multiple of 16, because minimum block is 4x4
         int stride = (STRIDE + (rand() % STRIDE)) & ~15;
-        int cres = ref(sbuf1 + j, stride);
-        int vres = (int)checked(opt, sbuf1 + j, (intptr_t)stride);
+        sse_t cres = ref(sbuf1 + j, stride);
+        sse_t vres = (sse_t)checked(opt, sbuf1 + j, (intptr_t)stride);
 
         if (cres != vres)
             return false;
@@ -854,7 +858,7 @@
         int width = (rand() % 4) + 1; // range[1-4]
         float cres = ref(sum0, sum1, width);
         float vres = checked_float(opt, sum0, sum1, width);
-        if (fabs(vres - cres) > 0.00001)
+        if (fabs(vres - cres) > 0.0001)
             return false;
 
         reportfail();
@@ -1061,8 +1065,8 @@
         int endX = MAX_CU_SIZE - (rand() % 5);
         int endY = MAX_CU_SIZE - (rand() % 4) - 1;
 
-        ref(pbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_ref, count_ref);
-        checked(opt, pbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_vec, count_vec);
+        ref(sbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_ref, count_ref);
+        checked(opt, sbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_vec, count_vec);
 
         if (memcmp(stats_ref, stats_vec, sizeof(stats_ref)) || memcmp(count_ref, count_vec, sizeof(count_ref)))
             return false;
@@ -1097,8 +1101,8 @@
         int endX = MAX_CU_SIZE - (rand() % 5) - 1;
         int endY = MAX_CU_SIZE - (rand() % 4) - 1;
 
-        ref(pbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_ref, count_ref);
-        checked(opt, pbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_vec, count_vec);
+        ref(sbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_ref, count_ref);
+        checked(opt, sbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_vec, count_vec);
 
         if (memcmp(stats_ref, stats_vec, sizeof(stats_ref)) || memcmp(count_ref, count_vec, sizeof(count_ref)))
             return false;
@@ -1141,8 +1145,8 @@
         int endX = MAX_CU_SIZE - (rand() % 5);
         int endY = MAX_CU_SIZE - (rand() % 4) - 1;
 
-        ref(pbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
-        checked(opt, pbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
+        ref(sbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
+        checked(opt, sbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
 
         if (   memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
             || memcmp(stats_ref, stats_vec, sizeof(stats_ref))
@@ -1193,8 +1197,8 @@
         int endX = MAX_CU_SIZE - (rand() % 5) - 1;
         int endY = MAX_CU_SIZE - (rand() % 4) - 1;
 
-        ref(pbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, upBufft_ref, endX, endY, stats_ref, count_ref);
-        checked(opt, pbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, upBufft_vec, endX, endY, stats_vec, count_vec);
+        ref(sbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, upBufft_ref, endX, endY, stats_ref, count_ref);
+        checked(opt, sbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, upBufft_vec, endX, endY, stats_vec, count_vec);
 
         // TODO: don't check upBuff*, the latest output pixels different, and can move into stack temporary buffer in future
         if (   memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
@@ -1244,8 +1248,8 @@
         int endX = MAX_CU_SIZE - (rand() % 5) - 1;
         int endY = MAX_CU_SIZE - (rand() % 4) - 1;
 
-        ref(pbuf2, pbuf3, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
-        checked(opt, pbuf2, pbuf3, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
+        ref(sbuf2, pbuf3, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
+        checked(opt, sbuf2, pbuf3, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
 
         if (   memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
             || memcmp(stats_ref, stats_vec, sizeof(stats_ref))
@@ -1295,8 +1299,8 @@
 
     memset(ref_dest, 0xCD, sizeof(ref_dest));
     memset(opt_dest, 0xCD, sizeof(opt_dest));
-    int width = 32 + rand() % 32;
-    int height = 32 + rand() % 32;
+    int width = 32 + (rand() % 32);
+    int height = 32 + (rand() % 32);
     intptr_t srcStride = 64;
     intptr_t dstStride = width;
     int j = 0;
@@ -1304,11 +1308,23 @@
     for (int i = 0; i < ITERS; i++)
     {
         int index = i % TEST_CASES;
+
         checked(opt, ushort_test_buff[index] + j, srcStride, opt_dest, dstStride, width, height, (int)8, (uint16_t)((1 << X265_DEPTH) - 1));
         ref(ushort_test_buff[index] + j, srcStride, ref_dest, dstStride, width, height, (int)8, (uint16_t)((1 << X265_DEPTH) - 1));
 
-        if (memcmp(ref_dest, opt_dest, width * height * sizeof(pixel)))
+        if (memcmp(ref_dest, opt_dest, dstStride * height * sizeof(pixel)))
+        {
+            memcpy(opt_dest, ref_dest, sizeof(ref_dest));
+            opt(ushort_test_buff[index] + j, srcStride, opt_dest, dstStride, width, height, (int)8, (uint16_t)((1 << X265_DEPTH) - 1));
             return false;
+        }
+
+        // check tail memory area
+        for(int x = width; x < dstStride; x++)
+        {
+            if (opt_dest[(height - 1 * dstStride) + x] != 0xCD)
+                return false;
+        }
 
         reportfail();
         j += INCR;
@@ -1340,6 +1356,13 @@
         if (memcmp(ref_dest, opt_dest, sizeof(ref_dest)))
             return false;
 
+        // check tail memory area
+        for(int x = width; x < dstStride; x++)
+        {
+            if (opt_dest[(height - 1 * dstStride) + x] != 0xCD)
+                return false;
+        }
+
         reportfail();
         j += INCR;
     }
@@ -1356,16 +1379,16 @@
     memset(opt_dest, 0xCD, sizeof(opt_dest));
 
     double fps = 1.0;
-    int width = 16 + rand() % 64;
     int j = 0;
 
     for (int i = 0; i < ITERS; i++)
     {
+        int width = 16 + rand() % 64;
         int index = i % TEST_CASES;
         checked(opt, opt_dest, ushort_test_buff[index] + j, int_test_buff[index] + j, ushort_test_buff[index] + j, int_test_buff[index] + j, &fps, width);
         ref(ref_dest, ushort_test_buff[index] + j, int_test_buff[index] + j, ushort_test_buff[index] + j, int_test_buff[index] + j, &fps, width);
 
-        if (memcmp(ref_dest, opt_dest, width * sizeof(pixel)))
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
             return false;
 
         reportfail();
@@ -1397,28 +1420,6 @@
     return true;

x265_1.8.tar.gz/source/test/pixelharness.h -> x265_1.9.tar.gz/source/test/pixelharness.h Changed

@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -40,6 +41,8 @@
     enum { TEST_CASES = 3 };
     enum { SMAX = 1 << 12 };
     enum { SMIN = -1 << 12 };
+    enum { RMAX = PIXEL_MAX - PIXEL_MIN }; //The maximum value obtained by subtracting pixel values (residual max)
+    enum { RMIN = PIXEL_MIN - PIXEL_MAX }; //The minimum value obtained by subtracting pixel values (residual min)
 
     ALIGN_VAR_32(pixel, pbuf1[BUFFSIZE]);
     pixel    pbuf2[BUFFSIZE];
@@ -64,6 +67,7 @@
     uint16_t ushort_test_buff[TEST_CASES][BUFFSIZE];
     uint8_t  uchar_test_buff[TEST_CASES][BUFFSIZE];
     double   double_test_buff[TEST_CASES][BUFFSIZE];
+    int16_t  residual_test_buff[TEST_CASES][BUFFSIZE];
 
     bool check_pixelcmp(pixelcmp_t ref, pixelcmp_t opt);
     bool check_pixel_sse(pixel_sse_t ref, pixel_sse_t opt);
@@ -110,12 +114,15 @@
     bool check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt);
     bool check_cutree_propagate_cost(cutree_propagate_cost ref, cutree_propagate_cost opt);
     bool check_psyCost_pp(pixelcmp_t ref, pixelcmp_t opt);
-    bool check_psyCost_ss(pixelcmp_ss_t ref, pixelcmp_ss_t opt);
     bool check_calSign(sign_t ref, sign_t opt);
     bool check_scanPosLast(scanPosLast_t ref, scanPosLast_t opt);
     bool check_findPosFirstLast(findPosFirstLast_t ref, findPosFirstLast_t opt);
     bool check_costCoeffNxN(costCoeffNxN_t ref, costCoeffNxN_t opt);
     bool check_costCoeffRemain(costCoeffRemain_t ref, costCoeffRemain_t opt);
+    bool check_costC1C2Flag(costC1C2Flag_t ref, costC1C2Flag_t opt);
+    bool check_planeClipAndMax(planeClipAndMax_t ref, planeClipAndMax_t opt);
+    bool check_pelFilterLumaStrong_V(pelFilterLumaStrong_t ref, pelFilterLumaStrong_t opt);
+    bool check_pelFilterLumaStrong_H(pelFilterLumaStrong_t ref, pelFilterLumaStrong_t opt);
 
 public:

x265_1.8.tar.gz/source/test/regression-tests.txt -> x265_1.9.tar.gz/source/test/regression-tests.txt Changed

@@ -11,124 +11,132 @@
 # consistent across many machines, you must force a certain -FN so it is
 # not auto-detected.
 
+BasketballDrive_1920x1080_50.y4m,--preset ultrafast --signhide --colormatrix bt709
+BasketballDrive_1920x1080_50.y4m,--preset superfast --psy-rd 1 --ctu 16 --no-wpp --limit-modes
+BasketballDrive_1920x1080_50.y4m,--preset veryfast --tune zerolatency --no-temporal-mvp
 BasketballDrive_1920x1080_50.y4m,--preset faster --aq-strength 2 --merange 190
 BasketballDrive_1920x1080_50.y4m,--preset medium --ctu 16 --max-tu-size 8 --subme 7 --qg-size 16 --cu-lossless
 BasketballDrive_1920x1080_50.y4m,--preset medium --keyint -1 --nr-inter 100 -F4 --no-sao
+BasketballDrive_1920x1080_50.y4m,--preset medium --no-cutree --analysis-mode=save --bitrate 7000 --limit-modes,--preset medium --no-cutree --analysis-mode=load --bitrate 7000 --limit-modes
 BasketballDrive_1920x1080_50.y4m,--preset slow --nr-intra 100 -F4 --aq-strength 3 --qg-size 16 --limit-refs 1
 BasketballDrive_1920x1080_50.y4m,--preset slower --lossless --chromaloc 3 --subme 0
-BasketballDrive_1920x1080_50.y4m,--preset superfast --psy-rd 1 --ctu 16 --no-wpp
-BasketballDrive_1920x1080_50.y4m,--preset ultrafast --signhide --colormatrix bt709
-BasketballDrive_1920x1080_50.y4m,--preset veryfast --tune zerolatency --no-temporal-mvp
-BasketballDrive_1920x1080_50.y4m,--preset veryslow --crf 4 --cu-lossless --pmode --limit-refs 1
+BasketballDrive_1920x1080_50.y4m,--preset slower --no-cutree --analysis-mode=save --bitrate 7000,--preset slower --no-cutree --analysis-mode=load --bitrate 7000
+BasketballDrive_1920x1080_50.y4m,--preset veryslow --crf 4 --cu-lossless --pmode --limit-refs 1 --aq-mode 3
+BasketballDrive_1920x1080_50.y4m,--preset veryslow --no-cutree --analysis-mode=save --bitrate 7000 --tskip-fast,--preset veryslow --no-cutree --analysis-mode=load --bitrate 7000  --tskip-fast
+BasketballDrive_1920x1080_50.y4m,--preset veryslow --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
+Coastguard-4k.y4m,--preset ultrafast --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
+Coastguard-4k.y4m,--preset superfast --tune grain --overscan=crop
+Coastguard-4k.y4m,--preset veryfast --no-cutree --analysis-mode=save --bitrate 15000,--preset veryfast --no-cutree --analysis-mode=load --bitrate 15000
 Coastguard-4k.y4m,--preset medium --rdoq-level 1 --tune ssim --no-signhide --me umh
 Coastguard-4k.y4m,--preset slow --tune psnr --cbqpoffs -1 --crqpoffs 1 --limit-refs 1
-Coastguard-4k.y4m,--preset superfast --tune grain --overscan=crop
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --aq-mode 0 --sar 2 --range full
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset ultrafast --weightp --tune zerolatency --qg-size 16
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset superfast --weightp --no-wpp --sao
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset veryfast --temporal-layers --tune grain
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset faster --max-tu-size 4 --min-cu-size 32
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --aq-mode 0 --sar 2 --range full
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset medium --no-wpp --no-cutree --no-strong-intra-smoothing --limit-refs 1
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset slow --no-wpp --tune ssim --transfer smpte240m
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset slower --tune ssim --tune fastdecode --limit-refs 2
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset superfast --weightp --no-wpp --sao
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset ultrafast --weightp --tune zerolatency --qg-size 16
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset veryfast --temporal-layers --tune grain
-CrowdRun_1920x1080_50_10bit_444.yuv,--preset medium --dither --keyint -1 --rdoq-level 1
-CrowdRun_1920x1080_50_10bit_444.yuv,--preset superfast --weightp --dither --no-psy-rd
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset ultrafast --weightp --no-wpp --no-open-gop
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset superfast --weightp --dither --no-psy-rd
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryfast --temporal-layers --repeat-headers --limit-refs 2
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset medium --dither --keyint -1 --rdoq-level 1 --limit-modes
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --tskip --tskip-fast --no-scenecut
-DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset medium --tune psnr --bframes 16
-DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers --no-psy-rd --qg-size 32 --limit-refs 0 --cu-lossless
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset superfast --weightp --qg-size 16
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset medium --tune psnr --bframes 16 --limit-modes
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers --no-psy-rd --qg-size 32 --limit-refs 0 --cu-lossless
+DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset veryfast --weightp --nr-intra 1000 -F4
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset medium --nr-inter 500 -F4 --no-psy-rdoq
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset slower --no-weightp --rdoq-level 0 --limit-refs 3
-DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset veryfast --weightp --nr-intra 1000 -F4
-FourPeople_1280x720_60.y4m,--preset medium --qp 38 --no-psy-rd
 FourPeople_1280x720_60.y4m,--preset superfast --no-wpp --lookahead-slices 2
+FourPeople_1280x720_60.y4m,--preset medium --qp 38 --no-psy-rd
+FourPeople_1280x720_60.y4m,--preset medium --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
+FourPeople_1280x720_60.y4m,--preset veryslow --numa-pools "none"
+Keiba_832x480_30.y4m,--preset superfast --no-fast-intra --nr-intra 1000 -F4
 Keiba_832x480_30.y4m,--preset medium --pmode --tune grain
 Keiba_832x480_30.y4m,--preset slower --fast-intra --nr-inter 500 -F4 --limit-refs 0
-Keiba_832x480_30.y4m,--preset superfast --no-fast-intra --nr-intra 1000 -F4
-Kimono1_1920x1080_24_10bit_444.yuv,--preset medium --min-cu-size 32
 Kimono1_1920x1080_24_10bit_444.yuv,--preset superfast --weightb
-KristenAndSara_1280x720_60.y4m,--preset medium --no-cutree --max-tu-size 16
-KristenAndSara_1280x720_60.y4m,--preset slower --pmode --max-tu-size 8 --limit-refs 0
-KristenAndSara_1280x720_60.y4m,--preset superfast --min-cu-size 16 --qg-size 16 --limit-refs 1
+Kimono1_1920x1080_24_10bit_444.yuv,--preset medium --min-cu-size 32
 KristenAndSara_1280x720_60.y4m,--preset ultrafast --strong-intra-smoothing
-NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --tune grain --limit-refs 2
+KristenAndSara_1280x720_60.y4m,--preset superfast --min-cu-size 16 --qg-size 16 --limit-refs 1
+KristenAndSara_1280x720_60.y4m,--preset medium --no-cutree --max-tu-size 16
+KristenAndSara_1280x720_60.y4m,--preset slower --pmode --max-tu-size 8 --limit-refs 0 --limit-modes
 NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset superfast --tune psnr
-News-4k.y4m,--preset medium --tune ssim --no-sao --qg-size 16
+NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --tune grain --limit-refs 2
+NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset slow --no-cutree --analysis-mode=save --bitrate 9000,--preset slow --no-cutree --analysis-mode=load --bitrate 9000
+News-4k.y4m,--preset ultrafast --no-cutree --analysis-mode=save --bitrate 15000,--preset ultrafast --no-cutree --analysis-mode=load --bitrate 15000
 News-4k.y4m,--preset superfast --lookahead-slices 6 --aq-mode 0
+News-4k.y4m,--preset medium --tune ssim --no-sao --qg-size 16
+OldTownCross_1920x1080_50_10bit_422.yuv,--preset superfast --weightp
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset medium --no-weightp
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset slower --tune fastdecode
-OldTownCross_1920x1080_50_10bit_422.yuv,--preset superfast --weightp
+ParkScene_1920x1080_24_10bit_444.yuv,--preset superfast --weightp --lookahead-slices 4
 ParkScene_1920x1080_24.y4m,--preset medium --qp 40 --rdpenalty 2 --tu-intra-depth 3
 ParkScene_1920x1080_24.y4m,--preset slower --no-weightp
-ParkScene_1920x1080_24_10bit_444.yuv,--preset superfast --weightp --lookahead-slices 4
+RaceHorses_416x240_30.y4m,--preset superfast --no-cutree
 RaceHorses_416x240_30.y4m,--preset medium --tskip-fast --tskip
 RaceHorses_416x240_30.y4m,--preset slower --keyint -1 --rdoq-level 0
-RaceHorses_416x240_30.y4m,--preset superfast --no-cutree
 RaceHorses_416x240_30.y4m,--preset veryslow --tskip-fast --tskip --limit-refs 3
-RaceHorses_416x240_30_10bit.yuv,--preset fast --lookahead-slices 2 --b-intra --limit-refs 1
-RaceHorses_416x240_30_10bit.yuv,--preset faster --rdoq-level 0 --dither
-RaceHorses_416x240_30_10bit.yuv,--preset slow --tune grain
 RaceHorses_416x240_30_10bit.yuv,--preset ultrafast --tune psnr --limit-refs 1
 RaceHorses_416x240_30_10bit.yuv,--preset veryfast --weightb
+RaceHorses_416x240_30_10bit.yuv,--preset faster --rdoq-level 0 --dither
+RaceHorses_416x240_30_10bit.yuv,--preset fast --lookahead-slices 2 --b-intra --limit-refs 1
+RaceHorses_416x240_30_10bit.yuv,--preset slow --tune grain  --limit-modes
 RaceHorses_416x240_30_10bit.yuv,--preset placebo --limit-refs 1
 SteamLocomotiveTrain_2560x1600_60_10bit_crop.yuv,--preset medium --dither
-big_buck_bunny_360p24.y4m,--preset faster --keyint 240 --min-keyint 60 --rc-lookahead 200
-big_buck_bunny_360p24.y4m,--preset medium --keyint 60 --min-keyint 48 --weightb --limit-refs 3
-big_buck_bunny_360p24.y4m,--preset slow --psy-rdoq 2.0 --rdoq-level 1 --no-b-intra
-big_buck_bunny_360p24.y4m,--preset superfast --psy-rdoq 2.0
 big_buck_bunny_360p24.y4m,--preset ultrafast --deblock=2
+big_buck_bunny_360p24.y4m,--preset superfast --psy-rdoq 2.0 --aq-mode 3
 big_buck_bunny_360p24.y4m,--preset veryfast --no-deblock
-city_4cif_60fps.y4m,--preset medium --crf 4 --cu-lossless --sao-non-deblock
+big_buck_bunny_360p24.y4m,--preset faster --keyint 240 --min-keyint 60 --rc-lookahead 200
+big_buck_bunny_360p24.y4m,--preset medium --keyint 60 --min-keyint 48 --weightb --limit-refs 3
+big_buck_bunny_360p24.y4m,--preset slow --psy-rdoq 2.0 --rdoq-level 1 --no-b-intra --aq-mode 3
 city_4cif_60fps.y4m,--preset superfast --rdpenalty 1 --tu-intra-depth 2
+city_4cif_60fps.y4m,--preset medium --crf 4 --cu-lossless --sao-non-deblock
 city_4cif_60fps.y4m,--preset slower --scaling-list default
 city_4cif_60fps.y4m,--preset veryslow --rdpenalty 2 --sao-non-deblock --no-b-intra --limit-refs 0
-ducks_take_off_420_720p50.y4m,--preset fast --deblock 6 --bframes 16 --rc-lookahead 40
+ducks_take_off_420_720p50.y4m,--preset ultrafast --constrained-intra --rd 1
+ducks_take_off_444_720p50.y4m,--preset superfast --weightp --limit-refs 2
 ducks_take_off_420_720p50.y4m,--preset faster --qp 24 --deblock -6 --limit-refs 2
+ducks_take_off_420_720p50.y4m,--preset fast --deblock 6 --bframes 16 --rc-lookahead 40
 ducks_take_off_420_720p50.y4m,--preset medium --tskip --tskip-fast --constrained-intra
-ducks_take_off_420_720p50.y4m,--preset slow --scaling-list default --qp 40
-ducks_take_off_420_720p50.y4m,--preset ultrafast --constrained-intra --rd 1
-ducks_take_off_420_720p50.y4m,--preset veryslow --constrained-intra --bframes 2
 ducks_take_off_444_720p50.y4m,--preset medium --qp 38 --no-scenecut
-ducks_take_off_444_720p50.y4m,--preset superfast --weightp --rd 0 --limit-refs 2
+ducks_take_off_420_720p50.y4m,--preset slow --scaling-list default --qp 40
 ducks_take_off_444_720p50.y4m,--preset slower --psy-rd 1 --psy-rdoq 2.0 --rdoq-level 1 --limit-refs 1
+ducks_take_off_420_720p50.y4m,--preset slower --no-wpp
+ducks_take_off_420_720p50.y4m,--preset veryslow --constrained-intra --bframes 2
+mobile_calendar_422_ntsc.y4m,--preset superfast --weightp
 mobile_calendar_422_ntsc.y4m,--preset medium --bitrate 500 -F4
 mobile_calendar_422_ntsc.y4m,--preset slower --tskip --tskip-fast
-mobile_calendar_422_ntsc.y4m,--preset superfast --weightp --rd 0
 mobile_calendar_422_ntsc.y4m,--preset veryslow --tskip --limit-refs 2
+old_town_cross_444_720p50.y4m,--preset ultrafast --weightp --min-cu 32
+old_town_cross_444_720p50.y4m,--preset superfast --weightp --min-cu 16 --limit-modes
+old_town_cross_444_720p50.y4m,--preset veryfast --qp 1 --tune ssim
 old_town_cross_444_720p50.y4m,--preset faster --rd 1 --tune zero-latency
+old_town_cross_444_720p50.y4m,--preset fast --no-cutree --analysis-mode=save --bitrate 3000 --early-skip,--preset fast --no-cutree --analysis-mode=load --bitrate 3000 --early-skip
 old_town_cross_444_720p50.y4m,--preset medium --keyint -1 --no-weightp --ref 6
 old_town_cross_444_720p50.y4m,--preset slow --rdoq-level 1 --early-skip --ref 7 --no-b-pyramid
 old_town_cross_444_720p50.y4m,--preset slower --crf 4 --cu-lossless
-old_town_cross_444_720p50.y4m,--preset superfast --weightp --min-cu 16
-old_town_cross_444_720p50.y4m,--preset ultrafast --weightp --min-cu 32
-old_town_cross_444_720p50.y4m,--preset veryfast --qp 1 --tune ssim
 parkrun_ter_720p50.y4m,--preset medium --no-open-gop --sao-non-deblock --crf 4 --cu-lossless
 parkrun_ter_720p50.y4m,--preset slower --fast-intra --no-rect --tune grain
-silent_cif_420.y4m,--preset medium --me full --rect --amp
 silent_cif_420.y4m,--preset superfast --weightp --rect
+silent_cif_420.y4m,--preset medium --me full --rect --amp
 silent_cif_420.y4m,--preset placebo --ctu 32 --no-sao --qg-size 16
-vtc1nw_422_ntsc.y4m,--preset medium --scaling-list default --ctu 16 --ref 5
-vtc1nw_422_ntsc.y4m,--preset slower --nr-inter 1000 -F4 --tune fast-decode --qg-size 16
+washdc_422_ntsc.y4m,--preset ultrafast --weightp --tu-intra-depth 4
 vtc1nw_422_ntsc.y4m,--preset superfast --weightp --nr-intra 100 -F4
-washdc_422_ntsc.y4m,--preset faster --rdoq-level 1 --max-merge 5
-washdc_422_ntsc.y4m,--preset medium --no-weightp --max-tu-size 4 --limit-refs 1
-washdc_422_ntsc.y4m,--preset slower --psy-rdoq 2.0 --rdoq-level 2 --qg-size 32 --limit-refs 1
 washdc_422_ntsc.y4m,--preset superfast --psy-rd 1 --tune zerolatency
-washdc_422_ntsc.y4m,--preset ultrafast --weightp --tu-intra-depth 4
 washdc_422_ntsc.y4m,--preset veryfast --tu-inter-depth 4
-washdc_422_ntsc.y4m,--preset veryslow --crf 4 --cu-lossless --limit-refs 3
-BasketballDrive_1920x1080_50.y4m,--preset medium --no-cutree --analysis-mode=save --bitrate 15000,--preset medium --no-cutree --analysis-mode=load --bitrate 13000,--preset medium --no-cutree --analysis-mode=load --bitrate 11000,--preset medium --no-cutree --analysis-mode=load --bitrate 9000,--preset medium --no-cutree --analysis-mode=load --bitrate 7000
-NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset slow --no-cutree --analysis-mode=save --bitrate 15000,--preset slow --no-cutree --analysis-mode=load --bitrate 13000,--preset slow --no-cutree --analysis-mode=load --bitrate 11000,--preset slow --no-cutree --analysis-mode=load --bitrate 9000,--preset slow --no-cutree --analysis-mode=load --bitrate 7000
-old_town_cross_444_720p50.y4m,--preset veryslow --no-cutree --analysis-mode=save --bitrate 15000 --early-skip,--preset veryslow --no-cutree --analysis-mode=load --bitrate 13000 --early-skip,--preset veryslow --no-cutree --analysis-mode=load --bitrate 11000 --early-skip,--preset veryslow --no-cutree --analysis-mode=load --bitrate 9000 --early-skip,--preset veryslow --no-cutree --analysis-mode=load --bitrate 7000 --early-skip
-Johnny_1280x720_60.y4m,--preset medium --no-cutree --analysis-mode=save --bitrate 15000 --tskip-fast,--preset medium --no-cutree --analysis-mode=load --bitrate 13000  --tskip-fast,--preset medium --no-cutree --analysis-mode=load --bitrate 11000  --tskip-fast,--preset medium --no-cutree --analysis-mode=load --bitrate 9000  --tskip-fast,--preset medium --no-cutree --analysis-mode=load --bitrate 7000  --tskip-fast
-BasketballDrive_1920x1080_50.y4m,--preset medium --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
-FourPeople_1280x720_60.y4m,--preset ultrafast --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
-FourPeople_1280x720_60.y4m,--preset veryslow --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
+washdc_422_ntsc.y4m,--preset faster --rdoq-level 1 --max-merge 5
+vtc1nw_422_ntsc.y4m,--preset medium --scaling-list default --ctu 16 --ref 5
+washdc_422_ntsc.y4m,--preset medium --no-weightp --max-tu-size 4 --limit-refs 1 --aq-mode 2
+vtc1nw_422_ntsc.y4m,--preset slower --nr-inter 1000 -F4 --tune fast-decode --qg-size 16
+washdc_422_ntsc.y4m,--preset slower --psy-rdoq 2.0 --rdoq-level 2 --qg-size 32 --limit-refs 1
+washdc_422_ntsc.y4m,--preset veryslow --crf 4 --cu-lossless --limit-refs 3 --limit-modes
+
+# Main12 intraCost overflow bug test
+720p50_parkrun_ter.y4m,--preset medium
 
 # interlace test, even though input YUV is not field seperated
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --interlace bff
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset faster --interlace tff
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --interlace bff
 
 # vim: tw=200

x265_1.8.tar.gz/source/test/smoke-tests.txt -> x265_1.9.tar.gz/source/test/smoke-tests.txt Changed

x265_1.8.tar.gz/source/test/testbench.cpp -> x265_1.9.tar.gz/source/test/testbench.cpp Changed

x265_1.8.tar.gz/source/test/testharness.h -> x265_1.9.tar.gz/source/test/testharness.h Changed

x265_1.8.tar.gz/source/x265-extras.cpp -> x265_1.9.tar.gz/source/x265-extras.cpp Changed

@@ -36,7 +36,7 @@
     "I count, I ave-QP, I kbps, I-PSNR Y, I-PSNR U, I-PSNR V, I-SSIM (dB), "
     "P count, P ave-QP, P kbps, P-PSNR Y, P-PSNR U, P-PSNR V, P-SSIM (dB), "
     "B count, B ave-QP, B kbps, B-PSNR Y, B-PSNR U, B-PSNR V, B-SSIM (dB), "
-    "Version\n";
+    "MaxCLL, MaxFALL, Version\n";
 
 FILE* x265_csvlog_open(const x265_api& api, const x265_param& param, const char* fname, int level)
 {
@@ -61,54 +61,58 @@
         {
             if (level)
             {
-                fprintf(csvfp, "Encode Order, Type, POC, QP, Bits, ");
+                fprintf(csvfp, "Encode Order, Type, POC, QP, Bits, Scenecut, ");
                 if (param.rc.rateControlMode == X265_RC_CRF)
                     fprintf(csvfp, "RateFactor, ");
-                fprintf(csvfp, "Y PSNR, U PSNR, V PSNR, YUV PSNR, SSIM, SSIM (dB),  List 0, List 1");
-                /* detailed performance statistics */
-                fprintf(csvfp, ", DecideWait (ms), Row0Wait (ms), Wall time (ms), Ref Wait Wall (ms), Total CTU time (ms), Stall Time (ms), Avg WPP, Row Blocks");
-                if (level >= 2)
+                if (param.bEnablePsnr)
+                    fprintf(csvfp, "Y PSNR, U PSNR, V PSNR, YUV PSNR, ");
+                if (param.bEnableSsim)
+                    fprintf(csvfp, "SSIM, SSIM(dB), ");
+                fprintf(csvfp, "Latency, ");
+                fprintf(csvfp, "List 0, List 1");
+                uint32_t size = param.maxCUSize;
+                for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
+                {
+                    fprintf(csvfp, ", Intra %dx%d DC, Intra %dx%d Planar, Intra %dx%d Ang", size, size, size, size, size, size);
+                    size /= 2;
+                }
+                fprintf(csvfp, ", 4x4");
+                size = param.maxCUSize;
+                if (param.bEnableRectInter)
                 {
-                    uint32_t size = param.maxCUSize;
-                    for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
-                    {
-                        fprintf(csvfp, ", Intra %dx%d DC, Intra %dx%d Planar, Intra %dx%d Ang", size, size, size, size, size, size);
-                        size /= 2;
-                    }
-                    fprintf(csvfp, ", 4x4");
-                    size = param.maxCUSize;
-                    if (param.bEnableRectInter)
-                    {
-                        for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
-                        {
-                            fprintf(csvfp, ", Inter %dx%d, Inter %dx%d (Rect)", size, size, size, size);
-                            if (param.bEnableAMP)
-                                fprintf(csvfp, ", Inter %dx%d (Amp)", size, size);
-                            size /= 2;
-                        }
-                    }
-                    else
-                    {
-                        for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
-                        {
-                            fprintf(csvfp, ", Inter %dx%d", size, size);
-                            size /= 2;
-                        }
-                    }
-                    size = param.maxCUSize;
                     for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
                     {
-                        fprintf(csvfp, ", Skip %dx%d", size, size);
+                        fprintf(csvfp, ", Inter %dx%d, Inter %dx%d (Rect)", size, size, size, size);
+                        if (param.bEnableAMP)
+                            fprintf(csvfp, ", Inter %dx%d (Amp)", size, size);
                         size /= 2;
                     }
-                    size = param.maxCUSize;
+                }
+                else
+                {
                     for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
                     {
-                        fprintf(csvfp, ", Merge %dx%d", size, size);
+                        fprintf(csvfp, ", Inter %dx%d", size, size);
                         size /= 2;
                     }
-                    fprintf(csvfp, ", Avg Luma Distortion, Avg Chroma Distortion, Avg psyEnergy, Avg Luma Level, Max Luma Level");
                 }
+                size = param.maxCUSize;
+                for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
+                {
+                    fprintf(csvfp, ", Skip %dx%d", size, size);
+                    size /= 2;
+                }
+                size = param.maxCUSize;
+                for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
+                {
+                    fprintf(csvfp, ", Merge %dx%d", size, size);
+                    size /= 2;
+                }
+                fprintf(csvfp, ", Avg Luma Distortion, Avg Chroma Distortion, Avg psyEnergy, Avg Luma Level, Max Luma Level, Avg Residual Energy");
+
+                /* detailed performance statistics */
+                if (level >= 2)
+                    fprintf(csvfp, ", DecideWait (ms), Row0Wait (ms), Wall time (ms), Ref Wait Wall (ms), Total CTU time (ms), Stall Time (ms), Avg WPP, Row Blocks");
                 fprintf(csvfp, "\n");
             }
             else
@@ -125,17 +129,14 @@
         return;
 
     const x265_frame_stats* frameStats = &pic.frameData;
-    fprintf(csvfp, "%d, %c-SLICE, %4d, %2.2lf, %10d,", frameStats->encoderOrder, frameStats->sliceType, frameStats->poc, frameStats->qp, (int)frameStats->bits);
+    fprintf(csvfp, "%d, %c-SLICE, %4d, %2.2lf, %10d, %d,", frameStats->encoderOrder, frameStats->sliceType, frameStats->poc, frameStats->qp, (int)frameStats->bits, frameStats->bScenecut);
     if (param.rc.rateControlMode == X265_RC_CRF)
         fprintf(csvfp, "%.3lf,", frameStats->rateFactor);
     if (param.bEnablePsnr)
         fprintf(csvfp, "%.3lf, %.3lf, %.3lf, %.3lf,", frameStats->psnrY, frameStats->psnrU, frameStats->psnrV, frameStats->psnr);
-    else
-        fputs(" -, -, -, -,", csvfp);
     if (param.bEnableSsim)
         fprintf(csvfp, " %.6f, %6.3f,", frameStats->ssim, x265_ssim2dB(frameStats->ssim));
-    else
-        fputs(" -, -,", csvfp);
+    fprintf(csvfp, "%d, ", frameStats->frameLatency);
     if (frameStats->sliceType == 'I')
         fputs(" -, -,", csvfp);
     else
@@ -154,32 +155,33 @@
         else
             fputs(" -,", csvfp);
     }
-    fprintf(csvfp, " %.1lf, %.1lf, %.1lf, %.1lf, %.1lf, %.1lf,", frameStats->decideWaitTime, frameStats->row0WaitTime, frameStats->wallTime, frameStats->refWaitWallTime, frameStats->totalCTUTime, frameStats->stallTime);
-    fprintf(csvfp, " %.3lf, %d", frameStats->avgWPP, frameStats->countRowBlocks);
-    if (level >= 2)
+    for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
+        fprintf(csvfp, "%5.2lf%%, %5.2lf%%, %5.2lf%%,", frameStats->cuStats.percentIntraDistribution[depth][0], frameStats->cuStats.percentIntraDistribution[depth][1], frameStats->cuStats.percentIntraDistribution[depth][2]);
+    fprintf(csvfp, "%5.2lf%%", frameStats->cuStats.percentIntraNxN);
+    if (param.bEnableRectInter)
     {
         for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
-            fprintf(csvfp, ", %5.2lf%%, %5.2lf%%, %5.2lf%%", frameStats->cuStats.percentIntraDistribution[depth][0], frameStats->cuStats.percentIntraDistribution[depth][1], frameStats->cuStats.percentIntraDistribution[depth][2]);
-        fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentIntraNxN);
-        if (param.bEnableRectInter)
         {
-            for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
-            {
-                fprintf(csvfp, ", %5.2lf%%, %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][0], frameStats->cuStats.percentInterDistribution[depth][1]);
-                if (param.bEnableAMP)
-                    fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][2]);
-            }
+            fprintf(csvfp, ", %5.2lf%%, %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][0], frameStats->cuStats.percentInterDistribution[depth][1]);
+            if (param.bEnableAMP)
+                fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][2]);
         }
-        else
-        {
-            for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
-                fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][0]);
-        }
-        for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
-            fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentSkipCu[depth]);
+    }
+    else
+    {
         for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
-            fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentMergeCu[depth]);
-        fprintf(csvfp, ", %.2lf, %.2lf, %.2lf, %.2lf, %d", frameStats->avgLumaDistortion, frameStats->avgChromaDistortion, frameStats->avgPsyEnergy, frameStats->avgLumaLevel, frameStats->maxLumaLevel);
+            fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][0]);
+    }
+    for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
+        fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentSkipCu[depth]);
+    for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
+        fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentMergeCu[depth]);
+    fprintf(csvfp, ", %.2lf, %.2lf, %.2lf, %.2lf, %d, %.2lf", frameStats->avgLumaDistortion, frameStats->avgChromaDistortion, frameStats->avgPsyEnergy, frameStats->avgLumaLevel, frameStats->maxLumaLevel, frameStats->avgResEnergy);
+
+    if (level >= 2)
+    {
+        fprintf(csvfp, ", %.1lf, %.1lf, %.1lf, %.1lf, %.1lf, %.1lf,", frameStats->decideWaitTime, frameStats->row0WaitTime, frameStats->wallTime, frameStats->refWaitWallTime, frameStats->totalCTUTime, frameStats->stallTime);
+        fprintf(csvfp, " %.3lf, %d", frameStats->avgWPP, frameStats->countRowBlocks);
     }
     fprintf(csvfp, "\n");
     fflush(stderr);
@@ -198,11 +200,13 @@
     }
 
     // CLI arguments or other
+    fputc('"', csvfp);
     for (int i = 1; i < argc; i++)
     {
-        if (i) fputc(' ', csvfp);
+        fputc(' ', csvfp);
         fputs(argv[i], csvfp);
     }
+    fputc('"', csvfp);
 
     // current date and time
     time_t now;
@@ -273,7 +277,7 @@
     else
         fprintf(csvfp, " -, -, -, -, -, -, -,");
 
-    fprintf(csvfp, " %s\n", api.version_str);
+    fprintf(csvfp, " %-6u, %-6u, %s\n", stats.maxCLL, stats.maxFALL, api.version_str);

x265_1.8.tar.gz/source/x265.cpp -> x265_1.9.tar.gz/source/x265.cpp Changed

x265_1.8.tar.gz/source/x265.def.in -> x265_1.9.tar.gz/source/x265.def.in Changed

x265_1.8.tar.gz/source/x265.h -> x265_1.9.tar.gz/source/x265.h Changed

@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -91,13 +92,15 @@
 /* Stores all analysis data for a single frame */
 typedef struct x265_analysis_data
 {
-    void*            interData;
-    void*            intraData;
+    int64_t          satdCost;
     uint32_t         frameRecordSize;
     uint32_t         poc;
     uint32_t         sliceType;
     uint32_t         numCUsInFrame;
     uint32_t         numPartitions;
+    void*            interData;
+    void*            intraData;
+    int              bScenecut;
 } x265_analysis_data;
 
 /* cu statistics */
@@ -132,6 +135,7 @@
     double           avgLumaDistortion;
     double           avgChromaDistortion;
     double           avgPsyEnergy;
+    double           avgResEnergy;
     double           avgLumaLevel;
     uint64_t         bits;
     int              encoderOrder;
@@ -141,6 +145,8 @@
     int              list1POC[16];
     uint16_t         maxLumaLevel;
     char             sliceType;
+    int              bScenecut;
+    int              frameLatency;
     x265_cu_stats    cuStats;
 } x265_frame_stats;
 
@@ -205,6 +211,13 @@
      * this data structure */
     x265_analysis_data analysisData;
 
+    /* An array of quantizer offsets to be applied to this image during encoding.
+     * These are added on top of the decisions made by rateControl.
+     * Adaptive quantization must be enabled to use this feature. These quantizer
+     * offsets should be given for each 16x16 block. Behavior if quant
+     * offsets differ between encoding passes is undefined. */
+    float            *quantOffsets;
+
     /* Frame level statistics */
     x265_frame_stats frameData;
 
@@ -378,6 +391,8 @@
     x265_sliceType_stats  statsI;               /* statistics of I slice */
     x265_sliceType_stats  statsP;               /* statistics of P slice */
     x265_sliceType_stats  statsB;               /* statistics of B slice */
+    uint16_t              maxCLL;               /* maximum content light level */
+    uint16_t              maxFALL;              /* maximum frame average light level */
 } x265_stats;
 
 /* String values accepted by x265_param_parse() (and CLI) for various parameters */
@@ -604,7 +619,7 @@
 
     /* Enables the emission of a user data SEI with the stream headers which
      * describes the encoder version, build info, and parameters. This is
-     * very helpful for debugging, but may interfere with regression tests. 
+     * very helpful for debugging, but may interfere with regression tests.
      * Default enabled */
     int       bEmitInfoSEI;
 
@@ -664,9 +679,9 @@
     int       bBPyramid;
 
     /* A value which is added to the cost estimate of B frames in the lookahead.
-     * It may be a positive value (making B frames appear more expensive, which
-     * causes the lookahead to chose more P frames) or negative, which makes the
-     * lookahead chose more B frames. Default is 0, there are no limits */
+     * It may be a positive value (making B frames appear less expensive, which
+     * biases the lookahead to choose more B frames) or negative, which makes the
+     * lookahead choose more P frames. Default is 0, there are no limits */
     int       bFrameBias;
 
     /* The number of frames that must be queued in the lookahead before it may
@@ -691,6 +706,11 @@
      * should detect scene cuts. The default (40) is recommended. */
     int       scenecutThreshold;
 
+    /* Replace keyframes by using a column of intra blocks that move across the video
+     * from one side to the other, thereby "refreshing" the image. In effect, instead of a
+     * big keyframe, the keyframe is "spread" over many frames. */
+    int       bIntraRefresh;
+
     /*== Coding Unit (CU) definitions ==*/
 
     /* Maximum CU width and height in pixels.  The size must be 64, 32, or 16.
@@ -810,6 +830,9 @@
      * 4 split CUs at the next lower CU depth.  The two flags may be combined */
     uint32_t  limitReferences;
 
+    /* Limit modes analyzed for each CU using cost metrics from the 4 sub-CUs */
+    uint32_t limitModes;
+
     /* ME search method (DIA, HEX, UMH, STAR, FULL). The search patterns
      * (methods) are sorted in increasing complexity, with diamond being the
      * simplest and fastest and full being the slowest.  DIA, HEX, and UMH were
@@ -920,7 +943,7 @@
     /* Psycho-visual rate-distortion strength. Only has an effect in presets
      * which use RDO. It makes mode decision favor options which preserve the
      * energy of the source, at the cost of lost compression. The value must
-     * be between 0 and 2.0, 1.0 is typical. Default 0.3 */
+     * be between 0 and 5.0, 1.0 is typical. Default 2.0 */
     double    psyRd;
 
     /* Strength of psycho-visual optimizations in quantization. Only has an
@@ -1038,7 +1061,7 @@
 
         /* Enable slow and a more detailed first pass encode in multi pass rate control */
         int       bEnableSlowFirstPass;
-        
+
         /* rate-control overrides */
         int        zoneCount;
         x265_zone* zones;
@@ -1051,14 +1074,14 @@
          * values will affect all encoders in the same process */
         const char* lambdaFileName;
 
-        /* Enable stricter conditions to check bitrate deviations in CBR mode. May compromise 
+        /* Enable stricter conditions to check bitrate deviations in CBR mode. May compromise
          * quality to maintain bitrate adherence */
         int bStrictCbr;
 
-        /* Enable adaptive quantization at CU granularity. This parameter specifies 
-         * the minimum CU size at which QP can be adjusted, i.e. Quantization Group 
-         * (QG) size. Allowed values are 64, 32, 16 provided it falls within the 
-         * inclusuve range [maxCUSize, minCUSize]. Experimental, default: maxCUSize*/
+        /* Enable adaptive quantization at CU granularity. This parameter specifies
+         * the minimum CU size at which QP can be adjusted, i.e. Quantization Group
+         * (QG) size. Allowed values are 64, 32, 16 provided it falls within the
+         * inclusuve range [maxCUSize, minCUSize]. Experimental, default: maxCUSize */
         uint32_t qgSize;
     } rc;
 
@@ -1165,12 +1188,27 @@
      * max,min luminance values. */
     const char* masteringDisplayColorVolume;
 
-    /* Content light level info SEI, specified as a string which is parsed when
-     * the stream header SEI are emitted. The string format is "%hu,%hu" where
-     * %hu are unsigned 16bit integers. The first value is the max content light
-     * level (or 0 if no maximum is indicated), the second value is the maximum
-     * picture average light level (or 0). */
-    const char* contentLightLevelInfo;
+    /* Maximum Content light level(MaxCLL), specified as integer that indicates the
+     * maximum pixel intensity level in units of 1 candela per square metre of the
+     * bitstream. x265 will also calculate MaxCLL programmatically from the input
+     * pixel values and set in the Content light level info SEI */
+    uint16_t maxCLL;
+
+    /* Maximum Frame Average Light Level(MaxFALL), specified as integer that indicates
+     * the maximum frame average intensity level in units of 1 candela per square
+     * metre of the bitstream. x265 will also calculate MaxFALL programmatically
+     * from the input pixel values and set in the Content light level info SEI */
+    uint16_t maxFALL;
+
+    /* Minimum luma level of input source picture, specified as a integer which
+     * would automatically increase any luma values below the specified --min-luma
+     * value to that value. */
+    uint16_t minLuma;
+
+    /* Maximum luma level of input source picture, specified as a integer which
+     * would automatically decrease any luma values above the specified --max-luma
+     * value to that value. */
+    uint16_t maxLuma;
 
 } x265_param;
 
@@ -1211,7 +1249,7 @@
     "main422-10", "main422-10-intra",
     "main444-10", "main444-10-intra",
 
-    "main12",     "main12-intra",                  /* Highly Experimental */
+    "main12",     "main12-intra",
     "main422-12", "main422-12-intra",
     "main444-12", "main444-12-intra",
 
@@ -1347,6 +1385,22 @@
  *      close an encoder handler */
 void x265_encoder_close(x265_encoder *);
 
+/* x265_encoder_intra_refresh:
+ *      If an intra refresh is not in progress, begin one with the next P-frame.
+ *      If an intra refresh is in progress, begin one as soon as the current one finishes.
+ *      Requires bIntraRefresh to be set.
+ *

x265_1.8.tar.gz/source/x265cli.h -> x265_1.9.tar.gz/source/x265cli.h Changed

@@ -116,6 +116,7 @@
     { "min-keyint",     required_argument, NULL, 'i' },
     { "scenecut",       required_argument, NULL, 0 },
     { "no-scenecut",          no_argument, NULL, 0 },
+    { "intra-refresh",        no_argument, NULL, 0 },
     { "rc-lookahead",   required_argument, NULL, 0 },
     { "lookahead-slices", required_argument, NULL, 0 },
     { "bframes",        required_argument, NULL, 'b' },
@@ -126,6 +127,8 @@
     { "b-pyramid",            no_argument, NULL, 0 },
     { "ref",            required_argument, NULL, 0 },
     { "limit-refs",     required_argument, NULL, 0 },
+    { "no-limit-modes",       no_argument, NULL, 0 },
+    { "limit-modes",          no_argument, NULL, 0 },
     { "no-weightp",           no_argument, NULL, 0 },
     { "weightp",              no_argument, NULL, 'w' },
     { "no-weightb",           no_argument, NULL, 0 },
@@ -192,6 +195,8 @@
     { "crop-rect",      required_argument, NULL, 0 }, /* DEPRECATED */
     { "master-display", required_argument, NULL, 0 },
     { "max-cll",        required_argument, NULL, 0 },
+    { "min-luma",       required_argument, NULL, 0 },
+    { "max-luma",       required_argument, NULL, 0 },
     { "no-dither",            no_argument, NULL, 0 },
     { "dither",               no_argument, NULL, 0 },
     { "no-repeat-headers",    no_argument, NULL, 0 },
@@ -251,14 +256,18 @@
     H0("   --log-level <string>          Logging level: none error warning info debug full. Default %s\n", X265_NS::logLevelNames[param->logLevel + 1]);
     H0("   --no-progress                 Disable CLI progress reports\n");
     H0("   --csv <filename>              Comma separated log file, if csv-log-level > 0 frame level statistics, else one line per run\n");
-    H0("   --csv-log-level               Level of csv logging, if csv-log-level > 0 frame level statistics, else one line per run: 0-2\n");
+    H0("   --csv-log-level <integer>     Level of csv logging, if csv-log-level > 0 frame level statistics, else one line per run: 0-2\n");
     H0("\nInput Options:\n");
     H0("   --input <filename>            Raw YUV or Y4M input file name. `-` for stdin\n");
     H1("   --y4m                         Force parsing of input stream as YUV4MPEG2 regardless of file extension\n");
     H0("   --fps <float|rational>        Source frame rate (float or num/denom), auto-detected if Y4M\n");
     H0("   --input-res WxH               Source picture size [w x h], auto-detected if Y4M\n");
     H1("   --input-depth <integer>       Bit-depth of input file. Default 8\n");
-    H1("   --input-csp <string>          Source color space: i420, i444 or i422, auto-detected if Y4M. Default: i420\n");
+    H1("   --input-csp <string>          Chroma subsampling, auto-detected if Y4M\n");
+    H1("                                 0 - i400 (4:0:0 monochrome)\n");
+    H1("                                 1 - i420 (4:2:0 default)\n");
+    H1("                                 2 - i422 (4:2:2)\n");
+    H1("                                 3 - i444 (4:4:4)\n");
     H0("-f/--frames <integer>            Maximum number of frames to encode. Default all\n");
     H0("   --seek <integer>              First frame to encode\n");
     H1("   --[no-]interlace <bff|tff>    Indicate input pictures are interlace fields in temporal order. Default progressive\n");
@@ -292,7 +301,7 @@
     H0("   --tu-inter-depth <integer>    Max TU recursive depth for inter CUs. Default %d\n", param->tuQTMaxInterDepth);
     H0("\nAnalysis:\n");
     H0("   --rd <0..6>                   Level of RDO in mode decision 0:least....6:full RDO. Default %d\n", param->rdLevel);
-    H0("   --[no-]psy-rd <0..2.0>        Strength of psycho-visual rate distortion optimization, 0 to disable. Default %.1f\n", param->psyRd);
+    H0("   --[no-]psy-rd <0..5.0>        Strength of psycho-visual rate distortion optimization, 0 to disable. Default %.1f\n", param->psyRd);
     H0("   --[no-]rdoq-level <0|1|2>     Level of RDO in quantization 0:none, 1:levels, 2:levels & coding groups. Default %d\n", param->rdoqLevel);
     H0("   --[no-]psy-rdoq <0..50.0>     Strength of psycho-visual optimization in RDO quantization, 0 to disable. Default %.1f\n", param->psyRdoq);
     H0("   --[no-]early-skip             Enable early SKIP detection. Default %s\n", OPT(param->bEnableEarlySkip));
@@ -308,12 +317,13 @@
     H0("\nTemporal / motion search options:\n");
     H0("   --max-merge <1..5>            Maximum number of merge candidates. Default %d\n", param->maxNumMergeCand);
     H0("   --ref <integer>               max number of L0 references to be allowed (1 .. 16) Default %d\n", param->maxNumReferences);
-    H0("   --limit-refs <0|1|2|3>        limit references per depth (1) or CU (2) or both (3). Default %d\n", param->limitReferences);
+    H0("   --limit-refs <0|1|2|3>        Limit references per depth (1) or CU (2) or both (3). Default %d\n", param->limitReferences);
     H0("   --me <string>                 Motion search method dia hex umh star full. Default %d\n", param->searchMethod);
     H0("-m/--subme <integer>             Amount of subpel refinement to perform (0:least .. 7:most). Default %d \n", param->subpelRefine);
     H0("   --merange <integer>           Motion search range. Default %d\n", param->searchRange);
     H0("   --[no-]rect                   Enable rectangular motion partitions Nx2N and 2NxN. Default %s\n", OPT(param->bEnableRectInter));
     H0("   --[no-]amp                    Enable asymmetric motion partitions, requires --rect. Default %s\n", OPT(param->bEnableAMP));
+    H0("   --[no-]limit-modes            Limit rectangular and asymmetric motion predictions. Default %d\n", param->limitModes);
     H1("   --[no-]temporal-mvp           Enable temporal MV predictors. Default %s\n", OPT(param->bEnableTemporalMvp));
     H0("\nSpatial / intra options:\n");
     H0("   --[no-]strong-intra-smoothing Enable strong intra smoothing for 32x32 blocks. Default %s\n", OPT(param->bEnableStrongIntraSmoothing));
@@ -327,6 +337,7 @@
     H0("-i/--min-keyint <integer>        Scenecuts closer together than this are coded as I, not IDR. Default: auto\n");
     H0("   --no-scenecut                 Disable adaptive I-frame decision\n");
     H0("   --scenecut <integer>          How aggressively to insert extra I-frames. Default %d\n", param->scenecutThreshold);
+    H0("   --intra-refresh               Use Periodic Intra Refresh instead of IDR frames\n");
     H0("   --rc-lookahead <integer>      Number of frames for frame-type lookahead (determines encoder latency) Default %d\n", param->lookaheadDepth);
     H1("   --lookahead-slices <0..16>    Number of slices to use per lookahead cost estimate. Default %d\n", param->lookaheadSlices);
     H0("   --bframes <integer>           Maximum number of consecutive b-frames (now it only enables B GOP structure) Default %d\n", param->bframes);
@@ -335,7 +346,7 @@
     H0("   --[no-]b-pyramid              Use B-frames as references. Default %s\n", OPT(param->bBPyramid));
     H1("   --qpfile <string>             Force frametypes and QPs for some or all frames\n");
     H1("                                 Format of each line: framenumber frametype QP\n");
-    H1("                                 QP is optional (none lets x265 choose). Frametypes: I,i,P,B,b.\n");
+    H1("                                 QP is optional (none lets x265 choose). Frametypes: I,i,K,P,B,b.\n");
     H1("                                 QPs are restricted by qpmin/qpmax.\n");
     H0("\nRate control, Adaptive Quantization:\n");
     H0("   --bitrate <integer>           Target bitrate (kbps) for ABR (implied). Default %d\n", param->rc.bitrate);
@@ -403,6 +414,8 @@
     H0("   --master-display <string>     SMPTE ST 2086 master display color volume info SEI (HDR)\n");
     H0("                                    format: G(x,y)B(x,y)R(x,y)WP(x,y)L(max,min)\n");
     H0("   --max-cll <string>            Emit content light level info SEI as \"cll,fall\" (HDR)\n");
+    H0("   --min-luma <integer>          Minimum luma plane value of input source picture\n");
+    H0("   --max-luma <integer>          Maximum luma plane value of input source picture\n");
     H0("\nBitstream options:\n");
     H0("   --[no-]repeat-headers         Emit SPS and PPS headers at each keyframe. Default %s\n", OPT(param->bRepeatHeaders));
     H0("   --[no-]info                   Emit SEI identifying encoder and parameters. Default %s\n", OPT(param->bEmitInfoSEI));

Changes of Revision 12