Packman Build Service PMBS

Changes of Revision 4

libx264.changes Changed

libx264.spec Changed

@@ -1,5 +1,6 @@
 # vim: set ts=4 sw=4 et:
 # Copyright (c) 2012 Pascal Bleser <pascal.bleser@opensuse.org>
+# COpyright (c) 2013 Marguerite Su <marguerite@opensuse.org>
 #
 # All modifications and additions to the file contributed by third parties
 # remain the property of their copyright owners, unless otherwise agreed
@@ -10,20 +11,19 @@
 # license that conforms to the Open Source Definition (Version 1.9)
 # published by the Open Source Initiative.
 
-# Please submit bugfixes or comments via http://bugs.opensuse.org/
+# Please submit bugfixes or comments via http://bugs.links2linux.org/
 
 Name: libx264
-%define libname %{name}
-%define soname 129
-%define svn 20130224
+%define soname 135
+%define svn 20130723
 Version: 0.%{soname}svn%{svn}
 Release: 1
 License: GPL-2.0+
 Summary: A free h264/avc encoder - encoder binary
 Url: http://developers.videolan.org/x264.html
 Group: Productivity/Multimedia/Video/Editors and Convertors
-Source0: ftp://ftp.videolan.org/pub/videolan/x264/snapshots/x264-snapshot-%{svn}-2245.tar.bz2
-Patch0: x264-use-shared-library.patch
+Source: ftp://ftp.videolan.org/pub/videolan/x264/snapshots/x264-snapshot-%{svn}-2245.tar.bz2
+Patch: x264-use-shared-library.patch
 BuildRequires: nasm
 BuildRequires: pkg-config
 BuildRequires: yasm >= 1.2.0
@@ -59,11 +59,11 @@
 moment so please use mencoder or another tool that supports x264 library
 for all other file types.
 
-%package -n %{libname}-%{soname}
+%package %{soname}
 Summary: A free h264/avc encoder - encoder binary
 Group: Productivity/Multimedia/Video/Editors and Convertors
 
-%description -n %{libname}-%{soname}
+%description %{soname}
 x264 is a free library for encoding next-generation H264/AVC video
 streams. The code is written from scratch by Laurent Aimar, Loren
 Merritt, Eric Petit (OS X), Min Chen (vfw/asm), Justin Clay (vfw), Mans
@@ -73,15 +73,14 @@
 development with libx264. This library is needed to build
 mplayer/mencoder with H264 encoding support.
 
-%package -n %{libname}-devel
+%package devel
 Summary: Libraries and include file for the %{name} encoder
 Group: Development/Libraries/C and C++
-Requires: %{buildrequires}
-Requires: %{libname}-%{soname} = %{version}
-Provides: %{name}-devel = %{version}
-Obsoletes: %{name}-devel < %{version}
+Requires: %{name}-%{soname} = %{version}
+Provides: x264-devel = %{version}
+Obsoletes: x264-devel < %{version}
 
-%description -n %{libname}-devel
+%description devel
 x264 is a free library for encoding next-generation H264/AVC video
 streams. The code is written from scratch by Laurent Aimar, Loren
 Merritt, Eric Petit (OS X), Min Chen (vfw/asm), Justin Clay (vfw), Mans
@@ -92,8 +91,8 @@
 mplayer/mencoder with H264 encoding support.
 
 %prep
-%setup -q -n "x264-snapshot-%{svn}-2245"
-%patch0 -p0
+%setup -q -n x264-snapshot-%{svn}-2245
+%patch -p1
 FAKE_BUILDDATE=$(LC_ALL=C date -u -r %{_sourcedir}/%{name}.changes '+%%b %%e %%Y')
 sed -i "s/__DATE__/\"$FAKE_BUILDDATE\"/" x264.c
 
@@ -104,29 +103,26 @@
 %install
 %makeinstall
 
-rm -f "%{buildroot}%{_libdir}/%{libname}.so"
-rm -f "%{buildroot}%{_libdir}/%{libname}.a"
-ln -s %{libname}.so.%{soname} "%{buildroot}%{_libdir}/%{libname}.so"
+rm -f %{buildroot}%{_libdir}/%{name}.so
+rm -f %{buildroot}%{_libdir}/%{name}.a
+ln -s %{name}.so.%{soname} %{buildroot}%{_libdir}/%{name}.so
 
-rm "%{buildroot}%{_bindir}"/*
+rm %{buildroot}%{_bindir}/*
 
-echo "%{libname}-%{soname}" > %{_sourcedir}/baselibs.conf
+echo "%{name}-%{soname}" > %{_sourcedir}/baselibs.conf
 
-%clean
-%{?buildroot:%__rm -rf "%{buildroot}"}
+%post -n %{name}-%{soname} -p /sbin/ldconfig
+%postun -n %{name}-%{soname} -p /sbin/ldconfig
 
-%post -n %{libname}-%{soname} -p /sbin/ldconfig
-%postun -n %{libname}-%{soname} -p /sbin/ldconfig
-
-%files -n %{libname}-%{soname}
+%files %{soname}
 %defattr(0644,root,root)
-%{_libdir}/%{libname}.so.%{soname}
+%{_libdir}/%{name}.so.%{soname}
 
-%files -n %{libname}-devel
+%files devel
 %defattr(0644,root,root)
 %{_includedir}/x264.h
 %{_includedir}/x264_config.h
 %{_libdir}/pkgconfig/x264.pc
-%{_libdir}/%{libname}.so
+%{_libdir}/%{name}.so
 
 %changelog

x264-use-shared-library.patch Changed

@@ -1,21 +1,23 @@
---- Makefile.orig	2011-12-26 22:45:03.000000000 +0100
-+++ Makefile	2011-12-27 20:03:46.070404383 +0100
-@@ -152,6 +152,7 @@
+Index: x264-snapshot-20130723-2245/Makefile
+===================================================================
+--- x264-snapshot-20130723-2245.orig/Makefile
++++ x264-snapshot-20130723-2245/Makefile
+@@ -171,6 +171,7 @@ $(LIBX264): $(GENERATED) .depend $(OBJS)
  
- $(SONAME): .depend $(OBJS) $(OBJASM) $(OBJSO)
+ $(SONAME): $(GENERATED) .depend $(OBJS) $(OBJASM) $(OBJSO)
  	$(LD)$@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS)
 +	ln -s $(SONAME) libx264.so
  
  ifneq ($(EXE),)
  .PHONY: x264 checkasm
-@@ -159,8 +160,8 @@
+@@ -178,8 +179,8 @@ x264: x264$(EXE)
  checkasm: checkasm$(EXE)
  endif
  
--x264$(EXE): .depend $(OBJCLI) $(CLI_LIBX264)
+-x264$(EXE): $(GENERATED) .depend $(OBJCLI) $(CLI_LIBX264)
 -	$(LD)$@ $(OBJCLI) $(CLI_LIBX264) $(LDFLAGSCLI) $(LDFLAGS)
-+x264$(EXE): .depend $(OBJCLI) $(SONAME)
++x264$(EXE): $(GENERATED) .depend $(OBJCLI) $(SONAME)
 +	$(LD)$@ $(OBJCLI) -L. -lx264 $(LDFLAGSCLI) $(LDFLAGS)
  
- checkasm$(EXE): .depend $(OBJCHK) $(LIBX264)
+ checkasm$(EXE): $(GENERATED) .depend $(OBJCHK) $(LIBX264)
  	$(LD)$@ $(OBJCHK) $(LIBX264) $(LDFLAGS)

x264-snapshot-20130224-2245.tar.bz2/.gitignore -> x264-snapshot-20130723-2245.tar.bz2/.gitignore Changed

x264-snapshot-20130224-2245.tar.bz2/Makefile -> x264-snapshot-20130723-2245.tar.bz2/Makefile Changed

@@ -8,6 +8,8 @@
 vpath %.asm $(SRCPATH)
 vpath %.rc $(SRCPATH)
 
+GENERATED =
+
 all: default
 default:
 
@@ -145,6 +147,13 @@
 endif
 endif
 
+ifeq ($(HAVE_OPENCL),yes)
+common/oclobj.h: common/opencl/x264-cl.h $(wildcard $(SRCPATH)/common/opencl/*.cl)
+	cat $^ | perl $(SRCPATH)/tools/cltostr.pl x264_opencl_source > $@
+GENERATED += common/oclobj.h
+SRCS += common/opencl.c encoder/slicetype-cl.c
+endif
+
 OBJS   += $(SRCS:%.c=%.o)
 OBJCLI += $(SRCCLI:%.c=%.o)
 OBJSO  += $(SRCSO:%.c=%.o)
@@ -155,12 +164,12 @@
 lib-static: $(LIBX264)
 lib-shared: $(SONAME)
 
-$(LIBX264): .depend $(OBJS) $(OBJASM)
+$(LIBX264): $(GENERATED) .depend $(OBJS) $(OBJASM)
 	rm -f $(LIBX264)
 	$(AR)$@ $(OBJS) $(OBJASM)
 	$(if $(RANLIB), $(RANLIB) $@)
 
-$(SONAME): .depend $(OBJS) $(OBJASM) $(OBJSO)
+$(SONAME): $(GENERATED) .depend $(OBJS) $(OBJASM) $(OBJSO)
 	$(LD)$@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS)
 
 ifneq ($(EXE),)
@@ -169,10 +178,10 @@
 checkasm: checkasm$(EXE)
 endif
 
-x264$(EXE): .depend $(OBJCLI) $(CLI_LIBX264)
+x264$(EXE): $(GENERATED) .depend $(OBJCLI) $(CLI_LIBX264)
 	$(LD)$@ $(OBJCLI) $(CLI_LIBX264) $(LDFLAGSCLI) $(LDFLAGS)
 
-checkasm$(EXE): .depend $(OBJCHK) $(LIBX264)
+checkasm$(EXE): $(GENERATED) .depend $(OBJCHK) $(LIBX264)
 	$(LD)$@ $(OBJCHK) $(LIBX264) $(LDFLAGS)
 
 $(OBJS) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK): .depend
@@ -231,7 +240,7 @@
 
 clean:
 	rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(SONAME) *.a *.lib *.exp *.pdb x264 x264.exe .depend TAGS
-	rm -f checkasm checkasm.exe $(OBJCHK)
+	rm -f checkasm checkasm.exe $(OBJCHK) $(GENERATED) x264_lookahead.clbin
 	rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock
 
 distclean: clean

x264-snapshot-20130224-2245.tar.bz2/common/arm/mc-a.S -> x264-snapshot-20130723-2245.tar.bz2/common/arm/mc-a.S Changed

@@ -5,6 +5,7 @@
 *
 * Authors: David Conrad <lessen42@gmail.com>
 * Mans Rullgard <mans@mansr.com>
+ * Stefan Groenroos <stefan.gronroos@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -813,54 +814,57 @@
 // void x264_mc_chroma_neon( uint8_t *dst, intptr_t i_dst_stride,
 // uint8_t *src, intptr_t i_src_stride,
 // int dx, int dy, int i_width, int i_height );
+
 function x264_mc_chroma_neon
- push {r4-r6, lr}
- ldrd r4, [sp, #16]
- ldr r6, [sp, #24]
+ push {r4-r8, lr}
+ vpush {d8-d11}
+ ldrd r4, [sp, #56]
+ ldrd r6, [sp, #64]
 
- asr lr, r5, #3
- mul lr, r3, lr
- add r2, r2, r4, asr #3
- cmp r6, #4
- add r2, r2, lr
+ asr lr, r6, #3
+ mul lr, r4, lr
+ add r3, r3, r5, asr #2
+ cmp r7, #4
 
- and r4, r4, #7
 and r5, r5, #7
- pld [r2]
- pld [r2, r3]
+ and r6, r6, #7
+
+ add r3, r3, lr
+ bic r3, r3, #0x1
+
+ pld [r3]
+ pld [r3, r4]
 
 bgt mc_chroma_w8
 beq mc_chroma_w4
 
-// calculate cA cB cC cD
-.macro CHROMA_MC_START r0 r1
- muls lr, r4, r5
- rsb r6, lr, r5, lsl #3
- rsb ip, lr, r4, lsl #3
- sub r4, lr, r4, lsl #3
- sub r4, r4, r5, lsl #3
- add r4, r4, #64
+.macro CHROMA_MC_START r00, r01, r10, r11
+ muls lr, r5, r6
+ rsb r7, lr, r6, lsl #3
+ rsb ip, lr, r5, lsl #3
+ sub r5, lr, r5, lsl #3
+ sub r5, r5, r6, lsl #3
+ add r5, r5, #64
 
 beq 2f
+ vld2.8 {\r00-\r01}, [r3], r4
 
- add r5, r2, r3
+ vdup.8 d0, r5
+ vdup.8 d1, ip
 
- vdup.8 d0, r4
- lsl r3, r3, #1
- vdup.8 d1, ip
- vld1.64 {\r0}, [r2], r3
- vdup.8 d2, r6
- vld1.64 {\r1}, [r5], r3
- vdup.8 d3, lr
- ldr r4, [sp, #28]
-
- vext.8 d5, d4, d5, #1
- vext.8 d7, d6, d7, #1
+ vdup.8 d2, r7
+ vld2.8 {\r10-\r11}, [r3], r4
+ vdup.8 d3, lr
+ ldr r5, [sp, #72]
 .endm
 
 .macro CHROMA_MC width, align
 mc_chroma_w\width:
- CHROMA_MC_START d4, d6
+ CHROMA_MC_START d4, d5, d8, d9
+ vext.8 d6, d4, d6, #1
+ vext.8 d7, d5, d7, #1
+ vext.8 d10, d8, d10, #1
+ vext.8 d11, d9, d11, #1
 // since the element size varies, there's a different index for the 2nd store
 .if \width == 4
 .set st2, 1
@@ -868,187 +872,292 @@
 .set st2, 2
 .endif
 
- vtrn.32 d4, d5
- vtrn.32 d6, d7
+ vtrn.32 d4, d6
+ vtrn.32 d5, d7
+ vtrn.32 d8, d10
+ vtrn.32 d9, d11
 
- vtrn.32 d0, d1
- vtrn.32 d2, d3
+ vtrn.32 d0, d1
+ vtrn.32 d2, d3
 
 1: // height loop, interpolate xy
- pld [r5]
+
 vmull.u8 q8, d4, d0
- vmlal.u8 q8, d6, d2
- vld1.64 {d4}, [r2], r3
- vext.8 d5, d4, d5, #1
- vtrn.32 d4, d5
- vmull.u8 q9, d6, d0
- vmlal.u8 q9, d4, d2
- vld1.64 {d6}, [r5], r3
+ vmlal.u8 q8, d8, d2
+ vmull.u8 q9, d5, d0
+ vmlal.u8 q9, d9, d2
+
+ vld2.8 {d4-d5}, [r3], r4
+
+ vext.8 d6, d4, d6, #1
+ vext.8 d7, d5, d7, #1
+
 vadd.i16 d16, d16, d17
 vadd.i16 d17, d18, d19
+
+ vtrn.32 d4, d6
+ vtrn.32 d5, d7
+
+ vmull.u8 q10, d8, d0
+ vmlal.u8 q10, d4, d2
+ vmull.u8 q11, d9, d0
+ vmlal.u8 q11, d5, d2
+
+ vld2.8 {d8-d9}, [r3], r4
+
 vrshrn.u16 d16, q8, #6
- subs r4, r4, #2
- pld [r2]
- vext.8 d7, d6, d7, #1
- vtrn.32 d6, d7
- vst1.\align {d16[0]}, [r0,:\align], r1
- vst1.\align {d16[st2]}, [r0,:\align], r1
+
+ vext.8 d10, d8, d10, #1
+ vext.8 d11, d9, d11, #1
+
+ vadd.i16 d18, d20, d21
+ vadd.i16 d19, d22, d23
+
+ vtrn.32 d8, d10
+ vtrn.32 d9, d11
+
+ vrshrn.u16 d18, q9, #6
+
+ subs r5, r5, #2
+
+ pld [r3]
+ pld [r3, r4]
+
+ vst1.\align {d16[0]}, [r0,:\align], r2
+ vst1.\align {d16[st2]}, [r1,:\align], r2
+ vst1.\align {d18[0]}, [r0,:\align], r2
+ vst1.\align {d18[st2]}, [r1,:\align], r2
 bgt 1b
 
- pop {r4-r6, pc}
+ vpop {d8-d11}
+ pop {r4-r8, pc}
 
 2: // dx or dy are 0
- tst r6, r6
- add ip, ip, r6
- vdup.8 d0, r4
+ tst r7, r7
+ add ip, ip, r7
+ vdup.8 d0, r5
+ ldr r5, [sp, #72]
 vdup.8 d1, ip
- vtrn.32 d0, d1
- ldr r4, [sp, #28]
 
 beq 4f
 
- vext.32 d1, d0, d1, #1
- add r5, r2, r3
- lsl r3, r3, #1
- vld1.32 {d4[0]}, [r2], r3
- vld1.32 {d4[1]}, [r5], r3
+ vld1.64 {d4}, [r3], r4
+ vld1.64 {d6}, [r3], r4
 
 3: // vertical interpolation loop
- pld [r5]
+
 vmull.u8 q8, d4, d0
- vld1.32 {d4[0]}, [r2], r3
- vmull.u8 q9, d4, d1
- vld1.32 {d4[1]}, [r5], r3
- vadd.i16 d16, d16, d17
- vadd.i16 d17, d18, d19
- vrshrn.u16 d16, q8, #6
- subs r4, r4, #2
- pld [r2]
- vst1.\align {d16[0]}, [r0,:\align], r1
- vst1.\align {d16[st2]}, [r0,:\align], r1
+ vmlal.u8 q8, d6, d1
+ vmull.u8 q9, d6, d0
+ vld1.64 {d4}, [r3], r4
+ vmlal.u8 q9, d4, d1
+ vld1.64 {d6}, [r3], r4
+
+ vrshrn.u16 d16, q8, #6 // uvuvuvuv
+ vrshrn.u16 d17, q9, #6 // uvuvuvuv
+ subs r5, r5, #2
+ vuzp.8 d16, d17 // d16=uuuu|uuuu, d17=vvvv|vvvv
+
+ pld [r3]
+ pld [r3, r4]
+
+ vst1.\align {d16[0]}, [r0,:\align], r2
+ vst1.\align {d16[st2]}, [r0,:\align], r2
+ vst1.\align {d17[0]}, [r1,:\align], r2
+ vst1.\align {d17[st2]}, [r1,:\align], r2
 bgt 3b
 
- pop {r4-r6, pc}
+ vpop {d8-d11}
+ pop {r4-r8, pc}
 
 4: // dy is 0
- vld1.64 {d4}, [r2], r3
- vld1.64 {d6}, [r2], r3
- vext.8 d5, d4, d5, #1
- vext.8 d7, d6, d7, #1
- vtrn.32 d4, d5
- vtrn.32 d6, d7
+
+ vld1.64 {d4-d5}, [r3], r4
+ vld1.64 {d6-d7}, [r3], r4
+
+ vext.8 d5, d4, d5, #2
+ vext.8 d7, d6, d7, #2
 
 5: // horizontal interpolation loop
+
 vmull.u8 q8, d4, d0
+ vmlal.u8 q8, d5, d1
 vmull.u8 q9, d6, d0
- subs r4, r4, #2
- vld1.64 {d4}, [r2], r3
- vext.8 d5, d4, d5, #1
- vtrn.32 d4, d5
- vadd.i16 d16, d16, d17
- vadd.i16 d17, d18, d19
- pld [r2]
+ vmlal.u8 q9, d7, d1
+
+ subs r5, r5, #2
+ vld1.64 {d4-d5}, [r3], r4
+ vld1.64 {d6-d7}, [r3], r4
+ vext.8 d5, d4, d5, #2
 vrshrn.u16 d16, q8, #6
- vld1.64 {d6}, [r2], r3
- vext.8 d7, d6, d7, #1
- vtrn.32 d6, d7
- pld [r2]
- vst1.\align {d16[0]}, [r0,:\align], r1
- vst1.\align {d16[st2]}, [r0,:\align], r1
+ vrshrn.u16 d17, q9, #6
+ vext.8 d7, d6, d7, #2
+ vuzp.8 d16, d17
+
+ pld [r3]
+ pld [r3, r4]
+
+ vst1.\align {d16[0]}, [r0,:\align], r2
+ vst1.\align {d16[st2]}, [r0,:\align], r2
+ vst1.\align {d17[0]}, [r1,:\align], r2
+ vst1.\align {d17[st2]}, [r1,:\align], r2
 bgt 5b
 
- pop {r4-r6, pc}
+ vpop {d8-d11}
+ pop {r4-r8, pc}
 .endm
 
- CHROMA_MC 2, 16
- CHROMA_MC 4, 32
+ CHROMA_MC 2, 16
+ CHROMA_MC 4, 32
 
-// the optimial timing for width 8 is different enough that it's not
-// readable to put it in the same macro as width 2/4
 mc_chroma_w8:
- CHROMA_MC_START d4-d5, d6-d7
+ CHROMA_MC_START d4, d7, d8, d11
+ vext.8 d5, d4, d5, #1
+ vext.8 d9, d8, d9, #1
+ vext.8 d7, d6, d7, #1
+ vext.8 d11, d10, d11, #1
 
 1: // height loop, interpolate xy
- pld [r5]
 vmull.u8 q8, d4, d0
 vmlal.u8 q8, d5, d1
- vld1.64 {d4, d5}, [r2], r3
- vmlal.u8 q8, d6, d2
- vext.8 d5, d4, d5, #1
- vmlal.u8 q8, d7, d3
+ vmlal.u8 q8, d8, d2
+ vmlal.u8 q8, d9, d3
+
 vmull.u8 q9, d6, d0
- subs r4, r4, #2
 vmlal.u8 q9, d7, d1
- vmlal.u8 q9, d4, d2
- vmlal.u8 q9, d5, d3
+ vmlal.u8 q9, d10, d2
+ vmlal.u8 q9, d11, d3
+
+ vld2.8 {d4-d7}, [r3], r4
+
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+
+ vmull.u8 q10, d8, d0
+ vmlal.u8 q10, d9, d1
+ vmlal.u8 q10, d4, d2
+ vmlal.u8 q10, d5, d3
+
+ vmull.u8 q11, d10, d0
+ vmlal.u8 q11, d11, d1
+ vmlal.u8 q11, d6, d2
+ vmlal.u8 q11, d7, d3
+
+ subs r5, r5, #2
+ vld2.8 {d8-d11}, [r3], r4
+
 vrshrn.u16 d16, q8, #6
- vld1.64 {d6, d7}, [r5], r3
- pld [r2]
 vrshrn.u16 d17, q9, #6
- vext.8 d7, d6, d7, #1
- vst1.64 {d16}, [r0,:64], r1
- vst1.64 {d17}, [r0,:64], r1
+ vrshrn.u16 d18, q10, #6
+ vext.8 d9, d8, d9, #1
+ vrshrn.u16 d19, q11, #6
+ vext.8 d11, d10, d11, #1
+
+ pld [r3]
+ pld [r3, r4]
+
+ vst1.64 {d16}, [r0,:64], r2
+ vst1.64 {d17}, [r1,:64], r2
+ vst1.64 {d18}, [r0,:64], r2
+ vst1.64 {d19}, [r1,:64], r2
+
 bgt 1b
 
- pop {r4-r6, pc}
+ vpop {d8-d11}
+ pop {r4-r8, pc}
 
 2: // dx or dy are 0
- tst r6, r6
- add ip, ip, r6
- vdup.8 d0, r4
+ tst r7, r7
+ add ip, ip, r7
+ vdup.8 d0, r5
+ ldr r5, [sp, #72]
 vdup.8 d1, ip
- ldr r4, [sp, #28]
 
 beq 4f
 
- add r5, r2, r3
- lsl r3, r3, #1
- vld1.64 {d4}, [r2], r3
- vld1.64 {d6}, [r5], r3
+ vld2.8 {d4-d5}, [r3], r4
+ vld2.8 {d6-d7}, [r3], r4
 
 3: // vertical interpolation loop
- pld [r5]
- vmull.u8 q8, d4, d0
+ vmull.u8 q8, d4, d0 //U
 vmlal.u8 q8, d6, d1
- vld1.64 {d4}, [r2], r3
- vmull.u8 q9, d6, d0
- vmlal.u8 q9, d4, d1
- vld1.64 {d6}, [r5], r3
+ vmull.u8 q9, d5, d0 //V
+ vmlal.u8 q9, d7, d1
+
+ vld2.8 {d4-d5}, [r3], r4
+
+ vmull.u8 q10, d6, d0
+ vmlal.u8 q10, d4, d1
+ vmull.u8 q11, d7, d0
+ vmlal.u8 q11, d5, d1
+
+ vld2.8 {d6-d7}, [r3], r4
+
 vrshrn.u16 d16, q8, #6
 vrshrn.u16 d17, q9, #6
- subs r4, r4, #2
- pld [r2]
- vst1.64 {d16}, [r0,:64], r1
- vst1.64 {d17}, [r0,:64], r1
+ vrshrn.u16 d18, q10, #6
+ vrshrn.u16 d19, q11, #6
+ subs r5, r5, #2
+
+ pld [r3]
+ pld [r3, r4]
+
+ vst1.64 {d16}, [r0,:64], r2
+ vst1.64 {d17}, [r1,:64], r2
+ vst1.64 {d18}, [r0,:64], r2
+ vst1.64 {d19}, [r1,:64], r2
+
 bgt 3b
 
- pop {r4-r6, pc}
+ vpop {d8-d11}
+ pop {r4-r8, pc}
 
 4: // dy is 0
- vld1.64 {d4, d5}, [r2], r3
- vld1.64 {d6, d7}, [r2], r3
+
+ vld2.8 {d4-d7}, [r3], r4
+ vld2.8 {d8-d11}, [r3], r4
 vext.8 d5, d4, d5, #1
 vext.8 d7, d6, d7, #1
+ vext.8 d9, d8, d9, #1
+ vext.8 d11, d10, d11, #1
 
 5: // horizontal interpolation loop
- pld [r2]
- subs r4, r4, #2
- vmull.u8 q8, d4, d0
+ subs r5, r5, #2
+ vmull.u8 q8, d4, d0 //U
 vmlal.u8 q8, d5, d1
- vld1.64 {d4, d5}, [r2], r3
- vmull.u8 q9, d6, d0
+ vmull.u8 q9, d6, d0 //V
 vmlal.u8 q9, d7, d1
- pld [r2]
+
+ vld2.8 {d4-d7}, [r3], r4
+
+ vmull.u8 q10, d8, d0
+ vmlal.u8 q10, d9, d1
+ vmull.u8 q11, d10, d0
+ vmlal.u8 q11, d11, d1
+
+ vld2.8 {d8-d11}, [r3], r4
+
 vext.8 d5, d4, d5, #1
 vrshrn.u16 d16, q8, #6
- vrshrn.u16 d17, q9, #6
- vld1.64 {d6, d7}, [r2], r3
 vext.8 d7, d6, d7, #1
- vst1.64 {d16}, [r0,:64], r1
- vst1.64 {d17}, [r0,:64], r1
+ vrshrn.u16 d17, q9, #6
+ vext.8 d9, d8, d9, #1
+ vrshrn.u16 d18, q10, #6
+ vext.8 d11, d10, d11, #1
+ vrshrn.u16 d19, q11, #6
+
+ pld [r3]
+ pld [r3, r4]
+
+ vst1.64 {d16}, [r0,:64], r2
+ vst1.64 {d17}, [r1,:64], r2
+ vst1.64 {d18}, [r0,:64], r2
+ vst1.64 {d19}, [r1,:64], r2
 bgt 5b
 
- pop {r4-r6, pc}
+ vpop {d8-d11}
+ pop {r4-r8, pc}
+
 .endfunc

x264-snapshot-20130224-2245.tar.bz2/common/arm/mc-c.c -> x264-snapshot-20130723-2245.tar.bz2/common/arm/mc-c.c Changed

x264-snapshot-20130224-2245.tar.bz2/common/arm/quant-a.S -> x264-snapshot-20130723-2245.tar.bz2/common/arm/quant-a.S Changed

@@ -35,7 +35,7 @@
 
 .text
 
-.macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 load_mf=no
+.macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 mask load_mf=no
     vadd.u16    q8,  q8,  \bias0
     vadd.u16    q9,  q9,  \bias1
 .ifc \load_mf, yes
@@ -55,7 +55,7 @@
     veor        q9,  q9,  q15
     vsub.s16    q8,  q8,  q14
     vsub.s16    q9,  q9,  q15
-    vorr        \bias0, q8,  q9
+    vorr        \mask, q8,  q9
     vst1.64     {d16-d19}, [r0,:128]!
 .endm
 
@@ -89,7 +89,7 @@
     vabs.s16    q9,  q15
     vdup.16     q0,  r2
     vdup.16     q2,  r1
-    QUANT_TWO   q0,  q0,  d4,  d5,  d4,  d5
+    QUANT_TWO   q0,  q0,  d4,  d5,  d4,  d5,  q0
     vorr        d0,  d0,  d1
     QUANT_END   d0
 .endfunc
@@ -101,11 +101,52 @@
     vabs.s16    q9,  q15
     vld1.64     {d0-d3}, [r2,:128]
     vld1.64     {d4-d7}, [r1,:128]
-    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7
+    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7, q0
     vorr        d0,  d0,  d1
     QUANT_END   d0
 .endfunc
 
+// quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] )
+function x264_quant_4x4x4_neon
+    vpush       {d8-d15}
+    vld1.64     {d28-d31}, [r0,:128]
+    vabs.s16    q8,  q14
+    vabs.s16    q9,  q15
+    vld1.64     {d0-d3},   [r2,:128]
+    vld1.64     {d4-d7},   [r1,:128]
+    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7,  q4
+    vld1.64     {d28-d31}, [r0,:128]
+    vabs.s16    q8,  q14
+    vabs.s16    q9,  q15
+    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7,  q5
+    vld1.64     {d28-d31}, [r0,:128]
+    vabs.s16    q8,  q14
+    vabs.s16    q9,  q15
+    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7,  q6
+    vld1.64     {d28-d31}, [r0,:128]
+    vabs.s16    q8,  q14
+    vabs.s16    q9,  q15
+    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7,  q7
+    vorr        d8,  d8,  d9
+    vorr       d10, d10, d11
+    vorr       d12, d12, d13
+    vorr       d14, d14, d15
+    vmov        r0,  r1,  d8
+    vmov        r2,  r3, d10
+    orrs        r0,  r1
+    movne       r0,  #1
+    orrs        r2,  r3
+    orrne       r0,  #2
+    vmov        r1,  r2, d12
+    vmov        r3,  ip, d14
+    orrs        r1,  r2
+    orrne       r0,  #4
+    orrs        r3,  ip
+    orrne       r0,  #8
+    vpop        {d8-d15}
+    bx          lr
+.endfunc
+
 // quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
 function x264_quant_8x8_neon
     vld1.64     {d28-d31}, [r0,:128]
@@ -113,13 +154,13 @@
     vabs.s16    q9,  q15
     vld1.64     {d0-d3},   [r2,:128]!
     vld1.64     {d4-d7},   [r1,:128]!
-    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7
+    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7,  q0
 .rept 3
     vld1.64     {d28-d31}, [r0,:128]
     vabs.s16    q8,  q14
     vabs.s16    q9,  q15
     vld1.64     {d2-d5},   [r2,:128]!
-    QUANT_TWO   q1,  q2,  d4,  d5,  d6,  d7, yes
+    QUANT_TWO   q1,  q2,  d4,  d5,  d6,  d7,  q1, yes
     vorr        q0,  q0,  q1
 .endr
     vorr        d0,  d0,  d1

x264-snapshot-20130224-2245.tar.bz2/common/arm/quant.h -> x264-snapshot-20130723-2245.tar.bz2/common/arm/quant.h Changed

x264-snapshot-20130224-2245.tar.bz2/common/bitstream.c -> x264-snapshot-20130723-2245.tar.bz2/common/bitstream.c Changed

@@ -39,11 +39,20 @@
     return dst;
 }
 
-#if HAVE_MMX
 uint8_t *x264_nal_escape_mmx2( uint8_t *dst, uint8_t *src, uint8_t *end );
 uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end );
-uint8_t *x264_nal_escape_avx( uint8_t *dst, uint8_t *src, uint8_t *end );
-#endif
+uint8_t *x264_nal_escape_avx2( uint8_t *dst, uint8_t *src, uint8_t *end );
+void x264_cabac_block_residual_rd_internal_sse2       ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_rd_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_rd_internal_ssse3      ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_8x8_rd_internal_sse2       ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_8x8_rd_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_8x8_rd_internal_ssse3      ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_internal_sse2       ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_internal_avx2_bmi2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
 
 /****************************************************************************
  * x264_nal_encode:
@@ -88,13 +97,49 @@
 
 void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
 {
+    memset( pf, 0, sizeof(*pf) );
+
     pf->nal_escape = x264_nal_escape_c;
 #if HAVE_MMX
+#if ARCH_X86_64
+    pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_sse2;
+    pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_sse2;
+    pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_sse2;
+#endif
+
     if( cpu&X264_CPU_MMX2 )
         pf->nal_escape = x264_nal_escape_mmx2;
-    if( (cpu&X264_CPU_SSE2) && (cpu&X264_CPU_SSE2_IS_FAST) )
-        pf->nal_escape = x264_nal_escape_sse2;
-    if( cpu&X264_CPU_AVX )
-        pf->nal_escape = x264_nal_escape_avx;
+    if( cpu&X264_CPU_SSE2 )
+    {
+#if ARCH_X86_64
+        if( cpu&X264_CPU_LZCNT )
+        {
+            pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_sse2_lzcnt;
+            pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_sse2_lzcnt;
+            pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_sse2_lzcnt;
+        }
+#endif
+        if( cpu&X264_CPU_SSE2_IS_FAST )
+            pf->nal_escape = x264_nal_escape_sse2;
+    }
+#if ARCH_X86_64
+    if( cpu&X264_CPU_SSSE3 )
+    {
+        pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_ssse3;
+        pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_ssse3;
+        if( cpu&X264_CPU_LZCNT )
+        {
+            pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_ssse3_lzcnt;
+            pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt;
+        }
+    }
+
+    if( cpu&X264_CPU_AVX2 )
+    {
+        pf->nal_escape = x264_nal_escape_avx2;
+        if( cpu&X264_CPU_BMI2 )
+            pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx2_bmi2;
+    }
+#endif
 #endif
 }

x264-snapshot-20130224-2245.tar.bz2/common/bitstream.h -> x264-snapshot-20130723-2245.tar.bz2/common/bitstream.h Changed

x264-snapshot-20130224-2245.tar.bz2/common/common.c -> x264-snapshot-20130723-2245.tar.bz2/common/common.c Changed

@@ -171,6 +171,10 @@
     param->b_pic_struct = 0;
     param->b_fake_interlaced = 0;
     param->i_frame_packing = -1;
+    param->b_opencl = 0;
+    param->i_opencl_device = 0;
+    param->opencl_device_id = NULL;
+    param->psz_clbin_file = NULL;
 }
 
 static int x264_param_apply_preset( x264_param_t *param, const char *preset )
@@ -563,6 +567,8 @@
 }
 
 #define atobool(str) ( name_was_bool = 1, x264_atobool( str, &b_error ) )
+#undef atoi
+#undef atof
 #define atoi(str) x264_atoi( str, &b_error )
 #define atof(str) x264_atof( str, &b_error )
 
@@ -620,10 +626,8 @@
                     b_error = 1;
             }
             free( buf );
-            if( p->cpu & X264_CPU_SSSE3 )
+            if( (p->cpu&X264_CPU_SSSE3) && !(p->cpu&X264_CPU_SSE2_IS_SLOW) )
                 p->cpu |= X264_CPU_SSE2_IS_FAST;
-            if( p->cpu & X264_CPU_SSE4 )
-                p->cpu |= X264_CPU_SHUFFLE_IS_FAST;
         }
     }
     OPT("threads")
@@ -778,8 +782,12 @@
         p->i_slice_max_size = atoi(value);
     OPT("slice-max-mbs")
         p->i_slice_max_mbs = atoi(value);
+    OPT("slice-min-mbs")
+        p->i_slice_min_mbs = atoi(value);
     OPT("slices")
         p->i_slice_count = atoi(value);
+    OPT("slices-max")
+        p->i_slice_count_max = atoi(value);
     OPT("cabac")
         p->b_cabac = atobool(value);
     OPT("cabac-idc")
@@ -1029,6 +1037,14 @@
         p->b_fake_interlaced = atobool(value);
     OPT("frame-packing")
         p->i_frame_packing = atoi(value);
+    OPT("stitchable")
+        p->b_stitchable = atobool(value);
+    OPT("opencl")
+        p->b_opencl = atobool( value );
+    OPT("opencl-clbin")
+        p->psz_clbin_file = strdup( value );
+    OPT("opencl-device")
+        p->i_opencl_device = atoi( value );
     else
         return X264_PARAM_BAD_NAME;
 #undef OPT
@@ -1166,17 +1182,14 @@
 void *x264_malloc( int i_size )
 {
     uint8_t *align_buf = NULL;
-#if SYS_MACOSX || (SYS_WINDOWS && ARCH_X86_64)
-    /* Mac OS X and Win x64 always returns 16 byte aligned memory */
-    align_buf = malloc( i_size );
-#elif HAVE_MALLOC_H
-    align_buf = memalign( 16, i_size );
+#if HAVE_MALLOC_H
+    align_buf = memalign( NATIVE_ALIGN, i_size );
 #else
-    uint8_t *buf = malloc( i_size + 15 + sizeof(void **) );
+    uint8_t *buf = malloc( i_size + (NATIVE_ALIGN-1) + sizeof(void **) );
     if( buf )
     {
-        align_buf = buf + 15 + sizeof(void **);
-        align_buf -= (intptr_t) align_buf & 15;
+        align_buf = buf + (NATIVE_ALIGN-1) + sizeof(void **);
+        align_buf -= (intptr_t) align_buf & (NATIVE_ALIGN-1);
         *( (void **) ( align_buf - sizeof(void **) ) ) = buf;
     }
 #endif
@@ -1192,7 +1205,7 @@
 {
     if( p )
     {
-#if HAVE_MALLOC_H || SYS_MACOSX || (SYS_WINDOWS && ARCH_X86_64)
+#if HAVE_MALLOC_H
         free( p );
 #else
         free( *( ( ( void **) p ) - 1 ) );
@@ -1281,6 +1294,8 @@
         s += sprintf( s, "bitdepth=%d ", BIT_DEPTH );
     }
 
+    if( p->b_opencl )
+        s += sprintf( s, "opencl=%d ", p->b_opencl );
     s += sprintf( s, "cabac=%d", p->b_cabac );
     s += sprintf( s, " ref=%d", p->i_frame_reference );
     s += sprintf( s, " deblock=%d:%d:%d", p->b_deblocking_filter,
@@ -1305,14 +1320,20 @@
     s += sprintf( s, " sliced_threads=%d", p->b_sliced_threads );
     if( p->i_slice_count )
         s += sprintf( s, " slices=%d", p->i_slice_count );
+    if( p->i_slice_count_max )
+        s += sprintf( s, " slices_max=%d", p->i_slice_count_max );
     if( p->i_slice_max_size )
         s += sprintf( s, " slice_max_size=%d", p->i_slice_max_size );
     if( p->i_slice_max_mbs )
         s += sprintf( s, " slice_max_mbs=%d", p->i_slice_max_mbs );
+    if( p->i_slice_min_mbs )
+        s += sprintf( s, " slice_min_mbs=%d", p->i_slice_min_mbs );
     s += sprintf( s, " nr=%d", p->analyse.i_noise_reduction );
     s += sprintf( s, " decimate=%d", p->analyse.b_dct_decimate );
     s += sprintf( s, " interlaced=%s", p->b_interlaced ? p->b_tff ? "tff" : "bff" : p->b_fake_interlaced ? "fake" : "0" );
     s += sprintf( s, " bluray_compat=%d", p->b_bluray_compat );
+    if( p->b_stitchable )
+        s += sprintf( s, " stitchable=%d", p->b_stitchable );
 
     s += sprintf( s, " constrained_intra=%d", p->b_constrained_intra );

x264-snapshot-20130224-2245.tar.bz2/common/common.h -> x264-snapshot-20130723-2245.tar.bz2/common/common.h Changed

@@ -40,6 +40,7 @@
 #define IS_DISPOSABLE(type) ( type == X264_TYPE_B )
 #define FIX8(f) ((int)(f*(1<<8)+.5))
 #define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
+#define ARRAY_ELEMS(a) ((sizeof(a))/(sizeof(a[0])))
 
 #define CHECKED_MALLOC( var, size )\
 do {\
@@ -53,6 +54,8 @@
 memset( var, 0, size );\
 } while( 0 )
 
+#define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
+
 #define X264_BFRAME_MAX 16
 #define X264_REF_MAX 16
 #define X264_THREAD_MAX 128
@@ -202,6 +205,10 @@
 };
 
 #include "x264.h"
+#if HAVE_OPENCL
+#include "opencl.h"
+#endif
+#include "cabac.h"
 #include "bitstream.h"
 #include "set.h"
 #include "predict.h"
@@ -209,7 +216,6 @@
 #include "mc.h"
 #include "frame.h"
 #include "dct.h"
-#include "cabac.h"
 #include "quant.h"
 #include "cpu.h"
 #include "threadpool.h"
@@ -291,17 +297,6 @@
 return amvd0 + (amvd1<<8);
 }
 
-static void ALWAYS_INLINE x264_predictor_roundclip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
-{
- for( int i = 0; i < i_mvc; i++ )
- {
- int mx = (mvc[i][0] + 2) >> 2;
- int my = (mvc[i][1] + 2) >> 2;
- dst[i][0] = x264_clip3( mx, mv_x_min, mv_x_max );
- dst[i][1] = x264_clip3( my, mv_y_min, mv_y_max );
- }
-}
-
 extern const uint8_t x264_exp2_lut[64];
 extern const float x264_log2_lut[128];
 extern const float x264_log2_lz_lut[32];
@@ -614,11 +609,11 @@
 /* Current MB DCT coeffs */
 struct
 {
- ALIGNED_16( dctcoef luma16x16_dc[3][16] );
+ ALIGNED_N( dctcoef luma16x16_dc[3][16] );
 ALIGNED_16( dctcoef chroma_dc[2][8] );
 // FIXME share memory?
- ALIGNED_16( dctcoef luma8x8[12][64] );
- ALIGNED_16( dctcoef luma4x4[16*3][16] );
+ ALIGNED_N( dctcoef luma8x8[12][64] );
+ ALIGNED_N( dctcoef luma4x4[16*3][16] );
 } dct;
 
 /* MB table and cache for current frame/mb */
@@ -671,8 +666,7 @@
 int mv_miny_spel_row[3];
 int mv_maxy_spel_row[3];
 /* Fullpel MV range for motion search */
- int mv_min_fpel[2];
- int mv_max_fpel[2];
+ ALIGNED_8( int16_t mv_limit_fpel[2][2] ); /* min_x, min_y, max_x, max_y */
 int mv_miny_fpel_row[3];
 int mv_maxy_fpel_row[3];
 
@@ -758,7 +752,7 @@
 #define FENC_STRIDE 16
 #define FDEC_STRIDE 32
 ALIGNED_16( pixel fenc_buf[48*FENC_STRIDE] );
- ALIGNED_16( pixel fdec_buf[52*FDEC_STRIDE] );
+ ALIGNED_N( pixel fdec_buf[52*FDEC_STRIDE] );
 
 /* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
 ALIGNED_16( pixel i4x4_fdec_buf[16*16] );
@@ -775,8 +769,8 @@
 ALIGNED_16( dctcoef fenc_dct4[16][16] );
 
 /* Psy RD SATD/SA8D scores cache */
- ALIGNED_16( uint64_t fenc_hadamard_cache[9] );
- ALIGNED_16( uint32_t fenc_satd_cache[32] );
+ ALIGNED_N( uint64_t fenc_hadamard_cache[9] );
+ ALIGNED_N( uint32_t fenc_satd_cache[32] );
 
 /* pointer over mb of the frame to be compressed */
 pixel *p_fenc[3]; /* y,u,v */
@@ -910,8 +904,8 @@
 uint32_t (*nr_residual_sum)[64];
 uint32_t *nr_count;
 
- ALIGNED_16( udctcoef nr_offset_denoise[4][64] );
- ALIGNED_16( uint32_t nr_residual_sum_buf[2][4][64] );
+ ALIGNED_N( udctcoef nr_offset_denoise[4][64] );
+ ALIGNED_N( uint32_t nr_residual_sum_buf[2][4][64] );
 uint32_t nr_count_buf[2][4];
 
 uint8_t luma2chroma_pixel[7]; /* Subsampled pixel size */
@@ -947,11 +941,48 @@
 struct visualize_t *visualize;
 #endif
 x264_lookahead_t *lookahead;
+
+#if HAVE_OPENCL
+ x264_opencl_t opencl;
+#endif
 };
 
 // included at the end because it needs x264_t
 #include "macroblock.h"
 
+static int ALWAYS_INLINE x264_predictor_roundclip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
+{
+ int cnt = 0;
+ for( int i = 0; i < i_mvc; i++ )
+ {
+ int mx = (mvc[i][0] + 2) >> 2;
+ int my = (mvc[i][1] + 2) >> 2;
+ uint32_t mv = pack16to32_mask(mx, my);
+ if( !mv || mv == pmv ) continue;
+ dst[cnt][0] = x264_clip3( mx, mv_limit[0][0], mv_limit[1][0] );
+ dst[cnt][1] = x264_clip3( my, mv_limit[0][1], mv_limit[1][1] );
+ cnt++;
+ }
+ return cnt;
+}
+
+static int ALWAYS_INLINE x264_predictor_clip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
+{
+ int cnt = 0;
+ int qpel_limit[4] = {mv_limit[0][0] << 2, mv_limit[0][1] << 2, mv_limit[1][0] << 2, mv_limit[1][1] << 2};
+ for( int i = 0; i < i_mvc; i++ )
+ {
+ uint32_t mv = M32( mvc[i] );
+ int mx = mvc[i][0];
+ int my = mvc[i][1];
+ if( !mv || mv == pmv ) continue;
+ dst[cnt][0] = x264_clip3( mx, qpel_limit[0], qpel_limit[2] );
+ dst[cnt][1] = x264_clip3( my, qpel_limit[1], qpel_limit[3] );
+ cnt++;
+ }
+ return cnt;
+}
+
 #if ARCH_X86 || ARCH_X86_64
 #include "x86/util.h"
 #endif

x264-snapshot-20130224-2245.tar.bz2/common/cpu.c -> x264-snapshot-20130723-2245.tar.bz2/common/cpu.c Changed

@@ -47,18 +47,19 @@
 
 const x264_cpu_name_t x264_cpu_names[] =
 {
- {"Altivec", X264_CPU_ALTIVEC},
-// {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore
- {"MMX2", X264_CPU_MMX|X264_CPU_MMX2},
- {"MMXEXT", X264_CPU_MMX|X264_CPU_MMX2},
-// {"SSE", X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_SSE}, // there are no sse1 functions in x264
-#define SSE2 X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_SSE|X264_CPU_SSE2
+#if HAVE_MMX
+// {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore
+// {"CMOV", X264_CPU_CMOV}, // we require this unconditionally, so don't print it
+#define MMX2 X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_CMOV
+ {"MMX2", MMX2},
+ {"MMXEXT", MMX2},
+ {"SSE", MMX2|X264_CPU_SSE},
+#define SSE2 MMX2|X264_CPU_SSE|X264_CPU_SSE2
 {"SSE2Slow", SSE2|X264_CPU_SSE2_IS_SLOW},
 {"SSE2", SSE2},
 {"SSE2Fast", SSE2|X264_CPU_SSE2_IS_FAST},
 {"SSE3", SSE2|X264_CPU_SSE3},
 {"SSSE3", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
- {"FastShuffle", SSE2|X264_CPU_SHUFFLE_IS_FAST},
 {"SSE4.1", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
 {"SSE4", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
 {"SSE4.2", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
@@ -70,19 +71,26 @@
 {"FMA3", AVX|X264_CPU_FMA3},
 #undef AVX
 #undef SSE2
+#undef MMX2
 {"Cache32", X264_CPU_CACHELINE_32},
 {"Cache64", X264_CPU_CACHELINE_64},
 {"SSEMisalign", X264_CPU_SSE_MISALIGN},
 {"LZCNT", X264_CPU_LZCNT},
 {"BMI1", X264_CPU_BMI1},
 {"BMI2", X264_CPU_BMI1|X264_CPU_BMI2},
- {"TBM", X264_CPU_TBM},
- {"Slow_mod4_stack", X264_CPU_STACK_MOD4},
- {"ARMv6", X264_CPU_ARMV6},
- {"NEON", X264_CPU_NEON},
- {"Fast_NEON_MRC", X264_CPU_FAST_NEON_MRC},
 {"SlowCTZ", X264_CPU_SLOW_CTZ},
 {"SlowAtom", X264_CPU_SLOW_ATOM},
+ {"SlowPshufb", X264_CPU_SLOW_PSHUFB},
+ {"SlowPalignr", X264_CPU_SLOW_PALIGNR},
+ {"SlowShuffle", X264_CPU_SLOW_SHUFFLE},
+ {"UnalignedStack", X264_CPU_STACK_MOD4},
+#elif ARCH_PPC
+ {"Altivec", X264_CPU_ALTIVEC},
+#elif ARCH_ARM
+ {"ARMv6", X264_CPU_ARMV6},
+ {"NEON", X264_CPU_NEON},
+ {"FastNeonMRC", X264_CPU_FAST_NEON_MRC},
+#endif
 {"", 0},
 };
 
@@ -131,9 +139,13 @@
 if( edx&0x00800000 )
 cpu |= X264_CPU_MMX;
 else
- return 0;
+ return cpu;
 if( edx&0x02000000 )
 cpu |= X264_CPU_MMX2|X264_CPU_SSE;
+ if( edx&0x00008000 )
+ cpu |= X264_CPU_CMOV;
+ else
+ return cpu;
 if( edx&0x04000000 )
 cpu |= X264_CPU_SSE2;
 if( ecx&0x00000001 )
@@ -170,46 +182,56 @@
 
 if( cpu & X264_CPU_SSSE3 )
 cpu |= X264_CPU_SSE2_IS_FAST;
- if( cpu & X264_CPU_SSE4 )
- cpu |= X264_CPU_SHUFFLE_IS_FAST;
 
 x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
 max_extended_cap = eax;
 
- if( !strcmp((char*)vendor, "AuthenticAMD") && max_extended_cap >= 0x80000001 )
+ if( max_extended_cap >= 0x80000001 )
 {
- cpu |= X264_CPU_SLOW_CTZ;
 x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx );
- if( edx&0x00400000 )
- cpu |= X264_CPU_MMX2;
- if( cpu & X264_CPU_SSE2 )
+
+ if( ecx&0x00000020 )
+ cpu |= X264_CPU_LZCNT; /* Supported by Intel chips starting with Haswell */
+ if( ecx&0x00000040 ) /* SSE4a, AMD only */
 {
- if( ecx&0x00000040 ) /* SSE4a */
+ int family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
+ cpu |= X264_CPU_SSE2_IS_FAST; /* Phenom and later CPUs have fast SSE units */
+ if( family == 0x14 )
 {
- cpu |= X264_CPU_SSE2_IS_FAST;
- cpu |= X264_CPU_LZCNT;
- cpu |= X264_CPU_SHUFFLE_IS_FAST;
- cpu &= ~X264_CPU_SLOW_CTZ;
+ cpu &= ~X264_CPU_SSE2_IS_FAST; /* SSSE3 doesn't imply fast SSE anymore... */
+ cpu |= X264_CPU_SSE2_IS_SLOW; /* Bobcat has 64-bit SIMD units */
+ cpu |= X264_CPU_SLOW_PALIGNR; /* palignr is insanely slow on Bobcat */
 }
- else
- cpu |= X264_CPU_SSE2_IS_SLOW;
-
- if( ecx&0x00000080 ) /* Misalign SSE */
+ if( family == 0x16 )
 {
- cpu |= X264_CPU_SSE_MISALIGN;
- x264_cpu_mask_misalign_sse();
+ cpu |= X264_CPU_SLOW_PSHUFB; /* Jaguar's pshufb isn't that slow, but it's slow enough
+ * compared to alternate instruction sequences that this
+ * is equal or faster on almost all such functions. */
 }
+ }
 
- if( cpu & X264_CPU_AVX )
- {
- if( ecx&0x00000800 ) /* XOP */
- cpu |= X264_CPU_XOP;
- if( ecx&0x00010000 ) /* FMA4 */
- cpu |= X264_CPU_FMA4;
- }
+ if( ecx&0x00000080 ) /* Misalign SSE */
+ {
+ cpu |= X264_CPU_SSE_MISALIGN;
+ x264_cpu_mask_misalign_sse();
+ }
 
- if( ecx&0x00200000 )
- cpu |= X264_CPU_TBM;
+ if( cpu & X264_CPU_AVX )
+ {
+ if( ecx&0x00000800 ) /* XOP */
+ cpu |= X264_CPU_XOP;
+ if( ecx&0x00010000 ) /* FMA4 */
+ cpu |= X264_CPU_FMA4;
+ }
+
+ if( !strcmp((char*)vendor, "AuthenticAMD") )
+ {
+ if( edx&0x00400000 )
+ cpu |= X264_CPU_MMX2;
+ if( !(cpu&X264_CPU_LZCNT) )
+ cpu |= X264_CPU_SLOW_CTZ;
+ if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_FAST) )
+ cpu |= X264_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
 }
 }
 
@@ -233,11 +255,12 @@
 {
 cpu |= X264_CPU_SLOW_ATOM;
 cpu |= X264_CPU_SLOW_CTZ;
+ cpu |= X264_CPU_SLOW_PSHUFB;
 }
- /* Some Penryns and Nehalems are pointlessly crippled (SSE4 disabled), so
- * detect them here. */
- else if( model >= 23 )
- cpu |= X264_CPU_SHUFFLE_IS_FAST;
+ /* Conroe has a slow shuffle unit. Check the model number to make sure not
+ * to include crippled low-end Penryns and Nehalems that don't have SSE4. */
+ else if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE4) && model < 23 )
+ cpu |= X264_CPU_SLOW_SHUFFLE;
 }
 }

x264-snapshot-20130224-2245.tar.bz2/common/cpu.h -> x264-snapshot-20130723-2245.tar.bz2/common/cpu.h Changed

@@ -48,15 +48,17 @@
 void     x264_cpu_mask_misalign_sse( void );
 void     x264_safe_intel_cpu_indicator_init( void );
 
-/* kluge:
+/* kludge:
  * gcc can't give variables any greater alignment than the stack frame has.
- * We need 16 byte alignment for SSE2, so here we make sure that the stack is
- * aligned to 16 bytes.
+ * We need 32 byte alignment for AVX2, so here we make sure that the stack is
+ * aligned to 32 bytes.
  * gcc 4.2 introduced __attribute__((force_align_arg_pointer)) to fix this
  * problem, but I don't want to require such a new version.
- * This applies only to x86_32, since other architectures that need alignment
- * either have ABIs that ensure aligned stack, or don't support it at all. */
-#if ARCH_X86 && HAVE_MMX
+ * aligning to 32 bytes only works if the compiler supports keeping that
+ * alignment between functions (osdep.h handles manual alignment of arrays
+ * if it doesn't).
+ */
+#if (ARCH_X86 || HAVE_32B_STACK_ALIGNMENT) && HAVE_MMX
 int x264_stack_align( void (*func)(), ... );
 #define x264_stack_align(func,...) x264_stack_align((void (*)())func, __VA_ARGS__)
 #else

x264-snapshot-20130224-2245.tar.bz2/common/dct.c -> x264-snapshot-20130723-2245.tar.bz2/common/dct.c Changed

@@ -640,23 +640,32 @@
         dctf->add8x8_idct8  = x264_add8x8_idct8_sse2;
         dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
 
-        dctf->sub8x8_dct    = x264_sub8x8_dct_sse2;
-        dctf->sub16x16_dct  = x264_sub16x16_dct_sse2;
-        dctf->add8x8_idct   = x264_add8x8_idct_sse2;
-        dctf->add16x16_idct = x264_add16x16_idct_sse2;
-        dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
+        if( !(cpu&X264_CPU_SSE2_IS_SLOW) )
+        {
+            dctf->sub8x8_dct    = x264_sub8x8_dct_sse2;
+            dctf->sub16x16_dct  = x264_sub16x16_dct_sse2;
+            dctf->add8x8_idct   = x264_add8x8_idct_sse2;
+            dctf->add16x16_idct = x264_add16x16_idct_sse2;
+            dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
+        }
     }
 
-    if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SLOW_ATOM) )
+    if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
     {
-        dctf->sub4x4_dct    = x264_sub4x4_dct_ssse3;
-        dctf->sub8x8_dct    = x264_sub8x8_dct_ssse3;
-        dctf->sub16x16_dct  = x264_sub16x16_dct_ssse3;
-        dctf->sub8x8_dct8   = x264_sub8x8_dct8_ssse3;
-        dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
         dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_ssse3;
-        dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
-        dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
+        if( !(cpu&X264_CPU_SLOW_ATOM) )
+        {
+            dctf->sub4x4_dct    = x264_sub4x4_dct_ssse3;
+            dctf->sub8x8_dct    = x264_sub8x8_dct_ssse3;
+            dctf->sub16x16_dct  = x264_sub16x16_dct_ssse3;
+            dctf->sub8x8_dct8   = x264_sub8x8_dct8_ssse3;
+            dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
+            if( !(cpu&X264_CPU_SLOW_PSHUFB) )
+            {
+                dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
+                dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
+            }
+        }
     }
 
     if( cpu&X264_CPU_SSE4 )
@@ -681,6 +690,18 @@
         dctf->sub8x8_dct       = x264_sub8x8_dct_xop;
         dctf->sub16x16_dct     = x264_sub16x16_dct_xop;
     }
+
+    if( cpu&X264_CPU_AVX2 )
+    {
+        dctf->add8x8_idct      = x264_add8x8_idct_avx2;
+        dctf->add16x16_idct    = x264_add16x16_idct_avx2;
+        dctf->sub8x8_dct       = x264_sub8x8_dct_avx2;
+        dctf->sub16x16_dct     = x264_sub16x16_dct_avx2;
+        dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx2;
+#if ARCH_X86_64
+        dctf->sub16x16_dct8    = x264_sub16x16_dct8_avx2;
+#endif
+    }
 #endif //HAVE_MMX
 
 #if HAVE_ALTIVEC
@@ -951,7 +972,7 @@
         pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_ssse3;
         pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3;
         pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
-        if( cpu&X264_CPU_SHUFFLE_IS_FAST )
+        if( !(cpu&X264_CPU_SLOW_SHUFFLE) )
             pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
     }
     if( cpu&X264_CPU_AVX )
@@ -962,8 +983,7 @@
         pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_avx;
         pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_avx;
 #endif
-        if( cpu&X264_CPU_SHUFFLE_IS_FAST )
-            pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
+        pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
     }
     if( cpu&X264_CPU_XOP )
     {
@@ -1005,7 +1025,7 @@
         pf_interlaced->interleave_8x8_cavlc =
         pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
     }
-    if( cpu&X264_CPU_SHUFFLE_IS_FAST )
+    if( (cpu&X264_CPU_SSE2) && !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SSE2_IS_SLOW)) )
     {
         pf_interlaced->interleave_8x8_cavlc =
         pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
@@ -1016,6 +1036,12 @@
         pf_interlaced->interleave_8x8_cavlc =
         pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
     }
+
+    if( cpu&X264_CPU_AVX2 )
+    {
+        pf_interlaced->interleave_8x8_cavlc =
+        pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx2;
+    }
 #endif // HIGH_BIT_DEPTH
 #endif
 }

x264-snapshot-20130224-2245.tar.bz2/common/deblock.c -> x264-snapshot-20130723-2245.tar.bz2/common/deblock.c Changed

x264-snapshot-20130224-2245.tar.bz2/common/display-x11.c -> x264-snapshot-20130723-2245.tar.bz2/common/display-x11.c Changed

x264-snapshot-20130224-2245.tar.bz2/common/frame.c -> x264-snapshot-20130723-2245.tar.bz2/common/frame.c Changed

@@ -72,8 +72,18 @@
 int i_mb_count = h->mb.i_mb_count;
 int i_stride, i_width, i_lines, luma_plane_count;
 int i_padv = PADV << PARAM_INTERLACED;
- int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
- int disalign = h->param.cpu&X264_CPU_ALTIVEC ? 1<<9 : 1<<10;
+ int align = 16;
+#if ARCH_X86 || ARCH_X86_64
+ if( h->param.cpu&X264_CPU_CACHELINE_64 )
+ align = 64;
+ else if( h->param.cpu&X264_CPU_CACHELINE_32 || h->param.cpu&X264_CPU_AVX2 )
+ align = 32;
+#endif
+#if ARCH_PPC
+ int disalign = 1<<9;
+#else
+ int disalign = 1<<10;
+#endif
 
 CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );
 
@@ -251,6 +261,10 @@
 if( x264_pthread_cond_init( &frame->cv, NULL ) )
 goto fail;
 
+#if HAVE_OPENCL
+ frame->opencl.ocl = h->opencl.ocl;
+#endif
+
 return frame;
 
 fail:
@@ -312,6 +326,9 @@
 }
 x264_pthread_mutex_destroy( &frame->mutex );
 x264_pthread_cond_destroy( &frame->cv );
+#if HAVE_OPENCL
+ x264_opencl_frame_delete( frame );
+#endif
 }
 x264_free( frame );
 }
@@ -655,6 +672,21 @@
 x264_pthread_mutex_unlock( &h->mutex );
 }
 
+int x264_frame_new_slice( x264_t *h, x264_frame_t *frame )
+{
+ if( h->param.i_slice_count_max )
+ {
+ int slice_count;
+ if( h->param.b_sliced_threads )
+ slice_count = x264_pthread_fetch_and_add( &frame->i_slice_count, 1, &frame->mutex );
+ else
+ slice_count = frame->i_slice_count++;
+ if( slice_count >= h->param.i_slice_count_max )
+ return -1;
+ }
+ return 0;
+}
+
 /* list operators */
 
 void x264_frame_push( x264_frame_t **list, x264_frame_t *frame )
@@ -717,6 +749,7 @@
 frame->b_scenecut = 1;
 frame->b_keyframe = 0;
 frame->b_corrupt = 0;
+ frame->i_slice_count = h->param.b_sliced_threads ? h->param.i_threads : 1;
 
 memset( frame->weight, 0, sizeof(frame->weight) );
 memset( frame->f_weighted_cost_delta, 0, sizeof(frame->f_weighted_cost_delta) );

x264-snapshot-20130224-2245.tar.bz2/common/frame.h -> x264-snapshot-20130723-2245.tar.bz2/common/frame.h Changed

x264-snapshot-20130224-2245.tar.bz2/common/macroblock.c -> x264-snapshot-20130723-2245.tar.bz2/common/macroblock.c Changed

x264-snapshot-20130224-2245.tar.bz2/common/mc.c -> x264-snapshot-20130723-2245.tar.bz2/common/mc.c Changed

x264-snapshot-20130224-2245.tar.bz2/common/mc.h -> x264-snapshot-20130723-2245.tar.bz2/common/mc.h Changed

x264-snapshot-20130723-2245.tar.bz2/common/opencl Added

x264-snapshot-20130723-2245.tar.bz2/common/opencl.c Added

@@ -0,0 +1,718 @@
+/*****************************************************************************
+ * opencl.c: OpenCL initialization and kernel compilation
+ *****************************************************************************
+ * Copyright (C) 2012-2013 x264 project
+ *
+ * Authors: Steve Borho <sborho@multicorewareinc.com>
+ * Anton Mitrofanov <BugMaster@narod.ru>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common.h"
+
+#ifdef _WIN32
+#include <windows.h>
+#define ocl_open LoadLibrary( "OpenCL" )
+#define ocl_close FreeLibrary
+#define ocl_address GetProcAddress
+#else
+#include <dlfcn.h> //dlopen, dlsym, dlclose
+#if SYS_MACOSX
+#define ocl_open dlopen( "/System/Library/Frameworks/OpenCL.framework/OpenCL", RTLD_NOW )
+#else
+#define ocl_open dlopen( "libOpenCL.so", RTLD_NOW )
+#endif
+#define ocl_close dlclose
+#define ocl_address dlsym
+#endif
+
+#define LOAD_OCL_FUNC(name, continue_on_fail)\
+{\
+ ocl->name = (void*)ocl_address( ocl->library, #name );\
+ if( !continue_on_fail && !ocl->name )\
+ goto fail;\
+}
+
+/* load the library and functions we require from it */
+x264_opencl_function_t *x264_opencl_load_library( void )
+{
+ x264_opencl_function_t *ocl;
+#undef fail
+#define fail fail0
+ CHECKED_MALLOCZERO( ocl, sizeof(x264_opencl_function_t) );
+#undef fail
+#define fail fail1
+ ocl->library = ocl_open;
+ if( !ocl->library )
+ goto fail;
+#undef fail
+#define fail fail2
+ LOAD_OCL_FUNC( clBuildProgram, 0 );
+ LOAD_OCL_FUNC( clCreateBuffer, 0 );
+ LOAD_OCL_FUNC( clCreateCommandQueue, 0 );
+ LOAD_OCL_FUNC( clCreateContext, 0 );
+ LOAD_OCL_FUNC( clCreateImage2D, 0 );
+ LOAD_OCL_FUNC( clCreateKernel, 0 );
+ LOAD_OCL_FUNC( clCreateProgramWithBinary, 0 );
+ LOAD_OCL_FUNC( clCreateProgramWithSource, 0 );
+ LOAD_OCL_FUNC( clEnqueueCopyBuffer, 0 );
+ LOAD_OCL_FUNC( clEnqueueMapBuffer, 0 );
+ LOAD_OCL_FUNC( clEnqueueNDRangeKernel, 0 );
+ LOAD_OCL_FUNC( clEnqueueReadBuffer, 0 );
+ LOAD_OCL_FUNC( clEnqueueWriteBuffer, 0 );
+ LOAD_OCL_FUNC( clFinish, 0 );
+ LOAD_OCL_FUNC( clGetCommandQueueInfo, 0 );
+ LOAD_OCL_FUNC( clGetDeviceIDs, 0 );
+ LOAD_OCL_FUNC( clGetDeviceInfo, 0 );
+ LOAD_OCL_FUNC( clGetKernelWorkGroupInfo, 0 );
+ LOAD_OCL_FUNC( clGetPlatformIDs, 0 );
+ LOAD_OCL_FUNC( clGetProgramBuildInfo, 0 );
+ LOAD_OCL_FUNC( clGetProgramInfo, 0 );
+ LOAD_OCL_FUNC( clGetSupportedImageFormats, 0 );
+ LOAD_OCL_FUNC( clReleaseCommandQueue, 0 );
+ LOAD_OCL_FUNC( clReleaseContext, 0 );
+ LOAD_OCL_FUNC( clReleaseKernel, 0 );
+ LOAD_OCL_FUNC( clReleaseMemObject, 0 );
+ LOAD_OCL_FUNC( clReleaseProgram, 0 );
+ LOAD_OCL_FUNC( clSetKernelArg, 0 );
+ return ocl;
+#undef fail
+fail2:
+ ocl_close( ocl->library );
+fail1:
+ x264_free( ocl );
+fail0:
+ return NULL;
+}
+
+void x264_opencl_close_library( x264_opencl_function_t *ocl )
+{
+ if( !ocl )
+ return;
+ ocl_close( ocl->library );
+ x264_free( ocl );
+}
+
+/* define from recent cl_ext.h, copied here in case headers are old */
+#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD 0x4042
+
+/* Requires full include path in case of out-of-tree builds */
+#include "common/oclobj.h"
+
+static int x264_detect_switchable_graphics( void );
+
+/* Try to load the cached compiled program binary, verify the device context is
+ * still valid before reuse */
+static cl_program x264_opencl_cache_load( x264_t *h, char *dev_name, char *dev_vendor, char *driver_version )
+{
+ /* try to load cached program binary */
+ FILE *fp = fopen( h->param.psz_clbin_file, "rb" );
+ if( !fp )
+ return NULL;
+
+ x264_opencl_function_t *ocl = h->opencl.ocl;
+ cl_program program = NULL;
+ uint8_t *binary = NULL;
+
+ fseek( fp, 0, SEEK_END );
+ size_t size = ftell( fp );
+ rewind( fp );
+ CHECKED_MALLOC( binary, size );
+
+ fread( binary, 1, size, fp );
+ const uint8_t *ptr = (const uint8_t*)binary;
+
+#define CHECK_STRING( STR )\
+ do {\
+ size_t len = strlen( STR );\
+ if( size <= len || strncmp( (char*)ptr, STR, len ) )\
+ goto fail;\
+ else {\
+ size -= (len+1); ptr += (len+1);\
+ }\
+ } while( 0 )
+
+ CHECK_STRING( dev_name );
+ CHECK_STRING( dev_vendor );
+ CHECK_STRING( driver_version );
+ CHECK_STRING( x264_opencl_source_hash );
+#undef CHECK_STRING
+
+ cl_int status;
+ program = ocl->clCreateProgramWithBinary( h->opencl.context, 1, &h->opencl.device, &size, &ptr, NULL, &status );
+ if( status != CL_SUCCESS )
+ program = NULL;
+
+fail:
+ fclose( fp );
+ x264_free( binary );
+ return program;
+}
+
+/* Save the compiled program binary to a file for later reuse. Device context
+ * is also saved in the cache file so we do not reuse stale binaries */
+static void x264_opencl_cache_save( x264_t *h, cl_program program, char *dev_name, char *dev_vendor, char *driver_version )
+{
+ FILE *fp = fopen( h->param.psz_clbin_file, "wb" );
+ if( !fp )
+ {
+ x264_log( h, X264_LOG_INFO, "OpenCL: unable to open clbin file for write\n" );
+ return;
+ }
+
+ x264_opencl_function_t *ocl = h->opencl.ocl;
+ uint8_t *binary = NULL;
+
+ size_t size = 0;
+ cl_int status = ocl->clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL );
+ if( status != CL_SUCCESS || !size )
+ {
+ x264_log( h, X264_LOG_INFO, "OpenCL: Unable to query program binary size, no cache file generated\n" );
+ goto fail;
+ }
+
+ CHECKED_MALLOC( binary, size );
+ status = ocl->clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &binary, NULL );
+ if( status != CL_SUCCESS )
+ {
+ x264_log( h, X264_LOG_INFO, "OpenCL: Unable to query program binary, no cache file generated\n" );
+ goto fail;
+ }
+
+ fputs( dev_name, fp );
+ fputc( '\n', fp );
+ fputs( dev_vendor, fp );
+ fputc( '\n', fp );
+ fputs( driver_version, fp );
+ fputc( '\n', fp );
+ fputs( x264_opencl_source_hash, fp );
+ fputc( '\n', fp );
+ fwrite( binary, 1, size, fp );
+
+fail:
+ fclose( fp );
+ x264_free( binary );
+ return;
+}
+
+/* The OpenCL source under common/opencl will be merged into common/oclobj.h by
+ * the Makefile. It defines a x264_opencl_source byte array which we will pass
+ * to clCreateProgramWithSource(). We also attempt to use a cache file for the
+ * compiled binary, stored in the current working folder. */
+static cl_program x264_opencl_compile( x264_t *h )
+{
+ x264_opencl_function_t *ocl = h->opencl.ocl;
+ cl_program program = NULL;
+ char *build_log = NULL;
+
+ char dev_name[64];
+ char dev_vendor[64];
+ char driver_version[64];
+ cl_int status;
+ status = ocl->clGetDeviceInfo( h->opencl.device, CL_DEVICE_NAME, sizeof(dev_name), dev_name, NULL );
+ status |= ocl->clGetDeviceInfo( h->opencl.device, CL_DEVICE_VENDOR, sizeof(dev_vendor), dev_vendor, NULL );
+ status |= ocl->clGetDeviceInfo( h->opencl.device, CL_DRIVER_VERSION, sizeof(driver_version), driver_version, NULL );
+ if( status != CL_SUCCESS )
+ return NULL;
+
+ // Most AMD GPUs have vector registers
+ int vectorize = !strcmp( dev_vendor, "Advanced Micro Devices, Inc." );
+ h->opencl.b_device_AMD_SI = 0;
+
+ if( vectorize )
+ {
+ /* Disable OpenCL on Intel/AMD switchable graphics devices */
+ if( x264_detect_switchable_graphics() )
+ {
+ x264_log( h, X264_LOG_INFO, "OpenCL acceleration disabled, switchable graphics detected\n" );
+ return NULL;
+ }
+
+ /* Detect AMD SouthernIsland or newer device (single-width registers) */
+ cl_uint simdwidth = 4;
+ status = ocl->clGetDeviceInfo( h->opencl.device, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, sizeof(cl_uint), &simdwidth, NULL );
+ if( status == CL_SUCCESS && simdwidth == 1 )
+ {
+ vectorize = 0;
+ h->opencl.b_device_AMD_SI = 1;
+ }
+ }
+
+ x264_log( h, X264_LOG_INFO, "OpenCL acceleration enabled with %s %s %s\n", dev_vendor, dev_name, h->opencl.b_device_AMD_SI ? "(SI)" : "" );
+
+ program = x264_opencl_cache_load( h, dev_name, dev_vendor, driver_version );
+ if( !program )
+ {
+ /* clCreateProgramWithSource() requires a pointer variable, you cannot just use &x264_opencl_source */
+ x264_log( h, X264_LOG_INFO, "Compiling OpenCL kernels...\n" );
+ const char *strptr = (const char*)x264_opencl_source;
+ size_t size = sizeof(x264_opencl_source);
+ program = ocl->clCreateProgramWithSource( h->opencl.context, 1, &strptr, &size, &status );
+ if( status != CL_SUCCESS || !program )
+ {
+ x264_log( h, X264_LOG_WARNING, "OpenCL: unable to create program\n" );
+ return NULL;
+ }
+ }
+
+ /* Build the program binary for the OpenCL device */
+ const char *buildopts = vectorize ? "-DVECTORIZE=1" : "";
+ status = ocl->clBuildProgram( program, 1, &h->opencl.device, buildopts, NULL, NULL );
+ if( status == CL_SUCCESS )
+ {
+ x264_opencl_cache_save( h, program, dev_name, dev_vendor, driver_version );
+ return program;
+ }
+
+ /* Compile failure, should not happen with production code. */
+
+ size_t build_log_len = 0;
+ status = ocl->clGetProgramBuildInfo( program, h->opencl.device, CL_PROGRAM_BUILD_LOG, 0, NULL, &build_log_len );
+ if( status != CL_SUCCESS || !build_log_len )
+ {
+ x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to query build log\n" );
+ goto fail;
+ }
+
+ build_log = x264_malloc( build_log_len );
+ if( !build_log )
+ {
+ x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to alloc build log\n" );
+ goto fail;
+ }
+
+ status = ocl->clGetProgramBuildInfo( program, h->opencl.device, CL_PROGRAM_BUILD_LOG, build_log_len, build_log, NULL );
+ if( status != CL_SUCCESS )
+ {
+ x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to get build log\n" );
+ goto fail;
+ }
+
+ FILE *log_file = fopen( "x264_kernel_build_log.txt", "w" );
+ if( !log_file )
+ {
+ x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to create file x264_kernel_build_log.txt\n" );
+ goto fail;
+ }
+ fwrite( build_log, 1, build_log_len, log_file );
+ fclose( log_file );
+ x264_log( h, X264_LOG_WARNING, "OpenCL: kernel build errors written to x264_kernel_build_log.txt\n" );
+
+fail:
+ x264_free( build_log );
+ if( program )
+ ocl->clReleaseProgram( program );
+ return NULL;
+}
+
+static int x264_opencl_lookahead_alloc( x264_t *h )
+{
+ if( !h->param.rc.i_lookahead )
+ return -1;
+
+ static const char *kernelnames[] = {
+ "mb_intra_cost_satd_8x8",
+ "sum_intra_cost",
+ "downscale_hpel",
+ "downscale1",
+ "downscale2",
+ "memset_int16",
+ "weightp_scaled_images",
+ "weightp_hpel",
+ "hierarchical_motion",
+ "subpel_refine",
+ "mode_selection",
+ "sum_inter_cost"
+ };
+
+ cl_kernel *kernels[] = {
+ &h->opencl.intra_kernel,
+ &h->opencl.rowsum_intra_kernel,
+ &h->opencl.downscale_hpel_kernel,
+ &h->opencl.downscale_kernel1,
+ &h->opencl.downscale_kernel2,
+ &h->opencl.memset_kernel,
+ &h->opencl.weightp_scaled_images_kernel,
+ &h->opencl.weightp_hpel_kernel,
+ &h->opencl.hme_kernel,
+ &h->opencl.subpel_refine_kernel,
+ &h->opencl.mode_select_kernel,
+ &h->opencl.rowsum_inter_kernel
+ };
+
+ x264_opencl_function_t *ocl = h->opencl.ocl;
+ cl_int status;
+
+ h->opencl.lookahead_program = x264_opencl_compile( h );
+ if( !h->opencl.lookahead_program )
+ goto fail;
+
+ for( int i = 0; i < ARRAY_SIZE(kernelnames); i++ )
+ {
+ *kernels[i] = ocl->clCreateKernel( h->opencl.lookahead_program, kernelnames[i], &status );
+ if( status != CL_SUCCESS )
+ {
+ x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to compile kernel '%s' (%d)\n", kernelnames[i], status );
+ goto fail;
+ }
+ }
+
+ h->opencl.page_locked_buffer = ocl->clCreateBuffer( h->opencl.context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR, PAGE_LOCKED_BUF_SIZE, NULL, &status );
+ if( status != CL_SUCCESS )
+ {
+ x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to allocate page-locked buffer, error '%d'\n", status );
+ goto fail;
+ }
+ h->opencl.page_locked_ptr = ocl->clEnqueueMapBuffer( h->opencl.queue, h->opencl.page_locked_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
+ 0, PAGE_LOCKED_BUF_SIZE, 0, NULL, NULL, &status );
+ if( status != CL_SUCCESS )
+ {
+ x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to map page-locked buffer, error '%d'\n", status );
+ goto fail;
+ }
+
+ return 0;
+fail:
+ x264_opencl_lookahead_delete( h );
+ return -1;
+}
+
+static void CL_CALLBACK x264_opencl_error_notify( const char *errinfo, const void *private_info, size_t cb, void *user_data )
+{
+ /* Any error notification can be assumed to be fatal to the OpenCL context.
+ * We need to stop using it immediately to prevent further damage. */
+ x264_t *h = (x264_t*)user_data;
+ h->param.b_opencl = 0;
+ h->opencl.b_fatal_error = 1;
+ x264_log( h, X264_LOG_ERROR, "OpenCL: %s\n", errinfo );
+ x264_log( h, X264_LOG_ERROR, "OpenCL: fatal error, aborting encode\n" );
+}
+
+int x264_opencl_lookahead_init( x264_t *h )
+{
+ x264_opencl_function_t *ocl = h->opencl.ocl;
+ cl_platform_id *platforms = NULL;
+ cl_device_id *devices = NULL;
+ cl_image_format *imageType = NULL;
+ cl_context context = NULL;
+ int ret = -1;
+
+ cl_uint numPlatforms = 0;
+ cl_int status = ocl->clGetPlatformIDs( 0, NULL, &numPlatforms );
+ if( status != CL_SUCCESS || !numPlatforms )
+ {
+ x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to query installed platforms\n" );
+ goto fail;
+ }
+ platforms = (cl_platform_id*)x264_malloc( sizeof(cl_platform_id) * numPlatforms );
+ if( !platforms )
+ {
+ x264_log( h, X264_LOG_WARNING, "OpenCL: malloc of installed platforms buffer failed\n" );
+ goto fail;
+ }
+ status = ocl->clGetPlatformIDs( numPlatforms, platforms, NULL );
+ if( status != CL_SUCCESS )
+ {
+ x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to query installed platforms\n" );
+ goto fail;
+ }
+
+ /* Select the first OpenCL platform with a GPU device that supports our
+ * required image (texture) formats */
+ for( cl_uint i = 0; i < numPlatforms; i++ )
+ {
+ cl_uint gpu_count = 0;
+ status = ocl->clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &gpu_count );
+ if( status != CL_SUCCESS || !gpu_count )
+ continue;
+
+ x264_free( devices );
+ devices = x264_malloc( sizeof(cl_device_id) * gpu_count );
+ if( !devices )
+ continue;
+
+ status = ocl->clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, gpu_count, devices, NULL );
+ if( status != CL_SUCCESS )
+ continue;
+
+ /* Find a GPU device that supports our image formats */
+ for( cl_uint gpu = 0; gpu < gpu_count; gpu++ )
+ {
+ h->opencl.device = devices[gpu];
+
+ /* if the user has specified an exact device ID, skip all other
+ * GPUs. If this device matches, allow it to continue through the
+ * checks for supported images, etc. */
+ if( h->param.opencl_device_id && devices[gpu] != (cl_device_id)h->param.opencl_device_id )
+ continue;
+
+ cl_bool image_support = 0;
+ status = ocl->clGetDeviceInfo( h->opencl.device, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &image_support, NULL );
+ if( status != CL_SUCCESS || !image_support )
+ continue;
+
+ if( context )
+ ocl->clReleaseContext( context );
+ context = ocl->clCreateContext( NULL, 1, &h->opencl.device, (void*)x264_opencl_error_notify, (void*)h, &status );
+ if( status != CL_SUCCESS || !context )
+ continue;
+
+ cl_uint imagecount = 0;
+ status = ocl->clGetSupportedImageFormats( context, CL_MEM_READ_WRITE, CL_MEM_OBJECT_IMAGE2D, 0, NULL, &imagecount );
+ if( status != CL_SUCCESS || !imagecount )
+ continue;
+
+ x264_free( imageType );
+ imageType = x264_malloc( sizeof(cl_image_format) * imagecount );
+ if( !imageType )
+ continue;
+
+ status = ocl->clGetSupportedImageFormats( context, CL_MEM_READ_WRITE, CL_MEM_OBJECT_IMAGE2D, imagecount, imageType, NULL );
+ if( status != CL_SUCCESS )
+ continue;
+
+ int b_has_r = 0;
+ int b_has_rgba = 0;
+ for( cl_uint j = 0; j < imagecount; j++ )
+ {
+ if( imageType[j].image_channel_order == CL_R &&
+ imageType[j].image_channel_data_type == CL_UNSIGNED_INT32 )
+ b_has_r = 1;
+ else if( imageType[j].image_channel_order == CL_RGBA &&
+ imageType[j].image_channel_data_type == CL_UNSIGNED_INT8 )
+ b_has_rgba = 1;
+ }
+ if( !b_has_r || !b_has_rgba )
+ {
+ char dev_name[64];
+ status = ocl->clGetDeviceInfo( h->opencl.device, CL_DEVICE_NAME, sizeof(dev_name), dev_name, NULL );
+ if( status == CL_SUCCESS )
+ {
+ /* emit warning if we are discarding the user's explicit choice */
+ int level = h->param.opencl_device_id ? X264_LOG_WARNING : X264_LOG_DEBUG;
+ x264_log( h, level, "OpenCL: %s does not support required image formats\n", dev_name );
+ }
+ continue;
+ }
+
+ /* user selection of GPU device, skip N first matches */
+ if( h->param.i_opencl_device )
+ {
+ h->param.i_opencl_device--;
+ continue;
+ }
+
+ h->opencl.queue = ocl->clCreateCommandQueue( context, h->opencl.device, 0, &status );
+ if( status != CL_SUCCESS || !h->opencl.queue )
+ continue;
+
+ h->opencl.context = context;
+ context = NULL;
+
+ ret = 0;
+ break;
+ }
+
+ if( !ret )
+ break;
+ }
+
+ if( !h->param.psz_clbin_file )
+ h->param.psz_clbin_file = "x264_lookahead.clbin";
+
+ if( ret )
+ x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to find a compatible device\n" );
+ else
+ ret = x264_opencl_lookahead_alloc( h );
+
+fail:
+ if( context )
+ ocl->clReleaseContext( context );
+ x264_free( imageType );
+ x264_free( devices );
+ x264_free( platforms );
+ return ret;
+}
+
+static void x264_opencl_lookahead_free( x264_t *h )
+{
+ x264_opencl_function_t *ocl = h->opencl.ocl;
+
+#define RELEASE( a, f ) do { if( a ) { ocl->f( a ); a = NULL; } } while( 0 )
+ RELEASE( h->opencl.downscale_hpel_kernel, clReleaseKernel );
+ RELEASE( h->opencl.downscale_kernel1, clReleaseKernel );
+ RELEASE( h->opencl.downscale_kernel2, clReleaseKernel );
+ RELEASE( h->opencl.weightp_hpel_kernel, clReleaseKernel );
+ RELEASE( h->opencl.weightp_scaled_images_kernel, clReleaseKernel );
+ RELEASE( h->opencl.memset_kernel, clReleaseKernel );
+ RELEASE( h->opencl.intra_kernel, clReleaseKernel );
+ RELEASE( h->opencl.rowsum_intra_kernel, clReleaseKernel );
+ RELEASE( h->opencl.hme_kernel, clReleaseKernel );
+ RELEASE( h->opencl.subpel_refine_kernel, clReleaseKernel );
+ RELEASE( h->opencl.mode_select_kernel, clReleaseKernel );
+ RELEASE( h->opencl.rowsum_inter_kernel, clReleaseKernel );
+
+ RELEASE( h->opencl.lookahead_program, clReleaseProgram );
+
+ RELEASE( h->opencl.page_locked_buffer, clReleaseMemObject );
+ RELEASE( h->opencl.luma_16x16_image[0], clReleaseMemObject );
+ RELEASE( h->opencl.luma_16x16_image[1], clReleaseMemObject );
+ for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
+ RELEASE( h->opencl.weighted_scaled_images[i], clReleaseMemObject );
+ RELEASE( h->opencl.weighted_luma_hpel, clReleaseMemObject );
+ RELEASE( h->opencl.row_satds[0], clReleaseMemObject );
+ RELEASE( h->opencl.row_satds[1], clReleaseMemObject );
+ RELEASE( h->opencl.mv_buffers[0], clReleaseMemObject );
+ RELEASE( h->opencl.mv_buffers[1], clReleaseMemObject );
+ RELEASE( h->opencl.lowres_mv_costs, clReleaseMemObject );
+ RELEASE( h->opencl.mvp_buffer, clReleaseMemObject );
+ RELEASE( h->opencl.lowres_costs[0], clReleaseMemObject );
+ RELEASE( h->opencl.lowres_costs[1], clReleaseMemObject );
+ RELEASE( h->opencl.frame_stats[0], clReleaseMemObject );
+ RELEASE( h->opencl.frame_stats[1], clReleaseMemObject );
+#undef RELEASE
+}
+
+void x264_opencl_lookahead_delete( x264_t *h )
+{
+ x264_opencl_function_t *ocl = h->opencl.ocl;
+
+ if( !ocl )
+ return;
+
+ if( h->opencl.queue )
+ ocl->clFinish( h->opencl.queue );
+
+ x264_opencl_lookahead_free( h );
+
+ if( h->opencl.queue )
+ {
+ ocl->clReleaseCommandQueue( h->opencl.queue );
+ h->opencl.queue = NULL;
+ }
+ if( h->opencl.context )
+ {
+ ocl->clReleaseContext( h->opencl.context );
+ h->opencl.context = NULL;
+ }
+}
+
+void x264_opencl_frame_delete( x264_frame_t *frame )
+{
+ x264_opencl_function_t *ocl = frame->opencl.ocl;
+
+ if( !ocl )
+ return;
+
+#define RELEASEBUF(mem) do { if( mem ) { ocl->clReleaseMemObject( mem ); mem = NULL; } } while( 0 )
+ for( int j = 0; j < NUM_IMAGE_SCALES; j++ )
+ RELEASEBUF( frame->opencl.scaled_image2Ds[j] );
+ RELEASEBUF( frame->opencl.luma_hpel );
+ RELEASEBUF( frame->opencl.inv_qscale_factor );
+ RELEASEBUF( frame->opencl.intra_cost );
+ RELEASEBUF( frame->opencl.lowres_mvs0 );
+ RELEASEBUF( frame->opencl.lowres_mvs1 );
+ RELEASEBUF( frame->opencl.lowres_mv_costs0 );
+ RELEASEBUF( frame->opencl.lowres_mv_costs1 );
+#undef RELEASEBUF
+}
+
+/* OpenCL misbehaves on hybrid laptops with Intel iGPU and AMD dGPU, so
+ * we consult AMD's ADL interface to detect this situation and disable
+ * OpenCL on these machines (Linux and Windows) */
+#ifdef _WIN32
+#define ADL_API_CALL
+#define ADL_CALLBACK __stdcall
+#define adl_close FreeLibrary
+#define adl_address GetProcAddress
+#else
+#define ADL_API_CALL
+#define ADL_CALLBACK
+#define adl_close dlclose
+#define adl_address dlsym
+#endif
+
+typedef void* ( ADL_CALLBACK *ADL_MAIN_MALLOC_CALLBACK )( int );
+typedef int ( ADL_API_CALL *ADL_MAIN_CONTROL_CREATE )( ADL_MAIN_MALLOC_CALLBACK, int );
+typedef int ( ADL_API_CALL *ADL_ADAPTER_NUMBEROFADAPTERS_GET )( int * );
+typedef int ( ADL_API_CALL *ADL_POWERXPRESS_SCHEME_GET )( int, int *, int *, int * );
+typedef int ( ADL_API_CALL *ADL_MAIN_CONTROL_DESTROY )( void );
+
+#define ADL_OK 0
+#define ADL_PX_SCHEME_DYNAMIC 2
+
+static void* ADL_CALLBACK adl_malloc_wrapper( int iSize )
+{
+ return x264_malloc( iSize );
+}
+
+static int x264_detect_switchable_graphics( void )
+{
+ void *hDLL;
+ ADL_MAIN_CONTROL_CREATE ADL_Main_Control_Create;
+ ADL_ADAPTER_NUMBEROFADAPTERS_GET ADL_Adapter_NumberOfAdapters_Get;
+ ADL_POWERXPRESS_SCHEME_GET ADL_PowerXpress_Scheme_Get;
+ ADL_MAIN_CONTROL_DESTROY ADL_Main_Control_Destroy;
+ int ret = 0;
+
+#ifdef _WIN32
+ hDLL = LoadLibrary( "atiadlxx.dll" );
+ if( !hDLL )
+ hDLL = LoadLibrary( "atiadlxy.dll" );
+#else
+ hDLL = dlopen( "libatiadlxx.so", RTLD_LAZY|RTLD_GLOBAL );
+#endif
+ if( !hDLL )
+ goto fail0;
+
+ ADL_Main_Control_Create = (ADL_MAIN_CONTROL_CREATE)adl_address(hDLL, "ADL_Main_Control_Create");
+ ADL_Main_Control_Destroy = (ADL_MAIN_CONTROL_DESTROY)adl_address(hDLL, "ADL_Main_Control_Destroy");
+ ADL_Adapter_NumberOfAdapters_Get = (ADL_ADAPTER_NUMBEROFADAPTERS_GET)adl_address(hDLL, "ADL_Adapter_NumberOfAdapters_Get");
+ ADL_PowerXpress_Scheme_Get = (ADL_POWERXPRESS_SCHEME_GET)adl_address(hDLL, "ADL_PowerXpress_Scheme_Get");
+ if( !ADL_Main_Control_Destroy || !ADL_Main_Control_Destroy || !ADL_Adapter_NumberOfAdapters_Get ||
+ !ADL_PowerXpress_Scheme_Get )
+ goto fail1;
+
+ if( ADL_OK != ADL_Main_Control_Create( adl_malloc_wrapper, 1 ) )
+ goto fail1;
+
+ int numAdapters = 0;
+ if( ADL_OK != ADL_Adapter_NumberOfAdapters_Get( &numAdapters ) )
+ goto fail2;
+
+ for( int i = 0; i < numAdapters; i++ )
+ {
+ int PXSchemeRange, PXSchemeCurrentState, PXSchemeDefaultState;
+ if( ADL_OK != ADL_PowerXpress_Scheme_Get( i, &PXSchemeRange, &PXSchemeCurrentState, &PXSchemeDefaultState) )
+ break;
+
+ if( PXSchemeRange >= ADL_PX_SCHEME_DYNAMIC )
+ {
+ ret = 1;
+ break;
+ }
+ }
+
+fail2:
+ ADL_Main_Control_Destroy();
+fail1:
+ adl_close( hDLL );
+fail0:
+ return ret;
+}

x264-snapshot-20130723-2245.tar.bz2/common/opencl.h Added

@@ -0,0 +1,804 @@
+/*****************************************************************************
+ * opencl.h: OpenCL structures and defines
+ *****************************************************************************
+ * Copyright (C) 2012-2013 x264 project
+ *
+ * Authors: Steve Borho <sborho@multicorewareinc.com>
+ * Anton Mitrofanov <BugMaster@narod.ru>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_OPENCL_H
+#define X264_OPENCL_H
+
+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
+#include "extras/cl.h"
+
+#define OCL_API(ret, attr, name) typedef ret (attr *name##_func)
+
+/* Platform API */
+OCL_API(cl_int, CL_API_CALL, clGetPlatformIDs)
+( cl_uint /* num_entries */,
+ cl_platform_id * /* platforms */,
+ cl_uint * /* num_platforms */);
+
+OCL_API(cl_int, CL_API_CALL, clGetPlatformInfo)
+( cl_platform_id /* platform */,
+ cl_platform_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */);
+
+/* Device APIs */
+OCL_API(cl_int, CL_API_CALL, clGetDeviceIDs)
+( cl_platform_id /* platform */,
+ cl_device_type /* device_type */,
+ cl_uint /* num_entries */,
+ cl_device_id * /* devices */,
+ cl_uint * /* num_devices */);
+
+OCL_API(cl_int, CL_API_CALL, clGetDeviceInfo)
+( cl_device_id /* device */,
+ cl_device_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */);
+
+OCL_API(cl_int, CL_API_CALL, clCreateSubDevices)
+( cl_device_id /* in_device */,
+ const cl_device_partition_property * /* properties */,
+ cl_uint /* num_devices */,
+ cl_device_id * /* out_devices */,
+ cl_uint * /* num_devices_ret */);
+
+OCL_API(cl_int, CL_API_CALL, clRetainDevice)
+( cl_device_id /* device */);
+
+OCL_API(cl_int, CL_API_CALL, clReleaseDevice)
+( cl_device_id /* device */);
+
+/* Context APIs */
+OCL_API(cl_context, CL_API_CALL, clCreateContext)
+( const cl_context_properties * /* properties */,
+ cl_uint /* num_devices */,
+ const cl_device_id * /* devices */,
+ void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *),
+ void * /* user_data */,
+ cl_int * /* errcode_ret */);
+
+OCL_API(cl_context, CL_API_CALL, clCreateContextFromType)
+( const cl_context_properties * /* properties */,
+ cl_device_type /* device_type */,
+ void (CL_CALLBACK * /* pfn_notify*/ )(const char *, const void *, size_t, void *),
+ void * /* user_data */,
+ cl_int * /* errcode_ret */);
+
+OCL_API(cl_int, CL_API_CALL, clRetainContext)
+( cl_context /* context */);
+
+OCL_API(cl_int, CL_API_CALL, clReleaseContext)
+( cl_context /* context */);
+
+OCL_API(cl_int, CL_API_CALL, clGetContextInfo)
+( cl_context /* context */,
+ cl_context_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */);
+
+/* Command Queue APIs */
+OCL_API(cl_command_queue, CL_API_CALL, clCreateCommandQueue)
+( cl_context /* context */,
+ cl_device_id /* device */,
+ cl_command_queue_properties /* properties */,
+ cl_int * /* errcode_ret */);
+
+OCL_API(cl_int, CL_API_CALL, clRetainCommandQueue)
+( cl_command_queue /* command_queue */);
+
+OCL_API(cl_int, CL_API_CALL, clReleaseCommandQueue)
+( cl_command_queue /* command_queue */);
+
+OCL_API(cl_int, CL_API_CALL, clGetCommandQueueInfo)
+( cl_command_queue /* command_queue */,
+ cl_command_queue_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */);
+
+/* Memory Object APIs */
+OCL_API(cl_mem, CL_API_CALL, clCreateBuffer)
+( cl_context /* context */,
+ cl_mem_flags /* flags */,
+ size_t /* size */,
+ void * /* host_ptr */,
+ cl_int * /* errcode_ret */);
+
+OCL_API(cl_mem, CL_API_CALL, clCreateSubBuffer)
+( cl_mem /* buffer */,
+ cl_mem_flags /* flags */,
+ cl_buffer_create_type /* buffer_create_type */,
+ const void * /* buffer_create_info */,
+ cl_int * /* errcode_ret */);
+
+OCL_API(cl_mem, CL_API_CALL, clCreateImage)
+( cl_context /* context */,
+ cl_mem_flags /* flags */,
+ const cl_image_format * /* image_format */,
+ const cl_image_desc * /* image_desc */,
+ void * /* host_ptr */,
+ cl_int * /* errcode_ret */);
+
+OCL_API(cl_int, CL_API_CALL, clRetainMemObject)
+( cl_mem /* memobj */);
+
+OCL_API(cl_int, CL_API_CALL, clReleaseMemObject)
+( cl_mem /* memobj */);
+
+OCL_API(cl_int, CL_API_CALL, clGetSupportedImageFormats)
+( cl_context /* context */,
+ cl_mem_flags /* flags */,
+ cl_mem_object_type /* image_type */,
+ cl_uint /* num_entries */,
+ cl_image_format * /* image_formats */,
+ cl_uint * /* num_image_formats */);
+
+OCL_API(cl_int, CL_API_CALL, clGetMemObjectInfo)
+( cl_mem /* memobj */,
+ cl_mem_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */);
+
+OCL_API(cl_int, CL_API_CALL, clGetImageInfo)
+( cl_mem /* image */,
+ cl_image_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */);
+
+OCL_API(cl_int, CL_API_CALL, clSetMemObjectDestructorCallback)
+( cl_mem /* memobj */,
+ void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/),
+ void * /*user_data */ );
+
+/* Sampler APIs */
+OCL_API(cl_sampler, CL_API_CALL, clCreateSampler)
+( cl_context /* context */,
+ cl_bool /* normalized_coords */,
+ cl_addressing_mode /* addressing_mode */,
+ cl_filter_mode /* filter_mode */,
+ cl_int * /* errcode_ret */);
+
+OCL_API(cl_int, CL_API_CALL, clRetainSampler)
+( cl_sampler /* sampler */);
+
+OCL_API(cl_int, CL_API_CALL, clReleaseSampler)
+( cl_sampler /* sampler */);
+
+OCL_API(cl_int, CL_API_CALL, clGetSamplerInfo)
+( cl_sampler /* sampler */,
+ cl_sampler_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */);
+
+/* Program Object APIs */
+OCL_API(cl_program, CL_API_CALL, clCreateProgramWithSource)
+( cl_context /* context */,
+ cl_uint /* count */,
+ const char ** /* strings */,
+ const size_t * /* lengths */,
+ cl_int * /* errcode_ret */);
+
+OCL_API(cl_program, CL_API_CALL, clCreateProgramWithBinary)
+( cl_context /* context */,
+ cl_uint /* num_devices */,
+ const cl_device_id * /* device_list */,
+ const size_t * /* lengths */,
+ const unsigned char ** /* binaries */,
+ cl_int * /* binary_status */,
+ cl_int * /* errcode_ret */);
+
+OCL_API(cl_program, CL_API_CALL, clCreateProgramWithBuiltInKernels)
+( cl_context /* context */,
+ cl_uint /* num_devices */,
+ const cl_device_id * /* device_list */,
+ const char * /* kernel_names */,
+ cl_int * /* errcode_ret */);
+
+OCL_API(cl_int, CL_API_CALL, clRetainProgram)
+( cl_program /* program */);
+
+OCL_API(cl_int, CL_API_CALL, clReleaseProgram)
+( cl_program /* program */);
+
+OCL_API(cl_int, CL_API_CALL, clBuildProgram)
+( cl_program /* program */,
+ cl_uint /* num_devices */,
+ const cl_device_id * /* device_list */,
+ const char * /* options */,
+ void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+ void * /* user_data */);
+
+OCL_API(cl_int, CL_API_CALL, clCompileProgram)
+( cl_program /* program */,
+ cl_uint /* num_devices */,
+ const cl_device_id * /* device_list */,
+ const char * /* options */,
+ cl_uint /* num_input_headers */,
+ const cl_program * /* input_headers */,
+ const char ** /* header_include_names */,
+ void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+ void * /* user_data */);
+
+OCL_API(cl_program, CL_API_CALL, clLinkProgram)
+( cl_context /* context */,
+ cl_uint /* num_devices */,
+ const cl_device_id * /* device_list */,
+ const char * /* options */,
+ cl_uint /* num_input_programs */,
+ const cl_program * /* input_programs */,
+ void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+ void * /* user_data */,
+ cl_int * /* errcode_ret */ );
+
+
+OCL_API(cl_int, CL_API_CALL, clUnloadPlatformCompiler)
+( cl_platform_id /* platform */);
+
+OCL_API(cl_int, CL_API_CALL, clGetProgramInfo)
+( cl_program /* program */,
+ cl_program_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */);
+
+OCL_API(cl_int, CL_API_CALL, clGetProgramBuildInfo)
+( cl_program /* program */,
+ cl_device_id /* device */,
+ cl_program_build_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */);
+
+/* Kernel Object APIs */
+OCL_API(cl_kernel, CL_API_CALL, clCreateKernel)
+( cl_program /* program */,
+ const char * /* kernel_name */,
+ cl_int * /* errcode_ret */);
+
+OCL_API(cl_int, CL_API_CALL, clCreateKernelsInProgram)
+( cl_program /* program */,
+ cl_uint /* num_kernels */,
+ cl_kernel * /* kernels */,
+ cl_uint * /* num_kernels_ret */);
+
+OCL_API(cl_int, CL_API_CALL, clRetainKernel)
+( cl_kernel /* kernel */);
+
+OCL_API(cl_int, CL_API_CALL, clReleaseKernel)
+( cl_kernel /* kernel */);
+
+OCL_API(cl_int, CL_API_CALL, clSetKernelArg)
+( cl_kernel /* kernel */,
+ cl_uint /* arg_index */,
+ size_t /* arg_size */,
+ const void * /* arg_value */);
+
+OCL_API(cl_int, CL_API_CALL, clGetKernelInfo)
+( cl_kernel /* kernel */,
+ cl_kernel_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */);
+
+OCL_API(cl_int, CL_API_CALL, clGetKernelArgInfo)
+( cl_kernel /* kernel */,
+ cl_uint /* arg_indx */,
+ cl_kernel_arg_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */);
+
+OCL_API(cl_int, CL_API_CALL, clGetKernelWorkGroupInfo)
+( cl_kernel /* kernel */,
+ cl_device_id /* device */,
+ cl_kernel_work_group_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */);
+
+/* Event Object APIs */
+OCL_API(cl_int, CL_API_CALL, clWaitForEvents)
+( cl_uint /* num_events */,
+ const cl_event * /* event_list */);
+
+OCL_API(cl_int, CL_API_CALL, clGetEventInfo)
+( cl_event /* event */,
+ cl_event_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */);
+
+OCL_API(cl_event, CL_API_CALL, clCreateUserEvent)
+( cl_context /* context */,
+ cl_int * /* errcode_ret */);
+
+OCL_API(cl_int, CL_API_CALL, clRetainEvent)
+( cl_event /* event */);
+
+OCL_API(cl_int, CL_API_CALL, clReleaseEvent)
+( cl_event /* event */);
+
+OCL_API(cl_int, CL_API_CALL, clSetUserEventStatus)
+( cl_event /* event */,
+ cl_int /* execution_status */);
+
+OCL_API(cl_int, CL_API_CALL, clSetEventCallback)
+( cl_event /* event */,
+ cl_int /* command_exec_callback_type */,
+ void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *),
+ void * /* user_data */);
+
+/* Profiling APIs */
+OCL_API(cl_int, CL_API_CALL, clGetEventProfilingInfo)
+( cl_event /* event */,
+ cl_profiling_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */);
+
+/* Flush and Finish APIs */
+OCL_API(cl_int, CL_API_CALL, clFlush)
+( cl_command_queue /* command_queue */);
+
+OCL_API(cl_int, CL_API_CALL, clFinish)
+( cl_command_queue /* command_queue */);
+
+/* Enqueued Commands APIs */
+OCL_API(cl_int, CL_API_CALL, clEnqueueReadBuffer)
+( cl_command_queue /* command_queue */,
+ cl_mem /* buffer */,
+ cl_bool /* blocking_read */,
+ size_t /* offset */,
+ size_t /* size */,
+ void * /* ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */);
+
+OCL_API(cl_int, CL_API_CALL, clEnqueueReadBufferRect)
+( cl_command_queue /* command_queue */,
+ cl_mem /* buffer */,
+ cl_bool /* blocking_read */,
+ const size_t * /* buffer_offset */,
+ const size_t * /* host_offset */,
+ const size_t * /* region */,
+ size_t /* buffer_row_pitch */,
+ size_t /* buffer_slice_pitch */,
+ size_t /* host_row_pitch */,
+ size_t /* host_slice_pitch */,
+ void * /* ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */);
+
+OCL_API(cl_int, CL_API_CALL, clEnqueueWriteBuffer)
+( cl_command_queue /* command_queue */,
+ cl_mem /* buffer */,
+ cl_bool /* blocking_write */,
+ size_t /* offset */,
+ size_t /* size */,
+ const void * /* ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */);
+
+OCL_API(cl_int, CL_API_CALL, clEnqueueWriteBufferRect)
+( cl_command_queue /* command_queue */,
+ cl_mem /* buffer */,
+ cl_bool /* blocking_write */,
+ const size_t * /* buffer_offset */,
+ const size_t * /* host_offset */,
+ const size_t * /* region */,
+ size_t /* buffer_row_pitch */,
+ size_t /* buffer_slice_pitch */,
+ size_t /* host_row_pitch */,
+ size_t /* host_slice_pitch */,
+ const void * /* ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */);
+
+OCL_API(cl_int, CL_API_CALL, clEnqueueFillBuffer)
+( cl_command_queue /* command_queue */,
+ cl_mem /* buffer */,
+ const void * /* pattern */,
+ size_t /* pattern_size */,
+ size_t /* offset */,
+ size_t /* size */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */);
+
+OCL_API(cl_int, CL_API_CALL, clEnqueueCopyBuffer)
+( cl_command_queue /* command_queue */,
+ cl_mem /* src_buffer */,
+ cl_mem /* dst_buffer */,
+ size_t /* src_offset */,
+ size_t /* dst_offset */,
+ size_t /* size */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */);
+
+OCL_API(cl_int, CL_API_CALL, clEnqueueCopyBufferRect)
+( cl_command_queue /* command_queue */,
+ cl_mem /* src_buffer */,
+ cl_mem /* dst_buffer */,
+ const size_t * /* src_origin */,
+ const size_t * /* dst_origin */,
+ const size_t * /* region */,
+ size_t /* src_row_pitch */,
+ size_t /* src_slice_pitch */,
+ size_t /* dst_row_pitch */,
+ size_t /* dst_slice_pitch */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */);
+
+OCL_API(cl_int, CL_API_CALL, clEnqueueReadImage)
+( cl_command_queue /* command_queue */,
+ cl_mem /* image */,
+ cl_bool /* blocking_read */,
+ const size_t * /* origin[3] */,
+ const size_t * /* region[3] */,
+ size_t /* row_pitch */,
+ size_t /* slice_pitch */,
+ void * /* ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */);
+
+OCL_API(cl_int, CL_API_CALL, clEnqueueWriteImage)
+( cl_command_queue /* command_queue */,
+ cl_mem /* image */,
+ cl_bool /* blocking_write */,
+ const size_t * /* origin[3] */,
+ const size_t * /* region[3] */,
+ size_t /* input_row_pitch */,
+ size_t /* input_slice_pitch */,
+ const void * /* ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */);
+
+OCL_API(cl_int, CL_API_CALL, clEnqueueFillImage)
+( cl_command_queue /* command_queue */,
+ cl_mem /* image */,
+ const void * /* fill_color */,
+ const size_t * /* origin[3] */,
+ const size_t * /* region[3] */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */);
+
+OCL_API(cl_int, CL_API_CALL, clEnqueueCopyImage)
+( cl_command_queue /* command_queue */,
+ cl_mem /* src_image */,
+ cl_mem /* dst_image */,
+ const size_t * /* src_origin[3] */,
+ const size_t * /* dst_origin[3] */,
+ const size_t * /* region[3] */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */);
+
+OCL_API(cl_int, CL_API_CALL, clEnqueueCopyImageToBuffer)
+( cl_command_queue /* command_queue */,
+ cl_mem /* src_image */,
+ cl_mem /* dst_buffer */,
+ const size_t * /* src_origin[3] */,
+ const size_t * /* region[3] */,
+ size_t /* dst_offset */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */);
+
+OCL_API(cl_int, CL_API_CALL, clEnqueueCopyBufferToImage)
+( cl_command_queue /* command_queue */,
+ cl_mem /* src_buffer */,
+ cl_mem /* dst_image */,
+ size_t /* src_offset */,
+ const size_t * /* dst_origin[3] */,
+ const size_t * /* region[3] */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */);
+
+OCL_API(void *, CL_API_CALL, clEnqueueMapBuffer)
+( cl_command_queue /* command_queue */,
+ cl_mem /* buffer */,
+ cl_bool /* blocking_map */,
+ cl_map_flags /* map_flags */,
+ size_t /* offset */,
+ size_t /* size */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */,
+ cl_int * /* errcode_ret */);
+
+OCL_API(void *, CL_API_CALL, clEnqueueMapImage)
+( cl_command_queue /* command_queue */,
+ cl_mem /* image */,
+ cl_bool /* blocking_map */,
+ cl_map_flags /* map_flags */,
+ const size_t * /* origin[3] */,
+ const size_t * /* region[3] */,
+ size_t * /* image_row_pitch */,
+ size_t * /* image_slice_pitch */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */,
+ cl_int * /* errcode_ret */);
+
+OCL_API(cl_int, CL_API_CALL, clEnqueueUnmapMemObject)
+( cl_command_queue /* command_queue */,
+ cl_mem /* memobj */,
+ void * /* mapped_ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */);
+
+OCL_API(cl_int, CL_API_CALL, clEnqueueMigrateMemObjects)
+( cl_command_queue /* command_queue */,
+ cl_uint /* num_mem_objects */,
+ const cl_mem * /* mem_objects */,
+ cl_mem_migration_flags /* flags */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */);
+
+OCL_API(cl_int, CL_API_CALL, clEnqueueNDRangeKernel)
+( cl_command_queue /* command_queue */,
+ cl_kernel /* kernel */,
+ cl_uint /* work_dim */,
+ const size_t * /* global_work_offset */,
+ const size_t * /* global_work_size */,
+ const size_t * /* local_work_size */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */);
+
+OCL_API(cl_int, CL_API_CALL, clEnqueueTask)
+( cl_command_queue /* command_queue */,
+ cl_kernel /* kernel */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */);
+
+OCL_API(cl_int, CL_API_CALL, clEnqueueNativeKernel)
+( cl_command_queue /* command_queue */,
+ void (CL_CALLBACK * /*user_func*/)(void *),
+ void * /* args */,
+ size_t /* cb_args */,
+ cl_uint /* num_mem_objects */,
+ const cl_mem * /* mem_list */,
+ const void ** /* args_mem_loc */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */);
+
+OCL_API(cl_int, CL_API_CALL, clEnqueueMarkerWithWaitList)
+( cl_command_queue /* command_queue */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */);
+
+OCL_API(cl_int, CL_API_CALL, clEnqueueBarrierWithWaitList)
+( cl_command_queue /* command_queue */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */);
+
+
+/* Extension function access
+*
+* Returns the extension function address for the given function name,
+* or NULL if a valid function can not be found. The client must
+* check to make sure the address is not NULL, before using or
+* calling the returned function address.
+*/
+OCL_API(void *, CL_API_CALL, clGetExtensionFunctionAddressForPlatform)
+( cl_platform_id /* platform */,
+ const char * /* func_name */);
+
+
+// Deprecated OpenCL 1.1 APIs
+OCL_API(cl_mem, CL_API_CALL, clCreateImage2D)
+( cl_context /* context */,
+ cl_mem_flags /* flags */,
+ const cl_image_format * /* image_format */,
+ size_t /* image_width */,
+ size_t /* image_height */,
+ size_t /* image_row_pitch */,
+ void * /* host_ptr */,
+ cl_int * /* errcode_ret */);
+
+OCL_API(cl_mem, CL_API_CALL, clCreateImage3D)
+( cl_context /* context */,
+ cl_mem_flags /* flags */,
+ const cl_image_format * /* image_format */,
+ size_t /* image_width */,
+ size_t /* image_height */,
+ size_t /* image_depth */,
+ size_t /* image_row_pitch */,
+ size_t /* image_slice_pitch */,
+ void * /* host_ptr */,
+ cl_int * /* errcode_ret */);
+
+OCL_API(cl_int, CL_API_CALL, clEnqueueMarker)
+( cl_command_queue /* command_queue */,
+ cl_event * /* event */);
+
+OCL_API(cl_int, CL_API_CALL, clEnqueueWaitForEvents)
+( cl_command_queue /* command_queue */,
+ cl_uint /* num_events */,
+ const cl_event * /* event_list */);
+
+OCL_API(cl_int, CL_API_CALL, clEnqueueBarrier)
+( cl_command_queue /* command_queue */);
+
+OCL_API(cl_int, CL_API_CALL, clUnloadCompiler)
+( void);
+
+OCL_API(void *, CL_API_CALL, clGetExtensionFunctionAddress)
+( const char * /* func_name */);
+
+#define OCL_DECLARE_FUNC(name) name##_func name
+
+typedef struct
+{
+ void *library;
+
+ OCL_DECLARE_FUNC( clBuildProgram );
+ OCL_DECLARE_FUNC( clCreateBuffer );
+ OCL_DECLARE_FUNC( clCreateCommandQueue );
+ OCL_DECLARE_FUNC( clCreateContext );
+ OCL_DECLARE_FUNC( clCreateImage2D );
+ OCL_DECLARE_FUNC( clCreateKernel );
+ OCL_DECLARE_FUNC( clCreateProgramWithBinary );
+ OCL_DECLARE_FUNC( clCreateProgramWithSource );
+ OCL_DECLARE_FUNC( clEnqueueCopyBuffer );
+ OCL_DECLARE_FUNC( clEnqueueMapBuffer );
+ OCL_DECLARE_FUNC( clEnqueueNDRangeKernel );
+ OCL_DECLARE_FUNC( clEnqueueReadBuffer );
+ OCL_DECLARE_FUNC( clEnqueueWriteBuffer );
+ OCL_DECLARE_FUNC( clFinish );
+ OCL_DECLARE_FUNC( clGetCommandQueueInfo );
+ OCL_DECLARE_FUNC( clGetDeviceIDs );
+ OCL_DECLARE_FUNC( clGetDeviceInfo );
+ OCL_DECLARE_FUNC( clGetKernelWorkGroupInfo );
+ OCL_DECLARE_FUNC( clGetPlatformIDs );
+ OCL_DECLARE_FUNC( clGetProgramBuildInfo );
+ OCL_DECLARE_FUNC( clGetProgramInfo );
+ OCL_DECLARE_FUNC( clGetSupportedImageFormats );
+ OCL_DECLARE_FUNC( clReleaseCommandQueue );
+ OCL_DECLARE_FUNC( clReleaseContext );
+ OCL_DECLARE_FUNC( clReleaseKernel );
+ OCL_DECLARE_FUNC( clReleaseMemObject );
+ OCL_DECLARE_FUNC( clReleaseProgram );
+ OCL_DECLARE_FUNC( clSetKernelArg );
+} x264_opencl_function_t;
+
+/* Number of downscale resolutions to use for motion search */
+#define NUM_IMAGE_SCALES 4
+
+/* Number of PCIe copies that can be queued before requiring a flush */
+#define MAX_FINISH_COPIES 1024
+
+/* Size (in bytes) of the page-locked buffer used for PCIe xfers */
+#define PAGE_LOCKED_BUF_SIZE 32 * 1024 * 1024
+
+typedef struct
+{
+ x264_opencl_function_t *ocl;
+
+ cl_context context;
+ cl_device_id device;
+ cl_command_queue queue;
+
+ cl_program lookahead_program;
+ cl_int last_buf;
+
+ cl_mem page_locked_buffer;
+ char *page_locked_ptr;
+ int pl_occupancy;
+
+ struct
+ {
+ void *src;
+ void *dest;
+ int bytes;
+ } copies[MAX_FINISH_COPIES];
+ int num_copies;
+
+ int b_device_AMD_SI;
+ int b_fatal_error;
+ int lookahead_thread_pri;
+ int opencl_thread_pri;
+
+ /* downscale lowres luma */
+ cl_kernel downscale_hpel_kernel;
+ cl_kernel downscale_kernel1;
+ cl_kernel downscale_kernel2;
+ cl_mem luma_16x16_image[2];
+
+ /* weightp filtering */
+ cl_kernel weightp_hpel_kernel;
+ cl_kernel weightp_scaled_images_kernel;
+ cl_mem weighted_scaled_images[NUM_IMAGE_SCALES];
+ cl_mem weighted_luma_hpel;
+
+ /* intra */
+ cl_kernel memset_kernel;
+ cl_kernel intra_kernel;
+ cl_kernel rowsum_intra_kernel;
+ cl_mem row_satds[2];
+
+ /* hierarchical motion estimation */
+ cl_kernel hme_kernel;
+ cl_kernel subpel_refine_kernel;
+ cl_mem mv_buffers[2];
+ cl_mem lowres_mv_costs;
+ cl_mem mvp_buffer;
+
+ /* bidir */
+ cl_kernel mode_select_kernel;
+ cl_kernel rowsum_inter_kernel;
+ cl_mem lowres_costs[2];
+ cl_mem frame_stats[2]; /* cost_est, cost_est_aq, intra_mbs */
+} x264_opencl_t;
+
+typedef struct
+{
+ x264_opencl_function_t *ocl;
+
+ cl_mem scaled_image2Ds[NUM_IMAGE_SCALES];
+ cl_mem luma_hpel;
+ cl_mem inv_qscale_factor;
+ cl_mem intra_cost;
+ cl_mem lowres_mvs0;
+ cl_mem lowres_mvs1;
+ cl_mem lowres_mv_costs0;
+ cl_mem lowres_mv_costs1;
+} x264_frame_opencl_t;
+
+typedef struct x264_frame x264_frame;
+
+x264_opencl_function_t *x264_opencl_load_library( void );
+void x264_opencl_close_library( x264_opencl_function_t *ocl );
+
+int x264_opencl_lookahead_init( x264_t *h );
+void x264_opencl_lookahead_delete( x264_t *h );
+
+void x264_opencl_frame_delete( x264_frame *frame );
+
+#endif

x264-snapshot-20130723-2245.tar.bz2/common/opencl/bidir.cl Added

@@ -0,0 +1,265 @@
+/* Mode selection routines, select the least SATD cost mode for each lowres
+ * macroblock. When measuring B slices, this includes measuring the cost of
+ * three bidir modes. */
+
+/* Four threads cooperatively measure 8x8 BIDIR cost with SATD */
+int bidir_satd_8x8_ii_coop4( read_only image2d_t fenc_lowres,
+ int2 fencpos,
+ read_only image2d_t fref0_planes,
+ int2 qpos0,
+ read_only image2d_t fref1_planes,
+ int2 qpos1,
+ int weight,
+ local sum2_t *tmpp,
+ int idx )
+{
+ volatile local sum2_t( *tmp )[4] = (volatile local sum2_t( * )[4])tmpp;
+ sum2_t b0, b1, b2, b3;
+ sum2_t sum = 0;
+
+ // fencpos is full-pel position of original MB
+ // qpos0 is qpel position within reference frame 0
+ // qpos1 is qpel position within reference frame 1
+
+ int2 fref0Apos = (int2)(qpos0.x>>2, qpos0.y>>2);
+ int hpel0A = ((qpos0.x&2)>>1) + (qpos0.y&2);
+
+ int2 qpos0B = (int2)qpos0 + (int2)(((qpos0.x&1)<<1), ((qpos0.y&1)<<1));
+ int2 fref0Bpos = (int2)(qpos0B.x>>2, qpos0B.y>>2);
+ int hpel0B = ((qpos0B.x&2)>>1) + (qpos0B.y&2);
+
+ int2 fref1Apos = (int2)(qpos1.x>>2, qpos1.y>>2);
+ int hpel1A = ((qpos1.x&2)>>1) + (qpos1.y&2);
+
+ int2 qpos1B = (int2)qpos1 + (int2)(((qpos1.x&1)<<1), ((qpos1.y&1)<<1));
+ int2 fref1Bpos = (int2)(qpos1B.x>>2, qpos1B.y>>2);
+ int hpel1B = ((qpos1B.x&2)>>1) + (qpos1B.y&2);
+
+ uint mask_shift0A = 8 * hpel0A, mask_shift0B = 8 * hpel0B;
+ uint mask_shift1A = 8 * hpel1A, mask_shift1B = 8 * hpel1B;
+
+ uint vA, vB;
+ uint enc, ref0, ref1;
+ uint a0, a1;
+ const int weight2 = 64 - weight;
+
+#define READ_BIDIR_DIFF( OUT, X )\
+ enc = read_imageui( fenc_lowres, sampler, fencpos + (int2)(X, idx) ).s0;\
+ vA = (read_imageui( fref0_planes, sampler, fref0Apos + (int2)(X, idx) ).s0 >> mask_shift0A) & 0xFF;\
+ vB = (read_imageui( fref0_planes, sampler, fref0Bpos + (int2)(X, idx) ).s0 >> mask_shift0B) & 0xFF;\
+ ref0 = rhadd( vA, vB );\
+ vA = (read_imageui( fref1_planes, sampler, fref1Apos + (int2)(X, idx) ).s0 >> mask_shift1A) & 0xFF;\
+ vB = (read_imageui( fref1_planes, sampler, fref1Bpos + (int2)(X, idx) ).s0 >> mask_shift1B) & 0xFF;\
+ ref1 = rhadd( vA, vB );\
+ OUT = enc - ((ref0 * weight + ref1 * weight2 + (1 << 5)) >> 6);
+
+#define READ_DIFF_EX( OUT, a, b )\
+ READ_BIDIR_DIFF( a0, a );\
+ READ_BIDIR_DIFF( a1, b );\
+ OUT = a0 + (a1<<BITS_PER_SUM);
+
+#define ROW_8x4_SATD( a, b, c )\
+ fencpos.y += a;\
+ fref0Apos.y += b;\
+ fref0Bpos.y += b;\
+ fref1Apos.y += c;\
+ fref1Bpos.y += c;\
+ READ_DIFF_EX( b0, 0, 4 );\
+ READ_DIFF_EX( b1, 1, 5 );\
+ READ_DIFF_EX( b2, 2, 6 );\
+ READ_DIFF_EX( b3, 3, 7 );\
+ HADAMARD4( tmp[idx][0], tmp[idx][1], tmp[idx][2], tmp[idx][3], b0, b1, b2, b3 );\
+ HADAMARD4( b0, b1, b2, b3, tmp[0][idx], tmp[1][idx], tmp[2][idx], tmp[3][idx] );\
+ sum += abs2( b0 ) + abs2( b1 ) + abs2( b2 ) + abs2( b3 );
+
+ ROW_8x4_SATD( 0, 0, 0 );
+ ROW_8x4_SATD( 4, 4, 4 );
+
+#undef READ_BIDIR_DIFF
+#undef READ_DIFF_EX
+#undef ROW_8x4_SATD
+
+ return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;
+}
+
+/*
+ * mode selection - pick the least cost partition type for each 8x8 macroblock.
+ * Intra, list0 or list1. When measuring a B slice, also test three bidir
+ * possibilities.
+ *
+ * fenc_lowres_mvs[0|1] and fenc_lowres_mv_costs[0|1] are large buffers that
+ * hold many frames worth of motion vectors. We must offset into the correct
+ * location for this frame's vectors:
+ *
+ * CPU equivalent: fenc->lowres_mvs[0][b - p0 - 1]
+ * GPU equivalent: fenc_lowres_mvs0[(b - p0 - 1) * mb_count]
+ *
+ * global launch dimensions for P slice estimate: [mb_width, mb_height]
+ * global launch dimensions for B slice estimate: [mb_width * 4, mb_height]
+ */
+kernel void mode_selection( read_only image2d_t fenc_lowres,
+ read_only image2d_t fref0_planes,
+ read_only image2d_t fref1_planes,
+ const global short2 *fenc_lowres_mvs0,
+ const global short2 *fenc_lowres_mvs1,
+ const global short2 *fref1_lowres_mvs0,
+ const global int16_t *fenc_lowres_mv_costs0,
+ const global int16_t *fenc_lowres_mv_costs1,
+ const global uint16_t *fenc_intra_cost,
+ global uint16_t *lowres_costs,
+ global int *frame_stats,
+ local int16_t *cost_local,
+ local sum2_t *satd_local,
+ int mb_width,
+ int bipred_weight,
+ int dist_scale_factor,
+ int b,
+ int p0,
+ int p1,
+ int lambda )
+{
+ int mb_x = get_global_id( 0 );
+ int b_bidir = b < p1;
+ if( b_bidir )
+ {
+ /* when mode_selection is run for B frames, it must perform BIDIR SATD
+ * measurements, so it is launched with four times as many threads in
+ * order to spread the work around more of the GPU. And it can add
+ * padding threads in the X direction. */
+ mb_x >>= 2;
+ if( mb_x >= mb_width )
+ return;
+ }
+ int mb_y = get_global_id( 1 );
+ int mb_height = get_global_size( 1 );
+ int mb_count = mb_width * mb_height;
+ int mb_xy = mb_x + mb_y * mb_width;
+
+ /* Initialize int frame_stats[4] for next kernel (sum_inter_cost) */
+ if( mb_x < 4 && mb_y == 0 )
+ frame_stats[mb_x] = 0;
+
+ int bcost = COST_MAX;
+ int list_used = 0;
+
+ if( !b_bidir )
+ {
+ int icost = fenc_intra_cost[mb_xy];
+ COPY2_IF_LT( bcost, icost, list_used, 0 );
+ }
+ if( b != p0 )
+ {
+ int mv_cost0 = fenc_lowres_mv_costs0[(b - p0 - 1) * mb_count + mb_xy];
+ COPY2_IF_LT( bcost, mv_cost0, list_used, 1 );
+ }
+ if( b != p1 )
+ {
+ int mv_cost1 = fenc_lowres_mv_costs1[(p1 - b - 1) * mb_count + mb_xy];
+ COPY2_IF_LT( bcost, mv_cost1, list_used, 2 );
+ }
+
+ if( b_bidir )
+ {
+ int2 coord = (int2)(mb_x, mb_y) << 3;
+ int mb_i = get_global_id( 0 ) & 3;
+ int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
+ cost_local += mb_in_group * 4;
+ satd_local += mb_in_group * 16;
+
+#define TRY_BIDIR( mv0, mv1, penalty )\
+{\
+ int2 qpos0 = (int2)((coord.x<<2) + mv0.x, (coord.y<<2) + mv0.y);\
+ int2 qpos1 = (int2)((coord.x<<2) + mv1.x, (coord.y<<2) + mv1.y);\
+ cost_local[mb_i] = bidir_satd_8x8_ii_coop4( fenc_lowres, coord, fref0_planes, qpos0, fref1_planes, qpos1, bipred_weight, satd_local, mb_i );\
+ int cost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3];\
+ COPY2_IF_LT( bcost, penalty * lambda + cost, list_used, 3 );\
+}
+
+ /* temporal prediction */
+ short2 dmv0, dmv1;
+ short2 mvr = fref1_lowres_mvs0[mb_xy];
+ dmv0 = (mvr * (short) dist_scale_factor + (short) 128) >> (short) 8;
+ dmv1 = dmv0 - mvr;
+ TRY_BIDIR( dmv0, dmv1, 0 )
+
+ if( as_uint( dmv0 ) || as_uint( dmv1 ) )
+ {
+ /* B-direct prediction */
+ dmv0 = 0; dmv1 = 0;
+ TRY_BIDIR( dmv0, dmv1, 0 );
+ }
+
+ /* L0+L1 prediction */
+ dmv0 = fenc_lowres_mvs0[(b - p0 - 1) * mb_count + mb_xy];
+ dmv1 = fenc_lowres_mvs1[(p1 - b - 1) * mb_count + mb_xy];
+ TRY_BIDIR( dmv0, dmv1, 5 );
+#undef TRY_BIDIR
+ }
+
+ lowres_costs[mb_xy] = min( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT);
+}
+
+/*
+ * parallel sum inter costs
+ *
+ * global launch dimensions: [256, mb_height]
+ */
+kernel void sum_inter_cost( const global uint16_t *fenc_lowres_costs,
+ const global uint16_t *inv_qscale_factor,
+ global int *fenc_row_satds,
+ global int *frame_stats,
+ int mb_width,
+ int bframe_bias,
+ int b,
+ int p0,
+ int p1 )
+{
+ int y = get_global_id( 1 );
+ int mb_height = get_global_size( 1 );
+
+ int row_satds = 0;
+ int cost_est = 0;
+ int cost_est_aq = 0;
+ int intra_mbs = 0;
+
+ for( int x = get_global_id( 0 ); x < mb_width; x += get_global_size( 0 ))
+ {
+ int mb_xy = x + y * mb_width;
+ int cost = fenc_lowres_costs[mb_xy] & LOWRES_COST_MASK;
+ int list = fenc_lowres_costs[mb_xy] >> LOWRES_COST_SHIFT;
+ int b_frame_score_mb = (x > 0 && x < mb_width - 1 && y > 0 && y < mb_height - 1) || mb_width <= 2 || mb_height <= 2;
+
+ if( list == 0 && b_frame_score_mb )
+ intra_mbs++;
+
+ int cost_aq = (cost * inv_qscale_factor[mb_xy] + 128) >> 8;
+
+ row_satds += cost_aq;
+
+ if( b_frame_score_mb )
+ {
+ cost_est += cost;
+ cost_est_aq += cost_aq;
+ }
+ }
+
+ local int buffer[256];
+ int x = get_global_id( 0 );
+
+ row_satds = parallel_sum( row_satds, x, buffer );
+ cost_est = parallel_sum( cost_est, x, buffer );
+ cost_est_aq = parallel_sum( cost_est_aq, x, buffer );
+ intra_mbs = parallel_sum( intra_mbs, x, buffer );
+
+ if( b != p1 )
+ // Use floating point math to avoid 32bit integer overflow conditions
+ cost_est = (int)((float)cost_est * 100.0f / (120.0f + (float)bframe_bias));
+
+ if( get_global_id( 0 ) == 0 )
+ {
+ fenc_row_satds[y] = row_satds;
+ atomic_add( frame_stats + COST_EST, cost_est );
+ atomic_add( frame_stats + COST_EST_AQ, cost_est_aq );
+ atomic_add( frame_stats + INTRA_MBS, intra_mbs );
+ }
+}

x264-snapshot-20130723-2245.tar.bz2/common/opencl/downscale.cl Added

@@ -0,0 +1,135 @@
+/*
+ * downscale lowres luma: full-res buffer to down scale image, and to packed hpel image
+ *
+ * --
+ *
+ * fenc_img is an output image (area of memory referenced through a texture
+ * cache). A read of any pixel location (x,y) returns four pixel values:
+ *
+ * val.s0 = P(x,y)
+ * val.s1 = P(x+1,y)
+ * val.s2 = P(x+2,y)
+ * val.s3 = P(x+3,y)
+ *
+ * This is a 4x replication of the lowres pixels, a trade-off between memory
+ * size and read latency.
+ *
+ * --
+ *
+ * hpel_planes is an output image that contains the four HPEL planes used for
+ * subpel refinement. A read of any pixel location (x,y) returns a UInt32 with
+ * the four planar values C | V | H | F
+ *
+ * launch dimensions: [lowres-width, lowres-height]
+ */
+kernel void downscale_hpel( const global pixel *fenc,
+ write_only image2d_t fenc_img,
+ write_only image2d_t hpel_planes,
+ int stride )
+{
+ int x = get_global_id( 0 );
+ int y = get_global_id( 1 );
+ uint4 values;
+
+ fenc += y * stride * 2;
+ const global pixel *src1 = fenc + stride;
+ const global pixel *src2 = (y == get_global_size( 1 )-1) ? src1 : src1 + stride;
+ int2 pos = (int2)(x, y);
+ pixel right, left;
+
+ right = rhadd( fenc[x*2], src1[x*2] );
+ left = rhadd( fenc[x*2+1], src1[x*2+1] );
+ values.s0 = rhadd( right, left ); // F
+
+ right = rhadd( fenc[2*x+1], src1[2*x+1] );
+ left = rhadd( fenc[2*x+2], src1[2*x+2] );
+ values.s1 = rhadd( right, left ); // H
+
+ right = rhadd( src1[2*x], src2[2*x] );
+ left = rhadd( src1[2*x+1], src2[2*x+1] );
+ values.s2 = rhadd( right, left ); // V
+
+ right = rhadd( src1[2*x+1], src2[2*x+1] );
+ left = rhadd( src1[2*x+2], src2[2*x+2] );
+ values.s3 = rhadd( right, left ); // C
+
+ uint4 val = (uint4) ((values.s3 & 0xff) << 24) | ((values.s2 & 0xff) << 16) | ((values.s1 & 0xff) << 8) | (values.s0 & 0xff);
+ write_imageui( hpel_planes, pos, val );
+
+ x = select( x, x+1, x+1 < get_global_size( 0 ) );
+ right = rhadd( fenc[x*2], src1[x*2] );
+ left = rhadd( fenc[x*2+1], src1[x*2+1] );
+ values.s1 = rhadd( right, left );
+
+ x = select( x, x+1, x+1 < get_global_size( 0 ) );
+ right = rhadd( fenc[x*2], src1[x*2] );
+ left = rhadd( fenc[x*2+1], src1[x*2+1] );
+ values.s2 = rhadd( right, left );
+
+ x = select( x, x+1, x+1 < get_global_size( 0 ) );
+ right = rhadd( fenc[x*2], src1[x*2] );
+ left = rhadd( fenc[x*2+1], src1[x*2+1] );
+ values.s3 = rhadd( right, left );
+
+ write_imageui( fenc_img, pos, values );
+}
+
+/*
+ * downscale lowres hierarchical motion search image, copy from one image to
+ * another decimated image. This kernel is called iteratively to generate all
+ * of the downscales.
+ *
+ * launch dimensions: [lower_res width, lower_res height]
+ */
+kernel void downscale1( read_only image2d_t higher_res, write_only image2d_t lower_res )
+{
+ int x = get_global_id( 0 );
+ int y = get_global_id( 1 );
+ int2 pos = (int2)(x, y);
+ int gs = get_global_size( 0 );
+ uint4 top, bot, values;
+ top = read_imageui( higher_res, sampler, (int2)(x*2, 2*y) );
+ bot = read_imageui( higher_res, sampler, (int2)(x*2, 2*y+1) );
+ values.s0 = rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) );
+
+ /* these select statements appear redundant, and they should be, but tests break when
+ * they are not here. I believe this was caused by a driver bug
+ */
+ values.s1 = select( values.s0, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 1 < gs) );
+ top = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y) );
+ bot = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y+1) );
+ values.s2 = select( values.s1, rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ), ( x + 2 < gs ) );
+ values.s3 = select( values.s2, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 3 < gs ) );
+ write_imageui( lower_res, pos, (uint4)(values) );
+}
+
+/*
+ * Second copy of downscale kernel, no differences. This is a (no perf loss)
+ * workaround for a scheduling bug in current Tahiti drivers. This bug has
+ * theoretically been fixed in the July 2012 driver release from AMD.
+ */
+kernel void downscale2( read_only image2d_t higher_res, write_only image2d_t lower_res )
+{
+ int x = get_global_id( 0 );
+ int y = get_global_id( 1 );
+ int2 pos = (int2)(x, y);
+ int gs = get_global_size( 0 );
+ uint4 top, bot, values;
+ top = read_imageui( higher_res, sampler, (int2)(x*2, 2*y) );
+ bot = read_imageui( higher_res, sampler, (int2)(x*2, 2*y+1) );
+ values.s0 = rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) );
+
+ // see comment in above function copy
+ values.s1 = select( values.s0, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 1 < gs) );
+ top = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y) );
+ bot = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y+1) );
+ values.s2 = select( values.s1, rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ), ( x + 2 < gs ) );
+ values.s3 = select( values.s2, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 3 < gs ) );
+ write_imageui( lower_res, pos, (uint4)(values) );
+}
+
+/* OpenCL 1.2 finally added a memset command, but we're not targeting 1.2 */
+kernel void memset_int16( global int16_t *buf, int16_t value )
+{
+ buf[get_global_id( 0 )] = value;
+}

x264-snapshot-20130723-2245.tar.bz2/common/opencl/intra.cl Added

@@ -0,0 +1,1072 @@
+/* Lookahead lowres intra analysis
+ *
+ * Each intra analysis function has been implemented twice, once for scalar GPUs
+ * (NV) and once for vectorized GPUs (AMD pre-Southern Islands). x264 detects
+ * the GPU type and sets the -DVECTORIZE compile flag accordingly.
+ *
+ * All the intra analysis functions were based on their C versions in pixel.c
+ * and produce the exact same results.
+ */
+
+/* force all clamp arguments and return value to int, prevent ambiguous types */
+#define clamp_int( X, MIN, MAX ) (int) clamp( (int)(X), (int)(MIN), (int)(MAX) )
+
+#if VECTORIZE
+int satd_8x4_intra_lr( const local pixel *data, int data_stride, int8 pr0, int8 pr1, int8 pr2, int8 pr3 )
+{
+ int8 a_v, d_v;
+ int2 tmp00, tmp01, tmp02, tmp03, tmp10, tmp11, tmp12, tmp13;
+ int2 tmp20, tmp21, tmp22, tmp23, tmp30, tmp31, tmp32, tmp33;
+
+ d_v = convert_int8( vload8( 0, data ) );
+ a_v.s01234567 = (d_v - pr0).s04152637;
+ HADAMARD4V( tmp00, tmp01, tmp02, tmp03, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
+
+ data += data_stride;
+ d_v = convert_int8( vload8( 0, data ) );
+ a_v.s01234567 = (d_v - pr1).s04152637;
+ HADAMARD4V( tmp10, tmp11, tmp12, tmp13, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
+
+ data += data_stride;
+ d_v = convert_int8( vload8( 0, data ) );
+ a_v.s01234567 = (d_v - pr2).s04152637;
+ HADAMARD4V( tmp20, tmp21, tmp22, tmp23, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
+
+ data += data_stride;
+ d_v = convert_int8( vload8( 0, data ) );
+ a_v.s01234567 = (d_v - pr3).s04152637;
+ HADAMARD4V( tmp30, tmp31, tmp32, tmp33, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
+
+ uint8 sum_v;
+
+ HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp00, tmp10, tmp20, tmp30 );
+ sum_v = abs( a_v );
+
+ HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp01, tmp11, tmp21, tmp31 );
+ sum_v += abs( a_v );
+
+ HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp02, tmp12, tmp22, tmp32 );
+ sum_v += abs( a_v );
+
+ HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp03, tmp13, tmp23, tmp33 );
+ sum_v += abs( a_v );
+
+ uint4 sum2 = sum_v.hi + sum_v.lo;
+ uint2 sum3 = sum2.hi + sum2.lo;
+ return ( sum3.hi + sum3.lo ) >> 1;
+}
+#else
+SATD_C_8x4_Q( satd_8x4_lp, const local, private )
+#endif
+
+/****************************************************************************
+ * 8x8 prediction for intra luma block
+ ****************************************************************************/
+
+#define F1 rhadd
+#define F2( a, b, c ) ( a+2*b+c+2 )>>2
+
+#if VECTORIZE
+int x264_predict_8x8_ddl( const local pixel *src, int src_stride, const local pixel *top )
+{
+ int8 pr0, pr1, pr2, pr3;
+
+ // Upper half of pred[]
+ pr0.s0 = ( 2 + top[0] + 2*top[1] + top[2] ) >> 2;
+ pr0.s1 = ( 2 + top[1] + 2*top[2] + top[3] ) >> 2;
+ pr0.s2 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
+ pr0.s3 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
+ pr0.s4 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
+ pr0.s5 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
+ pr0.s6 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
+ pr0.s7 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+
+ pr1.s0 = ( 2 + top[1] + 2*top[2] + top[3] ) >> 2;
+ pr1.s1 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
+ pr1.s2 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
+ pr1.s3 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
+ pr1.s4 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
+ pr1.s5 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
+ pr1.s6 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+ pr1.s7 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
+
+ pr2.s0 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
+ pr2.s1 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
+ pr2.s2 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
+ pr2.s3 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
+ pr2.s4 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
+ pr2.s5 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+ pr2.s6 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
+ pr2.s7 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
+
+ pr3.s0 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
+ pr3.s1 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
+ pr3.s2 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
+ pr3.s3 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
+ pr3.s4 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+ pr3.s5 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
+ pr3.s6 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
+ pr3.s7 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
+ int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
+
+ // Lower half of pred[]
+ pr0.s0 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
+ pr0.s1 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
+ pr0.s2 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
+ pr0.s3 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+ pr0.s4 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
+ pr0.s5 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
+ pr0.s6 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
+ pr0.s7 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
+
+ pr1.s0 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
+ pr1.s1 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
+ pr1.s2 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+ pr1.s3 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
+ pr1.s4 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
+ pr1.s5 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
+ pr1.s6 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
+ pr1.s7 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
+
+ pr2.s0 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
+ pr2.s1 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+ pr2.s2 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
+ pr2.s3 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
+ pr2.s4 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
+ pr2.s5 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
+ pr2.s6 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
+ pr2.s7 = ( 2 + top[13] + 2*top[14] + top[15] ) >> 2;
+
+ pr3.s0 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+ pr3.s1 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
+ pr3.s2 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
+ pr3.s3 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
+ pr3.s4 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
+ pr3.s5 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
+ pr3.s6 = ( 2 + top[13] + 2*top[14] + top[15] ) >> 2;
+ pr3.s7 = ( 2 + top[14] + 3*top[15] ) >> 2;
+
+ return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
+}
+
+int x264_predict_8x8_ddr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
+{
+ int8 pr0, pr1, pr2, pr3;
+
+ // Upper half of pred[]
+ pr3.s0 = F2( left[1], left[2], left[3] );
+ pr2.s0 = pr3.s1 = F2( left[0], left[1], left[2] );
+ pr1.s0 = pr2.s1 = pr3.s2 = F2( left[1], left[0], left_top );
+ pr0.s0 = pr1.s1 = pr2.s2 = pr3.s3 = F2( left[0], left_top, top[0] );
+ pr0.s1 = pr1.s2 = pr2.s3 = pr3.s4 = F2( left_top, top[0], top[1] );
+ pr0.s2 = pr1.s3 = pr2.s4 = pr3.s5 = F2( top[0], top[1], top[2] );
+ pr0.s3 = pr1.s4 = pr2.s5 = pr3.s6 = F2( top[1], top[2], top[3] );
+ pr0.s4 = pr1.s5 = pr2.s6 = pr3.s7 = F2( top[2], top[3], top[4] );
+ pr0.s5 = pr1.s6 = pr2.s7 = F2( top[3], top[4], top[5] );
+ pr0.s6 = pr1.s7 = F2( top[4], top[5], top[6] );
+ pr0.s7 = F2( top[5], top[6], top[7] );
+ int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
+
+ // Lower half of pred[]
+ pr3.s0 = F2( left[5], left[6], left[7] );
+ pr2.s0 = pr3.s1 = F2( left[4], left[5], left[6] );
+ pr1.s0 = pr2.s1 = pr3.s2 = F2( left[3], left[4], left[5] );
+ pr0.s0 = pr1.s1 = pr2.s2 = pr3.s3 = F2( left[2], left[3], left[4] );
+ pr0.s1 = pr1.s2 = pr2.s3 = pr3.s4 = F2( left[1], left[2], left[3] );
+ pr0.s2 = pr1.s3 = pr2.s4 = pr3.s5 = F2( left[0], left[1], left[2] );
+ pr0.s3 = pr1.s4 = pr2.s5 = pr3.s6 = F2( left[1], left[0], left_top );
+ pr0.s4 = pr1.s5 = pr2.s6 = pr3.s7 = F2( left[0], left_top, top[0] );
+ pr0.s5 = pr1.s6 = pr2.s7 = F2( left_top, top[0], top[1] );
+ pr0.s6 = pr1.s7 = F2( top[0], top[1], top[2] );
+ pr0.s7 = F2( top[1], top[2], top[3] );
+ return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
+}
+
+int x264_predict_8x8_vr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
+{
+ int8 pr0, pr1, pr2, pr3;
+
+ // Upper half of pred[]
+ pr2.s0 = F2( left[1], left[0], left_top );
+ pr3.s0 = F2( left[2], left[1], left[0] );
+ pr1.s0 = pr3.s1 = F2( left[0], left_top, top[0] );
+ pr0.s0 = pr2.s1 = F1( left_top, top[0] );
+ pr1.s1 = pr3.s2 = F2( left_top, top[0], top[1] );
+ pr0.s1 = pr2.s2 = F1( top[0], top[1] );
+ pr1.s2 = pr3.s3 = F2( top[0], top[1], top[2] );
+ pr0.s2 = pr2.s3 = F1( top[1], top[2] );
+ pr1.s3 = pr3.s4 = F2( top[1], top[2], top[3] );
+ pr0.s3 = pr2.s4 = F1( top[2], top[3] );
+ pr1.s4 = pr3.s5 = F2( top[2], top[3], top[4] );
+ pr0.s4 = pr2.s5 = F1( top[3], top[4] );
+ pr1.s5 = pr3.s6 = F2( top[3], top[4], top[5] );
+ pr0.s5 = pr2.s6 = F1( top[4], top[5] );
+ pr1.s6 = pr3.s7 = F2( top[4], top[5], top[6] );
+ pr0.s6 = pr2.s7 = F1( top[5], top[6] );
+ pr1.s7 = F2( top[5], top[6], top[7] );
+ pr0.s7 = F1( top[6], top[7] );
+ int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
+
+ // Lower half of pred[]
+ pr2.s0 = F2( left[5], left[4], left[3] );
+ pr3.s0 = F2( left[6], left[5], left[4] );
+ pr0.s0 = pr2.s1 = F2( left[3], left[2], left[1] );
+ pr1.s0 = pr3.s1 = F2( left[4], left[3], left[2] );
+ pr0.s1 = pr2.s2 = F2( left[1], left[0], left_top );
+ pr1.s1 = pr3.s2 = F2( left[2], left[1], left[0] );
+ pr1.s2 = pr3.s3 = F2( left[0], left_top, top[0] );
+ pr0.s2 = pr2.s3 = F1( left_top, top[0] );
+ pr1.s3 = pr3.s4 = F2( left_top, top[0], top[1] );
+ pr0.s3 = pr2.s4 = F1( top[0], top[1] );
+ pr1.s4 = pr3.s5 = F2( top[0], top[1], top[2] );
+ pr0.s4 = pr2.s5 = F1( top[1], top[2] );
+ pr1.s5 = pr3.s6 = F2( top[1], top[2], top[3] );
+ pr0.s5 = pr2.s6 = F1( top[2], top[3] );
+ pr1.s6 = pr3.s7 = F2( top[2], top[3], top[4] );
+ pr0.s6 = pr2.s7 = F1( top[3], top[4] );
+ pr1.s7 = F2( top[3], top[4], top[5] );
+ pr0.s7 = F1( top[4], top[5] );
+ return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
+#undef PRED
+}
+
+int x264_predict_8x8_hd( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
+{
+ int8 pr0, pr1, pr2, pr3;
+
+ // Upper half of pred[]
+ pr0.s0 = F1( left_top, left[0] ); pr0.s1 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
+ pr0.s2 = F2( top[1], top[0], left_top ); pr0.s3 = F2( top[2], top[1], top[0] );
+ pr0.s4 = F2( top[3], top[2], top[1] ); pr0.s5 = F2( top[4], top[3], top[2] );
+ pr0.s6 = F2( top[5], top[4], top[3] ); pr0.s7 = F2( top[6], top[5], top[4] );
+
+ pr1.s0 = F1( left[0], left[1] ); pr1.s1 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
+ pr1.s2 = F1( left_top, left[0] ); pr1.s3 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
+ pr1.s4 = F2( top[1], top[0], left_top ); pr1.s5 = F2( top[2], top[1], top[0] );
+ pr1.s6 = F2( top[3], top[2], top[1] ); pr1.s7 = F2( top[4], top[3], top[2] );
+
+ pr2.s0 = F1( left[1], left[2] ); pr2.s1 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
+ pr2.s2 = F1( left[0], left[1] ); pr2.s3 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
+ pr2.s4 = F1( left_top, left[0] ); pr2.s5 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
+ pr2.s6 = F2( top[1], top[0], left_top ); pr2.s7 = F2( top[2], top[1], top[0] );
+
+ pr3.s0 = F1( left[2], left[3] ); pr3.s1 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
+ pr3.s2 = F1( left[1], left[2] ); pr3.s3 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
+ pr3.s4 = F1( left[0], left[1] ); pr3.s5 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
+ pr3.s6 = F1( left_top, left[0] ); pr3.s7 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
+ int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
+
+ // Lower half of pred[]
+ pr0.s0 = F1( left[3], left[4] ); pr0.s1 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
+ pr0.s2 = F1( left[2], left[3] ); pr0.s3 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
+ pr0.s4 = F1( left[1], left[2] ); pr0.s5 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
+ pr0.s6 = F1( left[0], left[1] ); pr0.s7 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
+
+ pr1.s0 = F1( left[4], left[5] ); pr1.s1 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
+ pr1.s2 = F1( left[3], left[4] ); pr1.s3 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
+ pr1.s4 = F1( left[2], left[3] ); pr1.s5 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
+ pr1.s6 = F1( left[1], left[2] ); pr1.s7 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
+
+ pr2.s0 = F1( left[5], left[6] ); pr2.s1 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
+ pr2.s2 = F1( left[4], left[5] ); pr2.s3 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
+ pr2.s4 = F1( left[3], left[4] ); pr2.s5 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
+ pr2.s6 = F1( left[2], left[3] ); pr2.s7 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
+
+ pr3.s0 = F1( left[6], left[7] ); pr3.s1 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
+ pr3.s2 = F1( left[5], left[6] ); pr3.s3 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
+ pr3.s4 = F1( left[4], left[5] ); pr3.s5 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
+ pr3.s6 = F1( left[3], left[4] ); pr3.s7 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
+ return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
+}
+
+int x264_predict_8x8_vl( const local pixel *src, int src_stride, const local pixel *top )
+{
+ int8 pr0, pr1, pr2, pr3;
+
+ // Upper half of pred[]
+ pr0.s0 = F1( top[0], top[1] );
+ pr1.s0 = F2( top[0], top[1], top[2] );
+ pr2.s0 = pr0.s1 = F1( top[1], top[2] );
+ pr3.s0 = pr1.s1 = F2( top[1], top[2], top[3] );
+ pr2.s1 = pr0.s2 = F1( top[2], top[3] );
+ pr3.s1 = pr1.s2 = F2( top[2], top[3], top[4] );
+ pr2.s2 = pr0.s3 = F1( top[3], top[4] );
+ pr3.s2 = pr1.s3 = F2( top[3], top[4], top[5] );
+ pr2.s3 = pr0.s4 = F1( top[4], top[5] );
+ pr3.s3 = pr1.s4 = F2( top[4], top[5], top[6] );
+ pr2.s4 = pr0.s5 = F1( top[5], top[6] );
+ pr3.s4 = pr1.s5 = F2( top[5], top[6], top[7] );
+ pr2.s5 = pr0.s6 = F1( top[6], top[7] );
+ pr3.s5 = pr1.s6 = F2( top[6], top[7], top[8] );
+ pr2.s6 = pr0.s7 = F1( top[7], top[8] );
+ pr3.s6 = pr1.s7 = F2( top[7], top[8], top[9] );
+ pr2.s7 = F1( top[8], top[9] );
+ pr3.s7 = F2( top[8], top[9], top[10] );
+ int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
+
+ // Lower half of pred[]
+ pr0.s0 = F1( top[2], top[3] );
+ pr1.s0 = F2( top[2], top[3], top[4] );
+ pr2.s0 = pr0.s1 = F1( top[3], top[4] );
+ pr3.s0 = pr1.s1 = F2( top[3], top[4], top[5] );
+ pr2.s1 = pr0.s2 = F1( top[4], top[5] );
+ pr3.s1 = pr1.s2 = F2( top[4], top[5], top[6] );
+ pr2.s2 = pr0.s3 = F1( top[5], top[6] );
+ pr3.s2 = pr1.s3 = F2( top[5], top[6], top[7] );
+ pr2.s3 = pr0.s4 = F1( top[6], top[7] );
+ pr3.s3 = pr1.s4 = F2( top[6], top[7], top[8] );
+ pr2.s4 = pr0.s5 = F1( top[7], top[8] );
+ pr3.s4 = pr1.s5 = F2( top[7], top[8], top[9] );
+ pr2.s5 = pr0.s6 = F1( top[8], top[9] );
+ pr3.s5 = pr1.s6 = F2( top[8], top[9], top[10] );
+ pr2.s6 = pr0.s7 = F1( top[9], top[10] );
+ pr3.s6 = pr1.s7 = F2( top[9], top[10], top[11] );
+ pr2.s7 = F1( top[10], top[11] );
+ pr3.s7 = F2( top[10], top[11], top[12] );
+ return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
+}
+
+int x264_predict_8x8_hu( const local pixel *src, int src_stride, const local pixel *left )
+{
+ int8 pr0, pr1, pr2, pr3;
+
+ // Upper half of pred[]
+ pr0.s0 = F1( left[0], left[1] ); pr0.s1 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
+ pr0.s2 = F1( left[1], left[2] ); pr0.s3 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
+ pr0.s4 = F1( left[2], left[3] ); pr0.s5 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
+ pr0.s6 = F1( left[3], left[4] ); pr0.s7 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
+
+ pr1.s0 = F1( left[1], left[2] ); pr1.s1 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
+ pr1.s2 = F1( left[2], left[3] ); pr1.s3 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
+ pr1.s4 = F1( left[3], left[4] ); pr1.s5 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
+ pr1.s6 = F1( left[4], left[5] ); pr1.s7 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
+
+ pr2.s0 = F1( left[2], left[3] ); pr2.s1 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
+ pr2.s2 = F1( left[3], left[4] ); pr2.s3 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
+ pr2.s4 = F1( left[4], left[5] ); pr2.s5 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
+ pr2.s6 = F1( left[5], left[6] ); pr2.s7 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
+
+ pr3.s0 = F1( left[3], left[4] ); pr3.s1 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
+ pr3.s2 = F1( left[4], left[5] ); pr3.s3 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
+ pr3.s4 = F1( left[5], left[6] ); pr3.s5 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
+ pr3.s6 = F1( left[6], left[7] ); pr3.s7 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
+ int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
+
+ // Lower half of pred[]
+ pr0.s0 = F1( left[4], left[5] ); pr0.s1 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
+ pr0.s2 = F1( left[5], left[6] ); pr0.s3 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
+ pr0.s4 = F1( left[6], left[7] ); pr0.s5 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
+ pr0.s6 = left[7]; pr0.s7 = left[7];
+
+ pr1.s0 = F1( left[5], left[6] ); pr1.s1 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
+ pr1.s2 = F1( left[6], left[7] ); pr1.s3 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
+ pr1.s4 = left[7]; pr1.s5 = left[7];
+ pr1.s6 = left[7]; pr1.s7 = left[7];
+
+ pr2.s0 = F1( left[6], left[7] ); pr2.s1 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
+ pr2.s2 = left[7]; pr2.s3 = left[7];
+ pr2.s4 = left[7]; pr2.s5 = left[7];
+ pr2.s6 = left[7]; pr2.s7 = left[7];
+
+ pr3 = (int8)left[7];
+
+ return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
+}
+
+int x264_predict_8x8c_h( const local pixel *src, int src_stride )
+{
+ const local pixel *src_l = src;
+ int8 pr0, pr1, pr2, pr3;
+
+ // Upper half of pred[]
+ pr0 = (int8)src[-1]; src += src_stride;
+ pr1 = (int8)src[-1]; src += src_stride;
+ pr2 = (int8)src[-1]; src += src_stride;
+ pr3 = (int8)src[-1]; src += src_stride;
+ int satd = satd_8x4_intra_lr( src_l, src_stride, pr0, pr1, pr2, pr3 );
+
+ //Lower half of pred[]
+ pr0 = (int8)src[-1]; src += src_stride;
+ pr1 = (int8)src[-1]; src += src_stride;
+ pr2 = (int8)src[-1]; src += src_stride;
+ pr3 = (int8)src[-1];
+ return satd + satd_8x4_intra_lr( src_l + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
+}
+
+int x264_predict_8x8c_v( const local pixel *src, int src_stride )
+{
+ int8 pred = convert_int8( vload8( 0, &src[-src_stride] ));
+ return satd_8x4_intra_lr( src, src_stride, pred, pred, pred, pred ) +
+ satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pred, pred, pred, pred );
+}
+
+int x264_predict_8x8c_p( const local pixel *src, int src_stride )
+{
+ int H = 0, V = 0;
+ for( int i = 0; i < 4; i++ )
+ {
+ H += (i + 1) * (src[4 + i - src_stride] - src[2 - i - src_stride]);
+ V += (i + 1) * (src[-1 + (i + 4) * src_stride] - src[-1 + (2 - i) * src_stride]);
+ }
+
+ int a = 16 * (src[-1 + 7 * src_stride] + src[7 - src_stride]);
+ int b = (17 * H + 16) >> 5;
+ int c = (17 * V + 16) >> 5;
+ int i00 = a - 3 * b - 3 * c + 16;
+
+ // Upper half of pred[]
+ int pix = i00;
+ int8 pr0, pr1, pr2, pr3;
+ pr0.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr0.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr0.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr0.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr0.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr0.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr0.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr0.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
+
+ pix = i00;
+ pr1.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr1.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr1.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr1.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr1.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr1.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr1.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr1.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
+
+ pix = i00;
+ pr2.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr2.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr2.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr2.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr2.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr2.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr2.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr2.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
+
+ pix = i00;
+ pr3.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr3.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr3.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr3.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr3.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr3.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr3.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr3.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
+ int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
+
+ //Lower half of pred[]
+ pix = i00;
+ pr0.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr0.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr0.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr0.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr0.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr0.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr0.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr0.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
+
+ pix = i00;
+ pr1.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr1.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr1.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr1.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr1.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr1.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr1.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr1.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
+
+ pix = i00;
+ pr2.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr2.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr2.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr2.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr2.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr2.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr2.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr2.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
+
+ pix = i00;
+ pr3.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr3.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr3.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr3.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr3.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr3.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr3.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr3.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
+ return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
+}
+
+int x264_predict_8x8c_dc( const local pixel *src, int src_stride )
+{
+ int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
+ for( int i = 0; i < 4; i++ )
+ {
+ s0 += src[i - src_stride];
+ s1 += src[i + 4 - src_stride];
+ s2 += src[-1 + i * src_stride];
+ s3 += src[-1 + (i+4)*src_stride];
+ }
+
+ // Upper half of pred[]
+ int8 dc0;
+ dc0.lo = (int4)( (s0 + s2 + 4) >> 3 );
+ dc0.hi = (int4)( (s1 + 2) >> 2 );
+ int satd = satd_8x4_intra_lr( src, src_stride, dc0, dc0, dc0, dc0 );
+
+ // Lower half of pred[]
+ dc0.lo = (int4)( (s3 + 2) >> 2 );
+ dc0.hi = (int4)( (s1 + s3 + 4) >> 3 );
+ return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, dc0, dc0, dc0, dc0 );
+}
+
+#else /* not vectorized: private is cheap registers are scarce */
+
+int x264_predict_8x8_ddl( const local pixel *src, int src_stride, const local pixel *top )
+{
+ private pixel pred[32];
+
+ // Upper half of pred[]
+ for( int y = 0; y < 4; y++ )
+ {
+ for( int x = 0; x < 8; x++ )
+ {
+ pixel x_plus_y = (pixel) clamp_int( x + y, 0, 13 );
+ pred[x + y*8] = ( 2 + top[x_plus_y] + 2*top[x_plus_y + 1] + top[x_plus_y + 2] ) >> 2;
+ }
+ }
+ int satd = satd_8x4_lp( src, src_stride, pred, 8 );
+ //Lower half of pred[]
+ for( int y = 4; y < 8; y++ )
+ {
+ for( int x = 0; x < 8; x++ )
+ {
+ pixel x_plus_y = (pixel) clamp_int( x + y, 0, 13 );
+ pred[x + ( y - 4 )*8] = ( 2 + top[x_plus_y] + 2*top[x_plus_y + 1] + top[x_plus_y + 2] ) >> 2;
+ }
+ }
+ pred[31] = ( 2 + top[14] + 3*top[15] ) >> 2;
+ satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
+ return satd;
+}
+
+int x264_predict_8x8_ddr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
+{
+ private pixel pred[32];
+#define PRED( x, y ) pred[(x) + (y)*8]
+ // Upper half of pred[]
+ PRED( 0, 3 ) = F2( left[1], left[2], left[3] );
+ PRED( 0, 2 ) = PRED( 1, 3 ) = F2( left[0], left[1], left[2] );
+ PRED( 0, 1 ) = PRED( 1, 2 ) = PRED( 2, 3 ) = F2( left[1], left[0], left_top );
+ PRED( 0, 0 ) = PRED( 1, 1 ) = PRED( 2, 2 ) = PRED( 3, 3 ) = F2( left[0], left_top, top[0] );
+ PRED( 1, 0 ) = PRED( 2, 1 ) = PRED( 3, 2 ) = PRED( 4, 3 ) = F2( left_top, top[0], top[1] );
+ PRED( 2, 0 ) = PRED( 3, 1 ) = PRED( 4, 2 ) = PRED( 5, 3 ) = F2( top[0], top[1], top[2] );
+ PRED( 3, 0 ) = PRED( 4, 1 ) = PRED( 5, 2 ) = PRED( 6, 3 ) = F2( top[1], top[2], top[3] );
+ PRED( 4, 0 ) = PRED( 5, 1 ) = PRED( 6, 2 ) = PRED( 7, 3 ) = F2( top[2], top[3], top[4] );
+ PRED( 5, 0 ) = PRED( 6, 1 ) = PRED( 7, 2 ) = F2( top[3], top[4], top[5] );
+ PRED( 6, 0 ) = PRED( 7, 1 ) = F2( top[4], top[5], top[6] );
+ PRED( 7, 0 ) = F2( top[5], top[6], top[7] );
+ int satd = satd_8x4_lp( src, src_stride, pred, 8 );
+
+ // Lower half of pred[]
+ PRED( 0, 3 ) = F2( left[5], left[6], left[7] );
+ PRED( 0, 2 ) = PRED( 1, 3 ) = F2( left[4], left[5], left[6] );
+ PRED( 0, 1 ) = PRED( 1, 2 ) = PRED( 2, 3 ) = F2( left[3], left[4], left[5] );
+ PRED( 0, 0 ) = PRED( 1, 1 ) = PRED( 2, 2 ) = PRED( 3, 3 ) = F2( left[2], left[3], left[4] );
+ PRED( 1, 0 ) = PRED( 2, 1 ) = PRED( 3, 2 ) = PRED( 4, 3 ) = F2( left[1], left[2], left[3] );
+ PRED( 2, 0 ) = PRED( 3, 1 ) = PRED( 4, 2 ) = PRED( 5, 3 ) = F2( left[0], left[1], left[2] );
+ PRED( 3, 0 ) = PRED( 4, 1 ) = PRED( 5, 2 ) = PRED( 6, 3 ) = F2( left[1], left[0], left_top );
+ PRED( 4, 0 ) = PRED( 5, 1 ) = PRED( 6, 2 ) = PRED( 7, 3 ) = F2( left[0], left_top, top[0] );
+ PRED( 5, 0 ) = PRED( 6, 1 ) = PRED( 7, 2 ) = F2( left_top, top[0], top[1] );
+ PRED( 6, 0 ) = PRED( 7, 1 ) = F2( top[0], top[1], top[2] );
+ PRED( 7, 0 ) = F2( top[1], top[2], top[3] );
+ satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
+ return satd;
+#undef PRED
+}
+
+int x264_predict_8x8_vr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
+{
+ private pixel pred[32];
+#define PRED( x, y ) pred[(x) + (y)*8]
+ // Upper half of pred[]
+ PRED( 0, 2 ) = F2( left[1], left[0], left_top );
+ PRED( 0, 3 ) = F2( left[2], left[1], left[0] );
+ PRED( 0, 1 ) = PRED( 1, 3 ) = F2( left[0], left_top, top[0] );
+ PRED( 0, 0 ) = PRED( 1, 2 ) = F1( left_top, top[0] );
+ PRED( 1, 1 ) = PRED( 2, 3 ) = F2( left_top, top[0], top[1] );
+ PRED( 1, 0 ) = PRED( 2, 2 ) = F1( top[0], top[1] );
+ PRED( 2, 1 ) = PRED( 3, 3 ) = F2( top[0], top[1], top[2] );
+ PRED( 2, 0 ) = PRED( 3, 2 ) = F1( top[1], top[2] );
+ PRED( 3, 1 ) = PRED( 4, 3 ) = F2( top[1], top[2], top[3] );
+ PRED( 3, 0 ) = PRED( 4, 2 ) = F1( top[2], top[3] );
+ PRED( 4, 1 ) = PRED( 5, 3 ) = F2( top[2], top[3], top[4] );
+ PRED( 4, 0 ) = PRED( 5, 2 ) = F1( top[3], top[4] );
+ PRED( 5, 1 ) = PRED( 6, 3 ) = F2( top[3], top[4], top[5] );
+ PRED( 5, 0 ) = PRED( 6, 2 ) = F1( top[4], top[5] );
+ PRED( 6, 1 ) = PRED( 7, 3 ) = F2( top[4], top[5], top[6] );
+ PRED( 6, 0 ) = PRED( 7, 2 ) = F1( top[5], top[6] );
+ PRED( 7, 1 ) = F2( top[5], top[6], top[7] );
+ PRED( 7, 0 ) = F1( top[6], top[7] );
+ int satd = satd_8x4_lp( src, src_stride, pred, 8 );
+
+ //Lower half of pred[]
+ PRED( 0, 2 ) = F2( left[5], left[4], left[3] );
+ PRED( 0, 3 ) = F2( left[6], left[5], left[4] );
+ PRED( 0, 0 ) = PRED( 1, 2 ) = F2( left[3], left[2], left[1] );
+ PRED( 0, 1 ) = PRED( 1, 3 ) = F2( left[4], left[3], left[2] );
+ PRED( 1, 0 ) = PRED( 2, 2 ) = F2( left[1], left[0], left_top );
+ PRED( 1, 1 ) = PRED( 2, 3 ) = F2( left[2], left[1], left[0] );
+ PRED( 2, 1 ) = PRED( 3, 3 ) = F2( left[0], left_top, top[0] );
+ PRED( 2, 0 ) = PRED( 3, 2 ) = F1( left_top, top[0] );
+ PRED( 3, 1 ) = PRED( 4, 3 ) = F2( left_top, top[0], top[1] );
+ PRED( 3, 0 ) = PRED( 4, 2 ) = F1( top[0], top[1] );
+ PRED( 4, 1 ) = PRED( 5, 3 ) = F2( top[0], top[1], top[2] );
+ PRED( 4, 0 ) = PRED( 5, 2 ) = F1( top[1], top[2] );
+ PRED( 5, 1 ) = PRED( 6, 3 ) = F2( top[1], top[2], top[3] );
+ PRED( 5, 0 ) = PRED( 6, 2 ) = F1( top[2], top[3] );
+ PRED( 6, 1 ) = PRED( 7, 3 ) = F2( top[2], top[3], top[4] );
+ PRED( 6, 0 ) = PRED( 7, 2 ) = F1( top[3], top[4] );
+ PRED( 7, 1 ) = F2( top[3], top[4], top[5] );
+ PRED( 7, 0 ) = F1( top[4], top[5] );
+ satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
+ return satd;
+#undef PRED
+}
+
+inline uint32_t pack16to32( uint32_t a, uint32_t b )
+{
+ return a + (b << 16);
+}
+
+inline uint32_t pack8to16( uint32_t a, uint32_t b )
+{
+ return a + (b << 8);
+}
+
+int x264_predict_8x8_hd( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
+{
+ private pixel pred[32];
+ int satd;
+ int p1 = pack8to16( (F1( left[6], left[7] )), ((left[5] + 2 * left[6] + left[7] + 2) >> 2) );
+ int p2 = pack8to16( (F1( left[5], left[6] )), ((left[4] + 2 * left[5] + left[6] + 2) >> 2) );
+ int p3 = pack8to16( (F1( left[4], left[5] )), ((left[3] + 2 * left[4] + left[5] + 2) >> 2) );
+ int p4 = pack8to16( (F1( left[3], left[4] )), ((left[2] + 2 * left[3] + left[4] + 2) >> 2) );
+ int p5 = pack8to16( (F1( left[2], left[3] )), ((left[1] + 2 * left[2] + left[3] + 2) >> 2) );
+ int p6 = pack8to16( (F1( left[1], left[2] )), ((left[0] + 2 * left[1] + left[2] + 2) >> 2) );
+ int p7 = pack8to16( (F1( left[0], left[1] )), ((left_top + 2 * left[0] + left[1] + 2) >> 2) );
+ int p8 = pack8to16( (F1( left_top, left[0] )), ((left[0] + 2 * left_top + top[0] + 2) >> 2) );
+ int p9 = pack8to16( (F2( top[1], top[0], left_top )), (F2( top[2], top[1], top[0] )) );
+ int p10 = pack8to16( (F2( top[3], top[2], top[1] )), (F2( top[4], top[3], top[2] )) );
+ int p11 = pack8to16( (F2( top[5], top[4], top[3] )), (F2( top[6], top[5], top[4] )) );
+ // Upper half of pred[]
+ vstore4( as_uchar4( pack16to32( p8, p9 ) ), 0, &pred[0 + 0 * 8] );
+ vstore4( as_uchar4( pack16to32( p10, p11 ) ), 0, &pred[4 + 0 * 8] );
+ vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[0 + 1 * 8] );
+ vstore4( as_uchar4( pack16to32( p9, p10 ) ), 0, &pred[4 + 1 * 8] );
+ vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[0 + 2 * 8] );
+ vstore4( as_uchar4( pack16to32( p8, p9 ) ), 0, &pred[4 + 2 * 8] );
+ vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[0 + 3 * 8] );
+ vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[4 + 3 * 8] );
+ satd = satd_8x4_lp( src, src_stride, pred, 8 );
+ // Lower half of pred[]
+ vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[0 + 0 * 8] );
+ vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[4 + 0 * 8] );
+ vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[0 + 1 * 8] );
+ vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[4 + 1 * 8] );
+ vstore4( as_uchar4( pack16to32( p2, p3 ) ), 0, &pred[0 + 2 * 8] );
+ vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[4 + 2 * 8] );
+ vstore4( as_uchar4( pack16to32( p1, p2 ) ), 0, &pred[0 + 3 * 8] );
+ vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[4 + 3 * 8] );
+ satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
+ return satd;
+}
+
+int x264_predict_8x8_vl( const local pixel *src, int src_stride, const local pixel *top )
+{
+ private pixel pred[32];
+ int satd;
+#define PRED( x, y ) pred[(x) + (y)*8]
+ // Upper half of pred[]
+ PRED( 0, 0 ) = F1( top[0], top[1] );
+ PRED( 0, 1 ) = F2( top[0], top[1], top[2] );
+ PRED( 0, 2 ) = PRED( 1, 0 ) = F1( top[1], top[2] );
+ PRED( 0, 3 ) = PRED( 1, 1 ) = F2( top[1], top[2], top[3] );
+ PRED( 1, 2 ) = PRED( 2, 0 ) = F1( top[2], top[3] );
+ PRED( 1, 3 ) = PRED( 2, 1 ) = F2( top[2], top[3], top[4] );
+ PRED( 2, 2 ) = PRED( 3, 0 ) = F1( top[3], top[4] );
+ PRED( 2, 3 ) = PRED( 3, 1 ) = F2( top[3], top[4], top[5] );
+ PRED( 3, 2 ) = PRED( 4, 0 ) = F1( top[4], top[5] );
+ PRED( 3, 3 ) = PRED( 4, 1 ) = F2( top[4], top[5], top[6] );
+ PRED( 4, 2 ) = PRED( 5, 0 ) = F1( top[5], top[6] );
+ PRED( 4, 3 ) = PRED( 5, 1 ) = F2( top[5], top[6], top[7] );
+ PRED( 5, 2 ) = PRED( 6, 0 ) = F1( top[6], top[7] );
+ PRED( 5, 3 ) = PRED( 6, 1 ) = F2( top[6], top[7], top[8] );
+ PRED( 6, 2 ) = PRED( 7, 0 ) = F1( top[7], top[8] );
+ PRED( 6, 3 ) = PRED( 7, 1 ) = F2( top[7], top[8], top[9] );
+ PRED( 7, 2 ) = F1( top[8], top[9] );
+ PRED( 7, 3 ) = F2( top[8], top[9], top[10] );
+ satd = satd_8x4_lp( src, src_stride, pred, 8 );
+ // Lower half of pred[]
+ PRED( 0, 0 ) = F1( top[2], top[3] );
+ PRED( 0, 1 ) = F2( top[2], top[3], top[4] );
+ PRED( 0, 2 ) = PRED( 1, 0 ) = F1( top[3], top[4] );
+ PRED( 0, 3 ) = PRED( 1, 1 ) = F2( top[3], top[4], top[5] );
+ PRED( 1, 2 ) = PRED( 2, 0 ) = F1( top[4], top[5] );
+ PRED( 1, 3 ) = PRED( 2, 1 ) = F2( top[4], top[5], top[6] );
+ PRED( 2, 2 ) = PRED( 3, 0 ) = F1( top[5], top[6] );
+ PRED( 2, 3 ) = PRED( 3, 1 ) = F2( top[5], top[6], top[7] );
+ PRED( 3, 2 ) = PRED( 4, 0 ) = F1( top[6], top[7] );
+ PRED( 3, 3 ) = PRED( 4, 1 ) = F2( top[6], top[7], top[8] );
+ PRED( 4, 2 ) = PRED( 5, 0 ) = F1( top[7], top[8] );
+ PRED( 4, 3 ) = PRED( 5, 1 ) = F2( top[7], top[8], top[9] );
+ PRED( 5, 2 ) = PRED( 6, 0 ) = F1( top[8], top[9] );
+ PRED( 5, 3 ) = PRED( 6, 1 ) = F2( top[8], top[9], top[10] );
+ PRED( 6, 2 ) = PRED( 7, 0 ) = F1( top[9], top[10] );
+ PRED( 6, 3 ) = PRED( 7, 1 ) = F2( top[9], top[10], top[11] );
+ PRED( 7, 2 ) = F1( top[10], top[11] );
+ PRED( 7, 3 ) = F2( top[10], top[11], top[12] );
+ satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
+ return satd;
+#undef PRED
+}
+
+int x264_predict_8x8_hu( const local pixel *src, int src_stride, const local pixel *left )
+{
+ private pixel pred[32];
+ int satd;
+ int p1 = pack8to16( (F1( left[0], left[1] )), ((left[0] + 2 * left[1] + left[2] + 2) >> 2) );
+ int p2 = pack8to16( (F1( left[1], left[2] )), ((left[1] + 2 * left[2] + left[3] + 2) >> 2) );
+ int p3 = pack8to16( (F1( left[2], left[3] )), ((left[2] + 2 * left[3] + left[4] + 2) >> 2) );
+ int p4 = pack8to16( (F1( left[3], left[4] )), ((left[3] + 2 * left[4] + left[5] + 2) >> 2) );
+ int p5 = pack8to16( (F1( left[4], left[5] )), ((left[4] + 2 * left[5] + left[6] + 2) >> 2) );
+ int p6 = pack8to16( (F1( left[5], left[6] )), ((left[5] + 2 * left[6] + left[7] + 2) >> 2) );
+ int p7 = pack8to16( (F1( left[6], left[7] )), ((left[6] + 2 * left[7] + left[7] + 2) >> 2) );
+ int p8 = pack8to16( left[7], left[7] );
+ // Upper half of pred[]
+ vstore4( as_uchar4( pack16to32( p1, p2 ) ), 0, &pred[( 0 ) + ( 0 ) * 8] );
+ vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[( 4 ) + ( 0 ) * 8] );
+ vstore4( as_uchar4( pack16to32( p2, p3 ) ), 0, &pred[( 0 ) + ( 1 ) * 8] );
+ vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[( 4 ) + ( 1 ) * 8] );
+ vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[( 0 ) + ( 2 ) * 8] );
+ vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[( 4 ) + ( 2 ) * 8] );
+ vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[( 0 ) + ( 3 ) * 8] );
+ vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[( 4 ) + ( 3 ) * 8] );
+ satd = satd_8x4_lp( src, src_stride, pred, 8 );
+ // Lower half of pred[]
+ vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[( 0 ) + ( 0 ) * 8] );
+ vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[( 4 ) + ( 0 ) * 8] );
+ vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[( 0 ) + ( 1 ) * 8] );
+ vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 1 ) * 8] );
+ vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[( 0 ) + ( 2 ) * 8] );
+ vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 2 ) * 8] );
+ vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 0 ) + ( 3 ) * 8] );
+ vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 3 ) * 8] );
+ satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
+ return satd;
+}
+
+int x264_predict_8x8c_h( const local pixel *src, int src_stride )
+{
+ private pixel pred[32];
+ const local pixel *src_l = src;
+
+ // Upper half of pred[]
+ vstore8( (uchar8)(src[-1]), 0, pred ); src += src_stride;
+ vstore8( (uchar8)(src[-1]), 1, pred ); src += src_stride;
+ vstore8( (uchar8)(src[-1]), 2, pred ); src += src_stride;
+ vstore8( (uchar8)(src[-1]), 3, pred ); src += src_stride;
+ int satd = satd_8x4_lp( src_l, src_stride, pred, 8 );
+
+ // Lower half of pred[]
+ vstore8( (uchar8)(src[-1]), 0, pred ); src += src_stride;
+ vstore8( (uchar8)(src[-1]), 1, pred ); src += src_stride;
+ vstore8( (uchar8)(src[-1]), 2, pred ); src += src_stride;
+ vstore8( (uchar8)(src[-1]), 3, pred );
+ return satd + satd_8x4_lp( src_l + ( src_stride << 2 ), src_stride, pred, 8 );
+}
+
+int x264_predict_8x8c_v( const local pixel *src, int src_stride )
+{
+ private pixel pred[32];
+ uchar16 v16;
+ v16.lo = vload8( 0, &src[-src_stride] );
+ v16.hi = vload8( 0, &src[-src_stride] );
+
+ vstore16( v16, 0, pred );
+ vstore16( v16, 1, pred );
+
+ return satd_8x4_lp( src, src_stride, pred, 8 ) +
+ satd_8x4_lp( src + (src_stride << 2), src_stride, pred, 8 );
+}
+
+int x264_predict_8x8c_p( const local pixel *src, int src_stride )
+{
+ int H = 0, V = 0;
+ private pixel pred[32];
+ int satd;
+
+ for( int i = 0; i < 4; i++ )
+ {
+ H += (i + 1) * (src[4 + i - src_stride] - src[2 - i - src_stride]);
+ V += (i + 1) * (src[-1 + (i + 4) * src_stride] - src[-1 + (2 - i) * src_stride]);
+ }
+
+ int a = 16 * (src[-1 + 7 * src_stride] + src[7 - src_stride]);
+ int b = (17 * H + 16) >> 5;
+ int c = (17 * V + 16) >> 5;
+ int i00 = a - 3 * b - 3 * c + 16;
+
+ // Upper half of pred[]
+ for( int y = 0; y < 4; y++ )
+ {
+ int pix = i00;
+ for( int x = 0; x < 8; x++ )
+ {
+ pred[x + y*8] = x264_clip_pixel( pix >> 5 );
+ pix += b;
+ }
+ i00 += c;
+ }
+ satd = satd_8x4_lp( src, src_stride, pred, 8 );
+ // Lower half of pred[]
+ for( int y = 0; y < 4; y++ )
+ {
+ int pix = i00;
+ for( int x = 0; x < 8; x++ )
+ {
+ pred[x + y*8] = x264_clip_pixel( pix >> 5 );
+ pix += b;
+ }
+ i00 += c;
+ }
+ satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
+ return satd;
+}
+
+int x264_predict_8x8c_dc( const local pixel *src, int src_stride )
+{
+ private pixel pred[32];
+ int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
+ for( int i = 0; i < 4; i++ )
+ {
+ s0 += src[i - src_stride];
+ s1 += src[i + 4 - src_stride];
+ s2 += src[-1 + i * src_stride];
+ s3 += src[-1 + (i+4)*src_stride];
+ }
+
+ // Upper half of pred[]
+ uchar8 dc0;
+ dc0.lo = (uchar4)( (s0 + s2 + 4) >> 3 );
+ dc0.hi = (uchar4)( (s1 + 2) >> 2 );
+ vstore8( dc0, 0, pred );
+ vstore8( dc0, 1, pred );
+ vstore8( dc0, 2, pred );
+ vstore8( dc0, 3, pred );
+ int satd = satd_8x4_lp( src, src_stride, pred, 8 );
+
+ // Lower half of pred[]
+ dc0.lo = (uchar4)( (s3 + 2) >> 2 );
+ dc0.hi = (uchar4)( (s1 + s3 + 4) >> 3 );
+ vstore8( dc0, 0, pred );
+ vstore8( dc0, 1, pred );
+ vstore8( dc0, 2, pred );
+ vstore8( dc0, 3, pred );
+ return satd + satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
+}
+#endif
+
+/* Find the least cost intra mode for 32 8x8 macroblocks per workgroup
+ *
+ * Loads 33 macroblocks plus the pixels directly above them into local memory,
+ * padding where necessary with edge pixels. It then cooperatively calculates
+ * smoothed top and left pixels for use in some of the analysis.
+ *
+ * Then groups of 32 threads each calculate a single intra mode for each 8x8
+ * block. Since consecutive threads are calculating the same intra mode there
+ * is no code-path divergence. 8 intra costs are calculated simultaneously. If
+ * the "slow" argument is not zero, the final two (least likely) intra modes are
+ * tested in a second pass. The slow mode is only enabled for presets slow,
+ * slower, and placebo.
+ *
+ * This allows all of the pixels functions to read pixels from local memory, and
+ * avoids re-fetching edge pixels from global memory. And it allows us to
+ * calculate all of the intra mode costs simultaneously without branch divergence.
+ *
+ * Local dimension: [ 32, 8 ]
+ * Global dimensions: [ paddedWidth, height ] */
+kernel void mb_intra_cost_satd_8x8( read_only image2d_t fenc,
+ global uint16_t *fenc_intra_cost,
+ global int *frame_stats,
+ int lambda,
+ int mb_width,
+ int slow )
+{
+#define CACHE_STRIDE 265
+#define BLOCK_OFFSET 266
+ local pixel cache[2385];
+ local int cost_buf[32];
+ local pixel top[32 * 16];
+ local pixel left[32 * 8];
+ local pixel left_top[32];
+
+ int lx = get_local_id( 0 );
+ int ly = get_local_id( 1 );
+ int gx = get_global_id( 0 );
+ int gy = get_global_id( 1 );
+ int gidx = get_group_id( 0 );
+ int gidy = get_group_id( 1 );
+ int linear_id = ly * get_local_size( 0 ) + lx;
+ int satd = COST_MAX;
+ int basex = gidx << 8;
+ int basey = (gidy << 3) - 1;
+
+ /* Load 33 8x8 macroblocks and the pixels above them into local cache */
+ for( int y = 0; y < 9 && linear_id < (33<<3)>>2; y++ )
+ {
+ int x = linear_id << 2;
+ uint4 data = read_imageui( fenc, sampler, (int2)(x + basex, y + basey) );
+ cache[y * CACHE_STRIDE + 1 + x] = data.s0;
+ cache[y * CACHE_STRIDE + 1 + x + 1] = data.s1;
+ cache[y * CACHE_STRIDE + 1 + x + 2] = data.s2;
+ cache[y * CACHE_STRIDE + 1 + x + 3] = data.s3;
+ }
+ /* load pixels on left edge */
+ if( linear_id < 9 )
+ cache[linear_id * CACHE_STRIDE] = read_imageui( fenc, sampler, (int2)( basex - 1, linear_id + basey) ).s0;
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ // Cooperatively build the top edge for the macroblock using lowpass filter
+ int j = ly;
+ top[lx*16 + j] = ( cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j - 1, -1, 15 )] +
+ 2*cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j, 0, 15 )] +
+ cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j + 1, 0, 15 )] + 2 ) >> 2;
+ j += 8;
+ top[lx*16 + j] = ( cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j - 1, -1, 15 )] +
+ 2*cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j, 0, 15 )] +
+ cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j + 1, 0, 15 )] + 2 ) >> 2;
+ // Cooperatively build the left edge for the macroblock using lowpass filter
+ left[lx*8 + ly] = ( cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*(ly - 1)] +
+ 2*cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*ly] +
+ cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*clamp((ly + 1), 0, 7 )] + 2 ) >> 2;
+ // One left_top per macroblock
+ if( 0 == ly )
+ {
+ left_top[lx] = ( cache[BLOCK_OFFSET + 8*lx - 1] + 2*cache[BLOCK_OFFSET + 8*lx - 1 - CACHE_STRIDE] +
+ cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE] + 2 ) >> 2;
+ cost_buf[lx] = COST_MAX;
+ }
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ // each warp/wavefront generates a different prediction type; no divergence
+ switch( ly )
+ {
+ case 0:
+ satd = x264_predict_8x8c_h( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
+ break;
+ case 1:
+ satd = x264_predict_8x8c_v( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
+ break;
+ case 2:
+ satd = x264_predict_8x8c_dc( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
+ break;
+ case 3:
+ satd = x264_predict_8x8c_p( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
+ break;
+ case 4:
+ satd = x264_predict_8x8_ddr( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
+ break;
+ case 5:
+ satd = x264_predict_8x8_vr( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
+ break;
+ case 6:
+ satd = x264_predict_8x8_hd( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
+ break;
+ case 7:
+ satd = x264_predict_8x8_hu( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &left[8*lx] );
+ break;
+ default:
+ break;
+ }
+ atom_min( &cost_buf[lx], satd );
+ if( slow )
+ {
+ // Do the remaining two (least likely) prediction modes
+ switch( ly )
+ {
+ case 0: // DDL
+ satd = x264_predict_8x8_ddl( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx] );
+ atom_min( &cost_buf[lx], satd );
+ break;
+ case 1: // VL
+ satd = x264_predict_8x8_vl( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx] );
+ atom_min( &cost_buf[lx], satd );
+ break;
+ default:
+ break;
+ }
+ }
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ if( (0 == ly) && (gx < mb_width) )
+ fenc_intra_cost[gidy * mb_width + gx] = cost_buf[lx]+ 5*lambda;
+
+ // initialize the frame_stats[2] buffer for kernel sum_intra_cost().
+ if( gx < 2 && gy == 0 )
+ frame_stats[gx] = 0;
+#undef CACHE_STRIDE
+#undef BLOCK_OFFSET
+}
+
+/*
+ * parallel sum intra costs
+ *
+ * global launch dimensions: [256, mb_height]
+ */
+kernel void sum_intra_cost( const global uint16_t *fenc_intra_cost,
+ const global uint16_t *inv_qscale_factor,
+ global int *fenc_row_satds,
+ global int *frame_stats,
+ int mb_width )
+{
+ int y = get_global_id( 1 );
+ int mb_height = get_global_size( 1 );
+
+ int row_satds = 0;
+ int cost_est = 0;
+ int cost_est_aq = 0;
+
+ for( int x = get_global_id( 0 ); x < mb_width; x += get_global_size( 0 ))
+ {
+ int mb_xy = x + y * mb_width;
+ int cost = fenc_intra_cost[mb_xy];
+ int cost_aq = (cost * inv_qscale_factor[mb_xy] + 128) >> 8;
+ int b_frame_score_mb = (x > 0 && x < mb_width - 1 && y > 0 && y < mb_height - 1) || mb_width <= 2 || mb_height <= 2;
+
+ row_satds += cost_aq;
+ if( b_frame_score_mb )
+ {
+ cost_est += cost;
+ cost_est_aq += cost_aq;
+ }
+ }
+
+ local int buffer[256];
+ int x = get_global_id( 0 );
+
+ row_satds = parallel_sum( row_satds, x, buffer );
+ cost_est = parallel_sum( cost_est, x, buffer );
+ cost_est_aq = parallel_sum( cost_est_aq, x, buffer );
+
+ if( get_global_id( 0 ) == 0 )
+ {
+ fenc_row_satds[y] = row_satds;
+ atomic_add( frame_stats + COST_EST, cost_est );
+ atomic_add( frame_stats + COST_EST_AQ, cost_est_aq );
+ }
+}

x264-snapshot-20130723-2245.tar.bz2/common/opencl/motionsearch.cl Added

@@ -0,0 +1,249 @@
+/* Hierarchical (iterative) OpenCL lowres motion search */
+
+inline int find_downscale_mb_xy( int x, int y, int mb_width, int mb_height )
+{
+ /* edge macroblocks might not have a direct descendant, use nearest */
+ x = select( x >> 1, (x - (mb_width&1)) >> 1, x == mb_width-1 );
+ y = select( y >> 1, (y - (mb_height&1)) >> 1, y == mb_height-1 );
+ return (mb_width>>1) * y + x;
+}
+
+/* Four threads calculate an 8x8 SAD. Each does two rows */
+int sad_8x8_ii_coop4( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos, int idx, local int16_t *costs )
+{
+ frefpos.y += idx << 1;
+ fencpos.y += idx << 1;
+ int cost = 0;
+ if( frefpos.x < 0 )
+ {
+ /* slow path when MV goes past left edge. The GPU clamps reads from
+ * (-1, 0) to (0,0), so you get pixels [0, 1, 2, 3] when what you really
+ * want are [0, 0, 1, 2]
+ */
+ for( int y = 0; y < 2; y++ )
+ {
+ for( int x = 0; x < 8; x++ )
+ {
+ pixel enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0;
+ pixel ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0;
+ cost += abs_diff( enc, ref );
+ }
+ }
+ }
+ else
+ {
+ uint4 enc, ref, costs = 0;
+ enc = read_imageui( fenc, sampler, fencpos );
+ ref = read_imageui( fref, sampler, frefpos );
+ costs += abs_diff( enc, ref );
+ enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 0) );
+ ref = read_imageui( fref, sampler, frefpos + (int2)(4, 0) );
+ costs += abs_diff( enc, ref );
+ enc = read_imageui( fenc, sampler, fencpos + (int2)(0, 1) );
+ ref = read_imageui( fref, sampler, frefpos + (int2)(0, 1) );
+ costs += abs_diff( enc, ref );
+ enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 1) );
+ ref = read_imageui( fref, sampler, frefpos + (int2)(4, 1) );
+ costs += abs_diff( enc, ref );
+ cost = costs.s0 + costs.s1 + costs.s2 + costs.s3;
+ }
+ costs[idx] = cost;
+ return costs[0] + costs[1] + costs[2] + costs[3];
+}
+
+/* One thread performs 8x8 SAD */
+int sad_8x8_ii( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos )
+{
+ if( frefpos.x < 0 )
+ {
+ /* slow path when MV goes past left edge */
+ int cost = 0;
+ for( int y = 0; y < 8; y++ )
+ {
+ for( int x = 0; x < 8; x++ )
+ {
+ uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0;
+ uint ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0;
+ cost += abs_diff( enc, ref );
+ }
+ }
+ return cost;
+ }
+ else
+ {
+ uint4 enc, ref, cost = 0;
+ for( int y = 0; y < 8; y++ )
+ {
+ for( int x = 0; x < 8; x += 4 )
+ {
+ enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) );
+ ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) );
+ cost += abs_diff( enc, ref );
+ }
+ }
+ return cost.s0 + cost.s1 + cost.s2 + cost.s3;
+ }
+}
+/*
+ * hierarchical motion estimation
+ *
+ * Each kernel launch is a single iteration
+ *
+ * MB per work group is determined by lclx / 4 * lcly
+ *
+ * global launch dimensions: [mb_width * 4, mb_height]
+ */
+kernel void hierarchical_motion( read_only image2d_t fenc,
+ read_only image2d_t fref,
+ const global short2 *in_mvs,
+ global short2 *out_mvs,
+ global int16_t *out_mv_costs,
+ global short2 *mvp_buffer,
+ local int16_t *cost_local,
+ local short2 *mvc_local,
+ int mb_width,
+ int lambda,
+ int me_range,
+ int scale,
+ int b_shift_index,
+ int b_first_iteration,
+ int b_reverse_references )
+{
+ int mb_x = get_global_id( 0 ) >> 2;
+ if( mb_x >= mb_width )
+ return;
+ int mb_height = get_global_size( 1 );
+ int mb_i = get_global_id( 0 ) & 3;
+ int mb_y = get_global_id( 1 );
+ int mb_xy = mb_y * mb_width + mb_x;
+ const int mb_size = 8;
+ int2 coord = (int2)(mb_x, mb_y) * mb_size;
+
+ const int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
+ cost_local += 4 * mb_in_group;
+
+ int i_mvc = 0;
+ mvc_local += 4 * mb_in_group;
+ mvc_local[mb_i] = 0;
+ int2 mvp =0;
+
+ if( !b_first_iteration )
+ {
+#define MVC( DX, DY )\
+ {\
+ int px = mb_x + DX;\
+ int py = mb_y + DY;\
+ mvc_local[i_mvc] = b_shift_index ? in_mvs[find_downscale_mb_xy( px, py, mb_width, mb_height )] : \
+ in_mvs[mb_width * py + px];\
+ mvc_local[i_mvc] >>= (short) scale;\
+ i_mvc++;\
+ }
+ /* Find MVP from median of MVCs */
+ if( b_reverse_references )
+ {
+ /* odd iterations: derive MVP from down and right */
+ if( mb_x < mb_width - 1 )
+ MVC( 1, 0 );
+ if( mb_y < mb_height - 1 )
+ {
+ MVC( 0, 1 );
+ if( mb_x > b_shift_index )
+ MVC( -1, 1 );
+ if( mb_x < mb_width - 1 )
+ MVC( 1, 1 );
+ }
+ }
+ else
+ {
+ /* even iterations: derive MVP from up and left */
+ if( mb_x > 0 )
+ MVC( -1, 0 );
+ if( mb_y > 0 )
+ {
+ MVC( 0, -1 );
+ if( mb_x < mb_width - 1 )
+ MVC( 1, -1 );
+ if( mb_x > b_shift_index )
+ MVC( -1, -1 );
+ }
+ }
+#undef MVC
+ mvp = (i_mvc <= 1) ? convert_int2_sat(mvc_local[0]) : x264_median_mv( mvc_local[0], mvc_local[1], mvc_local[2] );
+ }
+ /* current mvp matches the previous mvp and we have not changed scale. We know
+ * we're going to arrive at the same MV again, so just copy the previous
+ * result to our output. */
+ if( !b_shift_index && mvp.x == mvp_buffer[mb_xy].x && mvp.y == mvp_buffer[mb_xy].y )
+ {
+ out_mvs[mb_xy] = in_mvs[mb_xy];
+ return;
+ }
+ mvp_buffer[mb_xy] = convert_short2_sat(mvp);
+ int2 mv_min = -mb_size * (int2)(mb_x, mb_y) - 4;
+ int2 mv_max = mb_size * ((int2)(mb_width, mb_height) - (int2)(mb_x, mb_y) - 1) + 4;
+
+ int2 bestmv = clamp(mvp, mv_min, mv_max);
+ int2 refcrd = coord + bestmv;
+
+ /* measure cost at bestmv */
+ int bcost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) +
+ lambda * mv_cost( abs_diff( bestmv, mvp ) << (2 + scale) );
+
+ do
+ {
+ /* measure costs at offsets from bestmv */
+ refcrd = coord + bestmv + dia_offs[mb_i];
+ int2 trymv = bestmv + dia_offs[mb_i];
+ int cost = sad_8x8_ii( fenc, coord, fref, refcrd ) +
+ lambda * mv_cost( abs_diff( trymv, mvp ) << (2 + scale) );
+
+ cost_local[mb_i] = (cost<<2) | mb_i;
+ cost = min( cost_local[0], min( cost_local[1], min( cost_local[2], cost_local[3] ) ) );
+
+ if( (cost >> 2) >= bcost )
+ break;
+
+ bestmv += dia_offs[cost&3];
+ bcost = cost>>2;
+
+ if( bestmv.x >= mv_max.x || bestmv.x <= mv_min.x || bestmv.y >= mv_max.y || bestmv.y <= mv_min.y )
+ break;
+ }
+ while( --me_range > 0 );
+
+ int2 trymv = 0, diff = 0;
+
+#define COST_MV_NO_PAD( L )\
+ trymv = clamp( trymv, mv_min, mv_max );\
+ diff = convert_int2_sat(abs_diff( mvp, trymv ));\
+ if( diff.x > 1 || diff.y > 1 ) {\
+ int2 refcrd = coord + trymv;\
+ int cost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) +\
+ L * mv_cost( abs_diff( trymv, mvp ) << (2 + scale) );\
+ if( cost < bcost ) { bcost = cost; bestmv = trymv; } }
+
+ COST_MV_NO_PAD( 0 );
+
+ if( !b_first_iteration )
+ {
+ /* try cost at previous iteration's MV, if MVP was too far away */
+ int2 prevmv = b_shift_index ? convert_int2_sat(in_mvs[find_downscale_mb_xy( mb_x, mb_y, mb_width, mb_height )]) : convert_int2_sat(in_mvs[mb_xy]);
+ prevmv >>= scale;
+ trymv = prevmv;
+ COST_MV_NO_PAD( lambda );
+ }
+
+ for( int i = 0; i < i_mvc; i++ )
+ {
+ /* try cost at each candidate MV, if MVP was too far away */
+ trymv = convert_int2_sat( mvc_local[i] );
+ COST_MV_NO_PAD( lambda );
+ }
+
+ if( mb_i == 0 )
+ {
+ bestmv <<= scale;
+ out_mvs[mb_xy] = convert_short2_sat(bestmv);
+ out_mv_costs[mb_xy] = min( bcost, LOWRES_COST_MASK );
+ }
+}

x264-snapshot-20130723-2245.tar.bz2/common/opencl/subpel.cl Added

@@ -0,0 +1,242 @@
+/* OpenCL lowres subpel Refine */
+
+/* Each thread performs 8x8 SAD. 4 threads per MB, so the 4 DIA HPEL offsets are
+ * calculated simultaneously */
+int sad_8x8_ii_hpel( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref_planes, int2 qpos )
+{
+ int2 frefpos = qpos >> 2;
+ int hpel_idx = ((qpos.x & 2) >> 1) + (qpos.y & 2);
+ uint mask_shift = 8 * hpel_idx;
+
+ uint4 cost4 = 0;
+
+ for( int y = 0; y < 8; y++ )
+ {
+ uint4 enc, val4;
+ enc = read_imageui( fenc, sampler, fencpos + (int2)(0, y));
+ val4.s0 = (read_imageui( fref_planes, sampler, frefpos + (int2)(0, y)).s0 >> mask_shift) & 0xFF;
+ val4.s1 = (read_imageui( fref_planes, sampler, frefpos + (int2)(1, y)).s0 >> mask_shift) & 0xFF;
+ val4.s2 = (read_imageui( fref_planes, sampler, frefpos + (int2)(2, y)).s0 >> mask_shift) & 0xFF;
+ val4.s3 = (read_imageui( fref_planes, sampler, frefpos + (int2)(3, y)).s0 >> mask_shift) & 0xFF;
+ cost4 += abs_diff( enc, val4 );
+
+ enc = read_imageui( fenc, sampler, fencpos + (int2)(4, y));
+ val4.s0 = (read_imageui( fref_planes, sampler, frefpos + (int2)(4, y)).s0 >> mask_shift) & 0xFF;
+ val4.s1 = (read_imageui( fref_planes, sampler, frefpos + (int2)(5, y)).s0 >> mask_shift) & 0xFF;
+ val4.s2 = (read_imageui( fref_planes, sampler, frefpos + (int2)(6, y)).s0 >> mask_shift) & 0xFF;
+ val4.s3 = (read_imageui( fref_planes, sampler, frefpos + (int2)(7, y)).s0 >> mask_shift) & 0xFF;
+ cost4 += abs_diff( enc, val4 );
+ }
+
+ return cost4.s0 + cost4.s1 + cost4.s2 + cost4.s3;
+}
+
+/* One thread measures 8x8 SAD cost at a QPEL offset into an HPEL plane */
+int sad_8x8_ii_qpel( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref_planes, int2 qpos )
+{
+ int2 frefApos = qpos >> 2;
+ int hpelA = ((qpos.x & 2) >> 1) + (qpos.y & 2);
+
+ int2 qposB = qpos + ((qpos & 1) << 1);
+ int2 frefBpos = qposB >> 2;
+ int hpelB = ((qposB.x & 2) >> 1) + (qposB.y & 2);
+
+ uint mask_shift0 = 8 * hpelA, mask_shift1 = 8 * hpelB;
+
+ int cost = 0;
+
+ for( int y = 0; y < 8; y++ )
+ {
+ for( int x = 0; x < 8; x++ )
+ {
+ uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y)).s0;
+ uint vA = (read_imageui( fref_planes, sampler, frefApos + (int2)(x, y)).s0 >> mask_shift0) & 0xFF;
+ uint vB = (read_imageui( fref_planes, sampler, frefBpos + (int2)(x, y)).s0 >> mask_shift1) & 0xFF;
+ cost += abs_diff( enc, rhadd( vA, vB ) );
+ }
+ }
+
+ return cost;
+}
+
+/* Four threads measure 8x8 SATD cost at a QPEL offset into an HPEL plane
+ *
+ * Each thread collects 1/4 of the rows of diffs and processes one quarter of
+ * the transforms
+ */
+int satd_8x8_ii_qpel_coop4( read_only image2d_t fenc,
+ int2 fencpos,
+ read_only image2d_t fref_planes,
+ int2 qpos,
+ local sum2_t *tmpp,
+ int idx )
+{
+ volatile local sum2_t( *tmp )[4] = (volatile local sum2_t( * )[4])tmpp;
+ sum2_t b0, b1, b2, b3;
+
+ // fencpos is full-pel position of original MB
+ // qpos is qpel position within reference frame
+ int2 frefApos = qpos >> 2;
+ int hpelA = ((qpos.x&2)>>1) + (qpos.y&2);
+
+ int2 qposB = qpos + (int2)(((qpos.x&1)<<1), ((qpos.y&1)<<1));
+ int2 frefBpos = qposB >> 2;
+ int hpelB = ((qposB.x&2)>>1) + (qposB.y&2);
+
+ uint mask_shift0 = 8 * hpelA, mask_shift1 = 8 * hpelB;
+
+ uint vA, vB;
+ uint a0, a1;
+ uint enc;
+ sum2_t sum = 0;
+
+#define READ_DIFF( OUT, X )\
+ enc = read_imageui( fenc, sampler, fencpos + (int2)(X, idx) ).s0;\
+ vA = (read_imageui( fref_planes, sampler, frefApos + (int2)(X, idx) ).s0 >> mask_shift0) & 0xFF;\
+ vB = (read_imageui( fref_planes, sampler, frefBpos + (int2)(X, idx) ).s0 >> mask_shift1) & 0xFF;\
+ OUT = enc - rhadd( vA, vB );
+
+#define READ_DIFF_EX( OUT, a, b )\
+ {\
+ READ_DIFF( a0, a );\
+ READ_DIFF( a1, b );\
+ OUT = a0 + (a1<<BITS_PER_SUM);\
+ }
+#define ROW_8x4_SATD( a, b )\
+ {\
+ fencpos.y += a;\
+ frefApos.y += b;\
+ frefBpos.y += b;\
+ READ_DIFF_EX( b0, 0, 4 );\
+ READ_DIFF_EX( b1, 1, 5 );\
+ READ_DIFF_EX( b2, 2, 6 );\
+ READ_DIFF_EX( b3, 3, 7 );\
+ HADAMARD4( tmp[idx][0], tmp[idx][1], tmp[idx][2], tmp[idx][3], b0, b1, b2, b3 );\
+ HADAMARD4( b0, b1, b2, b3, tmp[0][idx], tmp[1][idx], tmp[2][idx], tmp[3][idx] );\
+ sum += abs2( b0 ) + abs2( b1 ) + abs2( b2 ) + abs2( b3 );\
+ }
+ ROW_8x4_SATD( 0, 0 );
+ ROW_8x4_SATD( 4, 4 );
+
+#undef READ_DIFF
+#undef READ_DIFF_EX
+#undef ROW_8x4_SATD
+ return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;
+}
+
+constant int2 hpoffs[4] =
+{
+ {0, -2}, {-2, 0}, {2, 0}, {0, 2}
+};
+
+/* sub pixel refinement of motion vectors, output MVs and costs are moved from
+ * temporary buffers into final per-frame buffer
+ *
+ * global launch dimensions: [mb_width * 4, mb_height]
+ *
+ * With X being the source 16x16 pixels, F is the lowres pixel used by the
+ * motion search. We will now utilize the H V and C pixels (stored in separate
+ * planes) to search at half-pel increments.
+ *
+ * X X X X X X
+ * F H F H F
+ * X X X X X X
+ * V C V C V
+ * X X X X X X
+ * F H F H F
+ * X X X X X X
+ *
+ * The YX HPEL bits of the motion vector selects the plane we search in. The
+ * four planes are packed in the fref_planes 2D image buffer. Each sample
+ * returns: s0 = F, s1 = H, s2 = V, s3 = C */
+kernel void subpel_refine( read_only image2d_t fenc,
+ read_only image2d_t fref_planes,
+ const global short2 *in_mvs,
+ const global int16_t *in_sad_mv_costs,
+ local int16_t *cost_local,
+ local sum2_t *satd_local,
+ local short2 *mvc_local,
+ global short2 *fenc_lowres_mv,
+ global int16_t *fenc_lowres_mv_costs,
+ int mb_width,
+ int lambda,
+ int b,
+ int ref,
+ int b_islist1 )
+{
+ int mb_x = get_global_id( 0 ) >> 2;
+ if( mb_x >= mb_width )
+ return;
+ int mb_height = get_global_size( 1 );
+
+ int mb_i = get_global_id( 0 ) & 3;
+ int mb_y = get_global_id( 1 );
+ int mb_xy = mb_y * mb_width + mb_x;
+
+ /* fenc_lowres_mv and fenc_lowres_mv_costs are large buffers that
+ * hold many frames worth of motion vectors. We must offset into the correct
+ * location for this frame's vectors. The kernel will be passed the correct
+ * directional buffer for the direction of the search: list1 or list0
+ *
+ * CPU equivalent: fenc->lowres_mvs[0][b - p0 - 1]
+ * GPU equivalent: fenc_lowres_mvs[(b - p0 - 1) * mb_count] */
+ fenc_lowres_mv += (b_islist1 ? (ref-b-1) : (b-ref-1)) * mb_width * mb_height;
+ fenc_lowres_mv_costs += (b_islist1 ? (ref-b-1) : (b-ref-1)) * mb_width * mb_height;
+
+ /* Adjust pointers into local memory buffers for this thread's data */
+ int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
+ cost_local += mb_in_group * 4;
+ satd_local += mb_in_group * 16;
+ mvc_local += mb_in_group * 4;
+
+ int i_mvc = 0;
+
+ mvc_local[0] = mvc_local[1] = mvc_local[2] = mvc_local[3] = 0;
+
+#define MVC( DX, DY ) mvc_local[i_mvc++] = in_mvs[mb_width * (mb_y + DY) + (mb_x + DX)];
+ if( mb_x > 0 )
+ MVC( -1, 0 );
+ if( mb_y > 0 )
+ {
+ MVC( 0, -1 );
+ if( mb_x < mb_width - 1 )
+ MVC( 1, -1 );
+ if( mb_x > 0 )
+ MVC( -1, -1 );
+ }
+#undef MVC
+ int2 mvp = (i_mvc <= 1) ? convert_int2_sat(mvc_local[0]) : x264_median_mv( mvc_local[0], mvc_local[1], mvc_local[2] );
+
+ int bcost = in_sad_mv_costs[mb_xy];
+ int2 coord = (int2)(mb_x, mb_y) << 3;
+ int2 bmv = convert_int2_sat( in_mvs[mb_xy] );
+
+ /* Make mvp and bmv QPEL MV */
+ mvp <<= 2; bmv <<= 2;
+
+#define HPEL_QPEL( ARR, FUNC )\
+ {\
+ int2 trymv = bmv + ARR[mb_i];\
+ int2 qpos = (coord << 2) + trymv;\
+ int cost = FUNC( fenc, coord, fref_planes, qpos ) + lambda * mv_cost( abs_diff( trymv, mvp ) );\
+ cost_local[mb_i] = (cost<<2) + mb_i;\
+ cost = min( cost_local[0], min( cost_local[1], min( cost_local[2], cost_local[3] ) ) );\
+ if( (cost>>2) < bcost )\
+ {\
+ bmv += ARR[cost&3];\
+ bcost = cost>>2;\
+ }\
+ }
+
+ HPEL_QPEL( hpoffs, sad_8x8_ii_hpel );
+ HPEL_QPEL( dia_offs, sad_8x8_ii_qpel );
+ fenc_lowres_mv[mb_xy] = convert_short2_sat( bmv );
+
+ /* remeasure cost of bmv using SATD */
+ int2 qpos = (coord << 2) + bmv;
+ cost_local[mb_i] = satd_8x8_ii_qpel_coop4( fenc, coord, fref_planes, qpos, satd_local, mb_i );
+ bcost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3];
+ bcost += lambda * mv_cost( abs_diff( bmv, mvp ) );
+
+ fenc_lowres_mv_costs[mb_xy] = min( bcost, LOWRES_COST_MASK );
+}

x264-snapshot-20130723-2245.tar.bz2/common/opencl/weightp.cl Added

@@ -0,0 +1,48 @@
+/* Weightp filter a downscaled image into a temporary output buffer.
+ * This kernel is launched once for each scale.
+ *
+ * Launch dimensions: width x height (in pixels)
+ */
+kernel void weightp_scaled_images( read_only image2d_t in_plane,
+ write_only image2d_t out_plane,
+ uint offset,
+ uint scale,
+ uint denom )
+{
+ int gx = get_global_id( 0 );
+ int gy = get_global_id( 1 );
+ uint4 input_val;
+ uint4 output_val;
+
+ input_val = read_imageui( in_plane, sampler, (int2)(gx, gy));
+ output_val = (uint4)(offset) + ( ( ((uint4)(scale)) * input_val ) >> ((uint4)(denom)) );
+ write_imageui( out_plane, (int2)(gx, gy), output_val );
+}
+
+/* Weightp filter for the half-pel interpolated image
+ *
+ * Launch dimensions: width x height (in pixels)
+ */
+kernel void weightp_hpel( read_only image2d_t in_plane,
+ write_only image2d_t out_plane,
+ uint offset,
+ uint scale,
+ uint denom )
+{
+ int gx = get_global_id( 0 );
+ int gy = get_global_id( 1 );
+ uint input_val;
+ uint output_val;
+
+ input_val = read_imageui( in_plane, sampler, (int2)(gx, gy)).s0;
+ //Unpack
+ uint4 temp;
+ temp.s0 = input_val & 0x00ff; temp.s1 = (input_val >> 8) & 0x00ff;
+ temp.s2 = (input_val >> 16) & 0x00ff; temp.s3 = (input_val >> 24) & 0x00ff;
+
+ temp = (uint4)(offset) + ( ( ((uint4)(scale)) * temp ) >> ((uint4)(denom)) );
+
+ //Pack
+ output_val = temp.s0 | (temp.s1 << 8) | (temp.s2 << 16) | (temp.s3 << 24);
+ write_imageui( out_plane, (int2)(gx, gy), output_val );
+}

x264-snapshot-20130723-2245.tar.bz2/common/opencl/x264-cl.h Added

@@ -0,0 +1,132 @@
+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
+
+constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
+
+/* 7.18.1.1 Exact-width integer types */
+typedef signed char int8_t;
+typedef unsigned char uint8_t;
+typedef short int16_t;
+typedef unsigned short uint16_t;
+typedef int int32_t;
+typedef unsigned uint32_t;
+
+typedef uint8_t pixel;
+typedef uint16_t sum_t;
+typedef uint32_t sum2_t;
+
+#define LOWRES_COST_MASK ((1<<14)-1)
+#define LOWRES_COST_SHIFT 14
+#define COST_MAX (1<<28)
+
+#define PIXEL_MAX 255
+#define BITS_PER_SUM (8 * sizeof(sum_t))
+
+/* Constants for offsets into frame statistics buffer */
+#define COST_EST 0
+#define COST_EST_AQ 1
+#define INTRA_MBS 2
+
+#define COPY2_IF_LT( x, y, a, b )\
+ if((y)<(x))\
+ {\
+ (x) = (y);\
+ (a) = (b);\
+ }
+
+constant int2 dia_offs[4] =
+{
+ {0, -1}, {-1, 0}, {1, 0}, {0, 1},
+};
+
+inline pixel x264_clip_pixel( int x )
+{
+ return (pixel) clamp( x, (int) 0, (int) PIXEL_MAX );
+}
+
+inline int2 x264_median_mv( short2 a, short2 b, short2 c )
+{
+ short2 t1 = min(a, b);
+ short2 t2 = min(max(a, b), c);
+ return convert_int2(max(t1, t2));
+}
+
+inline sum2_t abs2( sum2_t a )
+{
+ sum2_t s = ((a >> (BITS_PER_SUM - 1)) & (((sum2_t)1 << BITS_PER_SUM) + 1)) * ((sum_t)-1);
+ return (a + s) ^ s;
+}
+
+#define HADAMARD4( d0, d1, d2, d3, s0, s1, s2, s3 ) {\
+ sum2_t t0 = s0 + s1;\
+ sum2_t t1 = s0 - s1;\
+ sum2_t t2 = s2 + s3;\
+ sum2_t t3 = s2 - s3;\
+ d0 = t0 + t2;\
+ d2 = t0 - t2;\
+ d1 = t1 + t3;\
+ d3 = t1 - t3;\
+}
+
+#define HADAMARD4V( d0, d1, d2, d3, s0, s1, s2, s3 ) {\
+ int2 t0 = s0 + s1;\
+ int2 t1 = s0 - s1;\
+ int2 t2 = s2 + s3;\
+ int2 t3 = s2 - s3;\
+ d0 = t0 + t2;\
+ d2 = t0 - t2;\
+ d1 = t1 + t3;\
+ d3 = t1 - t3;\
+}
+
+#define SATD_C_8x4_Q( name, q1, q2 )\
+ int name( q1 pixel *pix1, int i_pix1, q2 pixel *pix2, int i_pix2 )\
+ {\
+ sum2_t tmp[4][4];\
+ sum2_t a0, a1, a2, a3;\
+ sum2_t sum = 0;\
+ for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )\
+ {\
+ a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM);\
+ a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM);\
+ a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM);\
+ a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM);\
+ HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3 );\
+ }\
+ for( int i = 0; i < 4; i++ )\
+ {\
+ HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] );\
+ sum += abs2( a0 ) + abs2( a1 ) + abs2( a2 ) + abs2( a3 );\
+ }\
+ return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;\
+ }
+
+/*
+ * Utility function to perform a parallel sum reduction of an array of integers
+ */
+int parallel_sum( int value, int x, volatile local int *array )
+{
+ array[x] = value;
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ int dim = get_local_size( 0 );
+
+ while( dim > 1 )
+ {
+ dim >>= 1;
+
+ if( x < dim )
+ array[x] += array[x + dim];
+
+ if( dim > 32 )
+ barrier( CLK_LOCAL_MEM_FENCE );
+ }
+
+ return array[0];
+}
+
+int mv_cost( uint2 mvd )
+{
+ float2 mvdf = (float2)(mvd.x, mvd.y) + 1.0f;
+ float2 cost = round( log2(mvdf) * 2.0f + 0.718f + (float2)(!!mvd.x, !!mvd.y) );
+ return (int) (cost.x + cost.y);
+}

x264-snapshot-20130224-2245.tar.bz2/common/osdep.h -> x264-snapshot-20130723-2245.tar.bz2/common/osdep.h Changed

@@ -79,6 +79,7 @@
 #else
 #define DECLARE_ALIGNED( var, n ) var __attribute__((aligned(n)))
 #endif
+#define ALIGNED_32( var ) DECLARE_ALIGNED( var, 32 )
 #define ALIGNED_16( var ) DECLARE_ALIGNED( var, 16 )
 #define ALIGNED_8( var )  DECLARE_ALIGNED( var, 8 )
 #define ALIGNED_4( var )  DECLARE_ALIGNED( var, 4 )
@@ -110,9 +111,26 @@
 
 #define EXPAND(x) x
 
+#if HAVE_32B_STACK_ALIGNMENT
+#define ALIGNED_ARRAY_32( type, name, sub1, ... )\
+    ALIGNED_32( type name sub1 __VA_ARGS__ )
+#else
 #define ALIGNED_ARRAY_32( ... ) EXPAND( ALIGNED_ARRAY_EMU( 31, __VA_ARGS__ ) )
+#endif
+
 #define ALIGNED_ARRAY_64( ... ) EXPAND( ALIGNED_ARRAY_EMU( 63, __VA_ARGS__ ) )
 
+/* For AVX2 */
+#if ARCH_X86 || ARCH_X86_64
+#define NATIVE_ALIGN 32
+#define ALIGNED_N ALIGNED_32
+#define ALIGNED_ARRAY_N ALIGNED_ARRAY_32
+#else
+#define NATIVE_ALIGN 16
+#define ALIGNED_N ALIGNED_16
+#define ALIGNED_ARRAY_N ALIGNED_ARRAY_16
+#endif
+
 #define UNINIT(x) x=x
 
 #if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
@@ -204,6 +222,25 @@
 #define x264_threading_init() 0
 #endif
 
+static ALWAYS_INLINE int x264_pthread_fetch_and_add( int *val, int add, x264_pthread_mutex_t *mutex )
+{
+#if HAVE_THREAD
+#if defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ > 0) && ARCH_X86
+    return __sync_fetch_and_add( val, add );
+#else
+    x264_pthread_mutex_lock( mutex );
+    int res = *val;
+    *val += add;
+    x264_pthread_mutex_unlock( mutex );
+    return res;
+#endif
+#else
+    int res = *val;
+    *val += add;
+    return res;
+#endif
+}
+
 #define WORD_SIZE sizeof(void*)
 
 #define asm __asm__
@@ -254,6 +291,13 @@
 }
 #endif
 
+/* For values with 4 bits or less. */
+static int ALWAYS_INLINE x264_ctz_4bit( uint32_t x )
+{
+    static uint8_t lut[16] = {4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0};
+    return lut[x];
+}
+
 #if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 3)
 #define x264_clz(x) __builtin_clz(x)
 #define x264_ctz(x) __builtin_ctz(x)

x264-snapshot-20130224-2245.tar.bz2/common/pixel.c -> x264-snapshot-20130723-2245.tar.bz2/common/pixel.c Changed

@@ -370,7 +370,6 @@
     return (sum+2)>>2;
 }
 
-
 static NOINLINE uint64_t pixel_hadamard_ac( pixel *pix, intptr_t stride )
 {
     sum2_t tmp[32];
@@ -501,6 +500,7 @@
 #if !HIGH_BIT_DEPTH
 SATD_X_DECL6( _sse2 )
 SATD_X_DECL7( _ssse3 )
+SATD_X_DECL6( _ssse3_atom )
 SATD_X_DECL7( _sse4 )
 SATD_X_DECL7( _avx )
 SATD_X_DECL7( _xop )
@@ -528,6 +528,7 @@
 INTRA_MBCMP_8x8( sad,, _c )
 INTRA_MBCMP_8x8(sa8d,, _c )
 #if HIGH_BIT_DEPTH && HAVE_MMX
+#define x264_predict_8x8_v_sse2 x264_predict_8x8_v_sse
 INTRA_MBCMP_8x8( sad, _mmx2,  _c )
 INTRA_MBCMP_8x8(sa8d, _sse2,  _sse2 )
 #endif
@@ -554,6 +555,9 @@
 
 #if HAVE_MMX
 #if HIGH_BIT_DEPTH
+#define x264_predict_8x8c_v_sse2 x264_predict_8x8c_v_sse
+#define x264_predict_8x16c_v_sse2 x264_predict_8x16c_v_sse
+#define x264_predict_16x16_v_sse2 x264_predict_16x16_v_sse
 INTRA_MBCMP( sad,  4x4,   v, h, dc,  , _mmx2, _c )
 INTRA_MBCMP( sad,  8x8,  dc, h,  v, c, _mmx2, _c )
 INTRA_MBCMP( sad, 16x16,  v, h, dc,  , _mmx2, _mmx2 )
@@ -841,6 +845,7 @@
     if( cpu&X264_CPU_MMX2 )
     {
         INIT7( sad, _mmx2 );
+        INIT7_NAME( sad_aligned, sad, _mmx2 );
         INIT7( sad_x3, _mmx2 );
         INIT7( sad_x4, _mmx2 );
         INIT8( satd, _mmx2 );
@@ -870,11 +875,14 @@
     {
         INIT4_NAME( sad_aligned, sad, _sse2_aligned );
         INIT5( ssd, _sse2 );
+        INIT6( satd, _sse2 );
+        pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_sse2;
 
         pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
         pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_sse2;
 #if ARCH_X86_64
         pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
+        pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse2;
 #endif
         pixf->intra_sad_x3_4x4  = x264_intra_sad_x3_4x4_sse2;
         pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_sse2;
@@ -916,10 +924,14 @@
     if( cpu&X264_CPU_SSSE3 )
     {
         INIT4_NAME( sad_aligned, sad, _ssse3_aligned );
+        pixf->sad_aligned[PIXEL_4x4] = x264_pixel_sad_4x4_ssse3;
+        pixf->sad_aligned[PIXEL_4x8] = x264_pixel_sad_4x8_ssse3;
         INIT7( sad, _ssse3 );
         INIT7( sad_x3, _ssse3 );
         INIT7( sad_x4, _ssse3 );
         INIT_ADS( _ssse3 );
+        INIT6( satd, _ssse3 );
+        pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_ssse3;
 
         if( !(cpu&X264_CPU_STACK_MOD4) )
         {
@@ -930,6 +942,9 @@
         pixf->intra_sad_x3_4x4  = x264_intra_sad_x3_4x4_ssse3;
         pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
         pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_ssse3;
+#if ARCH_X86_64
+        pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3;
+#endif
         pixf->intra_sad_x3_4x4    = x264_intra_sad_x3_4x4_ssse3;
         pixf->intra_sad_x3_8x8    = x264_intra_sad_x3_8x8_ssse3;
         pixf->intra_sad_x3_8x8c   = x264_intra_sad_x3_8x8c_ssse3;
@@ -937,16 +952,24 @@
     }
     if( cpu&X264_CPU_SSE4 )
     {
+        INIT6( satd, _sse4 );
+        pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_sse4;
         if( !(cpu&X264_CPU_STACK_MOD4) )
         {
             INIT4( hadamard_ac, _sse4 );
         }
         pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4;
         pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_sse4;
+#if ARCH_X86_64
+        pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse4;
+#endif
     }
     if( cpu&X264_CPU_AVX )
     {
+        INIT5_NAME( sad_aligned, sad, _ssse3 ); /* AVX-capable CPUs doesn't benefit from an aligned version */
         INIT_ADS( _avx );
+        INIT6( satd, _avx );
+        pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_avx;
         if( !(cpu&X264_CPU_STACK_MOD4) )
         {
             INIT4( hadamard_ac, _avx );
@@ -959,12 +982,26 @@
         pixf->ssd_nv12_core    = x264_pixel_ssd_nv12_core_avx;
         pixf->ssim_4x4x2_core  = x264_pixel_ssim_4x4x2_core_avx;
         pixf->ssim_end4        = x264_pixel_ssim_end4_avx;
+#if ARCH_X86_64
+        pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_avx;
+#endif
     }
     if( cpu&X264_CPU_XOP )
     {
         pixf->vsad = x264_pixel_vsad_xop;
         pixf->asd8 = x264_pixel_asd8_xop;
     }
+    if( cpu&X264_CPU_AVX2 )
+    {
+        INIT2( ssd, _avx2 );
+        INIT2( sad, _avx2 );
+        INIT2_NAME( sad_aligned, sad, _avx2 );
+        INIT2( sad_x3, _avx2 );
+        INIT2( sad_x4, _avx2 );
+        pixf->vsad = x264_pixel_vsad_avx2;
+        pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx2;
+        pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_avx2;
+    }
 #endif // HAVE_MMX
 #else // !HIGH_BIT_DEPTH
 #if HAVE_MMX
@@ -1003,14 +1040,14 @@
             INIT4( sad_x3, _cache32_mmx2 );
             INIT4( sad_x4, _cache32_mmx2 );
         }
-        else if( cpu&X264_CPU_CACHELINE_64 )
+        else if( cpu&X264_CPU_CACHELINE_64 && !(cpu&X264_CPU_SLOW_ATOM) )
         {
             INIT5( sad, _cache64_mmx2 );
             INIT4( sad_x3, _cache64_mmx2 );
             INIT4( sad_x4, _cache64_mmx2 );
         }
 #else
-        if( cpu&X264_CPU_CACHELINE_64 )
+        if( cpu&X264_CPU_CACHELINE_64 && !(cpu&X264_CPU_SLOW_ATOM) )
         {
             pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_cache64_mmx2;
             pixf->sad[PIXEL_8x8]  = x264_pixel_sad_8x8_cache64_mmx2;
@@ -1044,6 +1081,7 @@
         pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_sse2;
 #if ARCH_X86_64
         pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
+        pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse2;
 #endif
         pixf->var2[PIXEL_8x8]   = x264_pixel_var2_8x8_sse2;
         pixf->var2[PIXEL_8x16]  = x264_pixel_var2_8x16_sse2;
@@ -1060,10 +1098,7 @@
         pixf->satd[PIXEL_4x16]   = x264_pixel_satd_4x16_sse2;
         INIT6( satd_x3, _sse2 );
         INIT6( satd_x4, _sse2 );
-        if( !(cpu&X264_CPU_STACK_MOD4) )
-        {
-            INIT4( hadamard_ac, _sse2 );
-        }
+        INIT4( hadamard_ac, _sse2 );
         INIT_ADS( _sse2 );
         pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
         pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_sse2;
@@ -1113,9 +1148,9 @@
 
     if( cpu&X264_CPU_SSSE3 )
     {
+        INIT4( hadamard_ac, _ssse3 );
         if( !(cpu&X264_CPU_STACK_MOD4) )
         {
-            INIT4( hadamard_ac, _ssse3 );
             pixf->intra_sad_x9_4x4  = x264_intra_sad_x9_4x4_ssse3;
             pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_ssse3;
             pixf->intra_sad_x9_8x8  = x264_intra_sad_x9_8x8_ssse3;
@@ -1124,7 +1159,20 @@
 #endif
         }
         INIT_ADS( _ssse3 );
-        if( !(cpu&X264_CPU_SLOW_ATOM) )
+        if( cpu&X264_CPU_SLOW_ATOM )
+        {
+            pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3_atom;
+            pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_ssse3_atom;
+            INIT6( satd, _ssse3_atom );
+            pixf->satd[PIXEL_4x16]  = x264_pixel_satd_4x16_ssse3_atom;
+            INIT6( satd_x3, _ssse3_atom );
+            INIT6( satd_x4, _ssse3_atom );
+            INIT4( hadamard_ac, _ssse3_atom );
+#if ARCH_X86_64
+            pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3_atom;
+#endif
+        }
+        else
         {
             INIT8( ssd, _ssse3 );
             pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
@@ -1132,9 +1180,13 @@
             INIT8( satd, _ssse3 );
             INIT7( satd_x3, _ssse3 );
             INIT7( satd_x4, _ssse3 );
+#if ARCH_X86_64
+            pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3;
+#endif
         }
         pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3;
-        pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16_ssse3;
+        if( !(cpu&X264_CPU_SLOW_PSHUFB) )
+            pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16_ssse3;
         pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_ssse3;
         pixf->intra_satd_x3_8x8c  = x264_intra_satd_x3_8x8c_ssse3;
         pixf->intra_sad_x3_8x8c   = x264_intra_sad_x3_8x8c_ssse3;
@@ -1147,7 +1199,13 @@
             INIT2( sad_x3, _cache64_ssse3 );
             INIT2( sad_x4, _cache64_ssse3 );
         }
-        if( cpu&X264_CPU_SLOW_ATOM || !(cpu&X264_CPU_SHUFFLE_IS_FAST) )
+        else
+        {
+            pixf->sad_x4[PIXEL_8x4] = x264_pixel_sad_x4_8x4_ssse3;
+            pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_ssse3;
+            pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_ssse3;
+        }
+        if( (cpu&X264_CPU_SLOW_ATOM) || (cpu&X264_CPU_SLOW_SHUFFLE) )
         {
             INIT5( ssd, _sse2 ); /* on conroe, sse2 is faster for width8/16 */
         }
@@ -1158,9 +1216,9 @@
         INIT8( satd, _sse4 );
         INIT7( satd_x3, _sse4 );
         INIT7( satd_x4, _sse4 );
+        INIT4( hadamard_ac, _sse4 );
         if( !(cpu&X264_CPU_STACK_MOD4) )
         {
-            INIT4( hadamard_ac, _sse4 );
             pixf->intra_sad_x9_4x4  = x264_intra_sad_x9_4x4_sse4;
             pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_sse4;
             pixf->intra_sad_x9_8x8  = x264_intra_sad_x9_8x8_sse4;
@@ -1171,17 +1229,21 @@
         pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4;
         pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_sse4;
         pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_sse4;
+#if ARCH_X86_64
+        pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse4;
+#endif
     }
 
     if( cpu&X264_CPU_AVX )
     {
+        INIT2_NAME( sad_aligned, sad, _sse2 ); /* AVX-capable CPUs doesn't benefit from an aligned version */
         INIT8( satd, _avx );
         INIT7( satd_x3, _avx );
         INIT7( satd_x4, _avx );
         INIT_ADS( _avx );
+        INIT4( hadamard_ac, _avx );
         if( !(cpu&X264_CPU_STACK_MOD4) )
         {
-            INIT4( hadamard_ac, _avx );
             pixf->intra_sad_x9_4x4  = x264_intra_sad_x9_4x4_avx;
             pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_avx;
             pixf->intra_sad_x9_8x8  = x264_intra_sad_x9_8x8_avx;
@@ -1199,6 +1261,9 @@
         pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_avx;
         pixf->ssim_4x4x2_core  = x264_pixel_ssim_4x4x2_core_avx;
         pixf->ssim_end4        = x264_pixel_ssim_end4_avx;
+#if ARCH_X86_64
+        pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_avx;
+#endif
     }
 
     if( cpu&X264_CPU_XOP )
@@ -1206,9 +1271,9 @@
         INIT7( satd, _xop );
         INIT7( satd_x3, _xop );
         INIT7( satd_x4, _xop );
+        INIT4( hadamard_ac, _xop );
         if( !(cpu&X264_CPU_STACK_MOD4) )
         {
-            INIT4( hadamard_ac, _xop );
             pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_xop;
         }
         INIT5( ssd, _xop );
@@ -1220,6 +1285,30 @@
         pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_xop;
         pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_xop;
         pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_xop;
+#if ARCH_X86_64
+        pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_xop;
+#endif
+    }
+
+    if( cpu&X264_CPU_AVX2 )
+    {
+        INIT2( ssd, _avx2 );
+        INIT2( sad_x3, _avx2 );
+        INIT2( sad_x4, _avx2 );
+        INIT4( satd, _avx2 );
+        INIT2( hadamard_ac, _avx2 );
+        INIT_ADS( _avx2 );
+        pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_avx2;
+        pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx2;
+        pixf->var2[PIXEL_8x16]  = x264_pixel_var2_8x16_avx2;
+        pixf->var2[PIXEL_8x8]   = x264_pixel_var2_8x8_avx2;
+        pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_avx2;
+        pixf->intra_sad_x9_8x8  = x264_intra_sad_x9_8x8_avx2;
+        pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_avx2;
+        pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx2;
+#if ARCH_X86_64
+        pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_avx2;
+#endif
     }
 #endif //HAVE_MMX

x264-snapshot-20130224-2245.tar.bz2/common/pixel.h -> x264-snapshot-20130723-2245.tar.bz2/common/pixel.h Changed

x264-snapshot-20130224-2245.tar.bz2/common/quant.c -> x264-snapshot-20130723-2245.tar.bz2/common/quant.c Changed

@@ -63,6 +63,19 @@
 return !!nz;
 }
 
+static int quant_4x4x4( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] )
+{
+ int nza = 0;
+ for( int j = 0; j < 4; j++ )
+ {
+ int nz = 0;
+ for( int i = 0; i < 16; i++ )
+ QUANT_ONE( dct[j][i], mf[i], bias[i] );
+ nza |= (!!nz)<<j;
+ }
+ return nza;
+}
+
 static int quant_4x4_dc( dctcoef dct[16], int mf, int bias )
 {
 int nz = 0;
@@ -405,6 +418,7 @@
 {
 pf->quant_8x8 = quant_8x8;
 pf->quant_4x4 = quant_4x4;
+ pf->quant_4x4x4 = quant_4x4x4;
 pf->quant_4x4_dc = quant_4x4_dc;
 pf->quant_2x2_dc = quant_2x2_dc;
 
@@ -442,11 +456,6 @@
 pf->denoise_dct = x264_denoise_dct_mmx;
 pf->decimate_score15 = x264_decimate_score15_mmx2;
 pf->decimate_score16 = x264_decimate_score16_mmx2;
- if( cpu&X264_CPU_SLOW_CTZ )
- {
- pf->decimate_score15 = x264_decimate_score15_mmx2_slowctz;
- pf->decimate_score16 = x264_decimate_score16_mmx2_slowctz;
- }
 pf->decimate_score64 = x264_decimate_score64_mmx2;
 pf->coeff_last8 = x264_coeff_last8_mmx2;
 pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmx2;
@@ -464,6 +473,7 @@
 if( cpu&X264_CPU_SSE2 )
 {
 pf->quant_4x4 = x264_quant_4x4_sse2;
+ pf->quant_4x4x4 = x264_quant_4x4x4_sse2;
 pf->quant_8x8 = x264_quant_8x8_sse2;
 pf->quant_2x2_dc = x264_quant_2x2_dc_sse2;
 pf->quant_4x4_dc = x264_quant_4x4_dc_sse2;
@@ -474,11 +484,6 @@
 pf->decimate_score15 = x264_decimate_score15_sse2;
 pf->decimate_score16 = x264_decimate_score16_sse2;
 pf->decimate_score64 = x264_decimate_score64_sse2;
- if( cpu&X264_CPU_SLOW_CTZ )
- {
- pf->decimate_score15 = x264_decimate_score15_sse2_slowctz;
- pf->decimate_score16 = x264_decimate_score16_sse2_slowctz;
- }
 pf->coeff_last8 = x264_coeff_last8_sse2;
 pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
 pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
@@ -501,17 +506,13 @@
 if( cpu&X264_CPU_SSSE3 )
 {
 pf->quant_4x4 = x264_quant_4x4_ssse3;
+ pf->quant_4x4x4 = x264_quant_4x4x4_ssse3;
 pf->quant_8x8 = x264_quant_8x8_ssse3;
 pf->quant_2x2_dc = x264_quant_2x2_dc_ssse3;
 pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
 pf->denoise_dct = x264_denoise_dct_ssse3;
 pf->decimate_score15 = x264_decimate_score15_ssse3;
 pf->decimate_score16 = x264_decimate_score16_ssse3;
- if( cpu&X264_CPU_SLOW_CTZ )
- {
- pf->decimate_score15 = x264_decimate_score15_ssse3_slowctz;
- pf->decimate_score16 = x264_decimate_score16_ssse3_slowctz;
- }
 pf->decimate_score64 = x264_decimate_score64_ssse3;
 INIT_TRELLIS( ssse3 );
 }
@@ -520,6 +521,7 @@
 pf->quant_2x2_dc = x264_quant_2x2_dc_sse4;
 pf->quant_4x4_dc = x264_quant_4x4_dc_sse4;
 pf->quant_4x4 = x264_quant_4x4_sse4;
+ pf->quant_4x4x4 = x264_quant_4x4x4_sse4;
 pf->quant_8x8 = x264_quant_8x8_sse4;
 }
 if( cpu&X264_CPU_AVX )
@@ -535,6 +537,17 @@
 pf->dequant_8x8 = x264_dequant_8x8_xop;
 }
 }
+ if( cpu&X264_CPU_AVX2 )
+ {
+ pf->quant_4x4 = x264_quant_4x4_avx2;
+ pf->quant_4x4_dc = x264_quant_4x4_dc_avx2;
+ pf->quant_8x8 = x264_quant_8x8_avx2;
+ pf->quant_4x4x4 = x264_quant_4x4x4_avx2;
+ pf->dequant_4x4 = x264_dequant_4x4_avx2;
+ pf->dequant_8x8 = x264_dequant_8x8_avx2;
+ pf->dequant_4x4_dc = x264_dequant_4x4dc_avx2;
+ pf->denoise_dct = x264_denoise_dct_avx2;
+ }
 #endif // HAVE_MMX
 #else // !HIGH_BIT_DEPTH
 #if HAVE_MMX
@@ -543,6 +556,7 @@
 {
 #if ARCH_X86
 pf->quant_4x4 = x264_quant_4x4_mmx;
+ pf->quant_4x4x4 = x264_quant_4x4x4_mmx;
 pf->quant_8x8 = x264_quant_8x8_mmx;
 pf->dequant_4x4 = x264_dequant_4x4_mmx;
 pf->dequant_4x4_dc = x264_dequant_4x4dc_mmx2;
@@ -563,11 +577,6 @@
 pf->quant_4x4_dc = x264_quant_4x4_dc_mmx2;
 pf->decimate_score15 = x264_decimate_score15_mmx2;
 pf->decimate_score16 = x264_decimate_score16_mmx2;
- if( cpu&X264_CPU_SLOW_CTZ )
- {
- pf->decimate_score15 = x264_decimate_score15_mmx2_slowctz;
- pf->decimate_score16 = x264_decimate_score16_mmx2_slowctz;
- }
 pf->decimate_score64 = x264_decimate_score64_mmx2;
 pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmx2;
 pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2;
@@ -592,6 +601,7 @@
 {
 pf->quant_4x4_dc = x264_quant_4x4_dc_sse2;
 pf->quant_4x4 = x264_quant_4x4_sse2;
+ pf->quant_4x4x4 = x264_quant_4x4x4_sse2;
 pf->quant_8x8 = x264_quant_8x8_sse2;
 pf->dequant_4x4 = x264_dequant_4x4_sse2;
 pf->dequant_4x4_dc = x264_dequant_4x4dc_sse2;
@@ -606,11 +616,6 @@
 pf->decimate_score15 = x264_decimate_score15_sse2;
 pf->decimate_score16 = x264_decimate_score16_sse2;
 pf->decimate_score64 = x264_decimate_score64_sse2;
- if( cpu&X264_CPU_SLOW_CTZ )
- {
- pf->decimate_score15 = x264_decimate_score15_sse2_slowctz;
- pf->decimate_score16 = x264_decimate_score16_sse2_slowctz;
- }
 pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
 pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
 pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
@@ -631,18 +636,25 @@
 pf->quant_2x2_dc = x264_quant_2x2_dc_ssse3;
 pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
 pf->quant_4x4 = x264_quant_4x4_ssse3;
+ pf->quant_4x4x4 = x264_quant_4x4x4_ssse3;
 pf->quant_8x8 = x264_quant_8x8_ssse3;
 pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_ssse3;
 pf->denoise_dct = x264_denoise_dct_ssse3;
 pf->decimate_score15 = x264_decimate_score15_ssse3;
 pf->decimate_score16 = x264_decimate_score16_ssse3;
- if( cpu&X264_CPU_SLOW_CTZ )
- {
- pf->decimate_score15 = x264_decimate_score15_ssse3_slowctz;
- pf->decimate_score16 = x264_decimate_score16_ssse3_slowctz;
- }
 pf->decimate_score64 = x264_decimate_score64_ssse3;
 INIT_TRELLIS( ssse3 );
+ pf->coeff_level_run4 = x264_coeff_level_run4_ssse3;
+ pf->coeff_level_run8 = x264_coeff_level_run8_ssse3;
+ pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3;
+ pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3;
+ if( cpu&X264_CPU_LZCNT )
+ {
+ pf->coeff_level_run4 = x264_coeff_level_run4_ssse3;
+ pf->coeff_level_run8 = x264_coeff_level_run8_ssse3;
+ pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3_lzcnt;
+ pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3_lzcnt;
+ }
 }
 
 if( cpu&X264_CPU_SSE4 )
@@ -673,6 +685,30 @@
 pf->dequant_8x8 = x264_dequant_8x8_xop;
 }
 }
+
+ if( cpu&X264_CPU_AVX2 )
+ {
+ pf->quant_4x4 = x264_quant_4x4_avx2;
+ pf->quant_4x4_dc = x264_quant_4x4_dc_avx2;
+ pf->quant_8x8 = x264_quant_8x8_avx2;
+ pf->quant_4x4x4 = x264_quant_4x4x4_avx2;
+ pf->dequant_4x4 = x264_dequant_4x4_avx2;
+ pf->dequant_8x8 = x264_dequant_8x8_avx2;
+ pf->dequant_4x4_dc = x264_dequant_4x4dc_avx2;
+ if( h->param.i_cqm_preset == X264_CQM_FLAT )
+ {
+ pf->dequant_4x4 = x264_dequant_4x4_flat16_avx2;
+ pf->dequant_8x8 = x264_dequant_8x8_flat16_avx2;
+ }
+ pf->decimate_score64 = x264_decimate_score64_avx2;
+ pf->denoise_dct = x264_denoise_dct_avx2;
+ if( cpu&X264_CPU_LZCNT )
+ {
+ pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2_lzcnt;
+ pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_avx2_lzcnt;
+ pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_avx2_lzcnt;
+ }
+ }
 #endif // HAVE_MMX
 
 #if HAVE_ALTIVEC
@@ -696,6 +732,7 @@
 pf->quant_2x2_dc = x264_quant_2x2_dc_neon;
 pf->quant_4x4 = x264_quant_4x4_neon;
 pf->quant_4x4_dc = x264_quant_4x4_dc_neon;
+ pf->quant_4x4x4 = x264_quant_4x4x4_neon;
 pf->quant_8x8 = x264_quant_8x8_neon;
 pf->dequant_4x4 = x264_dequant_4x4_neon;
 pf->dequant_4x4_dc = x264_dequant_4x4_dc_neon;

x264-snapshot-20130224-2245.tar.bz2/common/quant.h -> x264-snapshot-20130723-2245.tar.bz2/common/quant.h Changed

x264-snapshot-20130224-2245.tar.bz2/common/set.c -> x264-snapshot-20130723-2245.tar.bz2/common/set.c Changed

@@ -85,44 +85,49 @@
 int max_qp_err = -1;
 int max_chroma_qp_err = -1;
 int min_qp_err = QP_MAX+1;
- int num_8x8_lists = h->sps->i_chroma_format_idc == CHROMA_444 ? 4 : 2; /* Checkasm may segfault if optimized out by --chroma-format */
+ int num_8x8_lists = h->sps->i_chroma_format_idc == CHROMA_444 ? 4
+ : h->param.analyse.b_transform_8x8 ? 2 : 0; /* Checkasm may segfault if optimized out by --chroma-format */
 
- for( int i = 0; i < 4 + num_8x8_lists; i++ )
- {
- int size = i<4 ? 16 : 64;
- int j;
- for( j = (i<4 ? 0 : 4); j < i; j++ )
- if( !memcmp( h->pps->scaling_list[i], h->pps->scaling_list[j], size*sizeof(uint8_t) ) )
- break;
- if( j quant4_mf[i] = h-> quant4_mf[j];
- h->dequant4_mf[i] = h->dequant4_mf[j];
- h->unquant4_mf[i] = h->unquant4_mf[j];
- }
- else
- {
- CHECKED_MALLOC( h-> quant4_mf[i], (QP_MAX+1)*size*sizeof(udctcoef) );
- CHECKED_MALLOC( h->dequant4_mf[i], 6*size*sizeof(int) );
- CHECKED_MALLOC( h->unquant4_mf[i], (QP_MAX+1)*size*sizeof(int) );
- }
-
- for( j = (i<4 ? 0 : 4); j < i; j++ )
- if( deadzone[j&3] == deadzone[i&3] &&
- !memcmp( h->pps->scaling_list[i], h->pps->scaling_list[j], size*sizeof(uint8_t) ) )
- break;
- if( j quant4_bias[i] = h->quant4_bias[j];
- h->quant4_bias0[i] = h->quant4_bias0[j];
- }
- else
- {
- CHECKED_MALLOC( h->quant4_bias[i], (QP_MAX+1)*size*sizeof(udctcoef) );
- CHECKED_MALLOC( h->quant4_bias0[i], (QP_MAX+1)*size*sizeof(udctcoef) );
- }
+#define CQM_ALLOC( w, count )\
+ for( int i = 0; i < count; i++ )\
+ {\
+ int size = w*w;\
+ int start = w == 8 ? 4 : 0;\
+ int j;\
+ for( j = 0; j < i; j++ )\
+ if( !memcmp( h->pps->scaling_list[i+start], h->pps->scaling_list[j+start], size*sizeof(uint8_t) ) )\
+ break;\
+ if( j quant##w##_mf[i] = h-> quant##w##_mf[j];\
+ h->dequant##w##_mf[i] = h->dequant##w##_mf[j];\
+ h->unquant##w##_mf[i] = h->unquant##w##_mf[j];\
+ }\
+ else\
+ {\
+ CHECKED_MALLOC( h-> quant##w##_mf[i], (QP_MAX+1)*size*sizeof(udctcoef) );\
+ CHECKED_MALLOC( h->dequant##w##_mf[i], 6*size*sizeof(int) );\
+ CHECKED_MALLOC( h->unquant##w##_mf[i], (QP_MAX+1)*size*sizeof(int) );\
+ }\
+ for( j = 0; j < i; j++ )\
+ if( deadzone[j] == deadzone[i] &&\
+ !memcmp( h->pps->scaling_list[i+start], h->pps->scaling_list[j+start], size*sizeof(uint8_t) ) )\
+ break;\
+ if( j quant##w##_bias[i] = h->quant##w##_bias[j];\
+ h->quant##w##_bias0[i] = h->quant##w##_bias0[j];\
+ }\
+ else\
+ {\
+ CHECKED_MALLOC( h->quant##w##_bias[i], (QP_MAX+1)*size*sizeof(udctcoef) );\
+ CHECKED_MALLOC( h->quant##w##_bias0[i], (QP_MAX+1)*size*sizeof(udctcoef) );\
+ }\
 }
 
+ CQM_ALLOC( 4, 4 )
+ CQM_ALLOC( 8, num_8x8_lists )
+
 for( int q = 0; q < 6; q++ )
 {
 for( int i = 0; i < 16; i++ )
@@ -204,6 +209,9 @@
 for( int cat = 0; cat < 3 + CHROMA444; cat++ )
 {
 int dct8x8 = cat&1;
+ if( !h->param.analyse.b_transform_8x8 && dct8x8 )
+ continue;
+
 int size = dct8x8 ? 64 : 16;
 udctcoef *nr_offset = h->nr_offset_emergency[q][cat];
 /* Denoise chroma first (due to h264's chroma QP offset), then luma, then DC. */

x264-snapshot-20130224-2245.tar.bz2/common/win32thread.c -> x264-snapshot-20130723-2245.tar.bz2/common/win32thread.c Changed

x264-snapshot-20130224-2245.tar.bz2/common/x86/bitstream-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/bitstream-a.asm Changed

@@ -4,7 +4,7 @@
 ;* Copyright (C) 2010-2013 x264 project
 ;*
 ;* Authors: Jason Garrett-Glaser <darkshikari@gmail.com>
-;* Henrik Gramner <hengar-6@student.ltu.se>
+;* Henrik Gramner <henrik@gramner.com>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
@@ -32,100 +32,105 @@
 ;-----------------------------------------------------------------------------
 ; uint8_t *x264_nal_escape( uint8_t *dst, uint8_t *src, uint8_t *end )
 ;-----------------------------------------------------------------------------
-
 %macro NAL_LOOP 2
-%1_escape:
+%%escape:
 ; Detect false positive to avoid unneccessary escape loop
 xor r3d, r3d
 cmp byte [r0+r1-1], 0
 setnz r3b
- xor r3d, r4d
+ xor k3, k4
 jnz .escape
- jmp %1_continue
+ jmp %%continue
 ALIGN 16
 %1:
- pcmpeqb m3, m1, m4
- pcmpeqb m2, m0, m4
- pmovmskb r3d, m3
- %2 [r0+r1], m0
+ mova [r0+r1+mmsize], m1
+ pcmpeqb m1, m0
+ mova [r0+r1], m2
+ pcmpeqb m2, m0
+ pmovmskb r3d, m1
+ %2 m1, [r1+r2+3*mmsize]
 pmovmskb r4d, m2
- shl r3d, mmsize
- mova m0, [r1+r2+2*mmsize]
- or r4d, r3d
- %2 [r0+r1+mmsize], m1
- lea r3d, [r4+r4+1]
- mova m1, [r1+r2+3*mmsize]
- and r4d, r3d
- jnz %1_escape
-%1_continue:
+ %2 m2, [r1+r2+2*mmsize]
+ shl k3, mmsize
+ or k3, k4
+ lea k4, [2*r3+1]
+ and k4, k3
+ jnz %%escape
+%%continue:
 add r1, 2*mmsize
 jl %1
 %endmacro
 
 %macro NAL_ESCAPE 0
+%if mmsize == 32
+ %xdefine k3 r3
+ %xdefine k4 r4
+%else
+ %xdefine k3 r3d
+ %xdefine k4 r4d
+%endif
 
 cglobal nal_escape, 3,5
- mov r3w, [r1]
+ movzx r3d, byte [r1]
 sub r1, r2 ; r1 = offset of current src pointer from end of src
- pxor m4, m4
+ pxor m0, m0
+ mov [r0], r3b
 sub r0, r1 ; r0 = projected end of dst, assuming no more escapes
- mov [r0+r1], r3w
- add r1, 2
- jge .ret
+ or r3d, 0xffffff00 ; ignore data before src
 
- ; Start off by jumping into the escape loop in
- ; case there's an escape at the start.
- ; And do a few more in scalar until src is aligned again.
- jmp .first_escape
+ ; Start off by jumping into the escape loop in case there's an escape at the start.
+ ; And do a few more in scalar until dst is aligned.
+ jmp .escape_loop
 
+%if mmsize == 16
 NAL_LOOP .loop_aligned, mova
-%if mmsize==16
 jmp .ret
- NAL_LOOP .loop_unaligned, movu
 %endif
+ NAL_LOOP .loop_unaligned, movu
 .ret:
 movifnidn rax, r0
 RET
 
-ALIGN 16
 .escape:
 ; Skip bytes that are known to be valid
- and r4d, r3d
- tzcnt r3d, r4d
- add r1, r3
+ and k4, k3
+ tzcnt k4, k4
+ xor r3d, r3d ; the last two bytes are known to be zero
+ add r1, r4
 .escape_loop:
 inc r1
 jge .ret
-.first_escape:
- movzx r3d, byte [r1+r2]
- lea r4, [r1+r2]
- cmp r3d, 3
- jna .escape_check
-.no_escape:
+ movzx r4d, byte [r1+r2]
+ shl r3d, 8
+ or r3d, r4d
+ test r3d, 0xfffffc ; if the last two bytes are 0 and the current byte is <=3
+ jz .add_escape_byte
+.escaped:
+ lea r4d, [r0+r1]
 mov [r0+r1], r3b
- test r4d, mmsize-1 ; Do SIMD when src is aligned
+ test r4d, mmsize-1 ; Do SIMD when dst is aligned
 jnz .escape_loop
- mova m0, [r4]
- mova m1, [r4+mmsize]
-%if mmsize==16
- lea r4d, [r0+r1]
+ movu m1, [r1+r2+mmsize]
+ movu m2, [r1+r2]
+%if mmsize == 16
+ lea r4d, [r1+r2]
 test r4d, mmsize-1
- jnz .loop_unaligned
+ jz .loop_aligned
 %endif
- jmp .loop_aligned
+ jmp .loop_unaligned
 
-ALIGN 16
-.escape_check:
- cmp word [r0+r1-2], 0
- jnz .no_escape
+.add_escape_byte:
 mov byte [r0+r1], 3
- inc r0
- jmp .no_escape
+ inc r0
+ or r3d, 0x0300
+ jmp .escaped
 %endmacro
 
 INIT_MMX mmx2
 NAL_ESCAPE
 INIT_XMM sse2
 NAL_ESCAPE
-INIT_XMM avx
+%if ARCH_X86_64
+INIT_YMM avx2
 NAL_ESCAPE
+%endif

x264-snapshot-20130224-2245.tar.bz2/common/x86/cabac-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/cabac-a.asm Changed

@@ -26,22 +26,69 @@
 ;*****************************************************************************
 
 %include "x86inc.asm"
+%include "x86util.asm"
+
+SECTION_RODATA
+
+coeff_abs_level1_ctx: db 1, 2, 3, 4, 0, 0, 0, 0
+coeff_abs_levelgt1_ctx: db 5, 5, 5, 5, 6, 7, 8, 9
+coeff_abs_level_transition: db 1, 2, 3, 3, 4, 5, 6, 7
+ db 4, 4, 4, 4, 5, 6, 7, 7
+
+%if ARCH_X86_64
+%macro COEFF_LAST_TABLE 17
+ %define funccpu1 %1
+ %define funccpu2 %2
+ %define funccpu3 %3
+ %rep 14
+ %ifidn %4, 4
+ dq mangle(x264_coeff_last%4_ %+ funccpu1)
+ %elifidn %4, 64
+ dq mangle(x264_coeff_last%4_ %+ funccpu2)
+ %else
+ dq mangle(x264_coeff_last%4_ %+ funccpu3)
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+cextern coeff_last4_mmx2
+cextern coeff_last4_mmx2_lzcnt
+cextern coeff_last15_sse2
+cextern coeff_last15_sse2_lzcnt
+cextern coeff_last16_sse2
+cextern coeff_last16_sse2_lzcnt
+cextern coeff_last64_sse2
+cextern coeff_last64_sse2_lzcnt
+cextern coeff_last64_avx2_lzcnt
+
+%ifdef PIC
+SECTION .data
+%endif
+coeff_last_sse2: COEFF_LAST_TABLE mmx2, sse2, sse2, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
+coeff_last_sse2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, sse2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
+coeff_last_avx2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, avx2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
+%endif
 
 SECTION .text
 
 cextern cabac_range_lps
 cextern cabac_transition
 cextern cabac_renorm_shift
+cextern cabac_entropy
+cextern cabac_size_unary
+cextern cabac_transition_unary
+cextern significant_coeff_flag_offset
+cextern significant_coeff_flag_offset_8x8
+cextern last_coeff_flag_offset
+cextern last_coeff_flag_offset_8x8
+cextern coeff_abs_level_m1_offset
+cextern count_cat_m1
+cextern cabac_encode_ue_bypass
 
-; t3 must be ecx, since it's used for shift.
-%if WIN64
- DECLARE_REG_TMP 3,1,2,0,6,5,4,2
- %define pointer resq
-%elif ARCH_X86_64
- DECLARE_REG_TMP 0,1,2,3,4,5,6,6
+%if ARCH_X86_64
 %define pointer resq
 %else
- DECLARE_REG_TMP 0,4,2,1,3,5,6,2
 %define pointer resd
 %endif
 
@@ -58,24 +105,34 @@
 .state: resb 1024
 endstruc
 
-%macro LOAD_GLOBAL 4
+%macro LOAD_GLOBAL 3-5 0 ; dst, base, off1, off2, tmp
 %ifdef PIC
- ; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea
- lea r7, [%2]
- %ifnidn %3, 0
- add r7, %3
+ %ifidn %4, 0
+ movzx %1, byte [%2+%3+r7-$$]
+ %else
+ lea %5, [r7+%4]
+ movzx %1, byte [%2+%3+%5-$$]
 %endif
- movzx %1, byte [r7+%4]
 %else
 movzx %1, byte [%2+%3+%4]
 %endif
 %endmacro
 
-cglobal cabac_encode_decision_asm, 0,7
- movifnidn t0, r0mp
+%macro CABAC 1
+; t3 must be ecx, since it's used for shift.
+%if WIN64
+ DECLARE_REG_TMP 3,1,2,0,5,6,4,4
+%elif ARCH_X86_64
+ DECLARE_REG_TMP 0,1,2,3,4,5,6,6
+%else
+ DECLARE_REG_TMP 0,4,2,1,3,5,6,2
+%endif
+
+cglobal cabac_encode_decision_%1, 1,7
 movifnidn t1d, r1m
- mov t5d, [t0+cb.range]
- movzx t6d, byte [t0+cb.state+t1]
+ mov t5d, [r0+cb.range]
+ movzx t6d, byte [r0+cb.state+t1]
+ movifnidn t0, r0 ; WIN64
 mov t4d, ~1
 mov t3d, t5d
 and t4d, t6d
@@ -84,8 +141,11 @@
 %if WIN64
 PUSH r7
 %endif
- LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*2
- LOAD_GLOBAL t4d, cabac_transition, t2, t6*2
+%ifdef PIC
+ lea r7, [$$]
+%endif
+ LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*2, t4
+ LOAD_GLOBAL t4d, cabac_transition, t2, t6*2, t4
 and t6d, 1
 sub t3d, t5d
 cmp t6d, t2d
@@ -96,66 +156,82 @@
 mov [t0+cb.state+t1], t4b
 ;cabac_encode_renorm
 mov t4d, t3d
+%ifidn %1, bmi2
+ lzcnt t3d, t3d
+ sub t3d, 23
+ shlx t4d, t4d, t3d
+ shlx t6d, t6d, t3d
+%else
 shr t3d, 3
- LOAD_GLOBAL t3d, cabac_renorm_shift, 0, t3
+ LOAD_GLOBAL t3d, cabac_renorm_shift, t3
+ shl t4d, t3b
+ shl t6d, t3b
+%endif
 %if WIN64
 POP r7
 %endif
- shl t4d, t3b
- shl t6d, t3b
 mov [t0+cb.range], t4d
 add t3d, [t0+cb.queue]
- jge cabac_putbyte
+ jge cabac_putbyte_%1
 .update_queue_low:
 mov [t0+cb.low], t6d
 mov [t0+cb.queue], t3d
 RET
 
-cglobal cabac_encode_bypass_asm, 0,3
- movifnidn t0, r0mp
- movifnidn t3d, r1m
- mov t7d, [t0+cb.low]
- and t3d, [t0+cb.range]
- lea t7d, [t7*2+t3]
- mov t3d, [t0+cb.queue]
+cglobal cabac_encode_bypass_%1, 2,3
+ mov t7d, [r0+cb.low]
+ and r1d, [r0+cb.range]
+ lea t7d, [t7*2+r1]
+ movifnidn t0, r0 ; WIN64
+ mov t3d, [r0+cb.queue]
 inc t3d
-%if UNIX64 ; .putbyte compiles to nothing but a jmp
- jge cabac_putbyte
+%if ARCH_X86_64 ; .putbyte compiles to nothing but a jmp
+ jge cabac_putbyte_%1
 %else
 jge .putbyte
 %endif
 mov [t0+cb.low], t7d
 mov [t0+cb.queue], t3d
 RET
+%if ARCH_X86_64 == 0
 .putbyte:
 PROLOGUE 0,7
 movifnidn t6d, t7d
- jmp cabac_putbyte
+ jmp cabac_putbyte_%1
+%endif
 
-cglobal cabac_encode_terminal_asm, 0,3
- movifnidn t0, r0mp
- sub dword [t0+cb.range], 2
+%ifnidn %1,bmi2
+cglobal cabac_encode_terminal_%1, 1,3
+ sub dword [r0+cb.range], 2
 ; shortcut: the renormalization shift in terminal
 ; can only be 0 or 1 and is zero over 99% of the time.
- test dword [t0+cb.range], 0x100
+ test dword [r0+cb.range], 0x100
 je .renorm
 RET
 .renorm:
- shl dword [t0+cb.low], 1
- shl dword [t0+cb.range], 1
- inc dword [t0+cb.queue]
+ shl dword [r0+cb.low], 1
+ shl dword [r0+cb.range], 1
+ inc dword [r0+cb.queue]
 jge .putbyte
 RET
 .putbyte:
 PROLOGUE 0,7
- mov t3d, [t0+cb.queue]
+ movifnidn t0, r0 ; WIN64
+ mov t3d, [r0+cb.queue]
 mov t6d, [t0+cb.low]
+%endif
 
-cabac_putbyte:
+cabac_putbyte_%1:
 ; alive: t0=cb t3=queue t6=low
 %if WIN64
 DECLARE_REG_TMP 3,6,1,0,2,5,4
 %endif
+%ifidn %1, bmi2
+ add t3d, 10
+ shrx t2d, t6d, t3d
+ bzhi t6d, t6d, t3d
+ sub t3d, 18
+%else
 mov t1d, -1
 add t3d, 10
 mov t2d, t6d
@@ -164,6 +240,7 @@
 not t1d
 sub t3d, 18
 and t6d, t1d
+%endif
 mov t5d, [t0+cb.bytes_outstanding]
 cmp t2b, 0xff ; FIXME is a 32bit op faster?
 jz .postpone
@@ -180,4 +257,500 @@
 .postpone:
 inc t5d
 mov [t0+cb.bytes_outstanding], t5d
- jmp mangle(x264_cabac_encode_decision_asm.update_queue_low)
+ jmp mangle(x264_cabac_encode_decision_%1.update_queue_low)
+%endmacro
+
+CABAC asm
+CABAC bmi2
+
+; %1 = label name
+; %2 = node_ctx init?
+%macro COEFF_ABS_LEVEL_GT1 2
+%if %2
+ %define ctx 1
+%else
+ movzx r11d, byte [coeff_abs_level1_ctx+r2 GLOBAL]
+ %define ctx r11
+%endif
+ movzx r9d, byte [r8+ctx]
+; if( coeff_abs > 1 )
+ cmp r1d, 1
+ jg .%1_gt1
+; x264_cabac_encode_decision( cb, ctx_level+ctx, 0 )
+ movzx r10d, byte [cabac_transition+r9*2 GLOBAL]
+ movzx r9d, word [cabac_entropy+r9*2 GLOBAL]
+ lea r0d, [r0+r9+256]
+ mov [r8+ctx], r10b
+%if %2
+ mov r2d, 1
+%else
+ movzx r2d, byte [coeff_abs_level_transition+r2 GLOBAL]
+%endif
+ jmp .%1_end
+
+.%1_gt1:
+; x264_cabac_encode_decision( cb, ctx_level+ctx, 1 )
+ movzx r10d, byte [cabac_transition+r9*2+1 GLOBAL]
+ xor r9d, 1
+ movzx r9d, word [cabac_entropy+r9*2 GLOBAL]
+ mov [r8+ctx], r10b
+ add r0d, r9d
+%if %2
+ %define ctx 5
+%else
+ movzx r11d, byte [coeff_abs_levelgt1_ctx+r2 GLOBAL]
+ %define ctx r11
+%endif
+; if( coeff_abs < 15 )
+ cmp r1d, 15
+ jge .%1_escape
+ shl r1d, 7
+; x264_cabac_transition_unary[coeff_abs-1][cb->state[ctx_level+ctx]]
+ movzx r9d, byte [r8+ctx]
+ add r9d, r1d
+ movzx r10d, byte [cabac_transition_unary-128+r9 GLOBAL]
+; x264_cabac_size_unary[coeff_abs-1][cb->state[ctx_level+ctx]]
+ movzx r9d, word [cabac_size_unary-256+r9*2 GLOBAL]
+ mov [r8+ctx], r10b
+ add r0d, r9d
+ jmp .%1_gt1_end
+
+.%1_escape:
+; x264_cabac_transition_unary[14][cb->state[ctx_level+ctx]]
+ movzx r9d, byte [r8+ctx]
+ movzx r10d, byte [cabac_transition_unary+128*14+r9 GLOBAL]
+; x264_cabac_size_unary[14][cb->state[ctx_level+ctx]]
+ movzx r9d, word [cabac_size_unary+256*14+r9*2 GLOBAL]
+ add r0d, r9d
+ mov [r8+ctx], r10b
+ sub r1d, 14
+%if cpuflag(lzcnt)
+ lzcnt r9d, r1d
+ xor r9d, 0x1f
+%else
+ bsr r9d, r1d
+%endif
+; bs_size_ue_big(coeff_abs-15)<<8
+ shl r9d, 9
+; (ilog2(coeff_abs-14)+1) << 8
+ lea r0d, [r0+r9+256]
+.%1_gt1_end:
+%if %2
+ mov r2d, 4
+%else
+ movzx r2d, byte [coeff_abs_level_transition+8+r2 GLOBAL]
+%endif
+.%1_end:
+%endmacro
+
+%macro LOAD_DCTCOEF 1
+%if HIGH_BIT_DEPTH
+ mov %1, [dct+r6*4]
+%else
+ movzx %1, word [dct+r6*2]
+%endif
+%endmacro
+
+%macro ABS_DCTCOEFS 2
+%assign i 0
+%rep %2/16
+%if HIGH_BIT_DEPTH
+ ABSD m0, [%1+ 0+i*64], m4
+ ABSD m1, [%1+16+i*64], m5
+ ABSD m2, [%1+32+i*64], m4
+ ABSD m3, [%1+48+i*64], m5
+ mova [rsp+ 0+i*64], m0
+ mova [rsp+16+i*64], m1
+ mova [rsp+32+i*64], m2
+ mova [rsp+48+i*64], m3
+%else
+ ABSW m0, [%1+ 0+i*32], m2
+ ABSW m1, [%1+16+i*32], m3
+ mova [rsp+ 0+i*32], m0
+ mova [rsp+16+i*32], m1
+%endif
+%assign i i+1
+%endrep
+%endmacro
+
+%macro SIG_OFFSET 1
+%if %1
+ movzx r11d, byte [r4+r6]
+%endif
+%endmacro
+
+%macro LAST_OFFSET 1
+%if %1
+ movzx r11d, byte [last_coeff_flag_offset_8x8+r6 GLOBAL]
+%endif
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void x264_cabac_block_residual_rd_internal_sse2 ( dctcoef *l, int b_interlaced,
+; int ctx_block_cat, x264_cabac_t *cb );
+;-----------------------------------------------------------------------------
+
+;%1 = 8x8 mode
+%macro CABAC_RESIDUAL_RD 2
+%if %1
+ %define func cabac_block_residual_8x8_rd_internal
+ %define maxcoeffs 64
+ %define dct rsp
+%else
+ %define func cabac_block_residual_rd_internal
+ %define maxcoeffs 16
+ %define dct r4
+%endif
+
+%ifdef PIC
+ cglobal func, 4,13
+ lea r12, [$$]
+ %define GLOBAL +r12-$$
+%else
+ cglobal func, 4,12
+ %define GLOBAL
+%endif
+
+%assign pad gprsize+SIZEOF_DCTCOEF*maxcoeffs-(stack_offset&15)
+ SUB rsp, pad
+ shl r1d, 4 ; MB_INTERLACED*16
+%if %1
+ lea r4, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL] ; r12 = sig offset 8x8
+%endif
+ add r1d, r2d
+ movzx r5d, word [significant_coeff_flag_offset+r1*2 GLOBAL] ; r5 = ctx_sig
+ movzx r7d, word [last_coeff_flag_offset+r1*2 GLOBAL] ; r7 = ctx_last
+ movzx r8d, word [coeff_abs_level_m1_offset+r2*2 GLOBAL] ; r8 = ctx_level
+
+; abs() all the coefficients; copy them to the stack to avoid
+; changing the originals.
+; overreading is okay; it's all valid aligned data anyways.
+%if %1
+ ABS_DCTCOEFS r0, 64
+%else
+ mov r4, r0 ; r4 = dct
+ mov r6, ~SIZEOF_DCTCOEF
+ and r6, r4 ; handle AC coefficient case
+ ABS_DCTCOEFS r6, 16
+ sub r4, r6 ; calculate our new dct pointer
+ add r4, rsp ; restore AC coefficient offset
+%endif
+ mov r1, [%2+gprsize*r2 GLOBAL]
+; for improved OOE performance, run coeff_last on the original coefficients.
+ call r1 ; coeff_last[ctx_block_cat]( dct )
+; we know on 64-bit that the SSE2 versions of this function only
+; overwrite r0, r1, and rax (r6). last64 overwrites r2 too, but we
+; don't need r2 in 8x8 mode.
+ mov r0d, [r3+cb.bits_encoded] ; r0 = cabac.f8_bits_encoded
+; pre-add some values to simplify addressing
+ add r3, cb.state
+ add r5, r3
+ add r7, r3
+ add r8, r3 ; precalculate cabac state pointers
+
+; if( last != count_cat_m1[ctx_block_cat] )
+%if %1
+ cmp r6b, 63
+%else
+ cmp r6b, [count_cat_m1+r2 GLOBAL]
+%endif
+ je .skip_last_sigmap
+
+; in 8x8 mode we have to do a bit of extra calculation for ctx_sig/last,
+; so we'll use r11 for this.
+%if %1
+ %define siglast_ctx r11
+%else
+ %define siglast_ctx r6
+%endif
+
+; x264_cabac_encode_decision( cb, ctx_sig + last, 1 )
+; x264_cabac_encode_decision( cb, ctx_last + last, 1 )
+ SIG_OFFSET %1
+ movzx r1d, byte [r5+siglast_ctx]
+ movzx r9d, byte [cabac_transition+1+r1*2 GLOBAL]
+ xor r1d, 1
+ movzx r1d, word [cabac_entropy+r1*2 GLOBAL]
+ mov [r5+siglast_ctx], r9b
+ add r0d, r1d
+
+ LAST_OFFSET %1
+ movzx r1d, byte [r7+siglast_ctx]
+ movzx r9d, byte [cabac_transition+1+r1*2 GLOBAL]
+ xor r1d, 1
+ movzx r1d, word [cabac_entropy+r1*2 GLOBAL]
+ mov [r7+siglast_ctx], r9b
+ add r0d, r1d
+.skip_last_sigmap:
+ LOAD_DCTCOEF r1d
+ COEFF_ABS_LEVEL_GT1 last, 1
+; for( int i = last-1 ; i >= 0; i-- )
+ dec r6d
+ jl .end
+.coeff_loop:
+ LOAD_DCTCOEF r1d
+; if( l[i] )
+ SIG_OFFSET %1
+ movzx r9d, byte [r5+siglast_ctx]
+ test r1d, r1d
+ jnz .coeff_nonzero
+; x264_cabac_encode_decision( cb, ctx_sig + i, 0 )
+ movzx r10d, byte [cabac_transition+r9*2 GLOBAL]
+ movzx r9d, word [cabac_entropy+r9*2 GLOBAL]
+ mov [r5+siglast_ctx], r10b
+ add r0d, r9d
+ dec r6d
+ jge .coeff_loop
+ jmp .end
+.coeff_nonzero:
+; x264_cabac_encode_decision( cb, ctx_sig + i, 1 )
+ movzx r10d, byte [cabac_transition+r9*2+1 GLOBAL]
+ xor r9d, 1
+ movzx r9d, word [cabac_entropy+r9*2 GLOBAL]
+ mov [r5+siglast_ctx], r10b
+ add r0d, r9d
+; x264_cabac_encode_decision( cb, ctx_last + i, 0 );
+ LAST_OFFSET %1
+ movzx r9d, byte [r7+siglast_ctx]
+ movzx r10d, byte [cabac_transition+r9*2 GLOBAL]
+ movzx r9d, word [cabac_entropy+r9*2 GLOBAL]
+ mov [r7+siglast_ctx], r10b
+ add r0d, r9d
+ COEFF_ABS_LEVEL_GT1 coeff, 0
+ dec r6d
+ jge .coeff_loop
+.end:
+ mov [r3+cb.bits_encoded-cb.state], r0d
+ ADD rsp, pad
+ RET
+%endmacro
+
+%if ARCH_X86_64
+INIT_XMM sse2
+CABAC_RESIDUAL_RD 0, coeff_last_sse2
+CABAC_RESIDUAL_RD 1, coeff_last_sse2
+INIT_XMM sse2,lzcnt
+CABAC_RESIDUAL_RD 0, coeff_last_sse2_lzcnt
+CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt
+INIT_XMM ssse3
+CABAC_RESIDUAL_RD 0, coeff_last_sse2
+CABAC_RESIDUAL_RD 1, coeff_last_sse2
+INIT_XMM ssse3,lzcnt
+CABAC_RESIDUAL_RD 0, coeff_last_sse2_lzcnt
+CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt
+%endif
+
+;-----------------------------------------------------------------------------
+; void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced,
+; int ctx_block_cat, x264_cabac_t *cb );
+;-----------------------------------------------------------------------------
+
+%macro CALL_CABAC 0
+%if cpuflag(bmi2)
+ call cabac_encode_decision_bmi2
+%else
+ call cabac_encode_decision_asm
+%endif
+%if WIN64 ; move cabac back
+ mov r0, r3
+%endif
+%endmacro
+
+; %1 = 8x8 mode
+; %2 = dct register
+; %3 = countcat
+; %4 = name
+%macro SIGMAP_LOOP 3-4
+.sigmap_%4loop:
+%if HIGH_BIT_DEPTH
+ mov %2, [dct+r10*4]
+%else
+ movsx %2, word [dct+r10*2]
+%endif
+%if %1
+ movzx r1d, byte [sigoff_8x8 + r10]
+ add r1d, sigoffd
+%else
+ lea r1d, [sigoffd + r10d]
+%endif
+ test %2, %2
+ jz .sigmap_%4zero ; if( l[i] )
+ inc coeffidxd
+ mov [coeffs+coeffidxq*4], %2 ; coeffs[++coeff_idx] = l[i];
+ mov r2d, 1
+ CALL_CABAC ; x264_cabac_encode_decision( cb, ctx_sig + sig_off, 1 );
+%if %1
+ movzx r1d, byte [last_coeff_flag_offset_8x8 + r10 GLOBAL]
+ add r1d, lastoffd
+%else
+ lea r1d, [lastoffd + r10d]
+%endif
+ cmp r10d, lastm ; if( i == last )
+ je .sigmap_%4last
+ xor r2d, r2d
+ CALL_CABAC ; x264_cabac_encode_decision( cb, ctx_last + last_off, 0 );
+ jmp .sigmap_%4loop_endcheck
+.sigmap_%4zero:
+ xor r2d, r2d
+ CALL_CABAC ; x264_cabac_encode_decision( cb, ctx_sig + sig_off, 0 );
+.sigmap_%4loop_endcheck:
+ inc r10d
+ cmp r10d, %3
+ jne .sigmap_%4loop ; if( ++i == count_m1 )
+%if HIGH_BIT_DEPTH
+ mov %2, [dct+r10*4]
+%else
+ movsx %2, word [dct+r10*2]
+%endif
+ inc coeffidxd
+ mov [coeffs+coeffidxq*4], %2 ; coeffs[++coeff_idx] = l[i]
+ jmp .sigmap_%4end
+.sigmap_%4last: ; x264_cabac_encode_decision( cb, ctx_last + last_off, 1 );
+ mov r2d, 1
+ CALL_CABAC
+.sigmap_%4end:
+%if %1==0
+ jmp .level_loop_start
+%endif
+%endmacro
+
+%macro CABAC_RESIDUAL 1
+cglobal cabac_block_residual_internal, 4,15
+%ifdef PIC
+; if we use the same r7 as in cabac_encode_decision, we can cheat and save a register.
+ lea r7, [$$]
+ %define lastm [rsp+4*1]
+ %define GLOBAL +r7-$$
+%else
+ %define lastm r7d
+ %define GLOBAL
+%endif
+%assign pad gprsize+4*2+4*64-(stack_offset&15)
+ SUB rsp, pad
+ shl r1d, 4
+
+ %define sigoffq r8
+ %define sigoffd r8d
+ %define lastoffq r9
+ %define lastoffd r9d
+ %define leveloffq r10
+ %define leveloffd r10d
+ %define leveloffm [rsp+4*0]
+ %define countcatd r11d
+ %define sigoff_8x8 r12
+ %define coeffidxq r13
+ %define coeffidxd r13d
+ %define dct r14
+ %define coeffs rsp+4*2
+
+ lea sigoff_8x8, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL]
+ add r1d, r2d
+ movzx sigoffd, word [significant_coeff_flag_offset+r1*2 GLOBAL]
+ movzx lastoffd, word [last_coeff_flag_offset+r1*2 GLOBAL]
+ movzx leveloffd, word [coeff_abs_level_m1_offset+r2*2 GLOBAL]
+ movzx countcatd, byte [count_cat_m1+r2 GLOBAL]
+ mov coeffidxd, -1
+ mov dct, r0
+ mov leveloffm, leveloffd
+
+ mov r1, [%1+gprsize*r2 GLOBAL]
+ call r1
+ mov lastm, eax
+; put cabac in r0; needed for cabac_encode_decision
+ mov r0, r3
+
+ xor r10d, r10d
+ cmp countcatd, 63
+ je .sigmap_8x8
+ SIGMAP_LOOP 0, r12d, countcatd,
+.sigmap_8x8:
+ SIGMAP_LOOP 1, r11d, 63, _8x8
+.level_loop_start:
+; we now have r8, r9, r11, r12, and r7/r14(dct) free for the main loop.
+ %define nodectxq r8
+ %define nodectxd r8d
+ mov leveloffd, leveloffm
+ xor nodectxd, nodectxd
+.level_loop:
+ mov r9d, [coeffs+coeffidxq*4]
+ mov r11d, r9d
+ sar r11d, 31
+ add r9d, r11d
+ movzx r1d, byte [coeff_abs_level1_ctx+nodectxq GLOBAL]
+ xor r9d, r11d
+ add r1d, leveloffd
+ cmp r9d, 1
+ jg .level_gt1
+ xor r2d, r2d
+ CALL_CABAC
+ movzx nodectxd, byte [coeff_abs_level_transition+nodectxq GLOBAL]
+ jmp .level_sign
+.level_gt1:
+ mov r2d, 1
+ CALL_CABAC
+ movzx r14d, byte [coeff_abs_levelgt1_ctx+nodectxq GLOBAL]
+ add r14d, leveloffd
+ cmp r9d, 15
+ mov r12d, 15
+ cmovl r12d, r9d
+ sub r12d, 2
+ jz .level_eq2
+.level_gt1_loop:
+ mov r1d, r14d
+ mov r2d, 1
+ CALL_CABAC
+ dec r12d
+ jg .level_gt1_loop
+ cmp r9d, 15
+ jge .level_bypass
+.level_eq2:
+ mov r1d, r14d
+ xor r2d, r2d
+ CALL_CABAC
+ jmp .level_gt1_end
+.level_bypass:
+ lea r2d, [r9d-15]
+ xor r1d, r1d
+ push r0
+; we could avoid this if we implemented it in asm, but I don't feel like that
+; right now.
+%if UNIX64
+ push r7
+ push r8
+%else
+ sub rsp, 32 ; shadow space
+%endif
+ call cabac_encode_ue_bypass
+%if UNIX64
+ pop r8
+ pop r7
+%else
+ add rsp, 32
+%endif
+ pop r0
+.level_gt1_end:
+ movzx nodectxd, byte [coeff_abs_level_transition+8+nodectxq GLOBAL]
+.level_sign:
+ mov r1d, r11d
+%if cpuflag(bmi2)
+ call cabac_encode_bypass_bmi2
+%else
+ call cabac_encode_bypass_asm
+%endif
+%if WIN64
+ mov r0, r3
+%endif
+ dec coeffidxd
+ jge .level_loop
+ ADD rsp, pad
+ RET
+%endmacro
+
+%if ARCH_X86_64
+INIT_XMM sse2
+CABAC_RESIDUAL coeff_last_sse2
+INIT_XMM sse2,lzcnt
+CABAC_RESIDUAL coeff_last_sse2_lzcnt
+INIT_XMM avx2,bmi2
+CABAC_RESIDUAL coeff_last_avx2_lzcnt
+%endif

x264-snapshot-20130224-2245.tar.bz2/common/x86/const-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/const-a.asm Changed

@@ -26,39 +26,53 @@
 
 %include "x86inc.asm"
 
-SECTION_RODATA
+SECTION_RODATA 32
+
+const pb_1, times 32 db 1
+const hsub_mul, times 16 db 1, -1
+const pw_1, times 16 dw 1
+const pw_16, times 16 dw 16
+const pw_32, times 16 dw 32
+const pw_512, times 16 dw 512
+const pw_00ff, times 16 dw 0x00ff
+const pw_pixel_max,times 16 dw ((1 << BIT_DEPTH)-1)
+const pd_1, times 8 dd 1
+const deinterleave_shufd, dd 0,4,1,5,2,6,3,7
+const pb_unpackbd1, times 2 db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
+const pb_unpackbd2, times 2 db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
 
 const pb_01, times 8 db 0,1
 const pb_0, times 16 db 0
 const pb_a1, times 16 db 0xa1
-const pb_1, times 16 db 1
 const pb_3, times 16 db 3
-const hsub_mul, times 8 db 1, -1
 const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6
 
-const pw_1, times 8 dw 1
 const pw_2, times 8 dw 2
 const pw_m2, times 8 dw -2
 const pw_4, times 8 dw 4
 const pw_8, times 8 dw 8
-const pw_16, times 8 dw 16
-const pw_32, times 8 dw 32
 const pw_64, times 8 dw 64
+const pw_256, times 8 dw 256
 const pw_32_0, times 4 dw 32,
 times 4 dw 0
 const pw_8000, times 8 dw 0x8000
 const pw_3fff, times 8 dw 0x3fff
-const pw_pixel_max,times 8 dw ((1 << BIT_DEPTH)-1)
 const pw_ppppmmmm, dw 1,1,1,1,-1,-1,-1,-1
 const pw_ppmmppmm, dw 1,1,-1,-1,1,1,-1,-1
 const pw_pmpmpmpm, dw 1,-1,1,-1,1,-1,1,-1
 const pw_pmmpzzzz, dw 1,-1,-1,1,0,0,0,0
 
-const pd_1, times 4 dd 1
 const pd_32, times 4 dd 32
 const pd_1024, times 4 dd 1024
 const pd_ffff, times 4 dd 0xffff
-const pw_00ff, times 8 dw 0x00ff
 const pw_ff00, times 8 dw 0xff00
 
+const popcnt_table
+%assign x 0
+%rep 256
+; population count
+db ((x>>0)&1)+((x>>1)&1)+((x>>2)&1)+((x>>3)&1)+((x>>4)&1)+((x>>5)&1)+((x>>6)&1)+((x>>7)&1)
+%assign x x+1
+%endrep
+
 const sw_64, dd 64

x264-snapshot-20130224-2245.tar.bz2/common/x86/cpu-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/cpu-a.asm Changed

@@ -66,7 +66,27 @@
     mov [r4], edx
     RET
 
-%if ARCH_X86_64 == 0
+%if ARCH_X86_64
+
+;-----------------------------------------------------------------------------
+; void stack_align( void (*func)(void*), void *arg );
+;-----------------------------------------------------------------------------
+cglobal stack_align
+    push rbp
+    mov  rbp, rsp
+%if WIN64
+    sub  rsp, 32 ; shadow space
+%endif
+    and  rsp, ~31
+    mov  rax, r0
+    mov   r0, r1
+    mov   r1, r2
+    mov   r2, r3
+    call rax
+    leave
+    ret
+
+%else
 
 ;-----------------------------------------------------------------------------
 ; int cpu_cpuid_test( void )
@@ -94,14 +114,11 @@
     popfd
     ret
 
-;-----------------------------------------------------------------------------
-; void stack_align( void (*func)(void*), void *arg );
-;-----------------------------------------------------------------------------
 cglobal stack_align
     push ebp
     mov  ebp, esp
     sub  esp, 12
-    and  esp, ~15
+    and  esp, ~31
     mov  ecx, [ebp+8]
     mov  edx, [ebp+12]
     mov  [esp], edx
@@ -165,7 +182,10 @@
 %endif
     push rbp
     mov  rbp, rsp
-    and  rsp, ~15
+%if WIN64
+    sub  rsp, 32 ; shadow space
+%endif
+    and  rsp, ~31
     call intel_cpu_indicator_init
     leave
 %if ARCH_X86_64

x264-snapshot-20130224-2245.tar.bz2/common/x86/dct-64.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/dct-64.asm Changed

@@ -311,6 +311,42 @@
 INIT_XMM xop
 DCT_SUB8
 
+INIT_YMM avx2
+cglobal sub16x16_dct8, 3,3,10
+    add  r0, 128
+    add  r2, 4*FDEC_STRIDE
+    call .sub16x8_dct8
+    add  r0, 256
+    add  r1, FENC_STRIDE*8
+    add  r2, FDEC_STRIDE*8
+    call .sub16x8_dct8
+    RET
+.sub16x8_dct8:
+    LOAD_DIFF16x2_AVX2 0, 1, 2, 3, 0, 1
+    LOAD_DIFF16x2_AVX2 2, 3, 4, 5, 2, 3
+    LOAD_DIFF16x2_AVX2 4, 5, 6, 7, 4, 5
+    LOAD_DIFF16x2_AVX2 6, 7, 8, 9, 6, 7
+    DCT8_1D    w, 0,1,2,3,4,5,6,7,8,9
+    TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
+    DCT8_1D    w, 0,1,2,3,4,5,6,7,8,9
+    mova    [r0-0x80+0x00], xm0
+    vextracti128 [r0+0x00], m0, 1
+    mova    [r0-0x80+0x10], xm1
+    vextracti128 [r0+0x10], m1, 1
+    mova    [r0-0x80+0x20], xm2
+    vextracti128 [r0+0x20], m2, 1
+    mova    [r0-0x80+0x30], xm3
+    vextracti128 [r0+0x30], m3, 1
+    mova    [r0-0x80+0x40], xm4
+    vextracti128 [r0+0x40], m4, 1
+    mova    [r0-0x80+0x50], xm5
+    vextracti128 [r0+0x50], m5, 1
+    mova    [r0-0x80+0x60], xm6
+    vextracti128 [r0+0x60], m6, 1
+    mova    [r0-0x80+0x70], xm7
+    vextracti128 [r0+0x70], m7, 1
+    ret
+
 ;-----------------------------------------------------------------------------
 ; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
 ;-----------------------------------------------------------------------------
@@ -390,4 +426,5 @@
 ADD8x8
 INIT_XMM avx
 ADD8x8
+
 %endif ; !HIGH_BIT_DEPTH

x264-snapshot-20130224-2245.tar.bz2/common/x86/dct-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/dct-a.asm Changed

@@ -30,7 +30,7 @@
 %include "x86inc.asm"
 %include "x86util.asm"
 
-SECTION_RODATA
+SECTION_RODATA 32
 pw_ppmmmmpp:    dw 1,1,-1,-1,-1,-1,1,1
 pb_sub4frame:   db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
 pb_sub4field:   db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
@@ -39,8 +39,6 @@
 pb_scan4frameb: SHUFFLE_MASK_W 0,4,1,2,5,6,3,7
 pb_scan4frame2a: SHUFFLE_MASK_W 0,4,1,2,5,8,12,9
 pb_scan4frame2b: SHUFFLE_MASK_W 6,3,7,10,13,14,11,15
-pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
-pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
 
 pb_scan8framet1: SHUFFLE_MASK_W 0,  1,  6,  7,  8,  9, 13, 14
 pb_scan8framet2: SHUFFLE_MASK_W 2 , 3,  4,  7,  9, 15, 10, 14
@@ -74,6 +72,7 @@
 
 cextern pw_32_0
 cextern pw_32
+cextern pw_512
 cextern pw_8000
 cextern pw_pixel_max
 cextern hsub_mul
@@ -83,6 +82,9 @@
 cextern pd_32
 cextern pw_ppppmmmm
 cextern pw_pmpmpmpm
+cextern deinterleave_shufd
+cextern pb_unpackbd1
+cextern pb_unpackbd2
 
 %macro WALSH4_1D 6
     SUMSUB_BADC %1, %5, %4, %3, %2, %6
@@ -377,6 +379,135 @@
 ADD4x4
 INIT_XMM avx
 ADD4x4
+
+%macro STOREx2_AVX2 9
+    movq      xm%3, [r0+%5*FDEC_STRIDE]
+    vinserti128 m%3, m%3, [r0+%6*FDEC_STRIDE], 1
+    movq      xm%4, [r0+%7*FDEC_STRIDE]
+    vinserti128 m%4, m%4, [r0+%8*FDEC_STRIDE], 1
+    punpcklbw  m%3, m%9
+    punpcklbw  m%4, m%9
+    psraw      m%1, 6
+    psraw      m%2, 6
+    paddsw     m%1, m%3
+    paddsw     m%2, m%4
+    packuswb   m%1, m%2
+    vextracti128 xm%2, m%1, 1
+    movq   [r0+%5*FDEC_STRIDE], xm%1
+    movq   [r0+%6*FDEC_STRIDE], xm%2
+    movhps [r0+%7*FDEC_STRIDE], xm%1
+    movhps [r0+%8*FDEC_STRIDE], xm%2
+%endmacro
+
+INIT_YMM avx2
+cglobal add8x8_idct, 2,3,8
+    add    r0, 4*FDEC_STRIDE
+    pxor   m7, m7
+    TAIL_CALL .skip_prologue, 0
+global current_function %+ .skip_prologue
+.skip_prologue:
+    ; TRANSPOSE4x4Q
+    mova       xm0, [r1+ 0]
+    mova       xm1, [r1+32]
+    mova       xm2, [r1+16]
+    mova       xm3, [r1+48]
+    vinserti128 m0, m0, [r1+ 64], 1
+    vinserti128 m1, m1, [r1+ 96], 1
+    vinserti128 m2, m2, [r1+ 80], 1
+    vinserti128 m3, m3, [r1+112], 1
+    SBUTTERFLY qdq, 0, 1, 4
+    SBUTTERFLY qdq, 2, 3, 4
+    IDCT4_1D w,0,1,2,3,4,5
+    TRANSPOSE2x4x4W 0,1,2,3,4
+    paddw m0, [pw_32]
+    IDCT4_1D w,0,1,2,3,4,5
+    STOREx2_AVX2 0, 1, 4, 5, -4, 0, -3, 1, 7
+    STOREx2_AVX2 2, 3, 4, 5, -2, 2, -1, 3, 7
+    ret
+
+; 2xdst, 2xtmp, 4xsrcrow, 1xzero
+%macro LOAD_DIFF8x2_AVX2 9
+    movq    xm%1, [r1+%5*FENC_STRIDE]
+    movq    xm%2, [r1+%6*FENC_STRIDE]
+    vinserti128 m%1, m%1, [r1+%7*FENC_STRIDE], 1
+    vinserti128 m%2, m%2, [r1+%8*FENC_STRIDE], 1
+    punpcklbw m%1, m%9
+    punpcklbw m%2, m%9
+    movq    xm%3, [r2+(%5-4)*FDEC_STRIDE]
+    movq    xm%4, [r2+(%6-4)*FDEC_STRIDE]
+    vinserti128 m%3, m%3, [r2+(%7-4)*FDEC_STRIDE], 1
+    vinserti128 m%4, m%4, [r2+(%8-4)*FDEC_STRIDE], 1
+    punpcklbw m%3, m%9
+    punpcklbw m%4, m%9
+    psubw    m%1, m%3
+    psubw    m%2, m%4
+%endmacro
+
+; 4x src, 1x tmp
+%macro STORE8_DCT_AVX2 5
+    SBUTTERFLY qdq, %1, %2, %5
+    SBUTTERFLY qdq, %3, %4, %5
+    mova [r0+  0], xm%1
+    mova [r0+ 16], xm%3
+    mova [r0+ 32], xm%2
+    mova [r0+ 48], xm%4
+    vextracti128 [r0+ 64], m%1, 1
+    vextracti128 [r0+ 80], m%3, 1
+    vextracti128 [r0+ 96], m%2, 1
+    vextracti128 [r0+112], m%4, 1
+%endmacro
+
+%macro STORE16_DCT_AVX2 5
+    SBUTTERFLY qdq, %1, %2, %5
+    SBUTTERFLY qdq, %3, %4, %5
+    mova [r0+ 0-128], xm%1
+    mova [r0+16-128], xm%3
+    mova [r0+32-128], xm%2
+    mova [r0+48-128], xm%4
+    vextracti128 [r0+ 0], m%1, 1
+    vextracti128 [r0+16], m%3, 1
+    vextracti128 [r0+32], m%2, 1
+    vextracti128 [r0+48], m%4, 1
+%endmacro
+
+INIT_YMM avx2
+cglobal sub8x8_dct, 3,3,7
+    pxor m6, m6
+    add r2, 4*FDEC_STRIDE
+    LOAD_DIFF8x2_AVX2 0, 1, 4, 5, 0, 1, 4, 5, 6
+    LOAD_DIFF8x2_AVX2 2, 3, 4, 5, 2, 3, 6, 7, 6
+    DCT4_1D 0, 1, 2, 3, 4
+    TRANSPOSE2x4x4W 0, 1, 2, 3, 4
+    DCT4_1D 0, 1, 2, 3, 4
+    STORE8_DCT_AVX2 0, 1, 2, 3, 4
+    RET
+
+INIT_YMM avx2
+cglobal sub16x16_dct, 3,3,6
+    add r0, 128
+    add r2, 4*FDEC_STRIDE
+    call .sub16x4_dct
+    add r0, 64
+    add r1, 4*FENC_STRIDE
+    add r2, 4*FDEC_STRIDE
+    call .sub16x4_dct
+    add r0, 256-64
+    add r1, 4*FENC_STRIDE
+    add r2, 4*FDEC_STRIDE
+    call .sub16x4_dct
+    add r0, 64
+    add r1, 4*FENC_STRIDE
+    add r2, 4*FDEC_STRIDE
+    call .sub16x4_dct
+    RET
+.sub16x4_dct:
+    LOAD_DIFF16x2_AVX2 0, 1, 4, 5, 0, 1
+    LOAD_DIFF16x2_AVX2 2, 3, 4, 5, 2, 3
+    DCT4_1D 0, 1, 2, 3, 4
+    TRANSPOSE2x4x4W 0, 1, 2, 3, 4
+    DCT4_1D 0, 1, 2, 3, 4
+    STORE16_DCT_AVX2 0, 1, 2, 3, 4
+    ret
 %endif ; HIGH_BIT_DEPTH
 
 INIT_MMX
@@ -422,7 +553,7 @@
 cglobal %1, 2,2,11
     pxor m7, m7
 %endif
-%if mmsize==16 && %3!=256
+%if mmsize>=16 && %3!=256
     add  r0, 4*FDEC_STRIDE
 %endif
 .skip_prologue:
@@ -497,6 +628,9 @@
 SUB_NxN_DCT  sub16x16_dct8_sse2,  sub8x8_dct8_sse2,  128, 8, 0, 0, 11
 SUB_NxN_DCT  sub16x16_dct8_ssse3, sub8x8_dct8_ssse3, 128, 8, 0, 0, 11
 SUB_NxN_DCT  sub16x16_dct8_avx,   sub8x8_dct8_avx,   128, 8, 0, 0, 11
+
+INIT_YMM
+ADD_NxN_IDCT add16x16_idct_avx2, add8x8_idct_avx2, 128, 8, 0, 0
 %endif ; HIGH_BIT_DEPTH
 
 %if HIGH_BIT_DEPTH
@@ -607,10 +741,9 @@
     movh     m0, [r1]
     pxor     m1, m1
     add      r0, FDEC_STRIDE*4
-    paddw    m0, [pw_32]
-    psraw    m0, 6
+    pmulhrsw m0, [pw_512]
     psubw    m1, m0
-    mova     m5, [pb_idctdc_unpack]
+    mova     m5, [pb_unpackbd1]
     packuswb m0, m0
     packuswb m1, m1
     pshufb   m0, m5
@@ -705,11 +838,10 @@
     mova     m0, [r1]
     add      r1, 16
     pxor     m1, m1
-    paddw    m0, [pw_32]
-    psraw    m0, 6
+    pmulhrsw m0, [pw_512]
     psubw    m1, m0
-    mova     m5, [ pb_idctdc_unpack]
-    mova     m6, [pb_idctdc_unpack2]
+    mova     m5, [pb_unpackbd1]
+    mova     m6, [pb_unpackbd2]
     packuswb m0, m0
     packuswb m1, m1
     pshufb   m2, m0, m6
@@ -726,6 +858,43 @@
 INIT_XMM avx
 ADD16x16
 
+%macro ADD_DC_AVX2 3
+    mova   xm4, [r0+FDEC_STRIDE*0+%3]
+    mova   xm5, [r0+FDEC_STRIDE*1+%3]
+    vinserti128 m4, m4, [r2+FDEC_STRIDE*0+%3], 1
+    vinserti128 m5, m5, [r2+FDEC_STRIDE*1+%3], 1
+    paddusb m4, %1
+    paddusb m5, %1
+    psubusb m4, %2
+    psubusb m5, %2
+    mova [r0+FDEC_STRIDE*0+%3], xm4
+    mova [r0+FDEC_STRIDE*1+%3], xm5
+    vextracti128 [r2+FDEC_STRIDE*0+%3], m4, 1
+    vextracti128 [r2+FDEC_STRIDE*1+%3], m5, 1
+%endmacro
+
+INIT_YMM avx2
+cglobal add16x16_idct_dc, 2,3,6
+    add      r0, FDEC_STRIDE*4
+    mova     m0, [r1]
+    pxor     m1, m1
+    pmulhrsw m0, [pw_512]
+    psubw    m1, m0
+    mova     m4, [pb_unpackbd1]
+    mova     m5, [pb_unpackbd2]
+    packuswb m0, m0
+    packuswb m1, m1
+    pshufb   m2, m0, m4      ; row0, row2
+    pshufb   m3, m1, m4      ; row0, row2
+    pshufb   m0, m5          ; row1, row3
+    pshufb   m1, m5          ; row1, row3
+    lea      r2, [r0+FDEC_STRIDE*8]
+    ADD_DC_AVX2 m2, m3, FDEC_STRIDE*-4
+    ADD_DC_AVX2 m2, m3, FDEC_STRIDE*-2
+    ADD_DC_AVX2 m0, m1, FDEC_STRIDE* 0
+    ADD_DC_AVX2 m0, m1, FDEC_STRIDE* 2
+    RET
+
 %endif ; HIGH_BIT_DEPTH
 
 ;-----------------------------------------------------------------------------
@@ -1608,4 +1777,42 @@
 ZIGZAG_8x8_CAVLC
 INIT_XMM avx
 ZIGZAG_8x8_CAVLC
+
+INIT_YMM avx2
+cglobal zigzag_interleave_8x8_cavlc, 3,3,6
+    mova   m0, [r1+ 0]
+    mova   m1, [r1+32]
+    mova   m2, [r1+64]
+    mova   m3, [r1+96]
+    mova   m5, [deinterleave_shufd]
+    SBUTTERFLY wd, 0, 1, 4
+    SBUTTERFLY wd, 2, 3, 4
+    SBUTTERFLY wd, 0, 1, 4
+    SBUTTERFLY wd, 2, 3, 4
+    vpermd m0, m5, m0
+    vpermd m1, m5, m1
+    vpermd m2, m5, m2
+    vpermd m3, m5, m3
+    mova [r0+  0], xm0
+    mova [r0+ 16], xm2
+    vextracti128 [r0+ 32], m0, 1
+    vextracti128 [r0+ 48], m2, 1
+    mova [r0+ 64], xm1
+    mova [r0+ 80], xm3
+    vextracti128 [r0+ 96], m1, 1
+    vextracti128 [r0+112], m3, 1
+
+    packsswb m0, m2          ; nnz0, nnz1
+    packsswb m1, m3          ; nnz2, nnz3
+    packsswb m0, m1          ; {nnz0,nnz2}, {nnz1,nnz3}
+    vpermq   m0, m0, q3120   ; {nnz0,nnz1}, {nnz2,nnz3}
+    pxor     m5, m5
+    pcmpeqq  m0, m5
+    pmovmskb r0d, m0
+    not     r0d
+    and     r0d, 0x01010101
+    mov  [r2+0], r0w
+    shr     r0d, 16
+    mov  [r2+8], r0w
+    RET
 %endif ; !HIGH_BIT_DEPTH

x264-snapshot-20130224-2245.tar.bz2/common/x86/dct.h -> x264-snapshot-20130723-2245.tar.bz2/common/x86/dct.h Changed

@@ -40,6 +40,8 @@
 void x264_sub16x16_dct_avx  ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
 void x264_sub8x8_dct_xop    ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
 void x264_sub16x16_dct_xop  ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_avx2   ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct_avx2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
 void x264_sub8x8_dct_dc_mmx2( int16_t dct    [ 4], uint8_t *pix1, uint8_t *pix2 );
 void x264_sub8x8_dct_dc_sse2( dctcoef dct    [ 4], pixel   *pix1, pixel   *pix2 );
 void x264_sub8x16_dct_dc_sse2 ( dctcoef dct  [ 4], pixel   *pix1, pixel   *pix2 );
@@ -56,14 +58,17 @@
 void x264_add16x16_idct_dc_mmx2 ( uint8_t *p_dst, int16_t dct    [16] );
 void x264_add8x8_idct_sse2      ( pixel   *p_dst, dctcoef dct[ 4][16] );
 void x264_add8x8_idct_avx       ( pixel   *p_dst, dctcoef dct[ 4][16] );
+void x264_add8x8_idct_avx2      ( pixel   *p_dst, dctcoef dct[ 4][16] );
 void x264_add16x16_idct_sse2    ( pixel   *p_dst, dctcoef dct[16][16] );
 void x264_add16x16_idct_avx     ( pixel   *p_dst, dctcoef dct[16][16] );
+void x264_add16x16_idct_avx2    ( pixel   *p_dst, dctcoef dct[16][16] );
 void x264_add8x8_idct_dc_sse2   ( pixel   *p_dst, dctcoef dct    [ 4] );
 void x264_add16x16_idct_dc_sse2 ( pixel   *p_dst, dctcoef dct    [16] );
 void x264_add8x8_idct_dc_ssse3  ( uint8_t *p_dst, int16_t dct    [ 4] );
 void x264_add16x16_idct_dc_ssse3( uint8_t *p_dst, int16_t dct    [16] );
 void x264_add8x8_idct_dc_avx    ( pixel   *p_dst, dctcoef dct    [ 4] );
 void x264_add16x16_idct_dc_avx  ( pixel   *p_dst, dctcoef dct    [16] );
+void x264_add16x16_idct_dc_avx2 ( uint8_t *p_dst, int16_t dct    [16] );
 
 void x264_dct4x4dc_mmx       ( int16_t d[16] );
 void x264_dct4x4dc_sse2      ( int32_t d[16] );
@@ -82,6 +87,7 @@
 void x264_sub16x16_dct8_sse4 ( int32_t dct[4][64], uint16_t *pix1, uint16_t *pix2 );
 void x264_sub8x8_dct8_avx    ( dctcoef dct   [64], pixel *pix1, pixel *pix2 );
 void x264_sub16x16_dct8_avx  ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 );
+void x264_sub16x16_dct8_avx2 ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 );
 
 
 void x264_add8x8_idct8_mmx   ( uint8_t *dst, int16_t dct   [64] );
@@ -118,5 +124,6 @@
 void x264_zigzag_interleave_8x8_cavlc_mmx ( int16_t *dst, int16_t *src, uint8_t *nnz );
 void x264_zigzag_interleave_8x8_cavlc_sse2( dctcoef *dst, dctcoef *src, uint8_t *nnz );
 void x264_zigzag_interleave_8x8_cavlc_avx ( dctcoef *dst, dctcoef *src, uint8_t *nnz );
+void x264_zigzag_interleave_8x8_cavlc_avx2( int16_t *dst, int16_t *src, uint8_t *nnz );
 
 #endif

x264-snapshot-20130224-2245.tar.bz2/common/x86/deblock-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/deblock-a.asm Changed

@@ -28,8 +28,10 @@
 %include "x86inc.asm"
 %include "x86util.asm"
 
-SECTION_RODATA
+SECTION_RODATA 32
 
+load_bytes_shuf: times 2 db 3,4,5,6,11,12,13,14,4,5,6,7,12,13,14,15
+insert_top_shuf: dd 0,1,4,5,7,2,3,6
 transpose_shuf: db 0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15
 
 SECTION .text
@@ -42,6 +44,7 @@
 cextern pw_4
 cextern pw_00ff
 cextern pw_pixel_max
+cextern pb_unpackbd1
 
 %if HIGH_BIT_DEPTH
 ; out: %4 = |%1-%2|-%3
@@ -162,14 +165,12 @@
 ;-----------------------------------------------------------------------------
 ; void deblock_v_luma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
-cglobal deblock_v_luma, 5,5,8
- %assign pad 5*mmsize+12-(stack_offset&15)
+cglobal deblock_v_luma, 5,5,8,0-5*mmsize
 %define tcm [rsp]
 %define ms1 [rsp+mmsize]
 %define ms2 [rsp+mmsize*2]
 %define am [rsp+mmsize*3]
 %define bm [rsp+mmsize*4]
- SUB rsp, pad
 add r1, r1
 LOAD_AB m4, m5, r2d, r3d
 mov r3, 32/mmsize
@@ -213,11 +214,9 @@
 add r4, mmsize/8
 dec r3
 jg .loop
- ADD rsp, pad
 RET
 
-cglobal deblock_h_luma, 5,6,8
- %assign pad 7*mmsize+12-(stack_offset&15)
+cglobal deblock_h_luma, 5,6,8,0-7*mmsize
 %define tcm [rsp]
 %define ms1 [rsp+mmsize]
 %define ms2 [rsp+mmsize*2]
@@ -225,7 +224,6 @@
 %define p2m [rsp+mmsize*4]
 %define am [rsp+mmsize*5]
 %define bm [rsp+mmsize*6]
- SUB rsp, pad
 add r1, r1
 LOAD_AB m4, m5, r2d, r3d
 mov r3, r1
@@ -302,7 +300,6 @@
 lea r2, [r2+r1*(mmsize/2)]
 dec r5
 jg .loop
- ADD rsp, pad
 RET
 %endmacro
 
@@ -485,7 +482,6 @@
 %endmacro
 
 %macro LUMA_INTRA_INIT 1
- %xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15)
 %define t0 m4
 %define t1 m5
 %define t2 m6
@@ -495,7 +491,6 @@
 CAT_XDEFINE t, i, [rsp+mmsize*(i-4)]
 %assign i i+1
 %endrep
- SUB rsp, pad
 add r1, r1
 %endmacro
 
@@ -724,7 +719,7 @@
 ;-----------------------------------------------------------------------------
 ; void deblock_v_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
-cglobal deblock_v_luma_intra, 4,7,8
+cglobal deblock_v_luma_intra, 4,7,8,0-3*mmsize
 LUMA_INTRA_INIT 3
 lea r4, [r1*4]
 lea r5, [r1*3]
@@ -744,13 +739,12 @@
 add r4, mmsize
 dec r6
 jg .loop
- ADD rsp, pad
 RET
 
 ;-----------------------------------------------------------------------------
 ; void deblock_h_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
-cglobal deblock_h_luma_intra, 4,7,8
+cglobal deblock_h_luma_intra, 4,7,8,0-8*mmsize
 LUMA_INTRA_INIT 8
 %if mmsize == 8
 lea r4, [r1*3]
@@ -785,7 +779,6 @@
 dec r6
 %endif
 jg .loop
- ADD rsp, pad
 RET
 %endmacro
 
@@ -871,6 +864,19 @@
 movh %8, m4
 %endmacro
 
+; in: 8 rows of 4 bytes in %9..%10
+; out: 8 rows of 4 bytes in %1..%8
+%macro STORE_8x4B 10
+ movd %1, %9
+ pextrd %2, %9, 1
+ pextrd %3, %9, 2
+ pextrd %4, %9, 3
+ movd %5, %10
+ pextrd %6, %10, 1
+ pextrd %7, %10, 2
+ pextrd %8, %10, 3
+%endmacro
+
 %macro TRANSPOSE4x8B_LOAD 8
 TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8
 %endmacro
@@ -925,6 +931,45 @@
 ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
 %macro TRANSPOSE6x8_MEM 9
 RESET_MM_PERMUTATION
+%if cpuflag(avx)
+ ; input:
+ ; _ABCDEF_
+ ; _GHIJKL_
+ ; _MNOPQR_
+ ; _STUVWX_
+ ; _YZabcd_
+ ; _efghij_
+ ; _klmnop_
+ ; _qrstuv_
+
+ movh m0, %1
+ movh m2, %2
+ movh m1, %3
+ movh m3, %4
+ punpcklbw m0, m2 ; __ AG BH CI DJ EK FL __
+ punpcklbw m1, m3 ; __ MS NT OU PV QW RX __
+ movh m2, %5
+ movh m3, %6
+ punpcklbw m2, m3 ; __ Ye Zf ag bh ci dj __
+ movh m3, %7
+ movh m4, %8
+ punpcklbw m3, m4 ; __ kq lr ms nt ou pv __
+
+ SBUTTERFLY wd, 0, 1, 4 ; __ __ AG MS BH NT CI OU
+ ; DJ PV EK QW FL RX __ __
+ SBUTTERFLY wd, 2, 3, 4 ; __ __ Ye kq Zf lr ag ms
+ ; bh nt ci ou dj pv __ __
+ SBUTTERFLY dq, 0, 2, 4 ; __ __ __ __ AG MS Ye kq
+ ; BH NT Zf lr CI FL OU RX
+ SBUTTERFLY dq, 1, 3, 4 ; DJ PV bh nt EK QW Zf lr
+ ; FL RX dj pv __ __ __ __
+ movhps [%9+0x00], m0
+ movh [%9+0x10], m2
+ movhps [%9+0x20], m2
+ movh [%9+0x30], m1
+ movhps [%9+0x40], m1
+ movh [%9+0x50], m3
+%else
 movq m0, %1
 movq m1, %2
 movq m2, %3
@@ -951,13 +996,41 @@
 movq [%9+0x30], m1
 movq [%9+0x40], m5
 movq [%9+0x50], m3
+%endif
 RESET_MM_PERMUTATION
 %endmacro
 
+
 ; in: 8 rows of 8 in %1..%8
 ; out: 8 rows of 8 in %9..%16
 %macro TRANSPOSE8x8_MEM 16
 RESET_MM_PERMUTATION
+%if cpuflag(avx)
+ movh m0, %1
+ movh m4, %2
+ movh m1, %3
+ movh m5, %4
+ movh m2, %5
+ movh m3, %7
+ punpcklbw m0, m4
+ punpcklbw m1, m5
+ movh m4, %6
+ movh m5, %8
+ punpcklbw m2, m4
+ punpcklbw m3, m5
+ SBUTTERFLY wd, 0, 1, 4
+ SBUTTERFLY wd, 2, 3, 4
+ SBUTTERFLY dq, 0, 2, 4
+ SBUTTERFLY dq, 1, 3, 4
+ movh %9, m0
+ movhps %10, m0
+ movh %11, m2
+ movhps %12, m2
+ movh %13, m1
+ movhps %14, m1
+ movh %15, m3
+ movhps %16, m3
+%else
 movq m0, %1
 movq m1, %2
 movq m2, %3
@@ -988,6 +1061,7 @@
 movq %12, m0
 movq %15, m3
 movq %16, m7
+%endif
 RESET_MM_PERMUTATION
 %endmacro
 
@@ -1009,31 +1083,42 @@
 
 ; out: %4 = |%1-%2|>%3
 ; clobbers: %5
-%macro DIFF_GT2 5
-%if ARCH_X86_64
- psubusb %5, %2, %1
+%macro DIFF_GT2 5-6
+%if %0<6
 psubusb %4, %1, %2
+ psubusb %5, %2, %1
 %else
- mova %5, %2
 mova %4, %1
- psubusb %5, %1
+ mova %5, %2
 psubusb %4, %2
+ psubusb %5, %1
 %endif
 psubusb %5, %3
 psubusb %4, %3
 pcmpeqb %4, %5
 %endmacro
 
-; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
+; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha %2=beta
 ; out: m5=beta-1, m7=mask, %3=alpha-1
 ; clobbers: m4,m6
 %macro LOAD_MASK 2-3
+%if cpuflag(ssse3)
 movd m4, %1
 movd m5, %2
+ pxor m6, m6
+ pshufb m4, m6
+ pshufb m5, m6
+%else
+ movd m4, %1
+ movd m5, %2
+ punpcklbw m4, m4
+ punpcklbw m5, m5
 SPLATW m4, m4
 SPLATW m5, m5
- packuswb m4, m4 ; 16x alpha-1
- packuswb m5, m5 ; 16x beta-1
+%endif
+ mova m6, [pb_1]
+ psubusb m4, m6 ; alpha - 1
+ psubusb m5, m6 ; alpha - 2
 %if %0>2
 mova %3, m4
 %endif
@@ -1096,9 +1181,7 @@
 cglobal deblock_v_luma, 5,5,10
 movd m8, [r4] ; tc0
 lea r4, [r1*3]
- dec r2d ; alpha-1
 neg r4
- dec r3d ; beta-1
 add r4, r0 ; pix-3*stride
 
 mova m0, [r4+r1] ; p1
@@ -1107,21 +1190,26 @@
 mova m3, [r0+r1] ; q1
 LOAD_MASK r2d, r3d
 
+%if cpuflag(avx)
+ pshufb m8, [pb_unpackbd1]
+ pblendvb m9, m7, m6, m8
+%else
 punpcklbw m8, m8
 punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
 pcmpeqb m9, m9
 pcmpeqb m9, m8
 pandn m9, m7
+%endif
 pand m8, m9
 
- movdqa m3, [r4] ; p2
+ mova m3, [r4] ; p2
 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
 pand m6, m9
- psubb m7, m8, m6
+ psubb m7, m8, m6 ; tc++
 pand m6, m8
 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
 
- movdqa m4, [r0+2*r1] ; q2
+ mova m4, [r0+2*r1] ; q2
 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
 pand m6, m9
 pand m8, m6
@@ -1137,16 +1225,19 @@
 ;-----------------------------------------------------------------------------
 ; void deblock_h_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
+
+%if cpuflag(avx)
+INIT_XMM cpuname
+%else
 INIT_MMX cpuname
-cglobal deblock_h_luma, 5,9
+%endif
+cglobal deblock_h_luma, 5,9,0,0x60+16*WIN64
 lea r8, [r1*3]
 lea r6, [r0-4]
 lea r5, [r0-4+r8]
 %if WIN64
- sub rsp, 0x98
- %define pix_tmp rsp+0x30
+ %define pix_tmp rsp+0x30 ; shadow space + r4
 %else
- sub rsp, 0x68
 %define pix_tmp rsp
 %endif
 
@@ -1170,6 +1261,22 @@
 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
 add r6, 2
 add r5, 2
+%if cpuflag(sse4)
+ mova m0, [pix_tmp+0x10]
+ mova m1, [pix_tmp+0x20]
+ mova m2, [pix_tmp+0x30]
+ mova m3, [pix_tmp+0x40]
+ SBUTTERFLY bw, 0, 1, 4
+ SBUTTERFLY bw, 2, 3, 4
+ SBUTTERFLY wd, 0, 2, 4
+ SBUTTERFLY wd, 1, 3, 4
+ STORE_8x4B PASS8ROWS(r6, r5, r7, r8), m1, m3
+ shl r7, 3
+ sub r6, r7
+ sub r5, r7
+ shr r7, 3
+ STORE_8x4B PASS8ROWS(r6, r5, r7, r8), m0, m2
+%else
 movq m0, [pix_tmp+0x18]
 movq m1, [pix_tmp+0x28]
 movq m2, [pix_tmp+0x38]
@@ -1185,12 +1292,8 @@
 movq m2, [pix_tmp+0x30]
 movq m3, [pix_tmp+0x40]
 TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
-
-%if WIN64
- add rsp, 0x98
-%else
- add rsp, 0x68
 %endif
+
 RET
 %endmacro
 
@@ -1207,9 +1310,7 @@
 ;-----------------------------------------------------------------------------
 cglobal deblock_%1_luma, 5,5,8,2*%2
 lea r4, [r1*3]
- dec r2 ; alpha-1
 neg r4
- dec r3 ; beta-1
 add r4, r0 ; pix-3*stride
 
 mova m0, [r4+r1] ; p1
@@ -1220,12 +1321,18 @@
 
 mov r3, r4mp
 movd m4, [r3] ; tc0
+%if cpuflag(avx)
+ pshufb m4, [pb_unpackbd1]
+ mova [esp+%2], m4 ; tc
+ pblendvb m4, m7, m6, m4
+%else
 punpcklbw m4, m4
 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
 mova [esp+%2], m4 ; tc
 pcmpeqb m3, m3
 pcmpgtb m4, m3
 pand m4, m7
+%endif
 mova [esp], m4 ; mask
 
 mova m3, [r4] ; p2
@@ -1254,7 +1361,12 @@
 ;-----------------------------------------------------------------------------
 ; void deblock_h_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
+
+%if cpuflag(avx)
+INIT_XMM cpuname
+%else
 INIT_MMX cpuname
+%endif
 cglobal deblock_h_luma, 0,5,8,0x60+HAVE_ALIGNED_STACK*12
 mov r0, r0mp
 mov r3, r1m
@@ -1289,6 +1401,20 @@
 sub r0, 2
 lea r1, [r0+r4]
 
+%if cpuflag(avx)
+ mova m0, [pix_tmp+0x10]
+ mova m1, [pix_tmp+0x20]
+ mova m2, [pix_tmp+0x30]
+ mova m3, [pix_tmp+0x40]
+ SBUTTERFLY bw, 0, 1, 4
+ SBUTTERFLY bw, 2, 3, 4
+ SBUTTERFLY wd, 0, 2, 4
+ SBUTTERFLY wd, 1, 3, 4
+ STORE_8x4B PASS8ROWS(r0, r1, r3, r4), m0, m2
+ lea r0, [r0+r3*8]
+ lea r1, [r1+r3*8]
+ STORE_8x4B PASS8ROWS(r0, r1, r3, r4), m1, m3
+%else
 movq m0, [pix_tmp+0x10]
 movq m1, [pix_tmp+0x20]
 movq m2, [pix_tmp+0x30]
@@ -1302,6 +1428,7 @@
 movq m2, [pix_tmp+0x38]
 movq m3, [pix_tmp+0x48]
 TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
+%endif
 
 RET
 %endmacro ; DEBLOCK_LUMA
@@ -1429,7 +1556,11 @@
 %define t5 m11
 %define mask0 m12
 %define mask1p m13
+%if WIN64
+ %define mask1q [rsp]
+%else
 %define mask1q [rsp-24]
+%endif
 %define mpb_0 m14
 %define mpb_1 m15
 %else
@@ -1448,14 +1579,10 @@
 ;-----------------------------------------------------------------------------
 ; void deblock_v_luma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
-cglobal deblock_%1_luma_intra, 4,6,16,ARCH_X86_64*0x50-0x50
+cglobal deblock_%1_luma_intra, 4,6,16,0-(1-ARCH_X86_64)*0x50-WIN64*0x10
 lea r4, [r1*4]
 lea r5, [r1*3] ; 3*stride
- dec r2d ; alpha-1
- jl .end
 neg r4
- dec r3d ; beta-1
- jl .end
 add r4, r0 ; pix-4*stride
 mova p1, [r4+2*r1]
 mova p0, [r4+r5]
@@ -1470,9 +1597,9 @@
 pavgb t5, mpb_1 ; alpha/4+1
 movdqa p2, [r4+r1]
 movdqa q2, [r0+2*r1]
- DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
- DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
- DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
+ DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
+ DIFF_GT2 p0, p2, m5, t2, t5, 1 ; mask1 = |p2-p0| > beta-1
+ DIFF_GT2 q0, q2, m5, t4, t5, 1 ; t4 = |q2-q0| > beta-1
 pand t0, mask0
 pand t4, t0
 pand t2, t0
@@ -1484,12 +1611,12 @@
 mova mask0, m7
 pavgb m4, [pb_0]
 pavgb m4, [pb_1] ; alpha/4+1
- DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
+ DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
 pand m6, mask0
- DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
+ DIFF_GT2 p0, p2, m5, m4, m7, 1 ; m4 = |p2-p0| > beta-1
 pand m4, m6
 mova mask1p, m4
- DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
+ DIFF_GT2 q0, q2, m5, m4, m7, 1 ; m4 = |q2-q0| > beta-1
 pand m4, m6
 mova mask1q, m4
 %endif
@@ -1499,17 +1626,24 @@
 .end:
 REP_RET
 
+%if cpuflag(avx)
+INIT_XMM cpuname
+%else
 INIT_MMX cpuname
+%endif
 %if ARCH_X86_64
 ;-----------------------------------------------------------------------------
 ; void deblock_h_luma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
-cglobal deblock_h_luma_intra, 4,9
+cglobal deblock_h_luma_intra, 4,9,0,0x80
 lea r8, [r1*3]
 lea r6, [r0-4]
 lea r5, [r0-4+r8]
- sub rsp, 0x88
+%if WIN64
+ %define pix_tmp rsp+0x20 ; shadow space
+%else
 %define pix_tmp rsp
+%endif
 
 ; transpose 8x16 -> tmp space
 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r1, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
@@ -1530,7 +1664,6 @@
 sub r5, r7
 shr r7, 3
 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
- add rsp, 0x88
 RET
 %else
 cglobal deblock_h_luma_intra, 2,4,8,0x80
@@ -1867,8 +2000,6 @@
 
 %if HIGH_BIT_DEPTH == 0
 %macro CHROMA_V_START 0
- dec r2d ; alpha-1
- dec r3d ; beta-1
 mov t5, r0
 sub t5, r1
 sub t5, r1
@@ -1879,8 +2010,6 @@
 %endmacro
 
 %macro CHROMA_H_START 0
- dec r2d
- dec r3d
 sub r0, 4
 lea t6, [r1*3]
 mov t5, r0
@@ -1969,8 +2098,6 @@
 ;-----------------------------------------------------------------------------
 %macro DEBLOCK_H_CHROMA_420_MBAFF 0
 cglobal deblock_h_chroma_mbaff, 5,7,8
- dec r2d
- dec r3d
 sub r0, 4
 lea t6, [r1*3]
 mov t5, r0
@@ -2368,3 +2495,70 @@
 DEBLOCK_STRENGTH_XMM
 INIT_XMM avx
 DEBLOCK_STRENGTH_XMM
+
+%macro LOAD_BYTES_YMM 1
+ movu m0, [%1-4] ; ___E FGHI ___J KLMN ___O PQRS ___T UVWX
+ pshufb m0, [load_bytes_shuf] ; EFGH JKLM FGHI KLMN OPQR TUVW PQRS UVWX
+ mova m2, [insert_top_shuf]
+ vpermq m1, m0, q3131 ; FGHI KLMN PQRS UVWX x2
+ vpermd m0, m2, m0 ; EFGH JKLM OPQR TUVW ____ FGHI KLMN PQRS
+ vpbroadcastd m2, [%1-8] ; ABCD ....
+ vpblendd m0, m0, m2, 00010000b ; EFGH JKLM OPQR TUVW ABCD FGHI KLMN PQRS
+%endmacro
+
+INIT_YMM avx2
+cglobal deblock_strength, 6,6,7
+ ; Prepare mv comparison register
+ shl r4d, 8
+ add r4d, 3 - (1<<8)
+ movd xm6, r4d
+ vpbroadcastw m6, xm6
+ pxor m5, m5 ; bs0,bs1
+
+.lists:
+ ; Check refs
+ LOAD_BYTES_YMM ref
+ pxor m0, m1
+ por m5, m0
+
+ ; Check mvs
+ movu xm0, [mv-4+4*8*0]
+ vinserti128 m0, m0, [mv+4*8*-1], 1
+ vbroadcasti128 m2, [mv+4*8* 0]
+ vinserti128 m1, m2, [mv-4+4*8*1], 0
+ vbroadcasti128 m3, [mv+4*8* 1]
+ psubw m0, m2
+ psubw m1, m3
+
+ vinserti128 m2, m3, [mv-4+4*8*2], 0
+ vbroadcasti128 m4, [mv+4*8* 2]
+ vinserti128 m3, m4, [mv-4+4*8*3], 0
+ psubw m2, m4
+ vbroadcasti128 m4, [mv+4*8* 3]
+ psubw m3, m4
+ packsswb m0, m1
+ packsswb m2, m3
+ pabsb m0, m0
+ pabsb m2, m2
+ psubusb m0, m6
+ psubusb m2, m6
+ packsswb m0, m2
+ por m5, m0
+
+ add r1, 40
+ add r2, 4*8*5
+ dec r5d
+ jge .lists
+
+ ; Check nnz
+ LOAD_BYTES_YMM nnz
+ por m0, m1
+ mova m6, [pb_1]
+ pminub m0, m6
+ pminub m5, m6 ; mv ? 1 : 0
+ paddb m0, m0 ; nnz ? 2 : 0
+ pmaxub m5, m0
+ vextracti128 [bs1], m5, 1
+ pshufb xm5, [transpose_shuf]
+ mova [bs0], xm5
+ RET

x264-snapshot-20130224-2245.tar.bz2/common/x86/mc-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/mc-a.asm Changed

@@ -34,7 +34,7 @@
 
 SECTION_RODATA 32
 
-ch_shuf: db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
+ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
 ch_shuf_adj: times 8 db 0
 times 8 db 2
 times 8 db 4
@@ -49,10 +49,12 @@
 cextern pw_8
 cextern pw_32
 cextern pw_64
+cextern pw_512
 cextern pw_00ff
 cextern pw_pixel_max
 cextern sw_64
 cextern pd_32
+cextern deinterleave_shufd
 
 ;=============================================================================
 ; implicit weighted biprediction
@@ -141,8 +143,7 @@
 movh m1, %2
 punpcklbw m0, m1
 pmaddubsw m0, m3
- paddw m0, m4
- psraw m0, 6
+ pmulhrsw m0, m4
 %endmacro
 
 %macro BIWEIGHT_START_SSSE3 0
@@ -151,9 +152,13 @@
 sub t7d, t6d
 shl t7d, 8
 add t6d, t7d
- movd m3, t6d
- mova m4, [pw_32]
+ mova m4, [pw_512]
+ movd xm3, t6d
+%if cpuflag(avx2)
+ vpbroadcastw m3, xm3
+%else
 SPLATW m3, m3 ; weight_dst,src
+%endif
 %endmacro
 
 %if HIGH_BIT_DEPTH
@@ -244,6 +249,25 @@
 INIT_XMM ssse3
 AVG_WEIGHT 8, 7
 AVG_WEIGHT 16, 7
+
+INIT_YMM avx2
+cglobal pixel_avg_weight_w16
+ BIWEIGHT_START
+ AVG_START 5
+.height_loop:
+ movu xm0, [t2]
+ movu xm1, [t4]
+ vinserti128 m0, m0, [t2+t3], 1
+ vinserti128 m1, m1, [t4+t5], 1
+ SBUTTERFLY bw, 0, 1, 2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ packuswb m0, m1
+ mova [t0], xm0
+ vextracti128 [t0+t1], m0, 1
+ AVG_END
 %endif ;HIGH_BIT_DEPTH
 
 ;=============================================================================
@@ -274,7 +298,7 @@
 %endmacro
 
 ; src, dst, width
-%macro WEIGHT_TWO_ROW 3
+%macro WEIGHT_TWO_ROW 4
 %assign x 0
 %rep (%3+mmsize/2-1)/(mmsize/2)
 %if %3-x/2 <= 4 && mmsize == 16
@@ -298,16 +322,21 @@
 %else ; !HIGH_BIT_DEPTH
 
 %macro WEIGHT_START 1
+%if cpuflag(avx2)
+ vbroadcasti128 m3, [r4]
+ vbroadcasti128 m4, [r4+16]
+%else
 mova m3, [r4]
 mova m4, [r4+16]
 %if notcpuflag(ssse3)
 movd m5, [r4+32]
 %endif
+%endif
 pxor m2, m2
 %endmacro
 
-; src1, src2, dst1, dst2
-%macro WEIGHT_ROWx2 4
+; src1, src2, dst1, dst2, fast
+%macro WEIGHT_ROWx2 5
 movh m0, [%1 ]
 movh m1, [%1+mmsize/2]
 movh m6, [%2 ]
@@ -317,10 +346,12 @@
 punpcklbw m6, m2
 punpcklbw m7, m2
 %if cpuflag(ssse3)
+%if %5==0
 psllw m0, 7
 psllw m1, 7
 psllw m6, 7
 psllw m7, 7
+%endif
 pmulhrsw m0, m3
 pmulhrsw m1, m3
 pmulhrsw m6, m3
@@ -349,15 +380,54 @@
 mova [%4], m6
 %endmacro
 
-; src1, src2, dst1, dst2, width
-%macro WEIGHT_COL 5
+; src1, src2, dst1, dst2, width, fast
+%macro WEIGHT_COL 6
+%if cpuflag(avx2)
+%if %5==16
+ movu xm0, [%1]
+ vinserti128 m0, m0, [%2], 1
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m0, m2
+%if %6==0
+ psllw m0, 7
+ psllw m1, 7
+%endif
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ paddw m0, m4
+ paddw m1, m4
+ packuswb m0, m1
+ mova [%3], xm0
+ vextracti128 [%4], m0, 1
+%else
+ movq xm0, [%1]
+ vinserti128 m0, m0, [%2], 1
+ punpcklbw m0, m2
+%if %6==0
+ psllw m0, 7
+%endif
+ pmulhrsw m0, m3
+ paddw m0, m4
+ packuswb m0, m0
+ vextracti128 xm1, m0, 1
+%if %5 == 8
+ movq [%3], xm0
+ movq [%4], xm1
+%else
+ movd [%3], xm0
+ movd [%4], xm1
+%endif
+%endif
+%else
 movh m0, [%1]
 movh m1, [%2]
 punpcklbw m0, m2
 punpcklbw m1, m2
 %if cpuflag(ssse3)
+%if %6==0
 psllw m0, 7
 psllw m1, 7
+%endif
 pmulhrsw m0, m3
 pmulhrsw m1, m3
 paddw m0, m4
@@ -380,18 +450,22 @@
 movd [%3], m0 ; width 2 can write garbage for the last 2 bytes
 movd [%4], m1
 %endif
+%endif
 %endmacro
-
 ; src, dst, width
-%macro WEIGHT_TWO_ROW 3
+%macro WEIGHT_TWO_ROW 4
 %assign x 0
 %rep %3
 %if (%3-x) >= mmsize
- WEIGHT_ROWx2 %1+x, %1+r3+x, %2+x, %2+r1+x
+ WEIGHT_ROWx2 %1+x, %1+r3+x, %2+x, %2+r1+x, %4
 %assign x (x+mmsize)
 %else
- WEIGHT_COL %1+x, %1+r3+x, %2+x, %2+r1+x, %3-x
- %exitrep
+ %assign w %3-x
+%if w == 20
+ %assign w 16
+%endif
+ WEIGHT_COL %1+x, %1+r3+x, %2+x, %2+r1+x, w, %4
+ %assign x (x+w)
 %endif
 %if x >= %3
 %exitrep
@@ -409,13 +483,30 @@
 cglobal mc_weight_w%1, 6,6,8
 FIX_STRIDES r1, r3
 WEIGHT_START %1
+%if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0
+ ; we can merge the shift step into the scale factor
+ ; if (m3<<7) doesn't overflow an int16_t
+ cmp byte [r4+1], 0
+ jz .fast
+%endif
 .loop:
- WEIGHT_TWO_ROW r2, r0, %1
+ WEIGHT_TWO_ROW r2, r0, %1, 0
 lea r0, [r0+r1*2]
 lea r2, [r2+r3*2]
 sub r5d, 2
 jg .loop
 RET
+%if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0
+.fast:
+ psllw m3, 7
+.fastloop:
+ WEIGHT_TWO_ROW r2, r0, %1, 1
+ lea r0, [r0+r1*2]
+ lea r2, [r2+r3*2]
+ sub r5d, 2
+ jg .fastloop
+ RET
+%endif
 %endmacro
 
 INIT_MMX mmx2
@@ -437,6 +528,10 @@
 WEIGHTER 8
 WEIGHTER 16
 WEIGHTER 20
+INIT_YMM avx2
+WEIGHTER 8
+WEIGHTER 16
+WEIGHTER 20
 %endif
 
 %macro OFFSET_OP 7
@@ -531,11 +626,15 @@
 mov eax, %2
 cmp dword r6m, 32
 jne pixel_avg_weight_w%1 %+ SUFFIX
+%if cpuflag(avx2) && %1 == 16 ; all AVX2 machines can do fast 16-byte unaligned loads
+ jmp pixel_avg_w%1_avx2
+%else
 %if mmsize == 16 && %1 == 16
 test dword r4m, 15
 jz pixel_avg_w%1_sse2
 %endif
 jmp pixel_avg_w%1_mmx2
+%endif
 %endmacro
 
 ;-----------------------------------------------------------------------------
@@ -635,6 +734,10 @@
 AVGH 4, 8
 AVGH 4, 4
 AVGH 4, 2
+INIT_XMM avx2
+AVG_FUNC 16, movdqu, movdqa
+AVGH 16, 16
+AVGH 16, 8
 
 %endif ;HIGH_BIT_DEPTH
 
@@ -657,7 +760,7 @@
 .height_loop:
 movu m0, [r2]
 movu m1, [r2+r3*2]
-%if mmsize == 8
+%if cpuflag(avx) || mmsize == 8
 pavgw m0, [r2+r4]
 pavgw m1, [r2+r6]
 %else
@@ -717,6 +820,8 @@
 AVG2_W_ONE 8
 AVG2_W_TWO 10, movd, movd
 AVG2_W_TWO 16, movu, mova
+INIT_YMM avx2
+AVG2_W_ONE 16
 
 INIT_MMX
 cglobal pixel_avg2_w10_mmx2, 6,7
@@ -805,27 +910,40 @@
 jg .height_loop
 RET
 
-INIT_XMM
-cglobal pixel_avg2_w18_sse2, 6,7,6
+%macro PIXEL_AVG_W18 0
+cglobal pixel_avg2_w18, 6,7
 sub r4, r2
 .height_loop:
 movu m0, [r2+ 0]
+ movd xm2, [r2+32]
+%if mmsize == 32
+ pavgw m0, [r2+r4+ 0]
+ movd xm1, [r2+r4+32]
+ pavgw xm2, xm1
+%else
 movu m1, [r2+16]
- movh m2, [r2+32]
 movu m3, [r2+r4+ 0]
 movu m4, [r2+r4+16]
- movh m5, [r2+r4+32]
+ movd m5, [r2+r4+32]
 pavgw m0, m3
 pavgw m1, m4
 pavgw m2, m5
- mova [r0+ 0], m0
 mova [r0+16], m1
- movh [r0+32], m2
+%endif
+ mova [r0+ 0], m0
+ movd [r0+32], xm2
 lea r2, [r2+r3*2]
 lea r0, [r0+r1*2]
 dec r5d
 jg .height_loop
 RET
+%endmacro
+
+INIT_XMM sse2
+PIXEL_AVG_W18
+INIT_YMM avx2
+PIXEL_AVG_W18
+
 %endif ; HIGH_BIT_DEPTH
 
 %if HIGH_BIT_DEPTH == 0
@@ -965,6 +1083,23 @@
 AVG2_W20 sse2
 AVG2_W20 sse2_misalign
 
+INIT_YMM avx2
+cglobal pixel_avg2_w20, 6,7
+ sub r2, r4
+ lea r6, [r2+r3]
+.height_loop:
+ movu m0, [r4]
+ movu m1, [r4+r3]
+ pavgb m0, [r4+r2]
+ pavgb m1, [r4+r6]
+ lea r4, [r4+r3*2]
+ mova [r0], m0
+ mova [r0+r1], m1
+ lea r0, [r0+r1*2]
+ sub r5d, 2
+ jg .height_loop
+ RET
+
 ; Cacheline split code for processors with high latencies for loads
 ; split over cache lines. See sad-a.asm for a more detailed explanation.
 ; This particular instance is complicated by the fact that src1 and src2
@@ -1172,18 +1307,18 @@
 movu m1, [r2+%4*mmsize]
 movu m2, [r2+r3+%3*mmsize]
 movu m3, [r2+r3+%4*mmsize]
- movu m4, [r2+r3*2+%3*mmsize]
- movu m5, [r2+r3*2+%4*mmsize]
- movu m6, [r2+%2+%3*mmsize]
- movu m7, [r2+%2+%4*mmsize]
 mova [r0+%3*mmsize], m0
 mova [r0+%4*mmsize], m1
 mova [r0+r1+%3*mmsize], m2
 mova [r0+r1+%4*mmsize], m3
- mova [r0+r1*2+%3*mmsize], m4
- mova [r0+r1*2+%4*mmsize], m5
- mova [r0+%1+%3*mmsize], m6
- mova [r0+%1+%4*mmsize], m7
+ movu m0, [r2+r3*2+%3*mmsize]
+ movu m1, [r2+r3*2+%4*mmsize]
+ movu m2, [r2+%2+%3*mmsize]
+ movu m3, [r2+%2+%4*mmsize]
+ mova [r0+r1*2+%3*mmsize], m0
+ mova [r0+r1*2+%4*mmsize], m1
+ mova [r0+%1+%3*mmsize], m2
+ mova [r0+%1+%4*mmsize], m3
 %endmacro
 
 %macro COPY4 2
@@ -1216,7 +1351,7 @@
 %macro MC_COPY 1
 %assign %%w %1*SIZEOF_PIXEL/mmsize
 %if %%w > 0
-cglobal mc_copy_w%1, 5,7,8*(%%w/2)
+cglobal mc_copy_w%1, 5,7
 FIX_STRIDES r1, r3
 lea r6, [r3*3]
 lea r5, [r1*3]
@@ -1233,13 +1368,17 @@
 INIT_MMX mmx
 MC_COPY 8
 MC_COPY 16
-INIT_XMM sse2
+INIT_XMM sse
 MC_COPY 8
 MC_COPY 16
-INIT_XMM aligned, sse2
+INIT_XMM aligned, sse
 MC_COPY 16
-
-
+%if HIGH_BIT_DEPTH
+INIT_YMM avx
+MC_COPY 16
+INIT_YMM aligned, avx
+MC_COPY 16
+%endif
 
 ;=============================================================================
 ; prefetch
@@ -1514,7 +1653,11 @@
 mov t0, r0
 mov t1, r1
 mov t2, r3
+%if WIN64
+ %define multy0 r4m
+%else
 %define multy0 [rsp-8]
+%endif
 mova multy0, m5
 %else
 mov r3m, r3
@@ -1651,10 +1794,9 @@
 %if ARCH_X86_64 ; too many regs for x86_32
 RESET_MM_PERMUTATION
 %if WIN64
-%if xmm_regs_used > 6
- %assign stack_offset stack_offset-(xmm_regs_used-6)*16-16
- %assign xmm_regs_used 6
-%endif
+ %assign stack_offset stack_offset - stack_size_padded
+ %assign stack_size_padded 0
+ %assign xmm_regs_used 0
 %endif
 .mc1dy:
 and t2d, 7
@@ -1781,7 +1923,11 @@
 
 %macro MC_CHROMA_SSSE3 0
 cglobal mc_chroma
+%if cpuflag(avx2)
 MC_CHROMA_START 9
+%else
+ MC_CHROMA_START 10
+%endif
 and r5d, 7
 and t2d, 7
 mov t0d, r5d
@@ -1792,18 +1938,18 @@
 sub r5d, t2d
 imul t2d, t0d ; (x*255+8)*y
 imul r5d, t0d ; (x*255+8)*(8-y)
- movd m6, t2d
- movd m7, r5d
+ movd xm6, t2d
+ movd xm7, r5d
 %if cpuflag(cache64)
 mov t0d, r3d
 and t0d, 7
 %ifdef PIC
 lea t1, [ch_shuf_adj]
- movddup m5, [t1 + t0*4]
+ movddup xm5, [t1 + t0*4]
 %else
- movddup m5, [ch_shuf_adj + t0*4]
+ movddup xm5, [ch_shuf_adj + t0*4]
 %endif
- paddb m5, [ch_shuf]
+ paddb xm5, [ch_shuf]
 and r3, ~7
 %else
 mova m5, [ch_shuf]
@@ -1812,12 +1958,80 @@
 movifnidn r1, r1mp
 movifnidn r2d, r2m
 movifnidn r5d, r8m
+%if cpuflag(avx2)
+ vpbroadcastw m6, xm6
+ vpbroadcastw m7, xm7
+%else
 SPLATW m6, m6
 SPLATW m7, m7
+%endif
+%if ARCH_X86_64
+ %define shiftround m8
+ mova m8, [pw_512]
+%else
+ %define shiftround [pw_512]
+%endif
 cmp dword r7m, 4
 jg .width8
- movu m0, [r3]
+
+%if cpuflag(avx2)
+.loop4:
+ movu xm0, [r3]
+ movu xm1, [r3+r4]
+ vinserti128 m0, m0, [r3+r4], 1
+ vinserti128 m1, m1, [r3+r4*2], 1
+ pshufb m0, m5
+ pshufb m1, m5
+ pmaddubsw m0, m7
+ pmaddubsw m1, m6
+ paddw m0, m1
+ pmulhrsw m0, shiftround
+ packuswb m0, m0
+ vextracti128 xm1, m0, 1
+ movd [r0], xm0
+ movd [r0+r2], xm1
+ psrldq xm0, 4
+ psrldq xm1, 4
+ movd [r1], xm0
+ movd [r1+r2], xm1
+ lea r3, [r3+r4*2]
+ lea r0, [r0+r2*2]
+ lea r1, [r1+r2*2]
+ sub r5d, 2
+ jg .loop4
+ RET
+.width8:
+ movu xm0, [r3]
+ vinserti128 m0, m0, [r3+8], 1
 pshufb m0, m5
+.loop8:
+ movu xm3, [r3+r4]
+ vinserti128 m3, m3, [r3+r4+8], 1
+ pshufb m3, m5
+ pmaddubsw m1, m0, m7
+ pmaddubsw m2, m3, m6
+ pmaddubsw m3, m3, m7
+
+ movu xm0, [r3+r4*2]
+ vinserti128 m0, m0, [r3+r4*2+8], 1
+ pshufb m0, m5
+ pmaddubsw m4, m0, m6
+
+ paddw m1, m2
+ paddw m3, m4
+ pmulhrsw m1, shiftround
+ pmulhrsw m3, shiftround
+ packuswb m1, m3
+ mova m2, [deinterleave_shufd]
+ vpermd m1, m2, m1
+ vextracti128 xm2, m1, 1
+ movq [r0], xm1
+ movhps [r1], xm1
+ movq [r0+r2], xm2
+ movhps [r1+r2], xm2
+%else
+ movu m0, [r3]
+ pshufb m0, xm5
 .loop4:
 movu m1, [r3+r4]
 pshufb m1, m5
@@ -1828,16 +2042,14 @@
 pmaddubsw m2, m1, m7
 pmaddubsw m1, m6
 pmaddubsw m3, m6
- paddw m0, [pw_32]
- paddw m2, [pw_32]
 paddw m1, m0
 paddw m3, m2
+ pmulhrsw m1, shiftround
+ pmulhrsw m3, shiftround
 mova m0, m4
- psrlw m1, 6
- psrlw m3, 6
 packuswb m1, m3
 movhlps m3, m1
- movd [r0], m1
+ movd [r0], xm1
 movd [r0+r2], m3
 psrldq m1, 4
 psrldq m3, 4
@@ -1849,15 +2061,14 @@
 sub r5d, 2
 jg .loop4
 RET
-
 .width8:
 movu m0, [r3]
 pshufb m0, m5
 movu m1, [r3+8]
 pshufb m1, m5
 %if ARCH_X86_64
- SWAP 8, 6
- %define mult1 m8
+ SWAP 9, 6
+ %define mult1 m9
 %else
 mova r0m, m6
 %define mult1 r0m
@@ -1873,12 +2084,10 @@
 pmaddubsw m1, m7
 pmaddubsw m2, mult1
 pmaddubsw m3, mult1
- paddw m0, [pw_32]
- paddw m1, [pw_32]
 paddw m0, m2
 paddw m1, m3
- psrlw m0, 6
- psrlw m1, 6
+ pmulhrsw m0, shiftround ; x + 32 >> 6
+ pmulhrsw m1, shiftround
 packuswb m0, m1
 pshufd m0, m0, q3120
 movq [r0], m0
@@ -1894,16 +2103,15 @@
 pmaddubsw m6, m7
 pmaddubsw m2, mult1
 pmaddubsw m3, mult1
- paddw m4, [pw_32]
- paddw m6, [pw_32]
 paddw m2, m4
 paddw m3, m6
- psrlw m2, 6
- psrlw m3, 6
+ pmulhrsw m2, shiftround
+ pmulhrsw m3, shiftround
 packuswb m2, m3
 pshufd m2, m2, q3120
 movq [r0+r2], m2
 movhps [r1+r2], m2
+%endif
 lea r3, [r3+r4*2]
 lea r0, [r0+r2*2]
 lea r1, [r1+r2*2]
@@ -1932,4 +2140,6 @@
 MC_CHROMA_SSSE3
 INIT_XMM avx
 MC_CHROMA_SSSE3 ; No known AVX CPU will trigger CPU_CACHELINE_64
+INIT_YMM avx2
+MC_CHROMA_SSSE3
 %endif ; HIGH_BIT_DEPTH

x264-snapshot-20130224-2245.tar.bz2/common/x86/mc-a2.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/mc-a2.asm Changed

@@ -30,13 +30,14 @@
 %include "x86inc.asm"
 %include "x86util.asm"
 
-SECTION_RODATA
+SECTION_RODATA 32
+
+filt_mul20: times 32 db 20
+filt_mul15: times 16 db 1, -5
+filt_mul51: times 16 db -5, 1
+hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
+deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
 
-filt_mul20: times 16 db 20
-filt_mul15: times 8 db 1, -5
-filt_mul51: times 8 db -5, 1
-hpel_shuf: db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
-deinterleave_shuf: db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
 %if HIGH_BIT_DEPTH
 deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14
 deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15
@@ -44,6 +45,7 @@
 deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
 deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
 %endif
+pw_1024: times 16 dw 1024
 
 pd_16: times 4 dd 16
 pd_0f: times 4 dd 0xffff
@@ -64,6 +66,7 @@
 cextern pw_1
 cextern pw_16
 cextern pw_32
+cextern pw_512
 cextern pw_00ff
 cextern pw_3fff
 cextern pw_pixel_max
@@ -127,19 +130,24 @@
 paddw %4, %6
 %endmacro
 
-%macro FILT_PACK 4-6 b
- paddw %1, %4
- paddw %2, %4
-%if %0 == 6
- psubusw %1, %6
- psubusw %2, %6
- psrlw %1, %3
- psrlw %2, %3
+%macro FILT_PACK 3-5
+%if cpuflag(ssse3)
+ pmulhrsw %1, %3
+ pmulhrsw %2, %3
+%else
+ paddw %1, %3
+ paddw %2, %3
+%if %0 == 5
+ psubusw %1, %5
+ psubusw %2, %5
+ psrlw %1, %4
+ psrlw %2, %4
 %else
- psraw %1, %3
- psraw %2, %3
+ psraw %1, %4
+ psraw %2, %4
 %endif
-%ifnidn w, %5
+%endif
+%if HIGH_BIT_DEPTH == 0
 packuswb %1, %2
 %endif
 %endmacro
@@ -203,7 +211,7 @@
 mova [r2+r4+mmsize], m4
 paddw m1, s30
 paddw m4, s30
- FILT_PACK m1, m4, 5, m6, w, s10
+ FILT_PACK m1, m4, m6, 5, s10
 CLIPW m1, m0, m7
 CLIPW m4, m0, m7
 mova [r0+r4], m1
@@ -295,7 +303,7 @@
 FILT_H2 m1, m2, m3, m4, m5, m6
 mova m7, [pw_1]
 pxor m2, m2
- FILT_PACK m1, m4, 1, m7, w
+ FILT_PACK m1, m4, m7, 1
 CLIPW m1, m2, m0
 CLIPW m4, m2, m0
 mova [r0+r2], m1
@@ -349,17 +357,25 @@
 paddw m4, m5
 paddw m1, m3
 paddw m4, m6
+ mova m7, [pw_1024]
 %else
 LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1
 LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1
 LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0
 LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
 FILT_V2 m1, m2, m3, m4, m5, m6
+ mova m7, [pw_16]
 %endif
- mova m7, [pw_16]
+%if mmsize==32
+ mova [r2+r4*2], xm1
+ mova [r2+r4*2+mmsize/2], xm4
+ vextracti128 [r2+r4*2+mmsize], m1, 1
+ vextracti128 [r2+r4*2+mmsize*3/2], m4, 1
+%else
 mova [r2+r4*2], m1
 mova [r2+r4*2+mmsize], m4
- FILT_PACK m1, m4, 5, m7
+%endif
+ FILT_PACK m1, m4, m7, 5
 movnta [r0+r4], m1
 add r1, mmsize
 add r5, mmsize
@@ -371,8 +387,8 @@
 ;-----------------------------------------------------------------------------
 ; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
 ;-----------------------------------------------------------------------------
-INIT_MMX
-cglobal hpel_filter_c_mmx2, 3,3
+INIT_MMX mmx2
+cglobal hpel_filter_c, 3,3
 add r0, r2
 lea r1, [r1+r2*2]
 neg r2
@@ -392,7 +408,7 @@
 paddw m5, [src+12] ; b1
 paddw m6, [src+10] ; c1
 FILT_H2 m1, m2, m3, m4, m5, m6
- FILT_PACK m1, m4, 6, m7
+ FILT_PACK m1, m4, m7, 6
 movntq [r0+r2], m1
 add r2, 8
 jl .loop
@@ -401,7 +417,8 @@
 ;-----------------------------------------------------------------------------
 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
 ;-----------------------------------------------------------------------------
-cglobal hpel_filter_h_mmx2, 3,3
+INIT_MMX mmx2
+cglobal hpel_filter_h, 3,3
 add r0, r2
 add r1, r2
 neg r2
@@ -436,14 +453,12 @@
 paddw m6, m7 ; a1
 movq m7, [pw_1]
 FILT_H2 m1, m2, m3, m4, m5, m6
- FILT_PACK m1, m4, 1, m7
+ FILT_PACK m1, m4, m7, 1
 movntq [r0+r2], m1
 add r2, 8
 jl .loop
 RET
 
-INIT_XMM
-
 %macro HPEL_C 0
 ;-----------------------------------------------------------------------------
 ; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
@@ -454,29 +469,33 @@
 neg r2
 %define src r1+r2*2
 %ifnidn cpuname, sse2
+%if cpuflag(ssse3)
+ mova m7, [pw_512]
+%else
 mova m7, [pw_32]
- %define tpw_32 m7
+%endif
+ %define pw_rnd m7
 %elif ARCH_X86_64
 mova m8, [pw_32]
- %define tpw_32 m8
+ %define pw_rnd m8
 %else
- %define tpw_32 [pw_32]
+ %define pw_rnd [pw_32]
 %endif
 ; This doesn't seem to be faster (with AVX) on Sandy Bridge or Bulldozer...
-%if cpuflag(misalign)
+%if cpuflag(misalign) || mmsize==32
 .loop:
 movu m4, [src-4]
 movu m5, [src-2]
- mova m6, [src]
- movu m3, [src+12]
- movu m2, [src+14]
- mova m1, [src+16]
+ mova m6, [src+0]
+ movu m3, [src-4+mmsize]
+ movu m2, [src-2+mmsize]
+ mova m1, [src+0+mmsize]
 paddw m4, [src+6]
 paddw m5, [src+4]
 paddw m6, [src+2]
- paddw m3, [src+22]
- paddw m2, [src+20]
- paddw m1, [src+18]
+ paddw m3, [src+6+mmsize]
+ paddw m2, [src+4+mmsize]
+ paddw m1, [src+2+mmsize]
 FILT_H2 m4, m5, m6, m3, m2, m1
 %else
 mova m0, [src-16]
@@ -506,9 +525,12 @@
 paddw m6, m0
 FILT_H m3, m5, m6
 %endif
- FILT_PACK m4, m3, 6, tpw_32
- movntps [r0+r2], m4
- add r2, 16
+ FILT_PACK m4, m3, pw_rnd, 6
+%if mmsize==32
+ vpermq m4, m4, q3120
+%endif
+ movnta [r0+r2], m4
+ add r2, mmsize
 jl .loop
 RET
 %endmacro
@@ -516,7 +538,8 @@
 ;-----------------------------------------------------------------------------
 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
 ;-----------------------------------------------------------------------------
-cglobal hpel_filter_h_sse2, 3,3,8
+INIT_XMM sse2
+cglobal hpel_filter_h, 3,3,8
 add r0, r2
 add r1, r2
 neg r2
@@ -555,7 +578,7 @@
 paddw m6, m7 ; c1
 mova m7, [pw_1] ; FIXME xmm8
 FILT_H2 m1, m2, m3, m4, m5, m6
- FILT_PACK m1, m4, 1, m7
+ FILT_PACK m1, m4, m7, 1
 movntps [r0+r2], m1
 add r2, 16
 jl .loop
@@ -572,7 +595,7 @@
 %define src r1+r2
 mova m0, [src-16]
 mova m1, [src]
- mova m7, [pw_16]
+ mova m7, [pw_1024]
 .loop:
 mova m2, [src+16]
 ; Using unaligned loads instead of palignr is marginally slower on SB and significantly
@@ -594,7 +617,7 @@
 paddw m3, m1
 paddw m4, m5
 paddw m4, m6
- FILT_PACK m3, m4, 5, m7
+ FILT_PACK m3, m4, m7, 5
 pshufb m3, [hpel_shuf]
 mova m1, m2
 movntps [r0+r2], m3
@@ -620,6 +643,45 @@
 HPEL_C
 HPEL_V 0
 HPEL_H
+INIT_YMM avx2
+HPEL_V 8
+HPEL_C
+
+INIT_YMM avx2
+cglobal hpel_filter_h, 3,3,8
+ add r0, r2
+ add r1, r2
+ neg r2
+ %define src r1+r2
+ mova m5, [filt_mul15]
+ mova m6, [filt_mul20]
+ mova m7, [filt_mul51]
+.loop:
+ movu m0, [src-2]
+ movu m1, [src-1]
+ movu m2, [src+2]
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m7
+ paddw m0, m2
+
+ mova m2, [src+0]
+ movu m3, [src+1]
+ movu m4, [src+3]
+ pmaddubsw m2, m6
+ pmaddubsw m3, m6
+ pmaddubsw m4, m7
+ paddw m0, m2
+ paddw m1, m3
+ paddw m1, m4
+
+ mova m2, [pw_1024]
+ FILT_PACK m0, m1, m2, 5
+ pshufb m0, [hpel_shuf]
+ movnta [r0+r2], m0
+ add r2, mmsize
+ jl .loop
+ RET
 %endif
 
 %if ARCH_X86_64
@@ -627,9 +689,9 @@
 ;The optimum prefetch distance is difficult to determine in checkasm:
 ;any prefetch seems slower than not prefetching.
 ;In real use, the prefetch seems to be a slight win.
- ;+16 is picked somewhat arbitrarily here based on the fact that even one
+ ;+mmsize is picked somewhat arbitrarily here based on the fact that even one
 ;loop iteration is going to take longer than the prefetch.
- prefetcht0 [r1+r2*2+16]
+ prefetcht0 [r1+r2*2+mmsize]
 %if cpuflag(ssse3)
 mova m1, [r3]
 mova m2, [r3+r2]
@@ -662,31 +724,48 @@
 packuswb %3, %4
 FILT_V2 m1, m2, m3, m4, m5, m6
 %endif
- add r3, 16
- add r1, 16
+ add r3, mmsize
+ add r1, mmsize
+%if mmsize==32
+ vinserti128 %1, m1, xm4, 1
+ vperm2i128 %2, m1, m4, q0301
+%else
 mova %1, m1
 mova %2, m4
- FILT_PACK m1, m4, 5, m15
+%endif
+ FILT_PACK m1, m4, m15, 5
 movntps [r8+r4+%5], m1
 %endmacro
 
-%macro FILT_C 4
- PALIGNR m1, %2, %1, 12, m2
- PALIGNR m2, %2, %1, 14, %1
+%macro FILT_C 3
+%if mmsize==32
+ vperm2i128 m3, %2, %1, q0003
+%endif
+ PALIGNR m1, %2, %1, (mmsize-4), m3
+ PALIGNR m2, %2, %1, (mmsize-2), m3
+%if mmsize==32
+ vperm2i128 %1, %3, %2, q0003
+%endif
 PALIGNR m3, %3, %2, 4, %1
 PALIGNR m4, %3, %2, 2, %1
 paddw m3, m2
+%if mmsize==32
+ mova m2, %1
+%endif
 mova %1, %3
- PALIGNR %3, %2, 6, m2
+ PALIGNR %3, %3, %2, 6, m2
 paddw m4, %2
 paddw %3, m1
 FILT_H %3, m3, m4
 %endmacro
 
 %macro DO_FILT_C 4
- FILT_C %1, %2, %3, 6
- FILT_C %2, %1, %4, 6
- FILT_PACK %3, %4, 6, m15
+ FILT_C %1, %2, %3
+ FILT_C %2, %1, %4
+ FILT_PACK %3, %4, m15, 6
+%if mmsize==32
+ vpermq %3, %3, q3120
+%endif
 movntps [r5+r4], %3
 %endmacro
 
@@ -700,8 +779,14 @@
 %endmacro
 
 %macro DO_FILT_H 3
- PALIGNR m1, %2, %1, 14, m3
- PALIGNR m2, %2, %1, 15, m3
+%if mmsize==32
+ vperm2i128 m3, %2, %1, q0003
+%endif
+ PALIGNR m1, %2, %1, (mmsize-2), m3
+ PALIGNR m2, %2, %1, (mmsize-1), m3
+%if mmsize==32
+ vperm2i128 m3, %3, %2, q0003
+%endif
 PALIGNR m4, %3, %2, 1 , m3
 PALIGNR m5, %3, %2, 2 , m3
 PALIGNR m6, %3, %2, 3 , m3
@@ -717,14 +802,14 @@
 paddw m2, m4
 paddw m1, m5
 paddw m2, m6
- FILT_PACK m1, m2, 5, m15
+ FILT_PACK m1, m2, m15, 5
 pshufb m1, [hpel_shuf]
 %else ; ssse3, avx
 ADD8TO16 m1, m6, m12, m3, m0 ; a
 ADD8TO16 m2, m5, m12, m3, m0 ; b
 ADD8TO16 %2, m4, m12, m3, m0 ; c
 FILT_V2 m1, m2, %2, m6, m5, m4
- FILT_PACK m1, m6, 5, m15
+ FILT_PACK m1, m6, m15, 5
 %endif
 movntps [r0+r4], m1
 mova %2, %3
@@ -737,9 +822,9 @@
 ;-----------------------------------------------------------------------------
 cglobal hpel_filter, 7,9,16
 mov r7, r3
- sub r5d, 16
+ sub r5d, mmsize
 mov r8, r1
- and r7, 15
+ and r7, mmsize-1
 sub r3, r7
 add r0, r5
 add r8, r5
@@ -751,13 +836,14 @@
 sub r3, r2
 sub r3, r2
 mov r4, r7
- mova m15, [pw_16]
 %if cpuflag(ssse3)
 mova m0, [filt_mul51]
 mova m12, [filt_mul15]
 mova m14, [filt_mul20]
+ mova m15, [pw_1024]
 %else
 pxor m0, m0
+ mova m15, [pw_16]
 %endif
 ;ALIGN 16
 .loopy:
@@ -765,16 +851,24 @@
 DO_FILT_V m8, m7, m13, m12, 0
 ;ALIGN 16
 .loopx:
- DO_FILT_V m6, m5, m11, m12, 16
+ DO_FILT_V m6, m5, m11, m12, mmsize
 .lastx:
+%if cpuflag(ssse3)
+ psrlw m15, 1 ; pw_512
+%else
 paddw m15, m15 ; pw_32
+%endif
 DO_FILT_C m9, m8, m7, m6
- psrlw m15, 1 ; pw_16
- movdqa m7, m5
+%if cpuflag(ssse3)
+ paddw m15, m15 ; pw_1024
+%else
+ psrlw m15, 1 ; pw_16
+%endif
+ mova m7, m5
 DO_FILT_H m10, m13, m11
- add r4, 16
+ add r4, mmsize
 jl .loopx
- cmp r4, 16
+ cmp r4, mmsize
 jl .lastx
 ; setup regs for next y
 sub r4, r7
@@ -797,6 +891,8 @@
 HPEL
 INIT_XMM avx
 HPEL
+INIT_YMM avx2
+HPEL
 %endif ; ARCH_X86_64
 
 %undef movntq
@@ -1131,115 +1227,109 @@
 ;-----------------------------------------------------------------------------
 ; void *memcpy_aligned( void *dst, const void *src, size_t n );
 ;-----------------------------------------------------------------------------
-INIT_MMX
-cglobal memcpy_aligned_mmx, 3,3
- test r2d, 16
- jz .copy32start
- movq mm0, [r1 + r2 - 16]
- movq mm1, [r1 + r2 - 8]
- movq [r0 + r2 - 16], mm0
- movq [r0 + r2 - 8], mm1
- sub r2d, 16
-.copy32start
- test r2d, r2d
- jz .ret
-.copy32:
- movq mm0, [r1 + r2 - 32]
- movq mm1, [r1 + r2 - 24]
- movq mm2, [r1 + r2 - 16]
- movq mm3, [r1 + r2 - 8]
- movq [r0 + r2 - 32], mm0
- movq [r0 + r2 - 24], mm1
- movq [r0 + r2 - 16], mm2
- movq [r0 + r2 - 8], mm3
- sub r2d, 32
- jg .copy32
-.ret
- RET
-
-;-----------------------------------------------------------------------------
-; void *memcpy_aligned( void *dst, const void *src, size_t n );
-;-----------------------------------------------------------------------------
-cglobal memcpy_aligned_sse2, 3,3
+%macro MEMCPY 0
+cglobal memcpy_aligned, 3,3
+%if mmsize == 16
 test r2d, 16
- jz .copy32
- movdqa xmm0, [r1 + r2 - 16]
- movdqa [r0 + r2 - 16], xmm0
+ jz .copy2
+ mova m0, [r1+r2-16]
+ mova [r0+r2-16], m0
 sub r2d, 16
-.copy32:
- test r2d, 32
- jz .copy64start
- movdqa xmm0, [r1 + r2 - 32]
- movdqa [r0 + r2 - 32], xmm0
- movdqa xmm1, [r1 + r2 - 16]
- movdqa [r0 + r2 - 16], xmm1
- sub r2d, 32
-.copy64start
+.copy2:
+%endif
+ test r2d, 2*mmsize
+ jz .copy4start
+ mova m0, [r1+r2-1*mmsize]
+ mova m1, [r1+r2-2*mmsize]
+ mova [r0+r2-1*mmsize], m0
+ mova [r0+r2-2*mmsize], m1
+ sub r2d, 2*mmsize
+.copy4start:
 test r2d, r2d
 jz .ret
-.copy64:
- movdqa xmm0, [r1 + r2 - 64]
- movdqa [r0 + r2 - 64], xmm0
- movdqa xmm1, [r1 + r2 - 48]
- movdqa [r0 + r2 - 48], xmm1
- movdqa xmm2, [r1 + r2 - 32]
- movdqa [r0 + r2 - 32], xmm2
- movdqa xmm3, [r1 + r2 - 16]
- movdqa [r0 + r2 - 16], xmm3
- sub r2d, 64
- jg .copy64
+.copy4:
+ mova m0, [r1+r2-1*mmsize]
+ mova m1, [r1+r2-2*mmsize]
+ mova m2, [r1+r2-3*mmsize]
+ mova m3, [r1+r2-4*mmsize]
+ mova [r0+r2-1*mmsize], m0
+ mova [r0+r2-2*mmsize], m1
+ mova [r0+r2-3*mmsize], m2
+ mova [r0+r2-4*mmsize], m3
+ sub r2d, 4*mmsize
+ jg .copy4
 .ret:
 REP_RET
+%endmacro
+
+INIT_MMX mmx
+MEMCPY
+INIT_XMM sse
+MEMCPY
 
 ;-----------------------------------------------------------------------------
 ; void *memzero_aligned( void *dst, size_t n );
 ;-----------------------------------------------------------------------------
-%macro MEMZERO 0
+%macro MEMZERO 1
 cglobal memzero_aligned, 2,2
 add r0, r1
 neg r1
+%if mmsize == 8
 pxor m0, m0
+%else
+ xorps m0, m0
+%endif
 .loop:
 %assign i 0
-%rep 8
+%rep %1
 mova [r0 + r1 + i], m0
 %assign i i+mmsize
 %endrep
- add r1, mmsize*8
+ add r1, mmsize*%1
 jl .loop
 RET
 %endmacro
 
 INIT_MMX mmx
-MEMZERO
-INIT_XMM sse2
-MEMZERO
-
-
+MEMZERO 8
+INIT_XMM sse
+MEMZERO 8
+INIT_YMM avx
+MEMZERO 4
 
 %if HIGH_BIT_DEPTH == 0
 ;-----------------------------------------------------------------------------
 ; void integral_init4h( uint16_t *sum, uint8_t *pix, intptr_t stride )
 ;-----------------------------------------------------------------------------
-INIT_XMM
-cglobal integral_init4h_sse4, 3,4
+%macro INTEGRAL_INIT4H 0
+cglobal integral_init4h, 3,4
 lea r3, [r0+r2*2]
 add r1, r2
 neg r2
 pxor m4, m4
 .loop:
- movdqa m0, [r1+r2]
- movdqa m1, [r1+r2+16]
+ mova m0, [r1+r2]
+%if mmsize==32
+ movu m1, [r1+r2+8]
+%else
+ mova m1, [r1+r2+16]
 palignr m1, m0, 8
+%endif
 mpsadbw m0, m4, 0
 mpsadbw m1, m4, 0
 paddw m0, [r0+r2*2]
- paddw m1, [r0+r2*2+16]
- movdqa [r3+r2*2 ], m0
- movdqa [r3+r2*2+16], m1
- add r2, 16
+ paddw m1, [r0+r2*2+mmsize]
+ mova [r3+r2*2 ], m0
+ mova [r3+r2*2+mmsize], m1
+ add r2, mmsize
 jl .loop
 RET
+%endmacro
+
+INIT_XMM sse4
+INTEGRAL_INIT4H
+INIT_YMM avx2
+INTEGRAL_INIT4H
 
 %macro INTEGRAL_INIT8H 0
 cglobal integral_init8h, 3,4
@@ -1248,20 +1338,26 @@
 neg r2
 pxor m4, m4
 .loop:
- movdqa m0, [r1+r2]
- movdqa m1, [r1+r2+16]
+ mova m0, [r1+r2]
+%if mmsize==32
+ movu m1, [r1+r2+8]
+ mpsadbw m2, m0, m4, 100100b
+ mpsadbw m3, m1, m4, 100100b
+%else
+ mova m1, [r1+r2+16]
 palignr m1, m0, 8
- mpsadbw m2, m0, m4, 4
- mpsadbw m3, m1, m4, 4
+ mpsadbw m2, m0, m4, 100b
+ mpsadbw m3, m1, m4, 100b
+%endif
 mpsadbw m0, m4, 0
 mpsadbw m1, m4, 0
 paddw m0, [r0+r2*2]
- paddw m1, [r0+r2*2+16]
+ paddw m1, [r0+r2*2+mmsize]
 paddw m0, m2
 paddw m1, m3
- movdqa [r3+r2*2 ], m0
- movdqa [r3+r2*2+16], m1
- add r2, 16
+ mova [r3+r2*2 ], m0
+ mova [r3+r2*2+mmsize], m1
+ add r2, mmsize
 jl .loop
 RET
 %endmacro
@@ -1270,6 +1366,8 @@
 INTEGRAL_INIT8H
 INIT_XMM avx
 INTEGRAL_INIT8H
+INIT_YMM avx2
+INTEGRAL_INIT8H
 %endif ; !HIGH_BIT_DEPTH
 
 %macro INTEGRAL_INIT_8V 0
@@ -1277,7 +1375,7 @@
 ; void integral_init8v( uint16_t *sum8, intptr_t stride )
 ;-----------------------------------------------------------------------------
 cglobal integral_init8v, 3,3
- shl r1, 1
+ add r1, r1
 add r0, r1
 lea r2, [r0+r1*8]
 neg r1
@@ -1297,12 +1395,14 @@
 INTEGRAL_INIT_8V
 INIT_XMM sse2
 INTEGRAL_INIT_8V
+INIT_YMM avx2
+INTEGRAL_INIT_8V
 
 ;-----------------------------------------------------------------------------
 ; void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride )
 ;-----------------------------------------------------------------------------
-INIT_MMX
-cglobal integral_init4v_mmx, 3,5
+INIT_MMX mmx
+cglobal integral_init4v, 3,5
 shl r2, 1
 lea r3, [r0+r2*4]
 lea r4, [r0+r2*8]
@@ -1323,8 +1423,8 @@
 jge .loop
 RET
 
-INIT_XMM
-cglobal integral_init4v_sse2, 3,5
+INIT_XMM sse2
+cglobal integral_init4v, 3,5
 shl r2, 1
 add r0, r2
 add r1, r2
@@ -1349,7 +1449,8 @@
 jl .loop
 RET
 
-cglobal integral_init4v_ssse3, 3,5
+INIT_XMM ssse3
+cglobal integral_init4v, 3,5
 shl r2, 1
 add r0, r2
 add r1, r2
@@ -1374,6 +1475,28 @@
 jl .loop
 RET
 
+INIT_YMM avx2
+cglobal integral_init4v, 3,5
+ add r2, r2
+ add r0, r2
+ add r1, r2
+ lea r3, [r0+r2*4]
+ lea r4, [r0+r2*8]
+ neg r2
+.loop:
+ mova m2, [r0+r2]
+ movu m1, [r4+r2+8]
+ paddw m0, m2, [r0+r2+8]
+ paddw m1, [r4+r2]
+ mova m3, [r3+r2]
+ psubw m1, m0
+ psubw m3, m2
+ mova [r0+r2], m1
+ mova [r1+r2], m3
+ add r2, 32
+ jl .loop
+ RET
+
 %macro FILT8x4 7
 mova %3, [r0+%7]
 mova %4, [r0+r5+%7]
@@ -1394,6 +1517,43 @@
 %endif
 %endmacro
 
+%macro FILT32x4U 4
+ mova m1, [r0+r5]
+ pavgb m0, m1, [r0]
+ movu m3, [r0+r5+1]
+ pavgb m2, m3, [r0+1]
+ pavgb m1, [r0+r5*2]
+ pavgb m3, [r0+r5*2+1]
+ pavgb m0, m2
+ pavgb m1, m3
+
+ mova m3, [r0+r5+mmsize]
+ pavgb m2, m3, [r0+mmsize]
+ movu m5, [r0+r5+1+mmsize]
+ pavgb m4, m5, [r0+1+mmsize]
+ pavgb m3, [r0+r5*2+mmsize]
+ pavgb m5, [r0+r5*2+1+mmsize]
+ pavgb m2, m4
+ pavgb m3, m5
+
+ pshufb m0, m7
+ pshufb m1, m7
+ pshufb m2, m7
+ pshufb m3, m7
+ punpckhqdq m4, m0, m2
+ punpcklqdq m0, m0, m2
+ punpckhqdq m5, m1, m3
+ punpcklqdq m2, m1, m3
+ vpermq m0, m0, q3120
+ vpermq m1, m4, q3120
+ vpermq m2, m2, q3120
+ vpermq m3, m5, q3120
+ mova [%1], m0
+ mova [%2], m1
+ mova [%3], m2
+ mova [%4], m3
+%endmacro
+
 %macro FILT16x2 4
 mova m3, [r0+%4+mmsize]
 mova m2, [r0+%4]
@@ -1497,6 +1657,10 @@
 FIX_STRIDES r5
 shl dword r7m, 1
 %endif
+%if mmsize >= 16
+ add dword r7m, mmsize-1
+ and dword r7m, ~(mmsize-1)
+%endif
 ; src += 2*(height-1)*stride + 2*width
 mov r6d, r8m
 dec r6d
@@ -1554,17 +1718,9 @@
 sub r6d, mmsize
 jg .hloop
 %else ; !HIGH_BIT_DEPTH
-%if mmsize == 16
- ; adjust for the odd end case
- mov r6d, r7m
- and r6d, 8
- sub r1, r6
- sub r2, r6
- sub r3, r6
- sub r4, r6
- add dst_gap, r6d
-%endif ; mmsize
-%if cpuflag(xop)
+%if cpuflag(avx2)
+ mova m7, [deinterleave_shuf]
+%elif cpuflag(xop)
 mova m6, [deinterleave_shuf32a]
 mova m7, [deinterleave_shuf32b]
 %else
@@ -1574,44 +1730,22 @@
 .vloop:
 mov r6d, r7m
 %ifnidn cpuname, mmx2
+%if mmsize <= 16
 mova m0, [r0]
 mova m1, [r0+r5]
 pavgb m0, m1
 pavgb m1, [r0+r5*2]
 %endif
-%if mmsize == 16
- test r6d, 8
- jz .hloop
- sub r0, 16
- FILT8x4 m0, m1, m2, m3, m4, m5, 0
-%if cpuflag(xop)
- mova m4, m0
- vpperm m0, m4, m1, m6
- vpperm m1, m4, m1, m7
- movq [r1], m0
- movq [r2], m1
- movhps [r3], m0
- movhps [r4], m1
-%else
- packuswb m0, m4
- packuswb m1, m5
- movq [r1], m0
- movhps [r2], m0
- movq [r3], m1
- movhps [r4], m1
 %endif
- mova m0, m2
- mova m1, m3
- sub r6d, 8
- jz .skip
-%endif ; mmsize
 .hloop:
 sub r0, mmsize*2
 sub r1, mmsize
 sub r2, mmsize
 sub r3, mmsize
 sub r4, mmsize
-%ifdef m8
+%if mmsize==32
+ FILT32x4U r1, r2, r3, r4
+%elifdef m8
 FILT8x4 m0, m1, m2, m3, m10, m11, mmsize
 mova m8, m0
 mova m9, m1
@@ -1669,6 +1803,10 @@
 FRAME_INIT_LOWRES
 INIT_XMM xop
 FRAME_INIT_LOWRES
+%if HIGH_BIT_DEPTH==0
+INIT_YMM avx2
+FRAME_INIT_LOWRES
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
@@ -1741,68 +1879,76 @@
 INIT_XMM fma4
 MBTREE
 
-%macro INT16_TO_FLOAT 1
-%if cpuflag(avx2)
- vpmovzxwd ymm%1, xmm%1
-%else
- vpunpckhwd xmm4, xmm%1, xmm7
- vpunpcklwd xmm%1, xmm7
- vinsertf128 ymm%1, ymm%1, xmm4, 1
-%endif
- vcvtdq2ps ymm%1, ymm%1
+%macro INT16_UNPACK 1
+ vpunpckhwd xm4, xm%1, xm7
+ vpunpcklwd xm%1, xm7
+ vinsertf128 m%1, m%1, xm4, 1
 %endmacro
 
 ; FIXME: align loads/stores to 16 bytes
 %macro MBTREE_AVX 0
 cglobal mbtree_propagate_cost, 7,7,8
- add r6d, r6d
- lea r0, [r0+r6*2]
- add r1, r6
- add r2, r6
- add r3, r6
- add r4, r6
- neg r6
- vmovdqa xmm5, [pw_3fff]
- vbroadcastss ymm6, [r5]
- vmulps ymm6, ymm6, [pf_inv256]
+ add r6d, r6d
+ lea r0, [r0+r6*2]
+ add r1, r6
+ add r2, r6
+ add r3, r6
+ add r4, r6
+ neg r6
+ mova xm5, [pw_3fff]
+ vbroadcastss m6, [r5]
+ mulps m6, [pf_inv256]
 %if notcpuflag(avx2)
- vpxor xmm7, xmm7
+ pxor xm7, xm7
 %endif
 .loop:
- vmovdqu xmm0, [r2+r6] ; intra
- vmovdqu xmm1, [r4+r6] ; invq
- vmovdqu xmm2, [r1+r6] ; prop
- vpand xmm3, xmm5, [r3+r6] ; inter
- INT16_TO_FLOAT 0
- INT16_TO_FLOAT 1
- INT16_TO_FLOAT 2
- INT16_TO_FLOAT 3
-%if cpuflag(fma3)
- vmulps ymm1, ymm1, ymm0
- vsubps ymm4, ymm0, ymm3
- fmaddps ymm1, ymm1, ymm6, ymm2
- vrcpps ymm3, ymm0
- vmulps ymm2, ymm0, ymm3
- vmulps ymm1, ymm1, ymm4
- vaddps ymm4, ymm3, ymm3
- fnmaddps ymm4, ymm2, ymm3, ymm4
- vmulps ymm1, ymm1, ymm4
+%if cpuflag(avx2)
+ pmovzxwd m0, [r2+r6] ; intra
+ pmovzxwd m1, [r4+r6] ; invq
+ pmovzxwd m2, [r1+r6] ; prop
+ pand xm3, xm5, [r3+r6] ; inter
+ pmovzxwd m3, xm3
+ pmaddwd m1, m0
+ psubd m4, m0, m3
+ cvtdq2ps m0, m0
+ cvtdq2ps m1, m1
+ cvtdq2ps m2, m2
+ cvtdq2ps m4, m4
+ fmaddps m1, m1, m6, m2
+ rcpps m3, m0
+ mulps m2, m0, m3
+ mulps m1, m4
+ addps m4, m3, m3
+ fnmaddps m4, m2, m3, m4
+ mulps m1, m4
 %else
- vmulps ymm1, ymm1, ymm0
- vsubps ymm4, ymm0, ymm3
- vmulps ymm1, ymm1, ymm6 ; intra*invq*fps_factor>>8
- vaddps ymm1, ymm1, ymm2 ; prop + (intra*invq*fps_factor>>8)
- vrcpps ymm3, ymm0 ; 1 / intra 1st approximation
- vmulps ymm2, ymm0, ymm3 ; intra * (1/intra 1st approx)
- vmulps ymm2, ymm2, ymm3 ; intra * (1/intra 1st approx)^2
- vmulps ymm1, ymm1, ymm4 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
- vaddps ymm3, ymm3, ymm3 ; 2 * (1/intra 1st approx)
- vsubps ymm3, ymm3, ymm2 ; 2nd approximation for 1/intra
- vmulps ymm1, ymm1, ymm3 ; / intra
+ movu xm0, [r2+r6]
+ movu xm1, [r4+r6]
+ movu xm2, [r1+r6]
+ pand xm3, xm5, [r3+r6]
+ INT16_UNPACK 0
+ INT16_UNPACK 1
+ INT16_UNPACK 2
+ INT16_UNPACK 3
+ cvtdq2ps m0, m0
+ cvtdq2ps m1, m1
+ cvtdq2ps m2, m2
+ cvtdq2ps m3, m3
+ mulps m1, m0
+ subps m4, m0, m3
+ mulps m1, m6 ; intra*invq*fps_factor>>8
+ addps m1, m2 ; prop + (intra*invq*fps_factor>>8)
+ rcpps m3, m0 ; 1 / intra 1st approximation
+ mulps m2, m0, m3 ; intra * (1/intra 1st approx)
+ mulps m2, m3 ; intra * (1/intra 1st approx)^2
+ mulps m1, m4 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
+ addps m3, m3 ; 2 * (1/intra 1st approx)
+ subps m3, m2 ; 2nd approximation for 1/intra
+ mulps m1, m3 ; / intra
 %endif
- vcvtps2dq ymm1, ymm1
- vmovdqu [r0+r6*2], ymm1
- add r6, 16
+ vcvtps2dq m1, m1
+ movu [r0+r6*2], m1
+ add r6, 16
 jl .loop
 RET
 %endmacro

x264-snapshot-20130224-2245.tar.bz2/common/x86/mc-c.c -> x264-snapshot-20130723-2245.tar.bz2/common/x86/mc-c.c Changed

@@ -35,7 +35,8 @@
 #define DECL_SUF( func, args )\
     void func##_mmx2 args;\
     void func##_sse2 args;\
-    void func##_ssse3 args;
+    void func##_ssse3 args;\
+    void func##_avx2 args;
 
 DECL_SUF( x264_pixel_avg_16x16, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
 DECL_SUF( x264_pixel_avg_16x8,  ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
@@ -72,15 +73,20 @@
 MC_WEIGHT( 12, ssse3 )
 MC_WEIGHT( 16, ssse3 )
 MC_WEIGHT( 20, ssse3 )
+MC_WEIGHT( 8, avx2 )
+MC_WEIGHT( 16, avx2 )
+MC_WEIGHT( 20, avx2 )
 #undef MC_OFFSET
 #undef MC_WEIGHT
 
-void x264_mc_copy_w4_mmx  ( pixel *, intptr_t, pixel *, intptr_t, int );
-void x264_mc_copy_w8_mmx  ( pixel *, intptr_t, pixel *, intptr_t, int );
-void x264_mc_copy_w8_sse2 ( pixel *, intptr_t, pixel *, intptr_t, int );
-void x264_mc_copy_w16_mmx ( pixel *, intptr_t, pixel *, intptr_t, int );
-void x264_mc_copy_w16_sse2( pixel *, intptr_t, pixel *, intptr_t, int );
-void x264_mc_copy_w16_aligned_sse2( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_mc_copy_w4_mmx ( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_mc_copy_w8_mmx ( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_mc_copy_w8_sse ( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_mc_copy_w16_mmx( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_mc_copy_w16_sse( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_mc_copy_w16_aligned_sse( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_mc_copy_w16_avx( uint16_t *, intptr_t, uint16_t *, intptr_t, int );
+void x264_mc_copy_w16_aligned_avx( uint16_t *, intptr_t, uint16_t *, intptr_t, int );
 void x264_prefetch_fenc_420_mmx2( pixel *, intptr_t, pixel *, intptr_t, int );
 void x264_prefetch_fenc_422_mmx2( pixel *, intptr_t, pixel *, intptr_t, int );
 void x264_prefetch_ref_mmx2( pixel *, intptr_t, int );
@@ -121,18 +127,23 @@
 void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, intptr_t i_src, int height );
 void x264_load_deinterleave_chroma_fdec_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
 void x264_load_deinterleave_chroma_fdec_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
-void *x264_memcpy_aligned_mmx ( void *dst, const void *src, size_t n );
-void *x264_memcpy_aligned_sse2( void *dst, const void *src, size_t n );
-void x264_memzero_aligned_mmx ( void *dst, size_t n );
-void x264_memzero_aligned_sse2( void *dst, size_t n );
+void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n );
+void *x264_memcpy_aligned_sse( void *dst, const void *src, size_t n );
+void x264_memzero_aligned_mmx( void *dst, size_t n );
+void x264_memzero_aligned_sse( void *dst, size_t n );
+void x264_memzero_aligned_avx( void *dst, size_t n );
 void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );
+void x264_integral_init4h_avx2( uint16_t *sum, uint8_t *pix, intptr_t stride );
 void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );
 void x264_integral_init8h_avx ( uint16_t *sum, uint8_t *pix, intptr_t stride );
+void x264_integral_init8h_avx2( uint16_t *sum, uint8_t *pix, intptr_t stride );
 void x264_integral_init4v_mmx  ( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
 void x264_integral_init4v_sse2 ( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
 void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
+void x264_integral_init4v_avx2( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
 void x264_integral_init8v_mmx ( uint16_t *sum8, intptr_t stride );
 void x264_integral_init8v_sse2( uint16_t *sum8, intptr_t stride );
+void x264_integral_init8v_avx2( uint16_t *sum8, intptr_t stride );
 void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
                                       uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
 void x264_mbtree_propagate_cost_avx ( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
@@ -151,7 +162,7 @@
 MC_CHROMA(ssse3)
 MC_CHROMA(ssse3_cache64)
 MC_CHROMA(avx)
-MC_CHROMA(avx_cache64)
+MC_CHROMA(avx2)
 
 #define LOWRES(cpu)\
 void x264_frame_init_lowres_core_##cpu( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,\
@@ -162,6 +173,7 @@
 LOWRES(ssse3)
 LOWRES(avx)
 LOWRES(xop)
+LOWRES(avx2)
 
 #define PIXEL_AVG_W(width,cpu)\
 void x264_pixel_avg2_w##width##_##cpu( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t );
@@ -176,6 +188,7 @@
 PIXEL_AVG_WALL(sse2)
 PIXEL_AVG_WALL(sse2_misalign)
 PIXEL_AVG_WALL(cache64_ssse3)
+PIXEL_AVG_WALL(avx2)
 
 #define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\
 static void (* const x264_pixel_avg_wtab_##instr[6])( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t ) =\
@@ -194,6 +207,8 @@
 #define x264_pixel_avg2_w20_mmx2       x264_pixel_avg2_w18_mmx2
 #define x264_pixel_avg2_w12_sse2         x264_pixel_avg2_w10_sse2
 #define x264_pixel_avg2_w20_sse2         x264_pixel_avg2_w18_sse2
+#define x264_pixel_avg2_w12_avx2         x264_pixel_avg2_w16_avx2
+#define x264_pixel_avg2_w20_avx2         x264_pixel_avg2_w18_avx2
 #else
 /* w16 sse2 is faster than w12 mmx as long as the cacheline issue is resolved */
 #define x264_pixel_avg2_w12_cache64_ssse3 x264_pixel_avg2_w16_cache64_ssse3
@@ -205,6 +220,7 @@
 PIXEL_AVG_WTAB(mmx2, mmx2, mmx2, mmx2, mmx2, mmx2)
 #if HIGH_BIT_DEPTH
 PIXEL_AVG_WTAB(sse2, mmx2, sse2, sse2, sse2, sse2)
+PIXEL_AVG_WTAB(avx2, mmx2, sse2, avx2, avx2, avx2)
 #else // !HIGH_BIT_DEPTH
 #if ARCH_X86
 PIXEL_AVG_WTAB(cache32_mmx2, mmx2, cache32_mmx2, cache32_mmx2, cache32_mmx2, cache32_mmx2)
@@ -214,6 +230,8 @@
 PIXEL_AVG_WTAB(sse2_misalign, mmx2, mmx2, sse2, sse2, sse2_misalign)
 PIXEL_AVG_WTAB(cache64_sse2, mmx2, cache64_mmx2, cache64_sse2, cache64_sse2, cache64_sse2)
 PIXEL_AVG_WTAB(cache64_ssse3, mmx2, cache64_mmx2, cache64_ssse3, cache64_ssse3, cache64_sse2)
+PIXEL_AVG_WTAB(cache64_ssse3_atom, mmx2, mmx2, cache64_ssse3, cache64_ssse3, sse2)
+PIXEL_AVG_WTAB(avx2, mmx2, mmx2, sse2, sse2, avx2)
 #endif // HIGH_BIT_DEPTH
 
 #define MC_COPY_WTAB(instr, name1, name2, name3)\
@@ -228,9 +246,10 @@
 
 MC_COPY_WTAB(mmx,mmx,mmx,mmx)
 #if HIGH_BIT_DEPTH
-MC_COPY_WTAB(sse2,mmx,sse2,sse2)
+MC_COPY_WTAB(sse,mmx,sse,sse)
+MC_COPY_WTAB(avx,mmx,sse,avx)
 #else
-MC_COPY_WTAB(sse2,mmx,mmx,sse2)
+MC_COPY_WTAB(sse,mmx,mmx,sse)
 #endif
 
 #define MC_WEIGHT_WTAB(function, instr, name1, name2, w12version)\
@@ -282,6 +301,7 @@
 MC_WEIGHT_WTAB(offsetadd,sse2,mmx2,mmx2,16)
 MC_WEIGHT_WTAB(offsetsub,sse2,mmx2,mmx2,16)
 MC_WEIGHT_WTAB(weight,ssse3,ssse3,ssse3,16)
+MC_WEIGHT_WTAB(weight,avx2,ssse3,avx2,16)
 
 static void x264_weight_cache_mmx2( x264_t *h, x264_weight_t *w )
 {
@@ -357,14 +377,17 @@
 }
 
 MC_LUMA(mmx2,mmx2,mmx)
-MC_LUMA(sse2,sse2,sse2)
-#if !HIGH_BIT_DEPTH
+MC_LUMA(sse2,sse2,sse)
+#if HIGH_BIT_DEPTH
+MC_LUMA(avx2,avx2,avx)
+#else
 #if ARCH_X86
 MC_LUMA(cache32_mmx2,cache32_mmx2,mmx)
 MC_LUMA(cache64_mmx2,cache64_mmx2,mmx)
 #endif
-MC_LUMA(cache64_sse2,cache64_sse2,sse2)
-MC_LUMA(cache64_ssse3,cache64_ssse3,sse2)
+MC_LUMA(cache64_sse2,cache64_sse2,sse)
+MC_LUMA(cache64_ssse3,cache64_ssse3,sse)
+MC_LUMA(cache64_ssse3_atom,cache64_ssse3_atom,sse)
 #endif // !HIGH_BIT_DEPTH
 
 #define GET_REF(name)\
@@ -400,6 +423,7 @@
 
 GET_REF(mmx2)
 GET_REF(sse2)
+GET_REF(avx2)
 #if !HIGH_BIT_DEPTH
 #if ARCH_X86
 GET_REF(cache32_mmx2)
@@ -408,6 +432,7 @@
 GET_REF(sse2_misalign)
 GET_REF(cache64_sse2)
 GET_REF(cache64_ssse3)
+GET_REF(cache64_ssse3_atom)
 #endif // !HIGH_BIT_DEPTH
 
 #define HPEL(align, cpu, cpuv, cpuc, cpuh)\
@@ -425,8 +450,8 @@
     width += realign;\
     while( height-- )\
     {\
-        x264_hpel_filter_v_##cpuv( dstv, src, buf+8, stride, width );\
-        x264_hpel_filter_c_##cpuc( dstc, buf+8, width );\
+        x264_hpel_filter_v_##cpuv( dstv, src, buf+16, stride, width );\
+        x264_hpel_filter_c_##cpuc( dstc, buf+16, width );\
         x264_hpel_filter_h_##cpuh( dsth, src, width );\
         dsth += stride;\
         dstv += stride;\
@@ -445,10 +470,12 @@
 void x264_hpel_filter_sse2 ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );
 void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );
 void x264_hpel_filter_avx  ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );
+void x264_hpel_filter_avx2 ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );
 #else
 HPEL(16, sse2, sse2, sse2, sse2)
 HPEL(16, ssse3, ssse3, ssse3, ssse3)
 HPEL(16, avx, avx, avx, avx)
+HPEL(32, avx2, avx2, avx2, avx2)
 #endif
 HPEL(16, sse2_misalign, sse2, sse2_misalign, sse2)
 #endif // HIGH_BIT_DEPTH
@@ -545,6 +572,12 @@
 
     pf->frame_init_lowres_core = x264_frame_init_lowres_core_mmx2;
 
+    if( cpu&X264_CPU_SSE )
+    {
+        pf->memcpy_aligned  = x264_memcpy_aligned_sse;
+        pf->memzero_aligned = x264_memzero_aligned_sse;
+    }
+
 #if HIGH_BIT_DEPTH
 #if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
     if( cpu&(X264_CPU_CACHELINE_32|X264_CPU_CACHELINE_64) )
@@ -569,8 +602,6 @@
         pf->hpel_filter = x264_hpel_filter_sse2;
     }
 
-    pf->memcpy_aligned  = x264_memcpy_aligned_sse2;
-    pf->memzero_aligned = x264_memzero_aligned_sse2;
     pf->integral_init4v = x264_integral_init4v_sse2;
     pf->integral_init8v = x264_integral_init8v_sse2;
     pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
@@ -591,7 +622,7 @@
     pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_sse2;
     pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_sse2;
 
-    pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2;
+    pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse;
     pf->weight = x264_mc_weight_wtab_sse2;
 
     if( !(cpu&X264_CPU_STACK_MOD4) )
@@ -602,7 +633,7 @@
 
     pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
 
-    if( (cpu&X264_CPU_SHUFFLE_IS_FAST) && !(cpu&X264_CPU_SLOW_ATOM) )
+    if( !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SLOW_ATOM|X264_CPU_SLOW_PALIGNR)) )
         pf->integral_init4v = x264_integral_init4v_ssse3;
 
     if( !(cpu&X264_CPU_AVX) )
@@ -614,12 +645,16 @@
     pf->plane_copy_interleave        = x264_plane_copy_interleave_avx;
     pf->plane_copy_deinterleave      = x264_plane_copy_deinterleave_avx;
     pf->store_interleave_chroma      = x264_store_interleave_chroma_avx;
+    pf->copy[PIXEL_16x16]            = x264_mc_copy_w16_aligned_avx;
 
     if( !(cpu&X264_CPU_STACK_MOD4) )
         pf->mc_chroma = x264_mc_chroma_avx;
 
     if( cpu&X264_CPU_XOP )
         pf->frame_init_lowres_core = x264_frame_init_lowres_core_xop;
+
+    if( cpu&X264_CPU_AVX2 )
+        pf->mc_luma = mc_luma_avx2;
 #else // !HIGH_BIT_DEPTH
 
 #if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
@@ -640,55 +675,53 @@
     if( !(cpu&X264_CPU_SSE2) )
         return;
 
-    pf->memcpy_aligned = x264_memcpy_aligned_sse2;
-    pf->memzero_aligned = x264_memzero_aligned_sse2;
     pf->integral_init4v = x264_integral_init4v_sse2;
     pf->integral_init8v = x264_integral_init8v_sse2;
     pf->hpel_filter = x264_hpel_filter_sse2_amd;
     pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
 
-    if( cpu&X264_CPU_SSE2_IS_SLOW )
-        return;
-
-    pf->weight = x264_mc_weight_wtab_sse2;
-    if( !(cpu&X264_CPU_SLOW_ATOM) )
+    if( !(cpu&X264_CPU_SSE2_IS_SLOW) )
     {
-        pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
-        pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
-    }
-
-    pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2;
-    pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
-    pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_sse2;
-    pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_sse2;
-    pf->avg[PIXEL_8x8]  = x264_pixel_avg_8x8_sse2;
-    pf->avg[PIXEL_8x4]  = x264_pixel_avg_8x4_sse2;
-    pf->hpel_filter = x264_hpel_filter_sse2;
-    if( cpu&X264_CPU_SSE_MISALIGN )
-        pf->hpel_filter = x264_hpel_filter_sse2_misalign;
-    pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
-    if( !(cpu&X264_CPU_STACK_MOD4) )
-        pf->mc_chroma = x264_mc_chroma_sse2;
-
-    if( cpu&X264_CPU_SSE2_IS_FAST )
-    {
-        pf->store_interleave_chroma = x264_store_interleave_chroma_sse2; // FIXME sse2fast? sse2medium?
-        pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2;
-        pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2;
-        pf->plane_copy_interleave   = x264_plane_copy_interleave_sse2;
-        pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;
-        pf->mc_luma = mc_luma_sse2;
-        pf->get_ref = get_ref_sse2;
-        if( cpu&X264_CPU_CACHELINE_64 )
+        pf->weight = x264_mc_weight_wtab_sse2;
+        if( !(cpu&X264_CPU_SLOW_ATOM) )
         {
-            pf->mc_luma = mc_luma_cache64_sse2;
-            pf->get_ref = get_ref_cache64_sse2;
+            pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
+            pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
         }
+
+        pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse;
+        pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
+        pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_sse2;
+        pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_sse2;
+        pf->avg[PIXEL_8x8]  = x264_pixel_avg_8x8_sse2;
+        pf->avg[PIXEL_8x4]  = x264_pixel_avg_8x4_sse2;
+        pf->hpel_filter = x264_hpel_filter_sse2;
         if( cpu&X264_CPU_SSE_MISALIGN )
+            pf->hpel_filter = x264_hpel_filter_sse2_misalign;
+        pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
+        if( !(cpu&X264_CPU_STACK_MOD4) )
+            pf->mc_chroma = x264_mc_chroma_sse2;
+
+        if( cpu&X264_CPU_SSE2_IS_FAST )
         {
-            pf->get_ref = get_ref_sse2_misalign;
-            if( !(cpu&X264_CPU_STACK_MOD4) )
-                pf->mc_chroma = x264_mc_chroma_sse2_misalign;
+            pf->store_interleave_chroma = x264_store_interleave_chroma_sse2; // FIXME sse2fast? sse2medium?
+            pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2;
+            pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2;
+            pf->plane_copy_interleave   = x264_plane_copy_interleave_sse2;
+            pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;
+            pf->mc_luma = mc_luma_sse2;
+            pf->get_ref = get_ref_sse2;
+            if( cpu&X264_CPU_CACHELINE_64 )
+            {
+                pf->mc_luma = mc_luma_cache64_sse2;
+                pf->get_ref = get_ref_cache64_sse2;
+            }
+            if( cpu&X264_CPU_SSE_MISALIGN )
+            {
+                pf->get_ref = get_ref_sse2_misalign;
+                if( !(cpu&X264_CPU_STACK_MOD4) )
+                    pf->mc_chroma = x264_mc_chroma_sse2_misalign;
+            }
         }
     }
 
@@ -705,12 +738,21 @@
     pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_ssse3;
     pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_ssse3;
 
-    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_ssse3;
-    pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_ssse3;
-    pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_ssse3;
+    if( !(cpu&X264_CPU_SLOW_PSHUFB) )
+    {
+        pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_ssse3;
+        pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_ssse3;
+        pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_ssse3;
+    }
 
-    pf->hpel_filter = x264_hpel_filter_ssse3;
-    pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
+    if( !(cpu&X264_CPU_SLOW_PALIGNR) )
+    {
+#if ARCH_X86_64
+        if( !(cpu&X264_CPU_SLOW_ATOM) ) /* The 64-bit version is slower, but the 32-bit version is faster? */
+#endif
+            pf->hpel_filter = x264_hpel_filter_ssse3;
+        pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
+    }
     if( !(cpu&X264_CPU_STACK_MOD4) )
         pf->mc_chroma = x264_mc_chroma_ssse3;
 
@@ -720,13 +762,17 @@
             pf->mc_chroma = x264_mc_chroma_ssse3_cache64;
         pf->mc_luma = mc_luma_cache64_ssse3;
         pf->get_ref = get_ref_cache64_ssse3;
-
-        /* ssse3 weight is slower on Nehalem, so only assign here. */
-        pf->weight_cache = x264_weight_cache_ssse3;
-        pf->weight = x264_mc_weight_wtab_ssse3;
+        if( cpu&X264_CPU_SLOW_ATOM )
+        {
+            pf->mc_luma = mc_luma_cache64_ssse3_atom;
+            pf->get_ref = get_ref_cache64_ssse3_atom;
+        }
     }
 
-    if( (cpu&X264_CPU_SHUFFLE_IS_FAST) && !(cpu&X264_CPU_SLOW_ATOM) )
+    pf->weight_cache = x264_weight_cache_ssse3;
+    pf->weight = x264_mc_weight_wtab_ssse3;
+
+    if( !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SLOW_ATOM|X264_CPU_SLOW_PALIGNR)) )
         pf->integral_init4v = x264_integral_init4v_ssse3;
 
     if( !(cpu&X264_CPU_SSE4) )
@@ -742,18 +788,30 @@
     pf->integral_init8h = x264_integral_init8h_avx;
     pf->hpel_filter = x264_hpel_filter_avx;
 
-    /* ssse3 weight seems to be faster again on Sandy Bridge and Bulldozer. */
-    pf->weight_cache = x264_weight_cache_ssse3;
-    pf->weight = x264_mc_weight_wtab_ssse3;
     if( !(cpu&X264_CPU_STACK_MOD4) )
         pf->mc_chroma = x264_mc_chroma_avx;
 
     if( cpu&X264_CPU_XOP )
         pf->frame_init_lowres_core = x264_frame_init_lowres_core_xop;
+
+    if( cpu&X264_CPU_AVX2 )
+    {
+        pf->hpel_filter = x264_hpel_filter_avx2;
+        pf->mc_chroma = x264_mc_chroma_avx2;
+        pf->weight = x264_mc_weight_wtab_avx2;
+        pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_avx2;
+        pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_avx2;
+        pf->integral_init8v = x264_integral_init8v_avx2;
+        pf->integral_init4v = x264_integral_init4v_avx2;
+        pf->integral_init8h = x264_integral_init8h_avx2;
+        pf->integral_init4h = x264_integral_init4h_avx2;
+        pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx2;
+    }
 #endif // HIGH_BIT_DEPTH
 
     if( !(cpu&X264_CPU_AVX) )
         return;
+    pf->memzero_aligned = x264_memzero_aligned_avx;
     pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx;
 
     if( cpu&X264_CPU_FMA4 )
@@ -761,6 +819,7 @@
 
     if( !(cpu&X264_CPU_AVX2) )
         return;
+    pf->get_ref = get_ref_avx2;
 
     if( cpu&X264_CPU_FMA3 )
         pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2_fma3;

x264-snapshot-20130224-2245.tar.bz2/common/x86/pixel-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/pixel-a.asm Changed

@@ -32,8 +32,17 @@
 %include "x86util.asm"
 
 SECTION_RODATA 32
+hmul_16p: times 16 db 1
+ times 8 db 1, -1
+hmul_8p: times 8 db 1
+ times 4 db 1, -1
+ times 8 db 1
+ times 4 db 1, -1
 mask_ff: times 16 db 0xff
 times 16 db 0
+mask_ac4: times 2 dw 0, -1, -1, -1, 0, -1, -1, -1
+mask_ac4b: times 2 dw 0, -1, 0, -1, -1, -1, -1, -1
+mask_ac8: times 2 dw 0, -1, -1, -1, -1, -1, -1, -1
 %if BIT_DEPTH == 10
 ssim_c1: times 4 dd 6697.7856 ; .01*.01*1023*1023*64
 ssim_c2: times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63
@@ -46,12 +55,7 @@
 ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
 ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
 %endif
-mask_ac4: dw 0, -1, -1, -1, 0, -1, -1, -1
-mask_ac4b: dw 0, -1, 0, -1, -1, -1, -1, -1
-mask_ac8: dw 0, -1, -1, -1, -1, -1, -1, -1
 hmul_4p: times 2 db 1, 1, 1, 1, 1, -1, 1, -1
-hmul_8p: times 8 db 1
- times 4 db 1, -1
 mask_10: times 4 dw 0, -1
 mask_1100: times 2 dd 0, -1
 pb_pppm: times 4 db 1,1,1,-1
@@ -85,6 +89,7 @@
 intrax9b_v2: db 2, 3,-1,-1,-1,-1,-1,-1, 6, 7,-1,-1,-1,-1,-1,-1
 intrax9b_lut: db 0x60,0x64,0x80,0x00,0x04,0x20,0x40,0x24,0x44,0,0,0,0,0,0,0
 
+ALIGN 32
 intra8x9_h1: db 7, 7, 7, 7, 7, 7, 7, 7, 5, 5, 5, 5, 5, 5, 5, 5
 intra8x9_h2: db 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4
 intra8x9_h3: db 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1
@@ -120,9 +125,29 @@
 transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15
 
 sw_f0: dq 0xfff0, 0
-sq_0f: dq 0xffffffff, 0
 pd_f0: times 4 dd 0xffff0000
 
+pw_76543210: dw 0, 1, 2, 3, 4, 5, 6, 7
+
+ads_mvs_shuffle:
+%macro ADS_MVS_SHUFFLE 8
+ %assign y x
+ %rep 8
+ %rep 7
+ %rotate (~y)&1
+ %assign y y>>((~y)&1)
+ %endrep
+ db %1*2, %1*2+1
+ %rotate 1
+ %assign y y>>1
+ %endrep
+%endmacro
+%assign x 0
+%rep 256
+ ADS_MVS_SHUFFLE 0, 1, 2, 3, 4, 5, 6, 7
+%assign x x+1
+%endrep
+
 SECTION .text
 
 cextern pb_0
@@ -136,7 +161,9 @@
 cextern pw_ppmmppmm
 cextern pw_pmpmpmpm
 cextern pw_pmmpzzzz
+cextern pd_1
 cextern hsub_mul
+cextern popcnt_table
 
 ;=============================================================================
 ; SSD
@@ -144,69 +171,67 @@
 
 %if HIGH_BIT_DEPTH
 ;-----------------------------------------------------------------------------
-; int pixel_ssd_MxN( uint16_t *, intptr_t, uint16_t *, intptr_t )
+; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t )
 ;-----------------------------------------------------------------------------
 %macro SSD_ONE 2
-cglobal pixel_ssd_%1x%2, 4,5,6
- mov r4, %1*%2/mmsize
+cglobal pixel_ssd_%1x%2, 4,7,6
+ FIX_STRIDES r1, r3
+%if mmsize == %1*2
+ %define offset0_1 r1
+ %define offset0_2 r1*2
+ %define offset0_3 r5
+ %define offset1_1 r3
+ %define offset1_2 r3*2
+ %define offset1_3 r6
+ lea r5, [3*r1]
+ lea r6, [3*r3]
+%elif mmsize == %1
+ %define offset0_1 mmsize
+ %define offset0_2 r1
+ %define offset0_3 r1+mmsize
+ %define offset1_1 mmsize
+ %define offset1_2 r3
+ %define offset1_3 r3+mmsize
+%elif mmsize == %1/2
+ %define offset0_1 mmsize
+ %define offset0_2 mmsize*2
+ %define offset0_3 mmsize*3
+ %define offset1_1 mmsize
+ %define offset1_2 mmsize*2
+ %define offset1_3 mmsize*3
+%endif
+ %assign %%n %2/(2*mmsize/%1)
+%if %%n > 1
+ mov r4d, %%n
+%endif
 pxor m0, m0
 .loop
 mova m1, [r0]
-%if %1 <= mmsize/2
- mova m3, [r0+r1*2]
- %define offset r3*2
- %define num_rows 2
-%else
- mova m3, [r0+mmsize]
- %define offset mmsize
- %define num_rows 1
-%endif
- lea r0, [r0+r1*2*num_rows]
+ mova m2, [r0+offset0_1]
+ mova m3, [r0+offset0_2]
+ mova m4, [r0+offset0_3]
 psubw m1, [r2]
- psubw m3, [r2+offset]
- lea r2, [r2+r3*2*num_rows]
+ psubw m2, [r2+offset1_1]
+ psubw m3, [r2+offset1_2]
+ psubw m4, [r2+offset1_3]
+%if %%n > 1
+ lea r0, [r0+r1*(%2/%%n)]
+ lea r2, [r2+r3*(%2/%%n)]
+%endif
 pmaddwd m1, m1
+ pmaddwd m2, m2
 pmaddwd m3, m3
+ pmaddwd m4, m4
+ paddd m1, m2
+ paddd m3, m4
 paddd m0, m1
 paddd m0, m3
- dec r4
+%if %%n > 1
+ dec r4d
 jg .loop
+%endif
 HADDD m0, m5
- movd eax, m0
- RET
-%endmacro
-
-%macro SSD_16_MMX 2
-cglobal pixel_ssd_%1x%2, 4,5
- mov r4, %1*%2/mmsize/2
- pxor m0, m0
-.loop
- mova m1, [r0]
- mova m2, [r2]
- mova m3, [r0+mmsize]
- mova m4, [r2+mmsize]
- mova m5, [r0+mmsize*2]
- mova m6, [r2+mmsize*2]
- mova m7, [r0+mmsize*3]
- psubw m1, m2
- psubw m3, m4
- mova m2, [r2+mmsize*3]
- psubw m5, m6
- pmaddwd m1, m1
- psubw m7, m2
- pmaddwd m3, m3
- pmaddwd m5, m5
- lea r0, [r0+r1*2]
- lea r2, [r2+r3*2]
- pmaddwd m7, m7
- paddd m1, m3
- paddd m5, m7
- paddd m0, m1
- paddd m0, m5
- dec r4
- jg .loop
- HADDD m0, m7
- movd eax, m0
+ movd eax, xm0
 RET
 %endmacro
 
@@ -217,14 +242,17 @@
 SSD_ONE 8, 4
 SSD_ONE 8, 8
 SSD_ONE 8, 16
-SSD_16_MMX 16, 8
-SSD_16_MMX 16, 16
+SSD_ONE 16, 8
+SSD_ONE 16, 16
 INIT_XMM sse2
 SSD_ONE 8, 4
 SSD_ONE 8, 8
 SSD_ONE 8, 16
 SSD_ONE 16, 8
 SSD_ONE 16, 16
+INIT_YMM avx2
+SSD_ONE 16, 8
+SSD_ONE 16, 16
 %endif ; HIGH_BIT_DEPTH
 
 %if HIGH_BIT_DEPTH == 0
@@ -287,6 +315,23 @@
 punpcklbw m%2, m%4
 %endmacro
 
+%macro LOAD_AVX2 5
+ mova xm%1, %3
+ vinserti128 m%1, m%1, %4, 1
+%if %5
+ lea t0, [t0+2*t1]
+%endif
+%endmacro
+
+%macro JOIN_AVX2 7
+ mova xm%2, %5
+ vinserti128 m%2, m%2, %6, 1
+%if %7
+ lea t2, [t2+2*t3]
+%endif
+ SBUTTERFLY bw, %1, %2, %3
+%endmacro
+
 %macro SSD_LOAD_HALF 5
 LOAD 1, 2, [t0+%1], [t0+%3], 1
 JOIN 1, 2, 3, 4, [t2+%2], [t2+%4], 1
@@ -409,8 +454,15 @@
 %endif
 dec al
 jg .loop
+%if mmsize==32
+ vextracti128 xm1, m0, 1
+ paddd xm0, xm1
+ HADDD xm0, xm1
+ movd eax, xm0
+%else
 HADDD m0, m1
 movd eax, m0
+%endif
 RET
 %endif
 %endmacro
@@ -462,6 +514,11 @@
 SSD 16, 8
 SSD 8, 16
 SSD 8, 4
+%define LOAD LOAD_AVX2
+%define JOIN JOIN_AVX2
+INIT_YMM avx2
+SSD 16, 16
+SSD 16, 8
 %assign function_align 16
 %endif ; !HIGH_BIT_DEPTH
 
@@ -500,7 +557,7 @@
 psubw m1, [r2+r6+mmsize]
 PSHUFLW m0, m0, q3120
 PSHUFLW m1, m1, q3120
-%if mmsize==16
+%if mmsize >= 16
 pshufhw m0, m0, q3120
 pshufhw m1, m1, q3120
 %endif
@@ -510,8 +567,13 @@
 paddd m3, m1
 add r6, 2*mmsize
 jl .loopx
-%if mmsize==16 ; using HADDD would remove the mmsize/32 part from the
- ; equation above, putting the width limit at 8208
+%if mmsize == 32 ; avx2 may overread by 32 bytes, that has to be handled
+ jz .no_overread
+ psubd m3, m1
+.no_overread:
+%endif
+%if mmsize >= 16 ; using HADDD would remove the mmsize/32 part from the
+ ; equation above, putting the width limit at 8208
 punpckhdq m0, m2, m6
 punpckhdq m1, m3, m6
 punpckldq m2, m6
@@ -539,9 +601,13 @@
 jg .loopy
 mov r3, r6m
 mov r4, r7m
-%if mmsize==16
- movq [r3], m4
- movhps [r4], m4
+%if mmsize == 32
+ vextracti128 xm0, m4, 1
+ paddq xm4, xm0
+%endif
+%if mmsize >= 16
+ movq [r3], xm4
+ movhps [r4], xm4
 %else ; fixup for mmx2
 SBUTTERFLY dq, 4, 5, 0
 mova m0, m4
@@ -569,7 +635,7 @@
 ;-----------------------------------------------------------------------------
 %macro SSD_NV12 0
 cglobal pixel_ssd_nv12_core, 6,7
- shl r4d, 1
+ add r4d, r4d
 add r0, r4
 add r2, r4
 pxor m3, m3
@@ -579,10 +645,15 @@
 mov r6, r4
 neg r6
 .loopx:
- mova m0, [r0+r6]
+%if mmsize == 32 ; only 16-byte alignment is guaranteed
+ movu m2, [r0+r6]
+ movu m1, [r2+r6]
+%else
+ mova m2, [r0+r6]
 mova m1, [r2+r6]
- psubusb m0, m1
- psubusb m1, [r0+r6]
+%endif
+ psubusb m0, m2, m1
+ psubusb m1, m2
 por m0, m1
 psrlw m2, m0, 8
 pand m0, m5
@@ -592,19 +663,28 @@
 paddd m4, m2
 add r6, mmsize
 jl .loopx
+%if mmsize == 32 ; avx2 may overread by 16 bytes, that has to be handled
+ jz .no_overread
+ pcmpeqb xm1, xm1
+ pandn m0, m1, m0 ; zero the lower half
+ pandn m2, m1, m2
+ psubd m3, m0
+ psubd m4, m2
+.no_overread:
+%endif
 add r0, r1
 add r2, r3
 dec r5d
 jg .loopy
 mov r3, r6m
 mov r4, r7m
- mova m5, [sq_0f]
 HADDD m3, m0
 HADDD m4, m0
- pand m3, m5
- pand m4, m5
- movq [r3], m3
- movq [r4], m4
+ pxor xm0, xm0
+ punpckldq xm3, xm0
+ punpckldq xm4, xm0
+ movq [r3], xm3
+ movq [r4], xm4
 RET
 %endmacro ; SSD_NV12
 %endif ; !HIGH_BIT_DEPTH
@@ -615,6 +695,8 @@
 SSD_NV12
 INIT_XMM avx
 SSD_NV12
+INIT_YMM avx2
+SSD_NV12
 
 ;=============================================================================
 ; variance
@@ -626,7 +708,7 @@
 %if HIGH_BIT_DEPTH == 0
 %if %1
 mova m7, [pw_00ff]
-%else
+%elif mmsize < 32
 pxor m7, m7 ; zero
 %endif
 %endif ; !HIGH_BIT_DEPTH
@@ -642,12 +724,13 @@
 %else ; !HIGH_BIT_DEPTH
 HADDW m5, m2
 %endif ; HIGH_BIT_DEPTH
- movd eax, m5
 HADDD m6, m1
- movd edx, m6
 %if ARCH_X86_64
- shl rdx, 32
- add rax, rdx
+ punpckldq m5, m6
+ movq rax, m5
+%else
+ movd eax, m5
+ movd edx, m6
 %endif
 RET
 %endmacro
@@ -805,16 +888,45 @@
 VAR
 INIT_XMM xop
 VAR
+
+INIT_YMM avx2
+cglobal pixel_var_16x16, 2,4,7
+ VAR_START 0
+ mov r2d, 4
+ lea r3, [r1*3]
+.loop:
+ pmovzxbw m0, [r0]
+ pmovzxbw m3, [r0+r1]
+ pmovzxbw m1, [r0+r1*2]
+ pmovzxbw m4, [r0+r3]
+ lea r0, [r0+r1*4]
+ VAR_CORE
+ dec r2d
+ jg .loop
+ vextracti128 xm0, m5, 1
+ vextracti128 xm1, m6, 1
+ paddw xm5, xm0
+ paddd xm6, xm1
+ HADDW xm5, xm2
+ HADDD xm6, xm1
+%if ARCH_X86_64
+ punpckldq xm5, xm6
+ movq rax, xm5
+%else
+ movd eax, xm5
+ movd edx, xm6
+%endif
+ RET
 %endif ; !HIGH_BIT_DEPTH
 
-%macro VAR2_END 1
- HADDW m5, m7
- movd r1d, m5
+%macro VAR2_END 3
+ HADDW %2, xm1
+ movd r1d, %2
 imul r1d, r1d
- HADDD m6, m1
+ HADDD %3, xm1
 shr r1d, %1
- movd eax, m6
- mov [r4], eax
+ movd eax, %3
+ movd [r4], %3
 sub eax, r1d ; sqr - (sum * sum >> shift)
 RET
 %endmacro
@@ -855,7 +967,7 @@
 add r2, r3
 dec r5d
 jg .loop
- VAR2_END %2
+ VAR2_END %2, m5, m6
 %endmacro
 
 %if ARCH_X86_64 == 0
@@ -893,7 +1005,7 @@
 lea r2, [r2+r3*2*SIZEOF_PIXEL]
 dec r5d
 jg .loop
- VAR2_END %2
+ VAR2_END %2, m5, m6
 %endmacro
 
 INIT_XMM sse2
@@ -942,7 +1054,7 @@
 lea r2, [r2+r3*2]
 dec r5d
 jg .loop
- VAR2_END %2
+ VAR2_END %2, m5, m6
 %endmacro
 
 INIT_XMM ssse3
@@ -952,6 +1064,48 @@
 VAR2_8x8_SSSE3 8, 6
 VAR2_8x8_SSSE3 16, 7
 
+%macro VAR2_8x8_AVX2 2
+cglobal pixel_var2_8x%1, 5,6,6
+ pxor m3, m3 ; sum
+ pxor m4, m4 ; sum squared
+ mova m5, [hsub_mul]
+ mov r5d, %1/4
+.loop:
+ movq xm0, [r0]
+ movq xm1, [r2]
+ vinserti128 m0, m0, [r0+r1], 1
+ vinserti128 m1, m1, [r2+r3], 1
+ lea r0, [r0+r1*2]
+ lea r2, [r2+r3*2]
+ punpcklbw m0, m1
+ movq xm1, [r0]
+ movq xm2, [r2]
+ vinserti128 m1, m1, [r0+r1], 1
+ vinserti128 m2, m2, [r2+r3], 1
+ lea r0, [r0+r1*2]
+ lea r2, [r2+r3*2]
+ punpcklbw m1, m2
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ paddw m3, m0
+ paddw m3, m1
+ pmaddwd m0, m0
+ pmaddwd m1, m1
+ paddd m4, m0
+ paddd m4, m1
+ dec r5d
+ jg .loop
+ vextracti128 xm0, m3, 1
+ vextracti128 xm1, m4, 1
+ paddw xm3, xm0
+ paddd xm4, xm1
+ VAR2_END %2, xm3, xm4
+%endmacro
+
+INIT_YMM avx2
+VAR2_8x8_AVX2 8, 6
+VAR2_8x8_AVX2 16, 7
+
 %endif ; !HIGH_BIT_DEPTH
 
 ;=============================================================================
@@ -962,7 +1116,7 @@
 %if cpuflag(sse4)
 ; just use shufps on anything post conroe
 shufps %1, %2, 0
-%elif cpuflag(ssse3)
+%elif cpuflag(ssse3) && notcpuflag(atom)
 ; join 2x 32 bit and duplicate them
 ; emulating shufps is faster on conroe
 punpcklqdq %1, %2
@@ -1023,7 +1177,7 @@
 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
 %endmacro
 
-%macro LOAD_SUMSUB_8x4P_SSSE3 7-10 r0, r2, 0
+%macro LOAD_SUMSUB_8x4P_SSSE3 7-11 r0, r2, 0, 0
 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
 LOAD_SUMSUB_8x2P %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
 LOAD_SUMSUB_8x2P %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
@@ -1059,6 +1213,52 @@
 LOAD_SUMSUB_16P %4, %8, %13, %9, %10, %11+r4, %12+r5
 %endmacro
 
+%macro LOAD_SUMSUB_16x2P_AVX2 9
+; 2*dst, 2*tmp, mul, 4*ptr
+ vbroadcasti128 m%1, [%6]
+ vbroadcasti128 m%3, [%7]
+ vbroadcasti128 m%2, [%8]
+ vbroadcasti128 m%4, [%9]
+ DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
+%endmacro
+
+%macro LOAD_SUMSUB_16x4P_AVX2 7-11 r0, r2, 0, 0
+; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
+ LOAD_SUMSUB_16x2P_AVX2 %1, %2, %5, %6, %7, %8, %9, %8+r1, %9+r3
+ LOAD_SUMSUB_16x2P_AVX2 %3, %4, %5, %6, %7, %8+2*r1, %9+2*r3, %8+r4, %9+r5
+%if %10
+ lea %8, [%8+4*r1]
+ lea %9, [%9+4*r3]
+%endif
+%endmacro
+
+%macro LOAD_DUP_4x16P_AVX2 8 ; 4*dst, 4*pointer
+ mova xm%3, %6
+ mova xm%4, %8
+ mova xm%1, %5
+ mova xm%2, %7
+ vpermq m%3, m%3, q0011
+ vpermq m%4, m%4, q0011
+ vpermq m%1, m%1, q0011
+ vpermq m%2, m%2, q0011
+%endmacro
+
+%macro LOAD_SUMSUB8_16x2P_AVX2 9
+; 2*dst, 2*tmp, mul, 4*ptr
+ LOAD_DUP_4x16P_AVX2 %1, %2, %3, %4, %6, %7, %8, %9
+ DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
+%endmacro
+
+%macro LOAD_SUMSUB8_16x4P_AVX2 7-11 r0, r2, 0, 0
+; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
+ LOAD_SUMSUB8_16x2P_AVX2 %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
+ LOAD_SUMSUB8_16x2P_AVX2 %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
+%if %10
+ lea %8, [%8+4*r1]
+ lea %9, [%9+4*r3]
+%endif
+%endmacro
+
 ; in: r4=3*stride1, r5=3*stride2
 ; in: %2 = horizontal offset
 ; in: %3 = whether we need to increment pix1 and pix2
@@ -1080,8 +1280,9 @@
 SWAP %%n, 4
 %endmacro
 
+; in: %1 = horizontal if 0, vertical if 1
 %macro SATD_8x4_SSE 8-9
-%ifidn %1, sse2
+%if %1
 HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax
 %else
 HADAMARD4_V %2, %3, %4, %5, %6
@@ -1095,7 +1296,7 @@
 %else
 SWAP %8, %2
 %endif
-%ifidn %1, sse2
+%if %1
 paddw m%8, m%4
 %else
 HADAMARD 1, max, %3, %5, %6, %7
@@ -1250,21 +1451,43 @@
 SATD_4x4_MMX m0, 0, 0
 SATD_END_MMX
 
-%macro SATD_START_SSE2 2
-%if cpuflag(ssse3)
+%macro SATD_START_SSE2 2-3 0
+ FIX_STRIDES r1, r3
+%if HIGH_BIT_DEPTH && %3
+ pxor %2, %2
+%elif cpuflag(ssse3) && notcpuflag(atom)
+%if mmsize==32
+ mova %2, [hmul_16p]
+%else
 mova %2, [hmul_8p]
 %endif
+%endif
 lea r4, [3*r1]
 lea r5, [3*r3]
 pxor %1, %1
 %endmacro
 
-%macro SATD_END_SSE2 1
- HADDW %1, m7
+%macro SATD_END_SSE2 1-2
+%if HIGH_BIT_DEPTH
+ HADDUW %1, xm0
+%if %0 == 2
+ paddd %1, %2
+%endif
+%else
+ HADDW %1, xm7
+%endif
 movd eax, %1
 RET
 %endmacro
 
+%macro SATD_ACCUM 3
+%if HIGH_BIT_DEPTH
+ HADDUW %1, %2
+ paddd %3, %1
+ pxor %1, %1
+%endif
+%endmacro
+
 %macro BACKUP_POINTERS 0
 %if ARCH_X86_64
 %if WIN64
@@ -1277,20 +1500,44 @@
 
 %macro RESTORE_AND_INC_POINTERS 0
 %if ARCH_X86_64
- lea r0, [r6+8]
- lea r2, [r7+8]
+ lea r0, [r6+8*SIZEOF_PIXEL]
+ lea r2, [r7+8*SIZEOF_PIXEL]
 %if WIN64
 POP r7
 %endif
 %else
 mov r0, r0mp
 mov r2, r2mp
- add r0, 8
- add r2, 8
+ add r0, 8*SIZEOF_PIXEL
+ add r2, 8*SIZEOF_PIXEL
 %endif
 %endmacro
 
-%macro SATD_4x8_SSE 2
+%macro SATD_4x8_SSE 3
+%if HIGH_BIT_DEPTH
+ movh m0, [r0+0*r1]
+ movh m4, [r2+0*r3]
+ movh m1, [r0+1*r1]
+ movh m5, [r2+1*r3]
+ movhps m0, [r0+4*r1]
+ movhps m4, [r2+4*r3]
+ movh m2, [r0+2*r1]
+ movh m6, [r2+2*r3]
+ psubw m0, m4
+ movh m3, [r0+r4]
+ movh m4, [r2+r5]
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ movhps m1, [r0+1*r1]
+ movhps m5, [r2+1*r3]
+ movhps m2, [r0+2*r1]
+ movhps m6, [r2+2*r3]
+ psubw m1, m5
+ movhps m3, [r0+r4]
+ movhps m4, [r2+r5]
+ psubw m2, m6
+ psubw m3, m4
+%else ; !HIGH_BIT_DEPTH
 movd m4, [r2]
 movd m5, [r2+r3]
 movd m6, [r2+2*r3]
@@ -1307,7 +1554,7 @@
 JDUP m5, m3
 movd m3, [r0+2*r1]
 JDUP m1, m3
-%if cpuflag(ssse3) && %1==1
+%if %1==0 && %2==1
 mova m3, [hmul_4p]
 DIFFOP 0, 4, 1, 5, 3
 %else
@@ -1325,20 +1572,23 @@
 JDUP m5, m4
 movd m4, [r0+r1]
 JDUP m3, m4
-%if cpuflag(ssse3) && %1==1
+%if %1==0 && %2==1
 mova m4, [hmul_4p]
 DIFFOP 2, 6, 3, 5, 4
 %else
 DIFFOP 2, 6, 3, 5, 7
 %endif
- SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 7, %2
+%endif ; HIGH_BIT_DEPTH
+ SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3
 %endmacro
 
 ;-----------------------------------------------------------------------------
 ; int pixel_satd_8x4( uint8_t *, intptr_t, uint8_t *, intptr_t )
 ;-----------------------------------------------------------------------------
 %macro SATDS_SSE2 0
-%if cpuflag(ssse3)
+%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
+
+%if vertical==0 || HIGH_BIT_DEPTH
 cglobal pixel_satd_4x4, 4, 6, 6
 SATD_START_MMX
 mova m4, [hmul_4p]
@@ -1357,55 +1607,57 @@
 
 cglobal pixel_satd_4x8, 4, 6, 8
 SATD_START_MMX
-%if cpuflag(ssse3)
+%if vertical==0
 mova m7, [hmul_4p]
 %endif
- SATD_4x8_SSE 0, swap
+ SATD_4x8_SSE vertical, 0, swap
 HADDW m7, m1
 movd eax, m7
 RET
 
 cglobal pixel_satd_4x16, 4, 6, 8
 SATD_START_MMX
-%if cpuflag(ssse3)
+%if vertical==0
 mova m7, [hmul_4p]
 %endif
- SATD_4x8_SSE 0, swap
- lea r0, [r0+r1*2]
- lea r2, [r2+r3*2]
- SATD_4x8_SSE 1, add
+ SATD_4x8_SSE vertical, 0, swap
+ lea r0, [r0+r1*2*SIZEOF_PIXEL]
+ lea r2, [r2+r3*2*SIZEOF_PIXEL]
+ SATD_4x8_SSE vertical, 1, add
 HADDW m7, m1
 movd eax, m7
 RET
 
 cglobal pixel_satd_8x8_internal
- LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
- SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 6
+ LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
+ SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
 %%pixel_satd_8x4_internal:
- LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
- SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 6
+ LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
+ SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
 ret
 
-%if UNIX64 ; 16x8 regresses on phenom win64, 16x16 is almost the same
+; 16x8 regresses on phenom win64, 16x16 is almost the same (too many spilled registers)
+; These aren't any faster on AVX systems with fast movddup (Bulldozer, Sandy Bridge)
+%if HIGH_BIT_DEPTH == 0 && UNIX64 && notcpuflag(avx)
 cglobal pixel_satd_16x4_internal
 LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
 lea r2, [r2+4*r3]
 lea r0, [r0+4*r1]
- ; FIXME: this doesn't really mean ssse3, but rather selects between two different behaviors implemented with sse2?
- SATD_8x4_SSE ssse3, 0, 1, 2, 3, 6, 11, 10
- SATD_8x4_SSE ssse3, 4, 8, 5, 9, 6, 3, 10
+ ; always use horizontal mode here
+ SATD_8x4_SSE 0, 0, 1, 2, 3, 6, 11, 10
+ SATD_8x4_SSE 0, 4, 8, 5, 9, 6, 3, 10
 ret
 
 cglobal pixel_satd_16x8, 4,6,12
 SATD_START_SSE2 m10, m7
-%if notcpuflag(ssse3)
+%if vertical
 mova m7, [pw_00ff]
 %endif
 jmp %%pixel_satd_16x8_internal
 
 cglobal pixel_satd_16x16, 4,6,12
 SATD_START_SSE2 m10, m7
-%if notcpuflag(ssse3)
+%if vertical
 mova m7, [pw_00ff]
 %endif
 call pixel_satd_16x4_internal
@@ -1424,14 +1676,15 @@
 SATD_END_SSE2 m6
 
 cglobal pixel_satd_16x16, 4,6,8
- SATD_START_SSE2 m6, m7
+ SATD_START_SSE2 m6, m7, 1
 BACKUP_POINTERS
 call pixel_satd_8x8_internal
 call pixel_satd_8x8_internal
+ SATD_ACCUM m6, m0, m7
 RESTORE_AND_INC_POINTERS
 call pixel_satd_8x8_internal
 call pixel_satd_8x8_internal
- SATD_END_SSE2 m6
+ SATD_END_SSE2 m6, m7
 %endif
 
 cglobal pixel_satd_8x16, 4,6,8
@@ -1468,11 +1721,8 @@
 %endmacro
 
 %macro SA8D 0
-%if HIGH_BIT_DEPTH
- %define vertical 1
-%else ; sse2 doesn't seem to like the horizontal way of doing things
- %define vertical (cpuflags == cpuflags_sse2)
-%endif
+; sse2 doesn't seem to like the horizontal way of doing things
+%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
 
 %if ARCH_X86_64
 ;-----------------------------------------------------------------------------
@@ -1679,6 +1929,170 @@
 %endmacro ; SA8D
 
 ;=============================================================================
+; SA8D_SATD
+;=============================================================================
+
+; %1: vertical/horizontal mode
+; %2-%5: sa8d output regs (m0,m1,m2,m3,m4,m5,m8,m9)
+; m10: satd result
+; m6, m11-15: tmp regs
+%macro SA8D_SATD_8x4 5
+%if %1
+ LOAD_DIFF_8x4P %2, %3, %4, %5, 6, 11, 7, r0, r2, 1
+ HADAMARD 0, sumsub, %2, %3, 6
+ HADAMARD 0, sumsub, %4, %5, 6
+ SBUTTERFLY wd, %2, %3, 6
+ SBUTTERFLY wd, %4, %5, 6
+ HADAMARD2_2D %2, %4, %3, %5, 6, dq
+
+ mova m12, m%2
+ mova m13, m%3
+ mova m14, m%4
+ mova m15, m%5
+ HADAMARD 0, sumsub, %2, %3, 6
+ HADAMARD 0, sumsub, %4, %5, 6
+ SBUTTERFLY qdq, 12, 13, 6
+ HADAMARD 0, amax, 12, 13, 6
+ SBUTTERFLY qdq, 14, 15, 6
+ paddw m10, m12
+ HADAMARD 0, amax, 14, 15, 6
+ paddw m10, m14
+%else
+ LOAD_SUMSUB_8x4P %2, %3, %4, %5, 6, 11, 7, r0, r2, 1
+ HADAMARD4_V %2, %3, %4, %5, 6
+
+ pabsw m12, m%2 ; doing the abs first is a slight advantage
+ pabsw m14, m%4
+ pabsw m13, m%3
+ pabsw m15, m%5
+ HADAMARD 1, max, 12, 14, 6, 11
+ paddw m10, m12
+ HADAMARD 1, max, 13, 15, 6, 11
+ paddw m10, m13
+%endif
+%endmacro ; SA8D_SATD_8x4
+
+; %1: add spilled regs?
+; %2: spill regs?
+%macro SA8D_SATD_ACCUM 2
+%if HIGH_BIT_DEPTH
+ pmaddwd m10, [pw_1]
+ HADDUWD m0, m1
+%if %1
+ paddd m10, temp1
+ paddd m0, temp0
+%endif
+%if %2
+ mova temp1, m10
+ pxor m10, m10
+%endif
+%elif %1
+ paddw m0, temp0
+%endif
+%if %2
+ mova temp0, m0
+%endif
+%endmacro
+
+%macro SA8D_SATD 0
+%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
+cglobal pixel_sa8d_satd_8x8_internal
+ SA8D_SATD_8x4 vertical, 0, 1, 2, 3
+ SA8D_SATD_8x4 vertical, 4, 5, 8, 9
+
+%if vertical ; sse2-style
+ HADAMARD2_2D 0, 4, 2, 8, 6, qdq, amax
+ HADAMARD2_2D 1, 5, 3, 9, 6, qdq, amax
+%else ; complete sa8d
+ SUMSUB_BADC w, 0, 4, 1, 5, 12
+ HADAMARD 2, sumsub, 0, 4, 12, 11
+ HADAMARD 2, sumsub, 1, 5, 12, 11
+ SUMSUB_BADC w, 2, 8, 3, 9, 12
+ HADAMARD 2, sumsub, 2, 8, 12, 11
+ HADAMARD 2, sumsub, 3, 9, 12, 11
+ HADAMARD 1, amax, 0, 4, 12, 11
+ HADAMARD 1, amax, 1, 5, 12, 4
+ HADAMARD 1, amax, 2, 8, 12, 4
+ HADAMARD 1, amax, 3, 9, 12, 4
+%endif
+
+ ; create sa8d sub results
+ paddw m1, m2
+ paddw m0, m3
+ paddw m0, m1
+
+ SAVE_MM_PERMUTATION
+ ret
+
+;-------------------------------------------------------------------------------
+; uint64_t pixel_sa8d_satd_16x16( pixel *, intptr_t, pixel *, intptr_t )
+;-------------------------------------------------------------------------------
+cglobal pixel_sa8d_satd_16x16, 4,8-(mmsize/32),16,SIZEOF_PIXEL*mmsize
+ %define temp0 [rsp+0*mmsize]
+ %define temp1 [rsp+1*mmsize]
+ FIX_STRIDES r1, r3
+%if vertical==0
+ mova m7, [hmul_8p]
+%endif
+ lea r4, [3*r1]
+ lea r5, [3*r3]
+ pxor m10, m10
+
+%if mmsize==32
+ call pixel_sa8d_satd_8x8_internal
+ SA8D_SATD_ACCUM 0, 1
+ call pixel_sa8d_satd_8x8_internal
+ SA8D_SATD_ACCUM 1, 0
+ vextracti128 xm1, m0, 1
+ vextracti128 xm2, m10, 1
+ paddw xm0, xm1
+ paddw xm10, xm2
+%else
+ lea r6, [r2+8*SIZEOF_PIXEL]
+ lea r7, [r0+8*SIZEOF_PIXEL]
+
+ call pixel_sa8d_satd_8x8_internal
+ SA8D_SATD_ACCUM 0, 1
+ call pixel_sa8d_satd_8x8_internal
+ SA8D_SATD_ACCUM 1, 1
+
+ mov r0, r7
+ mov r2, r6
+
+ call pixel_sa8d_satd_8x8_internal
+ SA8D_SATD_ACCUM 1, 1
+ call pixel_sa8d_satd_8x8_internal
+ SA8D_SATD_ACCUM 1, 0
+%endif
+
+; xop already has fast horizontal sums
+%if cpuflag(sse4) && notcpuflag(xop) && HIGH_BIT_DEPTH==0
+ pmaddwd xm10, [pw_1]
+ HADDUWD xm0, xm1
+ phaddd xm0, xm10 ; sa8d1 sa8d2 satd1 satd2
+ pshufd xm1, xm0, q2301 ; sa8d2 sa8d1 satd2 satd1
+ paddd xm0, xm1 ; sa8d sa8d satd satd
+ movd r0d, xm0
+ pextrd eax, xm0, 2
+%else
+%if HIGH_BIT_DEPTH
+ HADDD xm0, xm1
+ HADDD xm10, xm2
+%else
+ HADDUW xm0, xm1
+ HADDW xm10, xm2
+%endif
+ movd r0d, xm0
+ movd eax, xm10
+%endif
+ add r0d, 1
+ shl rax, 32
+ shr r0d, 1
+ or rax, r0
+ RET
+%endmacro ; SA8D_SATD
+
+;=============================================================================
 ; INTRA SATD
 ;=============================================================================
 
@@ -1913,15 +2327,16 @@
 ; void intra_satd_x3_4x4( uint8_t *fenc, uint8_t *fdec, int *res )
 ;-----------------------------------------------------------------------------
 cglobal intra_satd_x3_4x4, 3,3
-%if ARCH_X86_64
+%if UNIX64
 ; stack is 16 byte aligned because abi says so
 %define top_1d rsp-8 ; size 8
 %define left_1d rsp-16 ; size 8
 %else
- ; stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned
- SUB esp, 16
- %define top_1d esp+8
- %define left_1d esp
+ ; WIN64: stack is 16 byte aligned because abi says so
+ ; X86_32: stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned
+ SUB rsp, 16
+ %define top_1d rsp+8
+ %define left_1d rsp
 %endif
 
 call hadamard_load
@@ -1943,8 +2358,8 @@
 movd [r2+0], m0 ; i4x4_v satd
 movd [r2+4], m4 ; i4x4_h satd
 movd [r2+8], m5 ; i4x4_dc satd
-%if ARCH_X86_64 == 0
- ADD esp, 16
+%if UNIX64 == 0
+ ADD rsp, 16
 %endif
 RET
 
@@ -2526,7 +2941,7 @@
 psubw m1, m9
 psubw m2, m10
 psubw m3, m11
- SATD_8x4_SSE cpuname, 0, 1, 2, 3, 13, 14, 0, swap
+ SATD_8x4_SSE 0, 0, 1, 2, 3, 13, 14, 0, swap
 pmaddwd m0, [pw_1]
 %if cpuflag(sse4)
 pshufd m1, m0, q0032
@@ -2634,7 +3049,7 @@
 psubw m2, [fenc_buf+0x20]
 .satd_8x4b:
 psubw m3, [fenc_buf+0x30]
- SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 0, swap
+ SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 0, swap
 pmaddwd m0, [pw_1]
 %if cpuflag(sse4)
 pshufd m1, m0, q0032
@@ -3574,12 +3989,12 @@
 %define spill2 m10
 %else
 %define spill0 [rsp+gprsize]
- %define spill1 [rsp+gprsize+16]
- %define spill2 [rsp+gprsize+32]
+ %define spill1 [rsp+gprsize+mmsize]
+ %define spill2 [rsp+gprsize+mmsize*2]
 %endif
 %if HIGH_BIT_DEPTH
 %define vertical 1
-%elif cpuflag(ssse3)
+%elif cpuflag(ssse3) && notcpuflag(atom)
 %define vertical 0
 ;LOAD_INC loads sumsubs
 mova m7, [hmul_8p]
@@ -3638,17 +4053,14 @@
 AC_PADD m1, m2, [pw_1]
 ABSW m2, m7, m7
 AC_PADD m1, m3, [pw_1]
- mova m3, m7
 AC_PADD m1, m2, [pw_1]
- mova m2, m6
+ paddw m3, m7, spill2
 psubw m7, spill2
- paddw m3, spill2
- mova [rsp+gprsize+32], m1 ; save satd
- mova m1, m5
+ mova [rsp+gprsize+mmsize*2], m1 ; save satd
+ paddw m2, m6, spill1
 psubw m6, spill1
- paddw m2, spill1
+ paddw m1, m5, spill0
 psubw m5, spill0
- paddw m1, spill0
 %assign %%x 2
 %if vertical
 %assign %%x 4
@@ -3672,15 +4084,17 @@
 ABSW m0, m0, m7
 AC_PADD m2, m4, [pw_1]
 AC_PADD m2, m0, [pw_1]
- mova [rsp+gprsize+16], m2 ; save sa8d
+ mova [rsp+gprsize+mmsize], m2 ; save sa8d
 SWAP 0, 2
 SAVE_MM_PERMUTATION
 ret
 
 HADAMARD_AC_WXH_SSE2 16, 16
-HADAMARD_AC_WXH_SSE2 8, 16
 HADAMARD_AC_WXH_SSE2 16, 8
+%if mmsize <= 16
+HADAMARD_AC_WXH_SSE2 8, 16
 HADAMARD_AC_WXH_SSE2 8, 8
+%endif
 %endmacro ; HADAMARD_AC_SSE2
 
 %macro HADAMARD_AC_WXH_SUM_SSE2 2
@@ -3697,62 +4111,69 @@
 paddd m1, [rsp+8*mmsize]
 psrld m0, 1
 %endif
- HADDD m0, m2
- HADDD m1, m3
+ HADDD xm0, xm2
+ HADDD xm1, xm3
 %else ; !HIGH_BIT_DEPTH
-%if %1*%2 >= 128
+%if %1*%2*16/mmsize >= 128
 paddusw m0, [rsp+3*mmsize]
 paddusw m1, [rsp+4*mmsize]
 %endif
-%if %1*%2 == 256
+%if %1*%2*16/mmsize == 256
 paddusw m0, [rsp+5*mmsize]
 paddusw m1, [rsp+6*mmsize]
 paddusw m0, [rsp+7*mmsize]
 paddusw m1, [rsp+8*mmsize]
 psrlw m0, 1
 %endif
- HADDUW m0, m2
- HADDW m1, m3
+%if mmsize==32
+ vextracti128 xm2, m0, 1
+ vextracti128 xm3, m1, 1
+ paddusw xm0, xm2
+ paddusw xm1, xm3
+%endif
+ HADDUW xm0, xm2
+ HADDW xm1, xm3
 %endif ; HIGH_BIT_DEPTH
 %endmacro
 
 ; struct { int satd, int sa8d; } pixel_hadamard_ac_16x16( uint8_t *pix, int stride )
 %macro HADAMARD_AC_WXH_SSE2 2
-cglobal pixel_hadamard_ac_%1x%2, 2,3,11
- %assign pad 16-gprsize-(stack_offset&15)
+cglobal pixel_hadamard_ac_%1x%2, 2,4,11
 %define ysub r1
 FIX_STRIDES r1
- sub rsp, 48+pad
- lea r2, [r1*3]
+ mov r3, rsp
+ and rsp, ~(mmsize-1)
+ sub rsp, mmsize*3
+ lea r2, [r1*3]
 call hadamard_ac_8x8
 %if %2==16
 %define ysub r2
- lea r0, [r0+r1*4]
- sub rsp, 32
+ lea r0, [r0+r1*4]
+ sub rsp, mmsize*2
 call hadamard_ac_8x8
 %endif
-%if %1==16
+%if %1==16 && mmsize <= 16
 neg ysub
- sub rsp, 32
- lea r0, [r0+ysub*4+8*SIZEOF_PIXEL]
+ sub rsp, mmsize*2
+ lea r0, [r0+ysub*4+8*SIZEOF_PIXEL]
 neg ysub
 call hadamard_ac_8x8
 %if %2==16
- lea r0, [r0+r1*4]
- sub rsp, 32
+ lea r0, [r0+r1*4]
+ sub rsp, mmsize*2
 call hadamard_ac_8x8
 %endif
 %endif
 HADAMARD_AC_WXH_SUM_SSE2 %1, %2
- movd edx, m0
- movd eax, m1
- shr edx, 2 - (%1*%2 >> 8)
+ movd edx, xm0
+ movd eax, xm1
+ shr edx, 2 - (%1*%2*16/mmsize >> 8)
 shr eax, 1
 %if ARCH_X86_64
 shl rdx, 32
 add rax, rdx
 %endif
- add rsp, 16+%1*%2/2+pad
+ mov rsp, r3
 RET
 %endmacro ; HADAMARD_AC_WXH_SSE2
 
@@ -3775,6 +4196,9 @@
 INIT_XMM sse2
 SA8D
 SATDS_SSE2
+%if ARCH_X86_64
+SA8D_SATD
+%endif
 %if HIGH_BIT_DEPTH == 0
 INTRA_SA8D_SSE2
 %endif
@@ -3783,6 +4207,16 @@
 INIT_XMM sse2
 HADAMARD_AC_SSE2
 
+%if HIGH_BIT_DEPTH == 0
+INIT_XMM ssse3,atom
+SATDS_SSE2
+SA8D
+HADAMARD_AC_SSE2
+%if ARCH_X86_64
+SA8D_SATD
+%endif
+%endif
+
 %define DIFFOP DIFF_SUMSUB_SSSE3
 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
 %if HIGH_BIT_DEPTH == 0
@@ -3794,6 +4228,9 @@
 SATDS_SSE2
 SA8D
 HADAMARD_AC_SSE2
+%if ARCH_X86_64
+SA8D_SATD
+%endif
 %if HIGH_BIT_DEPTH == 0
 INTRA_X9
 INTRA8_X9
@@ -3812,14 +4249,23 @@
 SATDS_SSE2
 SA8D
 HADAMARD_AC_SSE2
+%if ARCH_X86_64
+SA8D_SATD
+%endif
 %if HIGH_BIT_DEPTH == 0
 INTRA_X9
 INTRA8_X9
 %endif
 
+; Sandy/Ivy Bridge and Bulldozer do movddup in the load unit, so
+; it's effectively free.
+%define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
 INIT_XMM avx
 SATDS_SSE2
 SA8D
+%if ARCH_X86_64
+SA8D_SATD
+%endif
 %if HIGH_BIT_DEPTH == 0
 INTRA_X9
 INTRA8_X9
@@ -3830,12 +4276,331 @@
 INIT_XMM xop
 SATDS_SSE2
 SA8D
+%if ARCH_X86_64
+SA8D_SATD
+%endif
 %if HIGH_BIT_DEPTH == 0
 INTRA_X9
 ; no xop INTRA8_X9. it's slower than avx on bulldozer. dunno why.
 %endif
 HADAMARD_AC_SSE2
 
+
+%if HIGH_BIT_DEPTH == 0
+%define LOAD_SUMSUB_8x4P LOAD_SUMSUB8_16x4P_AVX2
+%define LOAD_DUP_4x8P LOAD_DUP_4x16P_AVX2
+%define TRANS TRANS_SSE4
+INIT_YMM avx2
+HADAMARD_AC_SSE2
+%if ARCH_X86_64
+SA8D_SATD
+%endif
+
+%macro LOAD_SUMSUB_8x8P_AVX2 7 ; 4*dst, 2*tmp, mul]
+ movq xm%1, [r0]
+ movq xm%3, [r2]
+ movq xm%2, [r0+r1]
+ movq xm%4, [r2+r3]
+ vinserti128 m%1, m%1, [r0+4*r1], 1
+ vinserti128 m%3, m%3, [r2+4*r3], 1
+ vinserti128 m%2, m%2, [r0+r4], 1
+ vinserti128 m%4, m%4, [r2+r5], 1
+ punpcklqdq m%1, m%1
+ punpcklqdq m%3, m%3
+ punpcklqdq m%2, m%2
+ punpcklqdq m%4, m%4
+ DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %7
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+
+ movq xm%3, [r0]
+ movq xm%5, [r2]
+ movq xm%4, [r0+r1]
+ movq xm%6, [r2+r3]
+ vinserti128 m%3, m%3, [r0+4*r1], 1
+ vinserti128 m%5, m%5, [r2+4*r3], 1
+ vinserti128 m%4, m%4, [r0+r4], 1
+ vinserti128 m%6, m%6, [r2+r5], 1
+ punpcklqdq m%3, m%3
+ punpcklqdq m%5, m%5
+ punpcklqdq m%4, m%4
+ punpcklqdq m%6, m%6
+ DIFF_SUMSUB_SSSE3 %3, %5, %4, %6, %7
+%endmacro
+
+%macro SATD_START_AVX2 2-3 0
+ FIX_STRIDES r1, r3
+%if %3
+ mova %2, [hmul_8p]
+ lea r4, [5*r1]
+ lea r5, [5*r3]
+%else
+ mova %2, [hmul_16p]
+ lea r4, [3*r1]
+ lea r5, [3*r3]
+%endif
+ pxor %1, %1
+%endmacro
+
+%define TRANS TRANS_SSE4
+INIT_YMM avx2
+cglobal pixel_satd_16x8_internal
+ LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
+ SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
+ LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 0
+ SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
+ ret
+
+cglobal pixel_satd_16x16, 4,6,8
+ SATD_START_AVX2 m6, m7
+ call pixel_satd_16x8_internal
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+pixel_satd_16x8_internal:
+ call pixel_satd_16x8_internal
+ vextracti128 xm0, m6, 1
+ paddw xm0, xm6
+ SATD_END_SSE2 xm0
+ RET
+
+cglobal pixel_satd_16x8, 4,6,8
+ SATD_START_AVX2 m6, m7
+ jmp pixel_satd_16x8_internal
+
+cglobal pixel_satd_8x8_internal
+ LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7
+ SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
+ ret
+
+cglobal pixel_satd_8x16, 4,6,8
+ SATD_START_AVX2 m6, m7, 1
+ call pixel_satd_8x8_internal
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ call pixel_satd_8x8_internal
+ vextracti128 xm0, m6, 1
+ paddw xm0, xm6
+ SATD_END_SSE2 xm0
+ RET
+
+cglobal pixel_satd_8x8, 4,6,8
+ SATD_START_AVX2 m6, m7, 1
+ call pixel_satd_8x8_internal
+ vextracti128 xm0, m6, 1
+ paddw xm0, xm6
+ SATD_END_SSE2 xm0
+ RET
+
+cglobal pixel_sa8d_8x8_internal
+ LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7
+ HADAMARD4_V 0, 1, 2, 3, 4
+ HADAMARD 8, sumsub, 0, 1, 4, 5
+ HADAMARD 8, sumsub, 2, 3, 4, 5
+ HADAMARD 2, sumsub, 0, 1, 4, 5
+ HADAMARD 2, sumsub, 2, 3, 4, 5
+ HADAMARD 1, amax, 0, 1, 4, 5
+ HADAMARD 1, amax, 2, 3, 4, 5
+ paddw m6, m0
+ paddw m6, m2
+ ret
+
+cglobal pixel_sa8d_8x8, 4,6,8
+ SATD_START_AVX2 m6, m7, 1
+ call pixel_sa8d_8x8_internal
+ vextracti128 xm1, m6, 1
+ paddw xm6, xm1
+ HADDW xm6, xm1
+ movd eax, xm6
+ add eax, 1
+ shr eax, 1
+ RET
+
+cglobal intra_sad_x9_8x8, 5,7,8
+ %define pred(i,j) [rsp+i*0x40+j*0x20]
+
+ mov r6, rsp
+ and rsp, ~31
+ sub rsp, 0x240
+ movu m5, [r0+0*FENC_STRIDE]
+ movu m6, [r0+4*FENC_STRIDE]
+ punpcklqdq m5, [r0+2*FENC_STRIDE]
+ punpcklqdq m6, [r0+6*FENC_STRIDE]
+
+ ; save instruction size: avoid 4-byte memory offsets
+ lea r0, [intra8x9_h1+128]
+ %define off(m) (r0+m-(intra8x9_h1+128))
+
+ vpbroadcastq m0, [r2+16]
+ psadbw m4, m0, m5
+ psadbw m2, m0, m6
+ mova pred(0,0), m0
+ mova pred(0,1), m0
+ paddw m4, m2
+
+ vpbroadcastq m1, [r2+7]
+ pshufb m3, m1, [off(intra8x9_h1)]
+ pshufb m2, m1, [off(intra8x9_h3)]
+ mova pred(1,0), m3
+ mova pred(1,1), m2
+ psadbw m3, m5
+ psadbw m2, m6
+ paddw m3, m2
+
+ lea r5, [rsp+0x100]
+ %define pred(i,j) [r5+i*0x40+j*0x20-0x100]
+
+ ; combine the first two
+ pslldq m3, 2
+ por m4, m3
+
+ pxor m2, m2
+ psadbw m0, m2
+ psadbw m1, m2
+ paddw m0, m1
+ psrlw m0, 3
+ pavgw m0, m2
+ pshufb m0, m2
+ mova pred(2,0), m0
+ mova pred(2,1), m0
+ psadbw m3, m0, m5
+ psadbw m2, m0, m6
+ paddw m3, m2
+
+ pslldq m3, 4
+ por m4, m3
+
+ vbroadcasti128 m0, [r2+16]
+ vbroadcasti128 m2, [r2+17]
+ pslldq m1, m0, 1
+ pavgb m3, m0, m2
+ PRED4x4_LOWPASS m0, m1, m2, m0, m7
+ pshufb m1, m0, [off(intra8x9_ddl1)]
+ pshufb m2, m0, [off(intra8x9_ddl3)]
+ mova pred(3,0), m1
+ mova pred(3,1), m2
+ psadbw m1, m5
+ psadbw m2, m6
+ paddw m1, m2
+
+ pslldq m1, 6
+ por m4, m1
+ vextracti128 xm1, m4, 1
+ paddw xm4, xm1
+ mova [r4], xm4
+
+ ; for later
+ vinserti128 m7, m3, xm0, 1
+
+ vbroadcasti128 m2, [r2+8]
+ vbroadcasti128 m0, [r2+7]
+ vbroadcasti128 m1, [r2+6]
+ pavgb m3, m2, m0
+ PRED4x4_LOWPASS m0, m1, m2, m0, m4
+ pshufb m1, m0, [off(intra8x9_ddr1)]
+ pshufb m2, m0, [off(intra8x9_ddr3)]
+ mova pred(4,0), m1
+ mova pred(4,1), m2
+ psadbw m4, m1, m5
+ psadbw m2, m6
+ paddw m4, m2
+
+ add r0, 256
+ add r5, 0xC0
+ %define off(m) (r0+m-(intra8x9_h1+256+128))
+ %define pred(i,j) [r5+i*0x40+j*0x20-0x1C0]
+
+ vpblendd m2, m3, m0, 11110011b
+ pshufb m1, m2, [off(intra8x9_vr1)]
+ pshufb m2, m2, [off(intra8x9_vr3)]
+ mova pred(5,0), m1
+ mova pred(5,1), m2
+ psadbw m1, m5
+ psadbw m2, m6
+ paddw m1, m2
+
+ pslldq m1, 2
+ por m4, m1
+
+ psrldq m2, m3, 4
+ pblendw m2, m0, q3330
+ punpcklbw m0, m3
+ pshufb m1, m2, [off(intra8x9_hd1)]
+ pshufb m2, m0, [off(intra8x9_hd3)]
+ mova pred(6,0), m1
+ mova pred(6,1), m2
+ psadbw m1, m5
+ psadbw m2, m6
+ paddw m1, m2
+
+ pslldq m1, 4
+ por m4, m1
+
+ pshufb m1, m7, [off(intra8x9_vl1)]
+ pshufb m2, m7, [off(intra8x9_vl3)]
+ mova pred(7,0), m1
+ mova pred(7,1), m2
+ psadbw m1, m5
+ psadbw m2, m6
+ paddw m1, m2
+
+ pslldq m1, 6
+ por m4, m1
+ vextracti128 xm1, m4, 1
+ paddw xm4, xm1
+ mova xm3, [r4]
+ SBUTTERFLY qdq, 3, 4, 7
+ paddw xm3, xm4
+
+ pslldq m1, m0, 1
+ vpbroadcastd m0, [r2+7]
+ palignr m0, m1, 1
+ pshufb m1, m0, [off(intra8x9_hu1)]
+ pshufb m2, m0, [off(intra8x9_hu3)]
+ mova pred(8,0), m1
+ mova pred(8,1), m2
+ psadbw m1, m5
+ psadbw m2, m6
+ paddw m1, m2
+ vextracti128 xm2, m1, 1
+ paddw xm1, xm2
+ movhlps xm2, xm1
+ paddw xm1, xm2
+ movd r2d, xm1
+
+ paddw xm3, [r3]
+ mova [r4], xm3
+ add r2w, word [r3+16]
+ mov [r4+16], r2w
+
+ phminposuw xm3, xm3
+ movd r3d, xm3
+ add r2d, 8<<16
+ cmp r3w, r2w
+ cmovg r3d, r2d
+
+ mov r2d, r3d
+ shr r3, 16
+ shl r3, 6
+ add r1, 4*FDEC_STRIDE
+ mova xm0, [rsp+r3+0x00]
+ mova xm1, [rsp+r3+0x10]
+ mova xm2, [rsp+r3+0x20]
+ mova xm3, [rsp+r3+0x30]
+ movq [r1+FDEC_STRIDE*-4], xm0
+ movhps [r1+FDEC_STRIDE*-2], xm0
+ movq [r1+FDEC_STRIDE*-3], xm1
+ movhps [r1+FDEC_STRIDE*-1], xm1
+ movq [r1+FDEC_STRIDE* 0], xm2
+ movhps [r1+FDEC_STRIDE* 2], xm2
+ movq [r1+FDEC_STRIDE* 1], xm3
+ movhps [r1+FDEC_STRIDE* 3], xm3
+ mov rsp, r6
+ mov eax, r2d
+ RET
+%endif ; HIGH_BIT_DEPTH
+
 ;=============================================================================
 ; SSIM
 ;=============================================================================
@@ -4074,13 +4839,13 @@
 
 %macro ADS_START 0
 %if UNIX64
- movsxd r5, r5d
+ movsxd r5, r5d
 %else
- mov r5d, r5m
+ mov r5d, r5m
 %endif
- mov r0d, r5d
- lea r6, [r4+r5+15]
- and r6, ~15;
+ mov r0d, r5d
+ lea r6, [r4+r5+(mmsize-1)]
+ and r6, ~(mmsize-1)
 shl r2d, 1
 %endmacro
 
@@ -4088,10 +4853,19 @@
 add r1, 8*%1
 add r3, 8*%1
 add r6, 4*%1
- sub r0d, 4*%1
+ sub r0d, 4*%1
 jg .loop
 WIN64_RESTORE_XMM rsp
- jmp ads_mvs
+%if mmsize==32
+ vzeroupper
+%endif
+ lea r6, [r4+r5+(mmsize-1)]
+ and r6, ~(mmsize-1)
+%if cpuflag(ssse3)
+ jmp ads_mvs_ssse3
+%else
+ jmp ads_mvs_mmx
+%endif
 %endmacro
 
 ;-----------------------------------------------------------------------------
@@ -4100,192 +4874,226 @@
 ;-----------------------------------------------------------------------------
 INIT_MMX mmx2
 cglobal pixel_ads4, 5,7
- movq mm6, [r0]
- movq mm4, [r0+8]
- pshufw mm7, mm6, 0
- pshufw mm6, mm6, q2222
- pshufw mm5, mm4, 0
- pshufw mm4, mm4, q2222
+ mova m6, [r0]
+ mova m4, [r0+8]
+ pshufw m7, m6, 0
+ pshufw m6, m6, q2222
+ pshufw m5, m4, 0
+ pshufw m4, m4, q2222
 ADS_START
 .loop:
- movq mm0, [r1]
- movq mm1, [r1+16]
- psubw mm0, mm7
- psubw mm1, mm6
- ABSW mm0, mm0, mm2
- ABSW mm1, mm1, mm3
- movq mm2, [r1+r2]
- movq mm3, [r1+r2+16]
- psubw mm2, mm5
- psubw mm3, mm4
- paddw mm0, mm1
- ABSW mm2, mm2, mm1
- ABSW mm3, mm3, mm1
- paddw mm0, mm2
- paddw mm0, mm3
- pshufw mm1, r6m, 0
- paddusw mm0, [r3]
- psubusw mm1, mm0
- packsswb mm1, mm1
- movd [r6], mm1
+ movu m0, [r1]
+ movu m1, [r1+16]
+ psubw m0, m7
+ psubw m1, m6
+ ABSW m0, m0, m2
+ ABSW m1, m1, m3
+ movu m2, [r1+r2]
+ movu m3, [r1+r2+16]
+ psubw m2, m5
+ psubw m3, m4
+ paddw m0, m1
+ ABSW m2, m2, m1
+ ABSW m3, m3, m1
+ paddw m0, m2
+ paddw m0, m3
+ pshufw m1, r6m, 0
+ paddusw m0, [r3]
+ psubusw m1, m0
+ packsswb m1, m1
+ movd [r6], m1
 ADS_END 1
 
 cglobal pixel_ads2, 5,7
- movq mm6, [r0]
- pshufw mm5, r6m, 0
- pshufw mm7, mm6, 0
- pshufw mm6, mm6, q2222
+ mova m6, [r0]
+ pshufw m5, r6m, 0
+ pshufw m7, m6, 0
+ pshufw m6, m6, q2222
 ADS_START
 .loop:
- movq mm0, [r1]
- movq mm1, [r1+r2]
- psubw mm0, mm7
- psubw mm1, mm6
- ABSW mm0, mm0, mm2
- ABSW mm1, mm1, mm3
- paddw mm0, mm1
- paddusw mm0, [r3]
- movq mm4, mm5
- psubusw mm4, mm0
- packsswb mm4, mm4
- movd [r6], mm4
+ movu m0, [r1]
+ movu m1, [r1+r2]
+ psubw m0, m7
+ psubw m1, m6
+ ABSW m0, m0, m2
+ ABSW m1, m1, m3
+ paddw m0, m1
+ paddusw m0, [r3]
+ mova m4, m5
+ psubusw m4, m0
+ packsswb m4, m4
+ movd [r6], m4
 ADS_END 1
 
 cglobal pixel_ads1, 5,7
- pshufw mm7, [r0], 0
- pshufw mm6, r6m, 0
+ pshufw m7, [r0], 0
+ pshufw m6, r6m, 0
 ADS_START
 .loop:
- movq mm0, [r1]
- movq mm1, [r1+8]
- psubw mm0, mm7
- psubw mm1, mm7
- ABSW mm0, mm0, mm2
- ABSW mm1, mm1, mm3
- paddusw mm0, [r3]
- paddusw mm1, [r3+8]
- movq mm4, mm6
- movq mm5, mm6
- psubusw mm4, mm0
- psubusw mm5, mm1
- packsswb mm4, mm5
- movq [r6], mm4
+ movu m0, [r1]
+ movu m1, [r1+8]
+ psubw m0, m7
+ psubw m1, m7
+ ABSW m0, m0, m2
+ ABSW m1, m1, m3
+ paddusw m0, [r3]
+ paddusw m1, [r3+8]
+ mova m4, m6
+ mova m5, m6
+ psubusw m4, m0
+ psubusw m5, m1
+ packsswb m4, m5
+ mova [r6], m4
 ADS_END 2
 
 %macro ADS_XMM 0
+%if mmsize==32
+cglobal pixel_ads4, 5,7,8
+ vpbroadcastw m7, [r0+ 0]
+ vpbroadcastw m6, [r0+ 4]
+ vpbroadcastw m5, [r0+ 8]
+ vpbroadcastw m4, [r0+12]
+%else
 cglobal pixel_ads4, 5,7,12
- movdqa xmm4, [r0]
- pshuflw xmm7, xmm4, 0
- pshuflw xmm6, xmm4, q2222
- pshufhw xmm5, xmm4, 0
- pshufhw xmm4, xmm4, q2222
- punpcklqdq xmm7, xmm7
- punpcklqdq xmm6, xmm6
- punpckhqdq xmm5, xmm5
- punpckhqdq xmm4, xmm4
-%if ARCH_X86_64
- pshuflw xmm8, r6m, 0
- punpcklqdq xmm8, xmm8
+ mova m4, [r0]
+ pshuflw m7, m4, q0000
+ pshuflw m6, m4, q2222
+ pshufhw m5, m4, q0000
+ pshufhw m4, m4, q2222
+ punpcklqdq m7, m7
+ punpcklqdq m6, m6
+ punpckhqdq m5, m5
+ punpckhqdq m4, m4
+%endif
+%if ARCH_X86_64 && mmsize == 16
+ movd m8, r6m
+ SPLATW m8, m8
 ADS_START
- movdqu xmm10, [r1]
- movdqu xmm11, [r1+r2]
+ movu m10, [r1]
+ movu m11, [r1+r2]
 .loop:
- psubw xmm0, xmm10, xmm7
- movdqu xmm10, [r1+16]
- psubw xmm1, xmm10, xmm6
- ABSW xmm0, xmm0, xmm2
- ABSW xmm1, xmm1, xmm3
- psubw xmm2, xmm11, xmm5
- movdqu xmm11, [r1+r2+16]
- paddw xmm0, xmm1
- psubw xmm3, xmm11, xmm4
- movdqu xmm9, [r3]
- ABSW xmm2, xmm2, xmm1
- ABSW xmm3, xmm3, xmm1
- paddw xmm0, xmm2
- paddw xmm0, xmm3
- paddusw xmm0, xmm9
- psubusw xmm1, xmm8, xmm0
- packsswb xmm1, xmm1
- movq [r6], xmm1
+ psubw m0, m10, m7
+ movu m10, [r1+16]
+ psubw m1, m10, m6
+ ABSW m0, m0, m2
+ ABSW m1, m1, m3
+ psubw m2, m11, m5
+ movu m11, [r1+r2+16]
+ paddw m0, m1
+ psubw m3, m11, m4
+ movu m9, [r3]
+ ABSW m2, m2, m1
+ ABSW m3, m3, m1
+ paddw m0, m2
+ paddw m0, m3
+ paddusw m0, m9
+ psubusw m1, m8, m0
 %else
 ADS_START
 .loop:
- movdqu xmm0, [r1]
- movdqu xmm1, [r1+16]
- psubw xmm0, xmm7
- psubw xmm1, xmm6
- ABSW xmm0, xmm0, xmm2
- ABSW xmm1, xmm1, xmm3
- movdqu xmm2, [r1+r2]
- movdqu xmm3, [r1+r2+16]
- psubw xmm2, xmm5
- psubw xmm3, xmm4
- paddw xmm0, xmm1
- ABSW xmm2, xmm2, xmm1
- ABSW xmm3, xmm3, xmm1
- paddw xmm0, xmm2
- paddw xmm0, xmm3
- movd xmm1, r6m
- movdqu xmm2, [r3]
- pshuflw xmm1, xmm1, 0
- punpcklqdq xmm1, xmm1
- paddusw xmm0, xmm2
- psubusw xmm1, xmm0
- packsswb xmm1, xmm1
- movq [r6], xmm1
+ movu m0, [r1]
+ movu m1, [r1+16]
+ psubw m0, m7
+ psubw m1, m6
+ ABSW m0, m0, m2
+ ABSW m1, m1, m3
+ movu m2, [r1+r2]
+ movu m3, [r1+r2+16]
+ psubw m2, m5
+ psubw m3, m4
+ paddw m0, m1
+ ABSW m2, m2, m1
+ ABSW m3, m3, m1
+ paddw m0, m2
+ paddw m0, m3
+ movu m2, [r3]
+%if mmsize==32
+ vpbroadcastw m1, r6m
+%else
+ movd m1, r6m
+ pshuflw m1, m1, 0
+ punpcklqdq m1, m1
+%endif
+ paddusw m0, m2
+ psubusw m1, m0
 %endif ; ARCH
- ADS_END 2
+ packsswb m1, m1
+%if mmsize==32
+ vpermq m1, m1, q3120
+ mova [r6], xm1
+%else
+ movh [r6], m1
+%endif
+ ADS_END mmsize/8
 
 cglobal pixel_ads2, 5,7,8
- movq xmm6, [r0]
- movd xmm5, r6m
- pshuflw xmm7, xmm6, 0
- pshuflw xmm6, xmm6, q2222
- pshuflw xmm5, xmm5, 0
- punpcklqdq xmm7, xmm7
- punpcklqdq xmm6, xmm6
- punpcklqdq xmm5, xmm5
+%if mmsize==32
+ vpbroadcastw m7, [r0+0]
+ vpbroadcastw m6, [r0+4]
+ vpbroadcastw m5, r6m
+%else
+ movq m6, [r0]
+ movd m5, r6m
+ pshuflw m7, m6, 0
+ pshuflw m6, m6, q2222
+ pshuflw m5, m5, 0
+ punpcklqdq m7, m7
+ punpcklqdq m6, m6
+ punpcklqdq m5, m5
+%endif
 ADS_START
 .loop:
- movdqu xmm0, [r1]
- movdqu xmm1, [r1+r2]
- psubw xmm0, xmm7
- psubw xmm1, xmm6
- movdqu xmm4, [r3]
- ABSW xmm0, xmm0, xmm2
- ABSW xmm1, xmm1, xmm3
- paddw xmm0, xmm1
- paddusw xmm0, xmm4
- psubusw xmm1, xmm5, xmm0
- packsswb xmm1, xmm1
- movq [r6], xmm1
- ADS_END 2
+ movu m0, [r1]
+ movu m1, [r1+r2]
+ psubw m0, m7
+ psubw m1, m6
+ movu m4, [r3]
+ ABSW m0, m0, m2
+ ABSW m1, m1, m3
+ paddw m0, m1
+ paddusw m0, m4
+ psubusw m1, m5, m0
+ packsswb m1, m1
+%if mmsize==32
+ vpermq m1, m1, q3120
+ mova [r6], xm1
+%else
+ movh [r6], m1
+%endif
+ ADS_END mmsize/8
 
 cglobal pixel_ads1, 5,7,8
- movd xmm7, [r0]
- movd xmm6, r6m
- pshuflw xmm7, xmm7, 0
- pshuflw xmm6, xmm6, 0
- punpcklqdq xmm7, xmm7
- punpcklqdq xmm6, xmm6
+%if mmsize==32
+ vpbroadcastw m7, [r0]
+ vpbroadcastw m6, r6m
+%else
+ movd m7, [r0]
+ movd m6, r6m
+ pshuflw m7, m7, 0
+ pshuflw m6, m6, 0
+ punpcklqdq m7, m7
+ punpcklqdq m6, m6
+%endif
 ADS_START
 .loop:
- movdqu xmm0, [r1]
- movdqu xmm1, [r1+16]
- psubw xmm0, xmm7
- psubw xmm1, xmm7
- movdqu xmm2, [r3]
- movdqu xmm3, [r3+16]
- ABSW xmm0, xmm0, xmm4
- ABSW xmm1, xmm1, xmm5
- paddusw xmm0, xmm2
- paddusw xmm1, xmm3
- psubusw xmm4, xmm6, xmm0
- psubusw xmm5, xmm6, xmm1
- packsswb xmm4, xmm5
- movdqa [r6], xmm4
- ADS_END 4
+ movu m0, [r1]
+ movu m1, [r1+mmsize]
+ psubw m0, m7
+ psubw m1, m7
+ movu m2, [r3]
+ movu m3, [r3+mmsize]
+ ABSW m0, m0, m4
+ ABSW m1, m1, m5
+ paddusw m0, m2
+ paddusw m1, m3
+ psubusw m4, m6, m0
+ psubusw m5, m6, m1
+ packsswb m4, m5
+%if mmsize==32
+ vpermq m4, m4, q3120
+%endif
+ mova [r6], m4
+ ADS_END mmsize/4
 %endmacro
 
 INIT_XMM sse2
@@ -4294,6 +5102,8 @@
 ADS_XMM
 INIT_XMM avx
 ADS_XMM
+INIT_YMM avx2
+ADS_XMM
 
 ; int pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
 ; {
@@ -4318,11 +5128,9 @@
 inc r1d
 %endmacro
 
-INIT_MMX
+INIT_MMX mmx
 cglobal pixel_ads_mvs, 0,7,0
-ads_mvs:
- lea r6, [r4+r5+15]
- and r6, ~15;
+ads_mvs_mmx:
 ; mvs = r4
 ; masks = r6
 ; width = r5
@@ -4364,3 +5172,36 @@
 .end:
 movifnidn eax, r0d
 RET
+
+INIT_XMM ssse3
+cglobal pixel_ads_mvs, 0,7,0
+ads_mvs_ssse3:
+ mova m3, [pw_8]
+ mova m4, [pw_76543210]
+ pxor m5, m5
+ add r5, r6
+ xor r0d, r0d ; nmv
+ mov [r5], r0d
+%ifdef PIC
+ lea r1, [$$]
+ %define GLOBAL +r1-$$
+%else
+ %define GLOBAL
+%endif
+.loop:
+ movh m0, [r6]
+ pcmpeqb m0, m5
+ pmovmskb r2d, m0
+ xor r2d, 0xffff ; skipping if r2d is zero is slower (branch mispredictions)
+ movzx r3d, byte [r2+popcnt_table GLOBAL] ; popcnt
+ add r2d, r2d
+ ; shuffle counters based on mv mask
+ pshufb m2, m4, [r2*8+ads_mvs_shuffle GLOBAL]
+ movu [r4+r0*2], m2
+ add r0d, r3d
+ paddw m4, m3 ; {i*8+0, i*8+1, i*8+2, i*8+3, i*8+4, i*8+5, i*8+6, i*8+7}
+ add r6, 8
+ cmp r6, r5
+ jl .loop
+ movifnidn eax, r0d
+ RET

x264-snapshot-20130224-2245.tar.bz2/common/x86/pixel.h -> x264-snapshot-20130723-2245.tar.bz2/common/x86/pixel.h Changed

@@ -52,10 +52,12 @@
 DECL_X1( sad, sse2_aligned )
 DECL_X1( sad, ssse3 )
 DECL_X1( sad, ssse3_aligned )
+DECL_X1( sad, avx2 )
 DECL_X4( sad, mmx2 )
 DECL_X4( sad, sse2 )
 DECL_X4( sad, sse3 )
 DECL_X4( sad, ssse3 )
+DECL_X4( sad, avx2 )
 DECL_X1( ssd, mmx )
 DECL_X1( ssd, mmx2 )
 DECL_X1( ssd, sse2slow )
@@ -63,18 +65,23 @@
 DECL_X1( ssd, ssse3 )
 DECL_X1( ssd, avx )
 DECL_X1( ssd, xop )
+DECL_X1( ssd, avx2 )
 DECL_X1( satd, mmx2 )
 DECL_X1( satd, sse2 )
 DECL_X1( satd, ssse3 )
+DECL_X1( satd, ssse3_atom )
 DECL_X1( satd, sse4 )
 DECL_X1( satd, avx )
 DECL_X1( satd, xop )
+DECL_X1( satd, avx2 )
 DECL_X1( sa8d, mmx2 )
 DECL_X1( sa8d, sse2 )
 DECL_X1( sa8d, ssse3 )
+DECL_X1( sa8d, ssse3_atom )
 DECL_X1( sa8d, sse4 )
 DECL_X1( sa8d, avx )
 DECL_X1( sa8d, xop )
+DECL_X1( sa8d, avx2 )
 DECL_X1( sad, cache32_mmx2 );
 DECL_X1( sad, cache64_mmx2 );
 DECL_X1( sad, cache64_sse2 );
@@ -88,12 +95,15 @@
 DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, intptr_t i_stride ))
 DECL_PIXELS( uint64_t, var, avx,  ( pixel *pix, intptr_t i_stride ))
 DECL_PIXELS( uint64_t, var, xop,  ( pixel *pix, intptr_t i_stride ))
+DECL_PIXELS( uint64_t, var, avx2, ( pixel *pix, intptr_t i_stride ))
 DECL_PIXELS( uint64_t, hadamard_ac, mmx2,  ( pixel *pix, intptr_t i_stride ))
 DECL_PIXELS( uint64_t, hadamard_ac, sse2,  ( pixel *pix, intptr_t i_stride ))
 DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( pixel *pix, intptr_t i_stride ))
+DECL_PIXELS( uint64_t, hadamard_ac, ssse3_atom, ( pixel *pix, intptr_t i_stride ))
 DECL_PIXELS( uint64_t, hadamard_ac, sse4,  ( pixel *pix, intptr_t i_stride ))
 DECL_PIXELS( uint64_t, hadamard_ac, avx,   ( pixel *pix, intptr_t i_stride ))
 DECL_PIXELS( uint64_t, hadamard_ac, xop,   ( pixel *pix, intptr_t i_stride ))
+DECL_PIXELS( uint64_t, hadamard_ac, avx2,  ( pixel *pix, intptr_t i_stride ))
 
 
 void x264_intra_satd_x3_4x4_mmx2   ( pixel   *, pixel   *, int * );
@@ -106,16 +116,19 @@
 void x264_intra_sad_x3_8x8c_mmx2   ( pixel   *, pixel   *, int * );
 void x264_intra_sad_x3_8x8c_sse2   ( pixel   *, pixel   *, int * );
 void x264_intra_sad_x3_8x8c_ssse3  ( pixel   *, pixel   *, int * );
+void x264_intra_sad_x3_8x8c_avx2   ( pixel   *, pixel   *, int * );
 void x264_intra_satd_x3_16x16_mmx2 ( pixel   *, pixel   *, int * );
 void x264_intra_satd_x3_16x16_ssse3( uint8_t *, uint8_t *, int * );
 void x264_intra_sad_x3_16x16_mmx2  ( pixel   *, pixel   *, int * );
 void x264_intra_sad_x3_16x16_sse2  ( pixel   *, pixel   *, int * );
 void x264_intra_sad_x3_16x16_ssse3 ( pixel   *, pixel   *, int * );
+void x264_intra_sad_x3_16x16_avx2  ( pixel   *, pixel   *, int * );
 void x264_intra_sa8d_x3_8x8_mmx2   ( uint8_t *, uint8_t *, int * );
 void x264_intra_sa8d_x3_8x8_sse2   ( pixel   *, pixel   *, int * );
 void x264_intra_sad_x3_8x8_mmx2    ( pixel   *, pixel   *, int * );
 void x264_intra_sad_x3_8x8_sse2    ( pixel   *, pixel   *, int * );
 void x264_intra_sad_x3_8x8_ssse3   ( pixel   *, pixel   *, int * );
+void x264_intra_sad_x3_8x8_avx2    ( uint16_t*, uint16_t*, int * );
 int x264_intra_satd_x9_4x4_ssse3( uint8_t *, uint8_t *, uint16_t * );
 int x264_intra_satd_x9_4x4_sse4 ( uint8_t *, uint8_t *, uint16_t * );
 int x264_intra_satd_x9_4x4_avx  ( uint8_t *, uint8_t *, uint16_t * );
@@ -129,6 +142,7 @@
 int x264_intra_sad_x9_8x8_ssse3 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
 int x264_intra_sad_x9_8x8_sse4  ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
 int x264_intra_sad_x9_8x8_avx   ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
+int x264_intra_sad_x9_8x8_avx2  ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
 
 void x264_pixel_ssd_nv12_core_mmx2( pixel *pixuv1, intptr_t stride1,
                                     pixel *pixuv2, intptr_t stride2, int width,
@@ -139,6 +153,9 @@
 void x264_pixel_ssd_nv12_core_avx ( pixel *pixuv1, intptr_t stride1,
                                     pixel *pixuv2, intptr_t stride2, int width,
                                     int height, uint64_t *ssd_u, uint64_t *ssd_v );
+void x264_pixel_ssd_nv12_core_avx2( pixel *pixuv1, intptr_t stride1,
+                                    pixel *pixuv2, intptr_t stride2, int width,
+                                    int height, uint64_t *ssd_u, uint64_t *ssd_v );
 void x264_pixel_ssim_4x4x2_core_mmx2( const uint8_t *pix1, intptr_t stride1,
                                       const uint8_t *pix2, intptr_t stride2, int sums[2][4] );
 void x264_pixel_ssim_4x4x2_core_sse2( const pixel *pix1, intptr_t stride1,
@@ -151,17 +168,28 @@
 int  x264_pixel_var2_8x8_sse2  ( pixel *,   intptr_t, pixel *,   intptr_t, int * );
 int  x264_pixel_var2_8x8_ssse3 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
 int  x264_pixel_var2_8x8_xop   ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
+int  x264_pixel_var2_8x8_avx2  ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
 int  x264_pixel_var2_8x16_mmx2 ( pixel *,   intptr_t, pixel *,   intptr_t, int * );
 int  x264_pixel_var2_8x16_sse2 ( pixel *,   intptr_t, pixel *,   intptr_t, int * );
 int  x264_pixel_var2_8x16_ssse3( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
 int  x264_pixel_var2_8x16_xop  ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
+int  x264_pixel_var2_8x16_avx2 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
 int  x264_pixel_vsad_mmx2 ( pixel *src, intptr_t stride, int height );
 int  x264_pixel_vsad_sse2 ( pixel *src, intptr_t stride, int height );
 int  x264_pixel_vsad_ssse3( pixel *src, intptr_t stride, int height );
 int  x264_pixel_vsad_xop  ( pixel *src, intptr_t stride, int height );
+int  x264_pixel_vsad_avx2 ( uint16_t *src, intptr_t stride, int height );
 int x264_pixel_asd8_sse2 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
 int x264_pixel_asd8_ssse3( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
 int x264_pixel_asd8_xop  ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
+uint64_t x264_pixel_sa8d_satd_16x16_sse2      ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
+uint64_t x264_pixel_sa8d_satd_16x16_ssse3     ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
+uint64_t x264_pixel_sa8d_satd_16x16_ssse3_atom( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
+uint64_t x264_pixel_sa8d_satd_16x16_sse4      ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
+uint64_t x264_pixel_sa8d_satd_16x16_avx       ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
+uint64_t x264_pixel_sa8d_satd_16x16_xop       ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
+uint64_t x264_pixel_sa8d_satd_16x16_avx2      ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
+
 
 #define DECL_ADS( size, suffix ) \
 int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\
@@ -178,6 +206,9 @@
 DECL_ADS( 4, avx )
 DECL_ADS( 2, avx )
 DECL_ADS( 1, avx )
+DECL_ADS( 4, avx2 )
+DECL_ADS( 2, avx2 )
+DECL_ADS( 1, avx2 )
 
 #undef DECL_PIXELS
 #undef DECL_X1

x264-snapshot-20130224-2245.tar.bz2/common/x86/predict-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/predict-a.asm Changed

@@ -6,6 +6,7 @@
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;* Holger Lubitz <holger@lubitz.org>
 ;* Jason Garrett-Glaser <darkshikari@gmail.com>
+;* Henrik Gramner <henrik@gramner.com>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
@@ -28,13 +29,12 @@
 %include "x86inc.asm"
 %include "x86util.asm"
 
-SECTION_RODATA
+SECTION_RODATA 32
 
-pw_76543210:
-pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7
-pw_43210123: dw -3, -2, -1, 0, 1, 2, 3, 4
-pw_m3: times 8 dw -3
-pw_m7: times 8 dw -7
+pw_0to15: dw 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+pw_43210123: times 2 dw -3, -2, -1, 0, 1, 2, 3, 4
+pw_m3: times 16 dw -3
+pw_m7: times 16 dw -7
 pb_00s_ff: times 8 db 0
 pb_0s_ff: times 7 db 0
 db 0xff
@@ -57,109 +57,106 @@
 cextern pw_00ff
 cextern pw_pixel_max
 
-%macro STORE8x8 2-4
- add r0, 4*FDEC_STRIDEB
- mova [r0 + -4*FDEC_STRIDEB], %1
- mova [r0 + -3*FDEC_STRIDEB], %1
- mova [r0 + -2*FDEC_STRIDEB], %1
- mova [r0 + -1*FDEC_STRIDEB], %1
- mova [r0 + 0*FDEC_STRIDEB], %2
- mova [r0 + 1*FDEC_STRIDEB], %2
- mova [r0 + 2*FDEC_STRIDEB], %2
- mova [r0 + 3*FDEC_STRIDEB], %2
+%macro STORE8 1
+ mova [r0+0*FDEC_STRIDEB], %1
+ mova [r0+1*FDEC_STRIDEB], %1
+ add r0, 4*FDEC_STRIDEB
+ mova [r0-2*FDEC_STRIDEB], %1
+ mova [r0-1*FDEC_STRIDEB], %1
+ mova [r0+0*FDEC_STRIDEB], %1
+ mova [r0+1*FDEC_STRIDEB], %1
+ mova [r0+2*FDEC_STRIDEB], %1
+ mova [r0+3*FDEC_STRIDEB], %1
 %endmacro
 
-%macro STORE8x16 4
- add r0, 4*FDEC_STRIDEB
- mova [r0 + -4*FDEC_STRIDEB], %1
- mova [r0 + -3*FDEC_STRIDEB], %1
- mova [r0 + -2*FDEC_STRIDEB], %1
- mova [r0 + -1*FDEC_STRIDEB], %1
- add r0, 4*FDEC_STRIDEB
- mova [r0 + -4*FDEC_STRIDEB], %2
- mova [r0 + -3*FDEC_STRIDEB], %2
- mova [r0 + -2*FDEC_STRIDEB], %2
- mova [r0 + -1*FDEC_STRIDEB], %2
- add r0, 4*FDEC_STRIDEB
- mova [r0 + -4*FDEC_STRIDEB], %3
- mova [r0 + -3*FDEC_STRIDEB], %3
- mova [r0 + -2*FDEC_STRIDEB], %3
- mova [r0 + -1*FDEC_STRIDEB], %3
- mova [r0 + 0*FDEC_STRIDEB], %4
- mova [r0 + 1*FDEC_STRIDEB], %4
- mova [r0 + 2*FDEC_STRIDEB], %4
- mova [r0 + 3*FDEC_STRIDEB], %4
+%macro STORE16 1-4
+%if %0 > 1
+ mov r1d, 2*%0
+.loop:
+ mova [r0+0*FDEC_STRIDEB+0*mmsize], %1
+ mova [r0+0*FDEC_STRIDEB+1*mmsize], %2
+ mova [r0+1*FDEC_STRIDEB+0*mmsize], %1
+ mova [r0+1*FDEC_STRIDEB+1*mmsize], %2
+%ifidn %0, 4
+ mova [r0+0*FDEC_STRIDEB+2*mmsize], %3
+ mova [r0+0*FDEC_STRIDEB+3*mmsize], %4
+ mova [r0+1*FDEC_STRIDEB+2*mmsize], %3
+ mova [r0+1*FDEC_STRIDEB+3*mmsize], %4
+ add r0, 2*FDEC_STRIDEB
+%else ; %0 == 2
+ add r0, 4*FDEC_STRIDEB
+ mova [r0-2*FDEC_STRIDEB+0*mmsize], %1
+ mova [r0-2*FDEC_STRIDEB+1*mmsize], %2
+ mova [r0-1*FDEC_STRIDEB+0*mmsize], %1
+ mova [r0-1*FDEC_STRIDEB+1*mmsize], %2
+%endif
+ dec r1d
+ jg .loop
+%else ; %0 == 1
+ STORE8 %1
+%if HIGH_BIT_DEPTH ; Different code paths to reduce code size
+ add r0, 6*FDEC_STRIDEB
+ mova [r0-2*FDEC_STRIDEB], %1
+ mova [r0-1*FDEC_STRIDEB], %1
+ mova [r0+0*FDEC_STRIDEB], %1
+ mova [r0+1*FDEC_STRIDEB], %1
+ add r0, 4*FDEC_STRIDEB
+ mova [r0-2*FDEC_STRIDEB], %1
+ mova [r0-1*FDEC_STRIDEB], %1
+ mova [r0+0*FDEC_STRIDEB], %1
+ mova [r0+1*FDEC_STRIDEB], %1
+%else
+ add r0, 8*FDEC_STRIDE
+ mova [r0-4*FDEC_STRIDE], %1
+ mova [r0-3*FDEC_STRIDE], %1
+ mova [r0-2*FDEC_STRIDE], %1
+ mova [r0-1*FDEC_STRIDE], %1
+ mova [r0+0*FDEC_STRIDE], %1
+ mova [r0+1*FDEC_STRIDE], %1
+ mova [r0+2*FDEC_STRIDE], %1
+ mova [r0+3*FDEC_STRIDE], %1
+%endif ; HIGH_BIT_DEPTH
+%endif
 %endmacro
 
-%macro STORE16x16 2-4
-%ifidn %0, 4
- mov r1d, 8
-.loop:
- mova [r0 + 0*FDEC_STRIDEB + 0], %1
- mova [r0 + 1*FDEC_STRIDEB + 0], %1
- mova [r0 + 0*FDEC_STRIDEB + 8], %2
- mova [r0 + 1*FDEC_STRIDEB + 8], %2
- mova [r0 + 0*FDEC_STRIDEB +16], %3
- mova [r0 + 1*FDEC_STRIDEB +16], %3
- mova [r0 + 0*FDEC_STRIDEB +24], %4
- mova [r0 + 1*FDEC_STRIDEB +24], %4
- add r0, 2*FDEC_STRIDEB
- dec r1d
- jg .loop
+%macro PRED_H_LOAD 2 ; reg, offset
+%if cpuflag(avx2)
+ vpbroadcastpix %1, [r0+(%2)*FDEC_STRIDEB-SIZEOF_PIXEL]
+%elif HIGH_BIT_DEPTH
+ movd %1, [r0+(%2)*FDEC_STRIDEB-4]
+ SPLATW %1, %1, 1
 %else
- mov r1d, 4
-.loop:
- mova [r0 + 0*FDEC_STRIDE], %1
- mova [r0 + 1*FDEC_STRIDE], %1
- mova [r0 + 2*FDEC_STRIDE], %1
- mova [r0 + 3*FDEC_STRIDE], %1
- mova [r0 + 0*FDEC_STRIDE + 8], %2
- mova [r0 + 1*FDEC_STRIDE + 8], %2
- mova [r0 + 2*FDEC_STRIDE + 8], %2
- mova [r0 + 3*FDEC_STRIDE + 8], %2
- add r0, 4*FDEC_STRIDE
- dec r1d
- jg .loop
+ SPLATB_LOAD %1, r0+(%2)*FDEC_STRIDE-1, m2
 %endif
 %endmacro
 
-%macro STORE16x16_SSE2 1-2
-%ifidn %0,2
- mov r1d, 4
-.loop
- mova [r0+0*FDEC_STRIDEB+ 0], %1
- mova [r0+0*FDEC_STRIDEB+16], %2
- mova [r0+1*FDEC_STRIDEB+ 0], %1
- mova [r0+1*FDEC_STRIDEB+16], %2
- mova [r0+2*FDEC_STRIDEB+ 0], %1
- mova [r0+2*FDEC_STRIDEB+16], %2
- mova [r0+3*FDEC_STRIDEB+ 0], %1
- mova [r0+3*FDEC_STRIDEB+16], %2
- add r0, 4*FDEC_STRIDEB
- dec r1d
- jg .loop
+%macro PRED_H_STORE 3 ; reg, offset, width
+%assign %%w %3*SIZEOF_PIXEL
+%if %%w == 8
+ movq [r0+(%2)*FDEC_STRIDEB], %1
 %else
- add r0, 4*FDEC_STRIDEB
- mova [r0 + -4*FDEC_STRIDEB], %1
- mova [r0 + -3*FDEC_STRIDEB], %1
- mova [r0 + -2*FDEC_STRIDEB], %1
- mova [r0 + -1*FDEC_STRIDEB], %1
- mova [r0 + 0*FDEC_STRIDEB], %1
- mova [r0 + 1*FDEC_STRIDEB], %1
- mova [r0 + 2*FDEC_STRIDEB], %1
- mova [r0 + 3*FDEC_STRIDEB], %1
- add r0, 8*FDEC_STRIDEB
- mova [r0 + -4*FDEC_STRIDEB], %1
- mova [r0 + -3*FDEC_STRIDEB], %1
- mova [r0 + -2*FDEC_STRIDEB], %1
- mova [r0 + -1*FDEC_STRIDEB], %1
- mova [r0 + 0*FDEC_STRIDEB], %1
- mova [r0 + 1*FDEC_STRIDEB], %1
- mova [r0 + 2*FDEC_STRIDEB], %1
- mova [r0 + 3*FDEC_STRIDEB], %1
+ %assign %%i 0
+ %rep %%w/mmsize
+ mova [r0+(%2)*FDEC_STRIDEB+%%i], %1
+ %assign %%i %%i+mmsize
+ %endrep
 %endif
 %endmacro
 
+%macro PRED_H_4ROWS 2 ; width, inc_ptr
+ PRED_H_LOAD m0, 0
+ PRED_H_LOAD m1, 1
+ PRED_H_STORE m0, 0, %1
+ PRED_H_STORE m1, 1, %1
+ PRED_H_LOAD m0, 2
+%if %2
+ add r0, 4*FDEC_STRIDEB
+%endif
+ PRED_H_LOAD m1, 3-4*%2
+ PRED_H_STORE m0, 2-4*%2, %1
+ PRED_H_STORE m1, 3-4*%2, %1
+%endmacro
+
 ; dest, left, right, src, tmp
 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
 %macro PRED8x8_LOWPASS 4-5
@@ -178,6 +175,16 @@
 %endmacro
 
 ;-----------------------------------------------------------------------------
+; void predict_4x4_h( pixel *src )
+;-----------------------------------------------------------------------------
+%if HIGH_BIT_DEPTH
+INIT_XMM avx2
+cglobal predict_4x4_h, 1,1
+ PRED_H_4ROWS 4, 0
+ RET
+%endif
+
+;-----------------------------------------------------------------------------
 ; void predict_4x4_ddl( pixel *src )
 ;-----------------------------------------------------------------------------
 %macro PREDICT_4x4_DDL 0
@@ -755,12 +762,12 @@
 %macro PREDICT_8x8_V 0
 cglobal predict_8x8_v, 2,2
 mova m0, [r1+16*SIZEOF_PIXEL]
- STORE8x8 m0, m0
+ STORE8 m0
 RET
 %endmacro
 
 %if HIGH_BIT_DEPTH
-INIT_XMM sse2
+INIT_XMM sse
 PREDICT_8x8_V
 %else
 INIT_MMX mmx2
@@ -806,7 +813,7 @@
 paddw m0, [pw_8]
 psrlw m0, 4
 SPLATW m0, m0
- STORE8x8 m0, m0
+ STORE8 m0
 RET
 
 %else ; !HIGH_BIT_DEPTH
@@ -821,7 +828,7 @@
 psrlw mm0, 4
 pshufw mm0, mm0, 0
 packuswb mm0, mm0
- STORE8x8 mm0, mm0
+ STORE8 mm0
 RET
 %endif ; HIGH_BIT_DEPTH
 
@@ -837,7 +844,7 @@
 paddw m0, [pw_4]
 psrlw m0, 3
 SPLATW m0, m0
- STORE8x8 m0, m0
+ STORE8 m0
 RET
 %endmacro
 INIT_XMM sse2
@@ -853,7 +860,7 @@
 psrlw mm0, 3
 pshufw mm0, mm0, 0
 packuswb mm0, mm0
- STORE8x8 mm0, mm0
+ STORE8 mm0
 RET
 %endmacro
 INIT_MMX
@@ -1062,17 +1069,21 @@
 %endif
 
 %macro LOAD_PLANE_ARGS 0
-%if ARCH_X86_64
- movd mm0, r1d
- movd mm2, r2d
- movd mm4, r3d
- pshufw mm0, mm0, 0
- pshufw mm2, mm2, 0
- pshufw mm4, mm4, 0
+%if cpuflag(avx2) && ARCH_X86_64 == 0
+ vpbroadcastw m0, r1m
+ vpbroadcastw m2, r2m
+ vpbroadcastw m4, r3m
+%elif mmsize == 8 ; MMX is only used on x86_32
+ SPLATW m0, r1m
+ SPLATW m2, r2m
+ SPLATW m4, r3m
 %else
- pshufw mm0, r1m, 0
- pshufw mm2, r2m, 0
- pshufw mm4, r3m, 0
+ movd xm0, r1m
+ movd xm2, r2m
+ movd xm4, r3m
+ SPLATW m0, xm0
+ SPLATW m2, xm2
+ SPLATW m4, xm4
 %endif
 %endmacro
 
@@ -1084,7 +1095,7 @@
 cglobal predict_8x%1c_p_core, 1,2
 LOAD_PLANE_ARGS
 movq m1, m2
- pmullw m2, [pw_3210]
+ pmullw m2, [pw_0to15]
 psllw m1, 2
 paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b}
 paddsw m1, m0 ; m1 = {i+4*b, i+5*b, i+6*b, i+7*b}
@@ -1111,17 +1122,12 @@
 PREDICT_CHROMA_P_MMX 16
 %endif ; !ARCH_X86_64 && !HIGH_BIT_DEPTH
 
-%macro PREDICT_CHROMA_P_XMM 1
+%macro PREDICT_CHROMA_P 1
 %if HIGH_BIT_DEPTH
 cglobal predict_8x%1c_p_core, 1,2,7
- movd m0, r1m
- movd m2, r2m
- movd m4, r3m
+ LOAD_PLANE_ARGS
 mova m3, [pw_pixel_max]
 pxor m1, m1
- SPLATW m0, m0, 0
- SPLATW m2, m2, 0
- SPLATW m4, m4, 0
 pmullw m2, [pw_43210123] ; b
 %if %1 == 16
 pmullw m5, m4, [pw_m7] ; c
@@ -1129,70 +1135,88 @@
 pmullw m5, m4, [pw_m3]
 %endif
 paddw m5, [pw_16]
- mov r1d, %1
+%if mmsize == 32
+ mova xm6, xm4
+ paddw m4, m4
+ paddw m5, m6
+%endif
+ mov r1d, %1/(mmsize/16)
 .loop:
 paddsw m6, m2, m5
 paddsw m6, m0
 psraw m6, 5
 CLIPW m6, m1, m3
- mova [r0], m6
 paddw m5, m4
+%if mmsize == 32
+ vextracti128 [r0], m6, 1
+ mova [r0+FDEC_STRIDEB], xm6
+ add r0, 2*FDEC_STRIDEB
+%else
+ mova [r0], m6
 add r0, FDEC_STRIDEB
+%endif
 dec r1d
 jg .loop
 RET
 %else ; !HIGH_BIT_DEPTH
 cglobal predict_8x%1c_p_core, 1,2
- movd m0, r1m
- movd m2, r2m
- movd m4, r3m
- SPLATW m0, m0, 0
- SPLATW m2, m2, 0
- SPLATW m4, m4, 0
- pmullw m2, [pw_76543210]
- paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b}
- paddsw m3, m0, m4
+ LOAD_PLANE_ARGS
+%if mmsize == 32
+ vbroadcasti128 m1, [pw_0to15] ; 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
+ pmullw m2, m1
+ mova xm1, xm4 ; zero upper half
 paddsw m4, m4
- mov r1d, %1/4
+ paddsw m0, m1
+%else
+ pmullw m2, [pw_0to15]
+%endif
+ paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b}
+ paddsw m1, m0, m4
+ paddsw m4, m4
+ mov r1d, %1/(mmsize/8)
 .loop:
- paddsw m1, m3, m4
- paddsw m5, m0, m4
- psraw m3, 5
- psraw m0, 5
- packuswb m0, m3
- movq [r0+FDEC_STRIDE*0], m0
- movhps [r0+FDEC_STRIDE*1], m0
- paddsw m0, m5, m4
- paddsw m3, m1, m4
- psraw m5, 5
- psraw m1, 5
- packuswb m5, m1
- movq [r0+FDEC_STRIDE*2], m5
- movhps [r0+FDEC_STRIDE*3], m5
- add r0, FDEC_STRIDE*4
+ psraw m2, m0, 5
+ psraw m3, m1, 5
+ paddsw m0, m4
+ paddsw m1, m4
+ packuswb m2, m3
+%if mmsize == 32
+ movq [r0+FDEC_STRIDE*1], xm2
+ movhps [r0+FDEC_STRIDE*3], xm2
+ vextracti128 xm2, m2, 1
+ movq [r0+FDEC_STRIDE*0], xm2
+ movhps [r0+FDEC_STRIDE*2], xm2
+%else
+ movq [r0+FDEC_STRIDE*0], xm2
+ movhps [r0+FDEC_STRIDE*1], xm2
+%endif
+ add r0, FDEC_STRIDE*mmsize/8
 dec r1d
 jg .loop
 RET
 %endif ; HIGH_BIT_DEPTH
-%endmacro ; PREDICT_CHROMA_P_XMM
+%endmacro ; PREDICT_CHROMA_P
 
 INIT_XMM sse2
-PREDICT_CHROMA_P_XMM 8
-PREDICT_CHROMA_P_XMM 16
+PREDICT_CHROMA_P 8
+PREDICT_CHROMA_P 16
 INIT_XMM avx
-PREDICT_CHROMA_P_XMM 8
-PREDICT_CHROMA_P_XMM 16
+PREDICT_CHROMA_P 8
+PREDICT_CHROMA_P 16
+INIT_YMM avx2
+PREDICT_CHROMA_P 8
+PREDICT_CHROMA_P 16
 
 ;-----------------------------------------------------------------------------
 ; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
 ;-----------------------------------------------------------------------------
-%if ARCH_X86_64 == 0
+%if HIGH_BIT_DEPTH == 0 && ARCH_X86_64 == 0
 INIT_MMX mmx2
 cglobal predict_16x16_p_core, 1,2
 LOAD_PLANE_ARGS
 movq mm5, mm2
 movq mm1, mm2
- pmullw mm5, [pw_3210]
+ pmullw mm5, [pw_0to15]
 psllw mm2, 3
 psllw mm1, 2
 movq mm3, mm2
@@ -1226,7 +1250,7 @@
 dec r1d
 jg .loop
 RET
-%endif ; !ARCH_X86_64
+%endif ; !HIGH_BIT_DEPTH && !ARCH_X86_64
 
 %macro PREDICT_16x16_P 0
 cglobal predict_16x16_p_core, 1,2,8
@@ -1236,7 +1260,7 @@
 SPLATW m0, m0, 0
 SPLATW m1, m1, 0
 SPLATW m2, m2, 0
- pmullw m3, m1, [pw_76543210]
+ pmullw m3, m1, [pw_0to15]
 psllw m1, 3
 %if HIGH_BIT_DEPTH
 pxor m6, m6
@@ -1257,8 +1281,6 @@
 mova [r0+16], m5
 add r0, FDEC_STRIDEB
 paddw m6, m2
- dec r1d
- jg .loop
 %else ; !HIGH_BIT_DEPTH
 paddsw m0, m3 ; m0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
 paddsw m1, m0 ; m1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b}
@@ -1279,9 +1301,9 @@
 paddsw m0, m7
 paddsw m1, m7
 add r0, FDEC_STRIDE*2
- dec r1d
- jg .loop
 %endif ; !HIGH_BIT_DEPTH
+ dec r1d
+ jg .loop
 RET
 %endmacro ; PREDICT_16x16_P
 
@@ -1292,6 +1314,60 @@
 PREDICT_16x16_P
 %endif
 
+INIT_YMM avx2
+cglobal predict_16x16_p_core, 1,2,8*HIGH_BIT_DEPTH
+ LOAD_PLANE_ARGS
+%if HIGH_BIT_DEPTH
+ pmullw m2, [pw_0to15]
+ pxor m5, m5
+ pxor m6, m6
+ mova m7, [pw_pixel_max]
+ mov r1d, 8
+.loop:
+ paddsw m1, m2, m5
+ paddw m5, m4
+ paddsw m1, m0
+ paddsw m3, m2, m5
+ psraw m1, 5
+ paddsw m3, m0
+ psraw m3, 5
+ CLIPW m1, m6, m7
+ mova [r0+0*FDEC_STRIDEB], m1
+ CLIPW m3, m6, m7
+ mova [r0+1*FDEC_STRIDEB], m3
+ paddw m5, m4
+ add r0, 2*FDEC_STRIDEB
+%else ; !HIGH_BIT_DEPTH
+ vbroadcasti128 m1, [pw_0to15]
+ mova xm3, xm4 ; zero high bits
+ pmullw m1, m2
+ psllw m2, 3
+ paddsw m0, m3
+ paddsw m0, m1 ; X+1*C X+0*C
+ paddsw m1, m0, m2 ; Y+1*C Y+0*C
+ paddsw m4, m4
+ mov r1d, 4
+.loop:
+ psraw m2, m0, 5
+ psraw m3, m1, 5
+ paddsw m0, m4
+ paddsw m1, m4
+ packuswb m2, m3 ; X+1*C Y+1*C X+0*C Y+0*C
+ vextracti128 [r0+0*FDEC_STRIDE], m2, 1
+ mova [r0+1*FDEC_STRIDE], xm2
+ psraw m2, m0, 5
+ psraw m3, m1, 5
+ paddsw m0, m4
+ paddsw m1, m4
+ packuswb m2, m3 ; X+3*C Y+3*C X+2*C Y+2*C
+ vextracti128 [r0+2*FDEC_STRIDE], m2, 1
+ mova [r0+3*FDEC_STRIDE], xm2
+ add r0, FDEC_STRIDE*4
+%endif ; !HIGH_BIT_DEPTH
+ dec r1d
+ jg .loop
+ RET
+
 %if HIGH_BIT_DEPTH == 0
 %macro PREDICT_8x8 0
 ;-----------------------------------------------------------------------------
@@ -1625,12 +1701,12 @@
 %macro PREDICT_8x8C_V 0
 cglobal predict_8x8c_v, 1,1
 mova m0, [r0 - FDEC_STRIDEB]
- STORE8x8 m0, m0
+ STORE8 m0
 RET
 %endmacro
 
 %if HIGH_BIT_DEPTH
-INIT_XMM sse2
+INIT_XMM sse
 PREDICT_8x8C_V
 %else
 INIT_MMX mmx
@@ -1659,12 +1735,12 @@
 %macro PREDICT_8x16C_V 0
 cglobal predict_8x16c_v, 1,1
 mova m0, [r0 - FDEC_STRIDEB]
- STORE8x16 m0, m0, m0, m0
+ STORE16 m0
 RET
 %endmacro
 
 %if HIGH_BIT_DEPTH
-INIT_XMM sse2
+INIT_XMM sse
 PREDICT_8x16C_V
 %else
 INIT_MMX mmx
@@ -1674,71 +1750,42 @@
 ;-----------------------------------------------------------------------------
 ; void predict_8x8c_h( uint8_t *src )
 ;-----------------------------------------------------------------------------
-%if HIGH_BIT_DEPTH
-
-%macro PREDICT_C_H 1
-cglobal predict_8x%1c_h, 1,1
- add r0, FDEC_STRIDEB*4
-%assign Y -4
-%rep %1
- movd m0, [r0+FDEC_STRIDEB*Y-SIZEOF_PIXEL*2]
- SPLATW m0, m0, 1
- mova [r0+FDEC_STRIDEB*Y], m0
-%if mmsize == 8
- mova [r0+FDEC_STRIDEB*Y+8], m0
+%macro PREDICT_C_H 0
+cglobal predict_8x8c_h, 1,1
+%if cpuflag(ssse3) && notcpuflag(avx2)
+ mova m2, [pb_3]
 %endif
-%assign Y Y+1
-%endrep
+ PRED_H_4ROWS 8, 1
+ PRED_H_4ROWS 8, 0
 RET
-%endmacro
-
-INIT_MMX mmx2
-PREDICT_C_H 8
-PREDICT_C_H 16
-INIT_XMM sse2
-PREDICT_C_H 8
-PREDICT_C_H 16
-
-%else ; !HIGH_BIT_DEPTH
-
-%macro PREDICT_C_H_CORE 1
-%assign Y %1
-%rep 4
- SPLATB_LOAD m0, r0+FDEC_STRIDE*Y-1, m1
- mova [r0+FDEC_STRIDE*Y], m0
-%assign Y Y+1
-%endrep
-%endmacro
 
-%macro PREDICT_C_H 1
-cglobal predict_8x%1c_h, 1,1
-%if cpuflag(ssse3)
- mova m1, [pb_3]
+cglobal predict_8x16c_h, 1,2
+%if cpuflag(ssse3) && notcpuflag(avx2)
+ mova m2, [pb_3]
 %endif
-%if %1==16
- add r0, FDEC_STRIDE*4
- PREDICT_C_H_CORE -4
- add r0, FDEC_STRIDE*4
- PREDICT_C_H_CORE -4
-%endif
- add r0, FDEC_STRIDE*4
- PREDICT_C_H_CORE -4
- PREDICT_C_H_CORE 0
+ mov r1d, 4
+.loop:
+ PRED_H_4ROWS 8, 1
+ dec r1d
+ jg .loop
 RET
 %endmacro
 
 INIT_MMX mmx2
-PREDICT_C_H 8
-PREDICT_C_H 16
+PREDICT_C_H
+%if HIGH_BIT_DEPTH
+INIT_XMM sse2
+PREDICT_C_H
+INIT_XMM avx2
+PREDICT_C_H
+%else
 INIT_MMX ssse3
-PREDICT_C_H 8
-PREDICT_C_H 16
-
+PREDICT_C_H
 %endif
+
 ;-----------------------------------------------------------------------------
 ; void predict_8x8c_dc( pixel *src )
 ;-----------------------------------------------------------------------------
-
 %macro LOAD_LEFT 1
 movzx r1d, pixel [r0+FDEC_STRIDEB*(%1-4)-SIZEOF_PIXEL]
 movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-3)-SIZEOF_PIXEL]
@@ -1958,7 +2005,7 @@
 paddw m0, m1
 psrlw m0, 1
 pavgw m0, m2
- STORE8x%1 m0, m0, m0, m0
+ STORE%1 m0
 RET
 %else ; !HIGH_BIT_DEPTH
 INIT_MMX
@@ -1977,7 +2024,7 @@
 pshufw mm1, mm1, 0
 pshufw mm0, mm0, 0 ; dc0 (w)
 packuswb mm0, mm1 ; dc0,dc1 (b)
- STORE8x%1 mm0, mm0, mm0, mm0
+ STORE%1 mm0
 RET
 %endif
 %endmacro
@@ -1988,33 +2035,31 @@
 ;-----------------------------------------------------------------------------
 ; void predict_16x16_v( pixel *src )
 ;-----------------------------------------------------------------------------
-%if HIGH_BIT_DEPTH
-INIT_MMX
-cglobal predict_16x16_v_mmx2, 1,2
- mova m0, [r0 - FDEC_STRIDEB+ 0]
- mova m1, [r0 - FDEC_STRIDEB+ 8]
- mova m2, [r0 - FDEC_STRIDEB+16]
- mova m3, [r0 - FDEC_STRIDEB+24]
- STORE16x16 m0, m1, m2, m3
- RET
-INIT_XMM
-cglobal predict_16x16_v_sse2, 2,2
- mova m0, [r0 - FDEC_STRIDEB+ 0]
- mova m1, [r0 - FDEC_STRIDEB+16]
- STORE16x16_SSE2 m0, m1
- RET
-%else ; !HIGH_BIT_DEPTH
-INIT_MMX
-cglobal predict_16x16_v_mmx2, 1,2
- movq m0, [r0 - FDEC_STRIDE + 0]
- movq m1, [r0 - FDEC_STRIDE + 8]
- STORE16x16 m0, m1
- RET
-INIT_XMM
-cglobal predict_16x16_v_sse2, 1,1
- movdqa xmm0, [r0 - FDEC_STRIDE]
- STORE16x16_SSE2 xmm0
+
+%macro PREDICT_16x16_V 0
+cglobal predict_16x16_v, 1,2
+%assign %%i 0
+%rep 16*SIZEOF_PIXEL/mmsize
+ mova m %+ %%i, [r0-FDEC_STRIDEB+%%i*mmsize]
+%assign %%i %%i+1
+%endrep
+%if 16*SIZEOF_PIXEL/mmsize == 4
+ STORE16 m0, m1, m2, m3
+%elif 16*SIZEOF_PIXEL/mmsize == 2
+ STORE16 m0, m1
+%else
+ STORE16 m0
+%endif
 RET
+%endmacro
+
+INIT_MMX mmx2
+PREDICT_16x16_V
+INIT_XMM sse
+PREDICT_16x16_V
+%if HIGH_BIT_DEPTH
+INIT_YMM avx
+PREDICT_16x16_V
 %endif
 
 ;-----------------------------------------------------------------------------
@@ -2022,46 +2067,23 @@
 ;-----------------------------------------------------------------------------
 %macro PREDICT_16x16_H 0
 cglobal predict_16x16_h, 1,2
- mov r1, 12*FDEC_STRIDEB
-%if HIGH_BIT_DEPTH
-.vloop:
-%assign Y 0
-%rep 4
- movd m0, [r0+r1+Y*FDEC_STRIDEB-2*SIZEOF_PIXEL]
- SPLATW m0, m0, 1
- mova [r0+r1+Y*FDEC_STRIDEB+ 0], m0
- mova [r0+r1+Y*FDEC_STRIDEB+16], m0
-%if mmsize==8
- mova [r0+r1+Y*FDEC_STRIDEB+ 8], m0
- mova [r0+r1+Y*FDEC_STRIDEB+24], m0
-%endif
-%assign Y Y+1
-%endrep
-
-%else ; !HIGH_BIT_DEPTH
-%if cpuflag(ssse3)
- mova m1, [pb_3]
-%endif
-.vloop:
-%assign Y 0
-%rep 4
- SPLATB_LOAD m0, r0+r1+FDEC_STRIDE*Y-1, m1
- mova [r0+r1+FDEC_STRIDE*Y], m0
-%if mmsize==8
- mova [r0+r1+FDEC_STRIDE*Y+8], m0
+%if cpuflag(ssse3) && notcpuflag(avx2)
+ mova m2, [pb_3]
 %endif
-%assign Y Y+1
-%endrep
-%endif ; HIGH_BIT_DEPTH
- sub r1, 4*FDEC_STRIDEB
- jge .vloop
+ mov r1d, 4
+.loop:
+ PRED_H_4ROWS 16, 1
+ dec r1d
+ jg .loop
 RET
 %endmacro
 
 INIT_MMX mmx2
 PREDICT_16x16_H
-INIT_XMM sse2
 %if HIGH_BIT_DEPTH
+INIT_XMM sse2
+PREDICT_16x16_H
+INIT_YMM avx2
 PREDICT_16x16_H
 %else
 ;no SSE2 for 8-bit, it's slower than MMX on all systems that don't support SSSE3
@@ -2072,8 +2094,7 @@
 ;-----------------------------------------------------------------------------
 ; void predict_16x16_dc_core( pixel *src, int i_dc_left )
 ;-----------------------------------------------------------------------------
-
-%macro PRED16x16_DC 2
+%macro PRED16x16_DC_MMX 2
 %if HIGH_BIT_DEPTH
 mova m0, [r0 - FDEC_STRIDEB+ 0]
 paddw m0, [r0 - FDEC_STRIDEB+ 8]
@@ -2083,7 +2104,7 @@
 paddw m0, %1
 psrlw m0, %2
 SPLATW m0, m0
- STORE16x16 m0, m0, m0, m0
+ STORE16 m0, m0, m0, m0
 %else ; !HIGH_BIT_DEPTH
 pxor m0, m0
 pxor m1, m1
@@ -2094,7 +2115,7 @@
 psrlw m0, %2 ; dc
 pshufw m0, m0, 0
 packuswb m0, m0 ; dc in bytes
- STORE16x16 m0, m0
+ STORE16 m0, m0
 %endif
 %endmacro
 
@@ -2102,15 +2123,15 @@
 cglobal predict_16x16_dc_core, 1,2
 %if ARCH_X86_64
 movd m6, r1d
- PRED16x16_DC m6, 5
+ PRED16x16_DC_MMX m6, 5
 %else
- PRED16x16_DC r1m, 5
+ PRED16x16_DC_MMX r1m, 5
 %endif
 RET
 
 INIT_MMX mmx2
 cglobal predict_16x16_dc_top, 1,2
- PRED16x16_DC [pw_8], 4
+ PRED16x16_DC_MMX [pw_8], 4
 RET
 
 INIT_MMX mmx2
@@ -2118,30 +2139,30 @@
 cglobal predict_16x16_dc_left_core, 1,2
 movd m0, r1m
 SPLATW m0, m0
- STORE16x16 m0, m0, m0, m0
+ STORE16 m0, m0, m0, m0
 RET
 %else ; !HIGH_BIT_DEPTH
 cglobal predict_16x16_dc_left_core, 1,1
 movd m0, r1m
 pshufw m0, m0, 0
 packuswb m0, m0
- STORE16x16 m0, m0
+ STORE16 m0, m0
 RET
 %endif
 
-;-----------------------------------------------------------------------------
-; void predict_16x16_dc_core( pixel *src, int i_dc_left )
-;-----------------------------------------------------------------------------
-
-%macro PRED16x16_DC_SSE2 2
+%macro PRED16x16_DC 2
 %if HIGH_BIT_DEPTH
- mova m0, [r0 - FDEC_STRIDEB+ 0]
- paddw m0, [r0 - FDEC_STRIDEB+16]
- HADDW m0, m2
- paddw m0, %1
- psrlw m0, %2
- SPLATW m0, m0
- STORE16x16_SSE2 m0, m0
+ mova xm0, [r0 - FDEC_STRIDEB+ 0]
+ paddw xm0, [r0 - FDEC_STRIDEB+16]
+ HADDW xm0, xm2
+ paddw xm0, %1
+ psrlw xm0, %2
+ SPLATW m0, xm0
+%if mmsize == 32
+ STORE16 m0
+%else
+ STORE16 m0, m0
+%endif
 %else ; !HIGH_BIT_DEPTH
 pxor m0, m0
 psadbw m0, [r0 - FDEC_STRIDE]
@@ -2151,32 +2172,40 @@
 psrlw m0, %2 ; dc
 SPLATW m0, m0
 packuswb m0, m0 ; dc in bytes
- STORE16x16_SSE2 m0
+ STORE16 m0
 %endif
 %endmacro
 
-INIT_XMM sse2
+%macro PREDICT_16x16_DC_CORE 0
 cglobal predict_16x16_dc_core, 2,2,4
- movd m3, r1m
- PRED16x16_DC_SSE2 m3, 5
+ movd xm3, r1m
+ PRED16x16_DC xm3, 5
 RET
 
 cglobal predict_16x16_dc_top, 1,2
- PRED16x16_DC_SSE2 [pw_8], 4
+ PRED16x16_DC [pw_8], 4
 RET
 
-INIT_XMM sse2
-%if HIGH_BIT_DEPTH
 cglobal predict_16x16_dc_left_core, 1,2
- movd m0, r1m
- SPLATW m0, m0
- STORE16x16_SSE2 m0, m0
- RET
-%else ; !HIGH_BIT_DEPTH
-cglobal predict_16x16_dc_left_core, 1,1
- movd m0, r1m
- SPLATW m0, m0
+ movd xm0, r1m
+ SPLATW m0, xm0
+%if HIGH_BIT_DEPTH && mmsize == 16
+ STORE16 m0, m0
+%else
+%if HIGH_BIT_DEPTH == 0
 packuswb m0, m0
- STORE16x16_SSE2 m0
+%endif
+ STORE16 m0
+%endif
 RET
+%endmacro
+
+INIT_XMM sse2
+PREDICT_16x16_DC_CORE
+%if HIGH_BIT_DEPTH
+INIT_YMM avx2
+PREDICT_16x16_DC_CORE
+%else
+INIT_XMM avx2
+PREDICT_16x16_DC_CORE
 %endif

x264-snapshot-20130224-2245.tar.bz2/common/x86/predict-c.c -> x264-snapshot-20130723-2245.tar.bz2/common/x86/predict-c.c Changed

@@ -43,6 +43,7 @@
 
 PREDICT_16x16_DC( mmx2 )
 PREDICT_16x16_DC( sse2 )
+PREDICT_16x16_DC( avx2 )
 
 #define PREDICT_16x16_DC_LEFT(name)\
 static void x264_predict_16x16_dc_left_##name( pixel *src )\
@@ -58,10 +59,11 @@
 
 PREDICT_16x16_DC_LEFT( mmx2 )
 PREDICT_16x16_DC_LEFT( sse2 )
+PREDICT_16x16_DC_LEFT( avx2 )
 
 #define PREDICT_P_SUM(j,i)\
 H += i * ( src[j+i - FDEC_STRIDE ] - src[j-i - FDEC_STRIDE ] );\
- V += i * ( src[(j+i)*FDEC_STRIDE -1] - src[(j-i)*FDEC_STRIDE -1] );\
+ V += i * ( src[(j+i)*FDEC_STRIDE -1] - src[(j-i)*FDEC_STRIDE -1] );
 
 ALIGNED_16( static const int16_t pw_12345678[8] ) = {1,2,3,4,5,6,7,8};
 ALIGNED_16( static const int16_t pw_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
@@ -70,178 +72,181 @@
 ALIGNED_8( static const int8_t pb_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
 ALIGNED_8( static const int8_t pb_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4};
 
-#if !HIGH_BIT_DEPTH
-#define PREDICT_16x16_P(name)\
-static void x264_predict_16x16_p_##name( pixel *src )\
-{\
- int a, b, c;\
+#define PREDICT_16x16_P_CORE\
 int H = 0;\
 int V = 0;\
- int i00;\
- PREDICT_P_SUM(7,1) \
- PREDICT_P_SUM(7,2) \
- PREDICT_P_SUM(7,3) \
- PREDICT_P_SUM(7,4) \
- PREDICT_P_SUM(7,5) \
- PREDICT_P_SUM(7,6) \
- PREDICT_P_SUM(7,7) \
- PREDICT_P_SUM(7,8) \
- a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] );\
- b = ( 5 * H + 32 ) >> 6;\
- c = ( 5 * V + 32 ) >> 6;\
- i00 = a - b * 7 - c * 7 + 16;\
- x264_predict_16x16_p_core_##name( src, i00, b, c );\
-}
-#ifndef ARCH_X86_64
-PREDICT_16x16_P( mmx2 )
-#endif
-PREDICT_16x16_P( sse2 )
-PREDICT_16x16_P( avx )
-#endif //!HIGH_BIT_DEPTH
+ PREDICT_P_SUM(7,1)\
+ PREDICT_P_SUM(7,2)\
+ PREDICT_P_SUM(7,3)\
+ PREDICT_P_SUM(7,4)\
+ PREDICT_P_SUM(7,5)\
+ PREDICT_P_SUM(7,6)\
+ PREDICT_P_SUM(7,7)\
+ PREDICT_P_SUM(7,8)
 
-#define PREDICT_8x16C_P_CORE \
- int H = 0, V = 0;\
- for( int i = 0; i < 4; i++ )\
- H += ( i + 1 ) * ( src[4 + i - FDEC_STRIDE] - src[2 - i - FDEC_STRIDE] );\
- for( int i = 0; i < 8; i++ )\
- V += ( i + 1 ) * ( src[-1 + (i+8)*FDEC_STRIDE] - src[-1 + (6-i)*FDEC_STRIDE] );\
- int a = 16 * ( src[-1 + 15*FDEC_STRIDE] + src[7 - FDEC_STRIDE] );\
- int b = ( 17 * H + 16 ) >> 5;\
- int c = ( 5 * V + 32 ) >> 6;
-
-#if HIGH_BIT_DEPTH
-#define PREDICT_8x16_P(name)\
-static void x264_predict_8x16c_p_##name( uint16_t *src )\
-{\
- PREDICT_8x16C_P_CORE \
- x264_predict_8x16c_p_core_##name( src, a, b, c );\
-}
+#define PREDICT_16x16_P_END(name)\
+ int a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] );\
+ int b = ( 5 * H + 32 ) >> 6;\
+ int c = ( 5 * V + 32 ) >> 6;\
+ int i00 = a - b * 7 - c * 7 + 16;\
+ /* b*15 + c*15 can overflow: it's easier to just branch away in this rare case
+ * than to try to consider it in the asm. */\
+ if( BIT_DEPTH > 8 && (i00 > 0x7fff || abs(b) > 1092 || abs(c) > 1092) )\
+ x264_predict_16x16_p_c( src );\
+ else\
+ x264_predict_16x16_p_core_##name( src, i00, b, c );
 
-PREDICT_8x16_P(sse2)
-PREDICT_8x16_P(avx)
-#else
-#define PREDICT_8x16_P(name)\
-static void x264_predict_8x16c_p_##name( uint8_t *src )\
+#define PREDICT_16x16_P(name, name2)\
+static void x264_predict_16x16_p_##name( pixel *src )\
 {\
- PREDICT_8x16C_P_CORE \
- int i00 = a -3*b -7*c + 16;\
- x264_predict_8x16c_p_core_##name( src, i00, b, c );\
+ PREDICT_16x16_P_CORE\
+ PREDICT_16x16_P_END(name2)\
 }
-#ifndef ARCH_X86_64
-PREDICT_8x16_P(mmx2)
-#endif
-PREDICT_8x16_P(sse2)
-PREDICT_8x16_P(avx)
-#endif
 
 #if HAVE_X86_INLINE_ASM
 #if HIGH_BIT_DEPTH
-static void x264_predict_16x16_p_sse2( uint16_t *src )
-#else
-static void x264_predict_16x16_p_ssse3( uint8_t *src )
-#endif
-{
- int a, b, c, i00;
- int H, V;
-#if HIGH_BIT_DEPTH
- asm (
- "movdqu %1, %%xmm1 \n"
- "movdqa %2, %%xmm0 \n"
- "pmaddwd %3, %%xmm0 \n"
- "pmaddwd %4, %%xmm1 \n"
- "paddd %%xmm1, %%xmm0 \n"
- "movhlps %%xmm0, %%xmm1 \n"
- "paddd %%xmm1, %%xmm0 \n"
- "pshuflw $14, %%xmm0, %%xmm1 \n"
- "paddd %%xmm1, %%xmm0 \n"
- "movd %%xmm0, %0 \n"
- :"=r"(H)
- :"m"(src[-FDEC_STRIDE-1]), "m"(src[-FDEC_STRIDE+8]),
- "m"(*pw_12345678), "m"(*pw_m87654321)
+#define PREDICT_16x16_P_ASM\
+ asm (\
+ "movdqu %1, %%xmm1 \n"\
+ "movdqa %2, %%xmm0 \n"\
+ "pmaddwd %3, %%xmm0 \n"\
+ "pmaddwd %4, %%xmm1 \n"\
+ "paddd %%xmm1, %%xmm0 \n"\
+ "movhlps %%xmm0, %%xmm1 \n"\
+ "paddd %%xmm1, %%xmm0 \n"\
+ "pshuflw $14, %%xmm0, %%xmm1 \n"\
+ "paddd %%xmm1, %%xmm0 \n"\
+ "movd %%xmm0, %0 \n"\
+ :"=r"(H)\
+ :"m"(src[-FDEC_STRIDE-1]), "m"(src[-FDEC_STRIDE+8]),\
+ "m"(*pw_12345678), "m"(*pw_m87654321)\
 );
-#else
- asm (
- "movq %1, %%mm1 \n"
- "movq %2, %%mm0 \n"
- "palignr $7, %3, %%mm1 \n"
- "pmaddubsw %4, %%mm0 \n"
- "pmaddubsw %5, %%mm1 \n"
- "paddw %%mm1, %%mm0 \n"
- "pshufw $14, %%mm0, %%mm1 \n"
- "paddw %%mm1, %%mm0 \n"
- "pshufw $1, %%mm0, %%mm1 \n"
- "paddw %%mm1, %%mm0 \n"
- "movd %%mm0, %0 \n"
- "movswl %w0, %0 \n"
- :"=r"(H)
- :"m"(src[-FDEC_STRIDE]), "m"(src[-FDEC_STRIDE+8]),
- "m"(src[-FDEC_STRIDE-8]), "m"(*pb_12345678), "m"(*pb_m87654321)
+#else // !HIGH_BIT_DEPTH
+#define PREDICT_16x16_P_ASM\
+ asm (\
+ "movq %1, %%mm1 \n"\
+ "movq %2, %%mm0 \n"\
+ "palignr $7, %3, %%mm1 \n"\
+ "pmaddubsw %4, %%mm0 \n"\
+ "pmaddubsw %5, %%mm1 \n"\
+ "paddw %%mm1, %%mm0 \n"\
+ "pshufw $14, %%mm0, %%mm1 \n"\
+ "paddw %%mm1, %%mm0 \n"\
+ "pshufw $1, %%mm0, %%mm1 \n"\
+ "paddw %%mm1, %%mm0 \n"\
+ "movd %%mm0, %0 \n"\
+ "movswl %w0, %0 \n"\
+ :"=r"(H)\
+ :"m"(src[-FDEC_STRIDE]), "m"(src[-FDEC_STRIDE+8]),\
+ "m"(src[-FDEC_STRIDE-8]), "m"(*pb_12345678), "m"(*pb_m87654321)\
 );
-#endif
- V = 8 * ( src[15*FDEC_STRIDE-1] - src[-1*FDEC_STRIDE-1] )
- + 7 * ( src[14*FDEC_STRIDE-1] - src[ 0*FDEC_STRIDE-1] )
- + 6 * ( src[13*FDEC_STRIDE-1] - src[ 1*FDEC_STRIDE-1] )
- + 5 * ( src[12*FDEC_STRIDE-1] - src[ 2*FDEC_STRIDE-1] )
- + 4 * ( src[11*FDEC_STRIDE-1] - src[ 3*FDEC_STRIDE-1] )
- + 3 * ( src[10*FDEC_STRIDE-1] - src[ 4*FDEC_STRIDE-1] )
- + 2 * ( src[ 9*FDEC_STRIDE-1] - src[ 5*FDEC_STRIDE-1] )
+#endif // HIGH_BIT_DEPTH
+
+#define PREDICT_16x16_P_CORE_INLINE\
+ int H, V;\
+ PREDICT_16x16_P_ASM\
+ V = 8 * ( src[15*FDEC_STRIDE-1] - src[-1*FDEC_STRIDE-1] )\
+ + 7 * ( src[14*FDEC_STRIDE-1] - src[ 0*FDEC_STRIDE-1] )\
+ + 6 * ( src[13*FDEC_STRIDE-1] - src[ 1*FDEC_STRIDE-1] )\
+ + 5 * ( src[12*FDEC_STRIDE-1] - src[ 2*FDEC_STRIDE-1] )\
+ + 4 * ( src[11*FDEC_STRIDE-1] - src[ 3*FDEC_STRIDE-1] )\
+ + 3 * ( src[10*FDEC_STRIDE-1] - src[ 4*FDEC_STRIDE-1] )\
+ + 2 * ( src[ 9*FDEC_STRIDE-1] - src[ 5*FDEC_STRIDE-1] )\
 + 1 * ( src[ 8*FDEC_STRIDE-1] - src[ 6*FDEC_STRIDE-1] );
- a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] );
- b = ( 5 * H + 32 ) >> 6;
- c = ( 5 * V + 32 ) >> 6;
- i00 = a - b * 7 - c * 7 + 16;
- /* b*15 + c*15 can overflow: it's easier to just branch away in this rare case
- * than to try to consider it in the asm. */
- if( BIT_DEPTH > 8 && (i00 > 0x7fff || abs(b) > 1092 || abs(c) > 1092) )
- x264_predict_16x16_p_c( src );
- else
- x264_predict_16x16_p_core_sse2( src, i00, b, c );
+
+#define PREDICT_16x16_P_INLINE(name, name2)\
+static void x264_predict_16x16_p_##name( pixel *src )\
+{\
+ PREDICT_16x16_P_CORE_INLINE\
+ PREDICT_16x16_P_END(name2)\
 }
-#endif
+#else // !HAVE_X86_INLINE_ASM
+#define PREDICT_16x16_P_INLINE(name, name2) PREDICT_16x16_P(name, name2)
+#endif // HAVE_X86_INLINE_ASM
+
+#if HIGH_BIT_DEPTH
+PREDICT_16x16_P_INLINE( sse2, sse2 )
+#else // !HIGH_BIT_DEPTH
+#if !ARCH_X86_64
+PREDICT_16x16_P( mmx2, mmx2 )
+#endif // !ARCH_X86_64
+PREDICT_16x16_P( sse2, sse2 )
+#if HAVE_X86_INLINE_ASM
+PREDICT_16x16_P_INLINE( ssse3, sse2 )
+#endif // HAVE_X86_INLINE_ASM
+PREDICT_16x16_P_INLINE( avx, avx )
+#endif // HIGH_BIT_DEPTH
+PREDICT_16x16_P_INLINE( avx2, avx2 )
+
+#define PREDICT_8x16C_P_CORE\
+ int H = 0, V = 0;\
+ for( int i = 0; i < 4; i++ )\
+ H += ( i + 1 ) * ( src[4 + i - FDEC_STRIDE] - src[2 - i - FDEC_STRIDE] );\
+ for( int i = 0; i < 8; i++ )\
+ V += ( i + 1 ) * ( src[-1 + (i+8)*FDEC_STRIDE] - src[-1 + (6-i)*FDEC_STRIDE] );
 
-#if !HIGH_BIT_DEPTH
+#if HIGH_BIT_DEPTH
+#define PREDICT_8x16C_P_END(name)\
+ int a = 16 * ( src[-1 + 15*FDEC_STRIDE] + src[7 - FDEC_STRIDE] );\
+ int b = ( 17 * H + 16 ) >> 5;\
+ int c = ( 5 * V + 32 ) >> 6;\
+ x264_predict_8x16c_p_core_##name( src, a, b, c );
+#else // !HIGH_BIT_DEPTH
+#define PREDICT_8x16C_P_END(name)\
+ int a = 16 * ( src[-1 + 15*FDEC_STRIDE] + src[7 - FDEC_STRIDE] );\
+ int b = ( 17 * H + 16 ) >> 5;\
+ int c = ( 5 * V + 32 ) >> 6;\
+ int i00 = a -3*b -7*c + 16;\
+ x264_predict_8x16c_p_core_##name( src, i00, b, c );
+#endif // HIGH_BIT_DEPTH
 
-#define PREDICT_8x8_P(name)\
-static void x264_predict_8x8c_p_##name( uint8_t *src )\
+#define PREDICT_8x16C_P(name)\
+static void x264_predict_8x16c_p_##name( pixel *src )\
 {\
- int a, b, c;\
+ PREDICT_8x16C_P_CORE\
+ PREDICT_8x16C_P_END(name)\
+}
+
+#if !ARCH_X86_64 && !HIGH_BIT_DEPTH
+PREDICT_8x16C_P( mmx2 )
+#endif // !ARCH_X86_64 && !HIGH_BIT_DEPTH
+PREDICT_8x16C_P( sse2 )
+PREDICT_8x16C_P( avx )
+PREDICT_8x16C_P( avx2 )
+
+#define PREDICT_8x8C_P_CORE\
 int H = 0;\
 int V = 0;\
- int i00;\
 PREDICT_P_SUM(3,1)\
 PREDICT_P_SUM(3,2)\
 PREDICT_P_SUM(3,3)\
- PREDICT_P_SUM(3,4)\
- a = 16 * ( src[7*FDEC_STRIDE -1] + src[7 - FDEC_STRIDE] );\
- b = ( 17 * H + 16 ) >> 5;\
- c = ( 17 * V + 16 ) >> 5;\
- i00 = a -3*b -3*c + 16;\
- x264_predict_8x8c_p_core_##name( src, i00, b, c );\
-}
-#ifndef ARCH_X86_64
-PREDICT_8x8_P( mmx2 )
-#endif
-PREDICT_8x8_P( sse2 )
+ PREDICT_P_SUM(3,4)
 
-#endif //!HIGH_BIT_DEPTH
-
-#if HAVE_X86_INLINE_ASM
-
-#define PREDICT_8x8C_P_CORE\
- V = 1 * ( src[4*FDEC_STRIDE -1] - src[ 2*FDEC_STRIDE -1] )\
- + 2 * ( src[5*FDEC_STRIDE -1] - src[ 1*FDEC_STRIDE -1] )\
- + 3 * ( src[6*FDEC_STRIDE -1] - src[ 0*FDEC_STRIDE -1] )\
- + 4 * ( src[7*FDEC_STRIDE -1] - src[-1*FDEC_STRIDE -1] );\
- H += -4 * src[-1*FDEC_STRIDE -1];\
+#if HIGH_BIT_DEPTH
+#define PREDICT_8x8C_P_END(name)\
 int a = 16 * ( src[7*FDEC_STRIDE -1] + src[7 - FDEC_STRIDE] );\
 int b = ( 17 * H + 16 ) >> 5;\
- int c = ( 17 * V + 16 ) >> 5;
+ int c = ( 17 * V + 16 ) >> 5;\
+ x264_predict_8x8c_p_core_##name( src, a, b, c );
+#else // !HIGH_BIT_DEPTH
+#define PREDICT_8x8C_P_END(name)\
+ int a = 16 * ( src[7*FDEC_STRIDE -1] + src[7 - FDEC_STRIDE] );\
+ int b = ( 17 * H + 16 ) >> 5;\
+ int c = ( 17 * V + 16 ) >> 5;\
+ int i00 = a -3*b -3*c + 16;\
+ x264_predict_8x8c_p_core_##name( src, i00, b, c );
+#endif // HIGH_BIT_DEPTH
 
-#if HIGH_BIT_DEPTH
-#define PREDICT_8x8_P2(cpu1, cpu2)\
-static void x264_predict_8x8c_p_ ## cpu1( pixel *src )\
+#define PREDICT_8x8C_P(name, name2)\
+static void x264_predict_8x8c_p_##name( pixel *src )\
 {\
- int H, V;\
+ PREDICT_8x8C_P_CORE\
+ PREDICT_8x8C_P_END(name2)\
+}
+
+#if HAVE_X86_INLINE_ASM
+#if HIGH_BIT_DEPTH
+#define PREDICT_8x8C_P_ASM\
 asm (\
 "movdqa %1, %%xmm0 \n"\
 "pmaddwd %2, %%xmm0 \n"\
@@ -252,19 +257,9 @@
 "movd %%xmm0, %0 \n"\
 :"=r"(H)\
 :"m"(src[-FDEC_STRIDE]), "m"(*pw_m32101234)\
- );\
- PREDICT_8x8C_P_CORE\
- x264_predict_8x8c_p_core_ ## cpu2( src, a, b, c );\
-}
-
-PREDICT_8x8_P2(sse2, sse2)
-PREDICT_8x8_P2( avx, avx)
-
-#else //!HIGH_BIT_DEPTH
-#define PREDICT_8x8_P2(cpu1, cpu2)\
-static void x264_predict_8x8c_p_ ## cpu1( pixel *src )\
-{\
- int H, V;\
+ );
+#else // !HIGH_BIT_DEPTH
+#define PREDICT_8x8C_P_ASM\
 asm (\
 "movq %1, %%mm0 \n"\
 "pmaddubsw %2, %%mm0 \n"\
@@ -276,16 +271,41 @@
 "movswl %w0, %0 \n"\
 :"=r"(H)\
 :"m"(src[-FDEC_STRIDE]), "m"(*pb_m32101234)\
- );\
- PREDICT_8x8C_P_CORE\
- int i00 = a -3*b -3*c + 16;\
- x264_predict_8x8c_p_core_ ## cpu2( src, i00, b, c );\
+ );
+#endif // HIGH_BIT_DEPTH
+
+#define PREDICT_8x8C_P_CORE_INLINE\
+ int H, V;\
+ PREDICT_8x8C_P_ASM\
+ V = 1 * ( src[4*FDEC_STRIDE -1] - src[ 2*FDEC_STRIDE -1] )\
+ + 2 * ( src[5*FDEC_STRIDE -1] - src[ 1*FDEC_STRIDE -1] )\
+ + 3 * ( src[6*FDEC_STRIDE -1] - src[ 0*FDEC_STRIDE -1] )\
+ + 4 * ( src[7*FDEC_STRIDE -1] - src[-1*FDEC_STRIDE -1] );\
+ H += -4 * src[-1*FDEC_STRIDE -1];
+
+#define PREDICT_8x8C_P_INLINE(name, name2)\
+static void x264_predict_8x8c_p_##name( pixel *src )\
+{\
+ PREDICT_8x8C_P_CORE_INLINE\
+ PREDICT_8x8C_P_END(name2)\
 }
+#else // !HAVE_X86_INLINE_ASM
+#define PREDICT_8x8C_P_INLINE(name, name2) PREDICT_8x8C_P(name, name2)
+#endif // HAVE_X86_INLINE_ASM
 
-PREDICT_8x8_P2(ssse3, sse2)
-PREDICT_8x8_P2( avx, avx)
-#endif
-#endif
+#if HIGH_BIT_DEPTH
+PREDICT_8x8C_P_INLINE( sse2, sse2 )
+#else //!HIGH_BIT_DEPTH
+#if !ARCH_X86_64
+PREDICT_8x8C_P( mmx2, mmx2 )
+#endif // !ARCH_X86_64
+PREDICT_8x8C_P( sse2, sse2 )
+#if HAVE_X86_INLINE_ASM
+PREDICT_8x8C_P_INLINE( ssse3, sse2 )
+#endif // HAVE_X86_INLINE_ASM
+#endif // HIGH_BIT_DEPTH
+PREDICT_8x8C_P_INLINE( avx, avx )
+PREDICT_8x8C_P_INLINE( avx2, avx2 )
 
 #if ARCH_X86_64 && !HIGH_BIT_DEPTH
 static void x264_predict_8x8c_dc_left( uint8_t *src )
@@ -312,7 +332,6 @@
 M64( src ) = dc1;
 src += FDEC_STRIDE;
 }
-
 }
 #endif // ARCH_X86_64 && !HIGH_BIT_DEPTH
 
@@ -329,24 +348,32 @@
 pf[I_PRED_16x16_V] = x264_predict_16x16_v_mmx2;
 pf[I_PRED_16x16_H] = x264_predict_16x16_h_mmx2;
 #if HIGH_BIT_DEPTH
+ if( !(cpu&X264_CPU_SSE) )
+ return;
+ pf[I_PRED_16x16_V] = x264_predict_16x16_v_sse;
 if( !(cpu&X264_CPU_SSE2) )
 return;
 pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_sse2;
 pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_sse2;
 pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_sse2;
- pf[I_PRED_16x16_V] = x264_predict_16x16_v_sse2;
 pf[I_PRED_16x16_H] = x264_predict_16x16_h_sse2;
-#if HAVE_X86_INLINE_ASM
 pf[I_PRED_16x16_P] = x264_predict_16x16_p_sse2;
-#endif
+ if( !(cpu&X264_CPU_AVX) )
+ return;
+ pf[I_PRED_16x16_V] = x264_predict_16x16_v_avx;
+ if( !(cpu&X264_CPU_AVX2) )
+ return;
+ pf[I_PRED_16x16_H] = x264_predict_16x16_h_avx2;
 #else
 #if !ARCH_X86_64
 pf[I_PRED_16x16_P] = x264_predict_16x16_p_mmx2;
 #endif
+ if( !(cpu&X264_CPU_SSE) )
+ return;
+ pf[I_PRED_16x16_V] = x264_predict_16x16_v_sse;
 if( !(cpu&X264_CPU_SSE2) )
 return;
 pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_sse2;
- pf[I_PRED_16x16_V] = x264_predict_16x16_v_sse2;
 if( cpu&X264_CPU_SSE2_IS_SLOW )
 return;
 pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_sse2;
@@ -354,7 +381,8 @@
 pf[I_PRED_16x16_P] = x264_predict_16x16_p_sse2;
 if( !(cpu&X264_CPU_SSSE3) )
 return;
- pf[I_PRED_16x16_H] = x264_predict_16x16_h_ssse3;
+ if( !(cpu&X264_CPU_SLOW_PSHUFB) )
+ pf[I_PRED_16x16_H] = x264_predict_16x16_h_ssse3;
 #if HAVE_X86_INLINE_ASM
 pf[I_PRED_16x16_P] = x264_predict_16x16_p_ssse3;
 #endif
@@ -362,6 +390,14 @@
 return;
 pf[I_PRED_16x16_P] = x264_predict_16x16_p_avx;
 #endif // HIGH_BIT_DEPTH
+
+ if( cpu&X264_CPU_AVX2 )
+ {
+ pf[I_PRED_16x16_P] = x264_predict_16x16_p_avx2;
+ pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_avx2;
+ pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_avx2;
+ pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_avx2;
+ }
 }
 
 void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] )
@@ -374,18 +410,21 @@
 return;
 pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_mmx2;
 pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_mmx2;
+ if( !(cpu&X264_CPU_SSE) )
+ return;
+ pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_sse;
 if( !(cpu&X264_CPU_SSE2) )
 return;
- pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_sse2;
 pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_sse2;
 pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_sse2;
 pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_sse2;
-#if HAVE_X86_INLINE_ASM
 pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_sse2;
 if( !(cpu&X264_CPU_AVX) )
 return;
 pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_avx;
-#endif
+ if( !(cpu&X264_CPU_AVX2) )
+ return;
+ pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_avx2;
 #else
 #if ARCH_X86_64
 pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left;
@@ -407,11 +446,16 @@
 pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_ssse3;
 #if HAVE_X86_INLINE_ASM
 pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_ssse3;
+#endif
 if( !(cpu&X264_CPU_AVX) )
 return;
 pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_avx;
-#endif
 #endif // HIGH_BIT_DEPTH
+
+ if( cpu&X264_CPU_AVX2 )
+ {
+ pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_avx2;
+ }
 }
 
 void x264_predict_8x16c_init_mmx( int cpu, x264_predict_t pf[7] )
@@ -423,9 +467,11 @@
 return;
 pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_mmx2;
 pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_mmx2;
+ if( !(cpu&X264_CPU_SSE) )
+ return;
+ pf[I_PRED_CHROMA_V] = x264_predict_8x16c_v_sse;
 if( !(cpu&X264_CPU_SSE2) )
 return;
- pf[I_PRED_CHROMA_V] = x264_predict_8x16c_v_sse2;
 pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x16c_dc_top_sse2;
 pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_sse2;
 pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_sse2;
@@ -433,6 +479,9 @@
 if( !(cpu&X264_CPU_AVX) )
 return;
 pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_avx;
+ if( !(cpu&X264_CPU_AVX2) )
+ return;
+ pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_avx2;
 #else
 pf[I_PRED_CHROMA_V] = x264_predict_8x16c_v_mmx;
 if( !(cpu&X264_CPU_MMX2) )
@@ -440,7 +489,7 @@
 pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x16c_dc_top_mmx2;
 pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_mmx2;
 pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_mmx2;
-#ifndef ARCH_X86_64
+#if !ARCH_X86_64
 pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_mmx2;
 #endif
 if( !(cpu&X264_CPU_SSE2) )
@@ -453,6 +502,11 @@
 return;
 pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_avx;
 #endif // HIGH_BIT_DEPTH
+
+ if( cpu&X264_CPU_AVX2 )
+ {
+ pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_avx2;
+ }
 }
 
 void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter )
@@ -460,9 +514,11 @@
 if( !(cpu&X264_CPU_MMX2) )
 return;
 #if HIGH_BIT_DEPTH
+ if( !(cpu&X264_CPU_SSE) )
+ return;
+ pf[I_PRED_8x8_V] = x264_predict_8x8_v_sse;
 if( !(cpu&X264_CPU_SSE2) )
 return;
- pf[I_PRED_8x8_V] = x264_predict_8x8_v_sse2;
 pf[I_PRED_8x8_H] = x264_predict_8x8_h_sse2;
 pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_sse2;
 pf[I_PRED_8x8_DC_TOP] = x264_predict_8x8_dc_top_sse2;
@@ -520,8 +576,11 @@
 pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_sse2;
 if( !(cpu&X264_CPU_SSSE3) )
 return;
- pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_ssse3;
- pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_ssse3;
+ if( !(cpu&X264_CPU_SLOW_PALIGNR) )
+ {
+ pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_ssse3;
+ pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_ssse3;
+ }
 pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_ssse3;
 *predict_8x8_filter = x264_predict_8x8_filter_ssse3;
 if( !(cpu&X264_CPU_AVX) )
@@ -564,6 +623,9 @@
 pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_avx;
 pf[I_PRED_4x4_VL] = x264_predict_4x4_vl_avx;
 pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_avx;
+ if( !(cpu&X264_CPU_AVX2) )
+ return;
+ pf[I_PRED_4x4_H] = x264_predict_4x4_h_avx2;
 #else
 pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_mmx2;
 if( !(cpu&X264_CPU_SSSE3) )

x264-snapshot-20130224-2245.tar.bz2/common/x86/predict.h -> x264-snapshot-20130723-2245.tar.bz2/common/x86/predict.h Changed

@@ -34,48 +34,57 @@
 void x264_predict_8x8_init_mmx   ( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter );
 
 void x264_predict_16x16_v_mmx2( pixel *src );
-void x264_predict_16x16_v_sse2( pixel *src );
+void x264_predict_16x16_v_sse ( pixel *src );
+void x264_predict_16x16_v_avx ( uint16_t *src );
 void x264_predict_16x16_h_mmx2( pixel *src );
 void x264_predict_16x16_h_sse2( uint16_t *src );
 void x264_predict_16x16_h_ssse3( uint8_t *src );
+void x264_predict_16x16_h_avx2( uint16_t *src );
 void x264_predict_16x16_dc_mmx2( pixel *src );
 void x264_predict_16x16_dc_sse2( pixel *src );
 void x264_predict_16x16_dc_core_mmx2( pixel *src, int i_dc_left );
 void x264_predict_16x16_dc_core_sse2( pixel *src, int i_dc_left );
+void x264_predict_16x16_dc_core_avx2( pixel *src, int i_dc_left );
 void x264_predict_16x16_dc_left_core_mmx2( pixel *src, int i_dc_left );
 void x264_predict_16x16_dc_left_core_sse2( pixel *src, int i_dc_left );
+void x264_predict_16x16_dc_left_core_avx2( pixel *src, int i_dc_left );
 void x264_predict_16x16_dc_top_mmx2( pixel *src );
 void x264_predict_16x16_dc_top_sse2( pixel *src );
-void x264_predict_16x16_dc_top_ssse3( uint16_t *src );
+void x264_predict_16x16_dc_top_avx2( pixel *src );
 void x264_predict_16x16_p_core_mmx2( uint8_t *src, int i00, int b, int c );
 void x264_predict_16x16_p_core_sse2( pixel *src, int i00, int b, int c );
 void x264_predict_16x16_p_core_avx( pixel *src, int i00, int b, int c );
+void x264_predict_16x16_p_core_avx2( pixel *src, int i00, int b, int c );
 void x264_predict_8x16c_dc_mmx2( pixel *src );
 void x264_predict_8x16c_dc_sse2( uint16_t *src );
 void x264_predict_8x16c_dc_top_mmx2( uint8_t *src );
 void x264_predict_8x16c_dc_top_sse2( uint16_t *src );
 void x264_predict_8x16c_v_mmx( uint8_t *src );
-void x264_predict_8x16c_v_sse2( uint16_t *src );
+void x264_predict_8x16c_v_sse( uint16_t *src );
 void x264_predict_8x16c_h_mmx2( pixel *src );
-void x264_predict_8x16c_h_sse2( pixel *src );
+void x264_predict_8x16c_h_sse2( uint16_t *src );
 void x264_predict_8x16c_h_ssse3( uint8_t *src );
+void x264_predict_8x16c_h_avx2( uint16_t *src );
 void x264_predict_8x16c_p_core_mmx2( uint8_t *src, int i00, int b, int c );
 void x264_predict_8x16c_p_core_sse2( pixel *src, int i00, int b, int c );
-void x264_predict_8x16c_p_core_avx( pixel *src, int i00, int b, int c );
+void x264_predict_8x16c_p_core_avx ( pixel *src, int i00, int b, int c );
+void x264_predict_8x16c_p_core_avx2( pixel *src, int i00, int b, int c );
 void x264_predict_8x8c_p_core_mmx2( uint8_t *src, int i00, int b, int c );
 void x264_predict_8x8c_p_core_sse2( pixel *src, int i00, int b, int c );
-void x264_predict_8x8c_p_core_avx( pixel *src, int i00, int b, int c );
+void x264_predict_8x8c_p_core_avx ( pixel *src, int i00, int b, int c );
+void x264_predict_8x8c_p_core_avx2( pixel *src, int i00, int b, int c );
 void x264_predict_8x8c_dc_mmx2( pixel *src );
 void x264_predict_8x8c_dc_sse2( uint16_t *src );
 void x264_predict_8x8c_dc_top_mmx2( uint8_t *src );
 void x264_predict_8x8c_dc_top_sse2( uint16_t *src );
 void x264_predict_8x8c_v_mmx( pixel *src );
-void x264_predict_8x8c_v_sse2( uint16_t *src );
+void x264_predict_8x8c_v_sse( uint16_t *src );
 void x264_predict_8x8c_h_mmx2( pixel *src );
-void x264_predict_8x8c_h_sse2( pixel *src );
+void x264_predict_8x8c_h_sse2( uint16_t *src );
 void x264_predict_8x8c_h_ssse3( uint8_t *src );
+void x264_predict_8x8c_h_avx2( uint16_t *src );
 void x264_predict_8x8_v_mmx2( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_v_sse2( uint16_t *src, uint16_t edge[36] );
+void x264_predict_8x8_v_sse ( uint16_t *src, uint16_t edge[36] );
 void x264_predict_8x8_h_mmx2( uint8_t *src, uint8_t edge[36] );
 void x264_predict_8x8_h_sse2( uint16_t *src, uint16_t edge[36] );
 void x264_predict_8x8_hd_mmx2( uint8_t *src, uint8_t edge[36] );
@@ -114,6 +123,7 @@
 void x264_predict_8x8_filter_sse2( uint16_t *src, uint16_t edge[36], int i_neighbor, int i_filters );
 void x264_predict_8x8_filter_ssse3( pixel *src, pixel edge[36], int i_neighbor, int i_filters );
 void x264_predict_8x8_filter_avx( uint16_t *src, uint16_t edge[36], int i_neighbor, int i_filters );
+void x264_predict_4x4_h_avx2( uint16_t *src );
 void x264_predict_4x4_ddl_mmx2( pixel *src );
 void x264_predict_4x4_ddl_sse2( uint16_t *src );
 void x264_predict_4x4_ddl_avx( uint16_t *src );

x264-snapshot-20130224-2245.tar.bz2/common/x86/quant-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/quant-a.asm Changed

@@ -7,7 +7,7 @@
 ;* Jason Garrett-Glaser <darkshikari@gmail.com>
 ;* Christian Heine <sennindemokrit@gmx.net>
 ;* Oskar Arvidsson <oskar@irock.se>
-;* Henrik Gramner <hengar-6@student.ltu.se>
+;* Henrik Gramner <henrik@gramner.com>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
@@ -30,7 +30,7 @@
 %include "x86inc.asm"
 %include "x86util.asm"
 
-SECTION_RODATA
+SECTION_RODATA 32
 
 %macro DQM4 3
 dw %1, %2, %1, %2, %2, %3, %2, %3
@@ -39,8 +39,7 @@
 dw %1, %4, %5, %4, %1, %4, %5, %4
 dw %4, %2, %6, %2, %4, %2, %6, %2
 dw %5, %6, %3, %6, %5, %6, %3, %6
- ; last line not used, just padding for power-of-2 stride
- times 8 dw 0
+ dw %4, %2, %6, %2, %4, %2, %6, %2
 %endmacro
 
 dequant4_scale:
@@ -75,27 +74,55 @@
 chroma_dc_dct_mask: dw 1, 1,-1,-1, 1, 1,-1,-1
 chroma_dc_dmf_mask: dw 1, 1,-1,-1, 1,-1,-1, 1
 
+%if HIGH_BIT_DEPTH==0
+dct_coef_shuffle:
+%macro DCT_COEF_SHUFFLE 8
+ %assign y x
+ %rep 8
+ %rep 7
+ %rotate (~(y>>7))&1
+ %assign y y<<((~(y>>7))&1)
+ %endrep
+ db %1*2
+ %rotate 1
+ %assign y y<<1
+ %endrep
+%endmacro
+%assign x 0
+%rep 256
+ DCT_COEF_SHUFFLE 7, 6, 5, 4, 3, 2, 1, 0
+%assign x x+1
+%endrep
+%endif
+
 SECTION .text
 
 cextern pb_1
 cextern pw_1
+cextern pw_2
+cextern pw_256
 cextern pd_1
 cextern pb_01
 cextern pd_1024
-
-%macro QUANT_DC_START 0
- movd m6, r1m ; mf
- movd m7, r2m ; bias
-%if HIGH_BIT_DEPTH
- SPLATD m6, m6
- SPLATD m7, m7
+cextern deinterleave_shufd
+cextern popcnt_table
+
+%macro QUANT_DC_START 2
+ movd xm%1, r1m ; mf
+ movd xm%2, r2m ; bias
+%if cpuflag(avx2)
+ vpbroadcastdct m%1, xm%1
+ vpbroadcastdct m%2, xm%2
+%elif HIGH_BIT_DEPTH
+ SPLATD m%1, m%1
+ SPLATD m%2, m%2
 %elif cpuflag(sse4) ; ssse3, but not faster on conroe
 mova m5, [pb_01]
- pshufb m6, m5
- pshufb m7, m5
+ pshufb m%1, m5
+ pshufb m%2, m5
 %else
- SPLATW m6, m6
- SPLATW m7, m7
+ SPLATW m%1, m%1
+ SPLATW m%2, m%2
 %endif
 %endmacro
 
@@ -175,7 +202,7 @@
 %endif ; cpuflag
 %endmacro
 
-%macro QUANT_ONE_AC_MMX 4
+%macro QUANT_ONE_AC_MMX 5
 mova m0, [%1]
 mova m2, [%2]
 ABSD m1, m0
@@ -191,10 +218,10 @@
 psrad m1, 16
 PSIGND m1, m0
 mova [%1], m1
- ACCUM por, 5, 1, %4
+ ACCUM por, %5, 1, %4
 %endmacro
 
-%macro QUANT_TWO_AC 4
+%macro QUANT_TWO_AC 5
 %if cpuflag(sse4)
 mova m0, [%1 ]
 mova m1, [%1+mmsize]
@@ -210,11 +237,11 @@
 PSIGND m3, m1
 mova [%1 ], m2
 mova [%1+mmsize], m3
- ACCUM por, 5, 2, %4
- por m5, m3
+ ACCUM por, %5, 2, %4
+ por m%5, m3
 %else ; !sse4
- QUANT_ONE_AC_MMX %1, %2, %3, %4
- QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, %4+mmsize
+ QUANT_ONE_AC_MMX %1, %2, %3, %4, %5
+ QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, 1, %5
 %endif ; cpuflag
 %endmacro
 
@@ -223,7 +250,7 @@
 ;-----------------------------------------------------------------------------
 %macro QUANT_DC 2
 cglobal quant_%1x%2_dc, 3,3,8
- QUANT_DC_START
+ QUANT_DC_START 6,7
 %if %1*%2 <= mmsize/4
 QUANT_ONE_DC r0, m6, m7, 0
 %else
@@ -244,35 +271,87 @@
 cglobal quant_%1x%2, 3,3,8
 %assign x 0
 %rep %1*%2/(mmsize/2)
- QUANT_TWO_AC r0+x, r1+x, r2+x, x
+ QUANT_TWO_AC r0+x, r1+x, r2+x, x, 5
 %assign x x+mmsize*2
 %endrep
 QUANT_END
 RET
 %endmacro
 
+%macro QUANT_4x4 2
+ QUANT_TWO_AC r0+%1+mmsize*0, r1+mmsize*0, r2+mmsize*0, 0, %2
+ QUANT_TWO_AC r0+%1+mmsize*2, r1+mmsize*2, r2+mmsize*2, 1, %2
+%endmacro
+
+%macro QUANT_4x4x4 0
+cglobal quant_4x4x4, 3,3,8
+ QUANT_4x4 0, 5
+ QUANT_4x4 64, 6
+ add r0, 128
+ packssdw m5, m6
+ QUANT_4x4 0, 6
+ QUANT_4x4 64, 7
+ packssdw m6, m7
+ packssdw m5, m6
+ packssdw m5, m5 ; AA BB CC DD
+ packsswb m5, m5 ; A B C D
+ pxor m4, m4
+ pcmpeqb m5, m4
+ pmovmskb eax, m5
+ not eax
+ and eax, 0xf
+ RET
+%endmacro
+
 INIT_XMM sse2
 QUANT_DC 2, 2
 QUANT_DC 4, 4
 QUANT_AC 4, 4
 QUANT_AC 8, 8
+QUANT_4x4x4
 
 INIT_XMM ssse3
 QUANT_DC 2, 2
 QUANT_DC 4, 4
 QUANT_AC 4, 4
 QUANT_AC 8, 8
+QUANT_4x4x4
 
 INIT_XMM sse4
 QUANT_DC 2, 2
 QUANT_DC 4, 4
 QUANT_AC 4, 4
 QUANT_AC 8, 8
+QUANT_4x4x4
+
+INIT_YMM avx2
+QUANT_DC 4, 4
+QUANT_AC 4, 4
+QUANT_AC 8, 8
+
+INIT_YMM avx2
+cglobal quant_4x4x4, 3,3,6
+ QUANT_TWO_AC r0, r1, r2, 0, 4
+ QUANT_TWO_AC r0+64, r1, r2, 0, 5
+ add r0, 128
+ packssdw m4, m5
+ QUANT_TWO_AC r0, r1, r2, 0, 5
+ QUANT_TWO_AC r0+64, r1, r2, 0, 1
+ packssdw m5, m1
+ packssdw m4, m5
+ pxor m3, m3
+ pcmpeqd m4, m3
+ movmskps eax, m4
+ mov edx, eax
+ shr eax, 4
+ and eax, edx
+ xor eax, 0xf
+ RET
 
 %endif ; HIGH_BIT_DEPTH
 
 %if HIGH_BIT_DEPTH == 0
-%macro QUANT_ONE 4
+%macro QUANT_ONE 5
 ;;; %1 (m64) dct[y][x]
 ;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
 ;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t)
@@ -282,10 +361,10 @@
 pmulhuw m0, %2 ; divide
 PSIGNW m0, m1 ; restore sign
 mova %1, m0 ; store
- ACCUM por, 5, 0, %4
+ ACCUM por, %5, 0, %4
 %endmacro
 
-%macro QUANT_TWO 7
+%macro QUANT_TWO 8
 mova m1, %1
 mova m3, %2
 ABSW m0, m1, sign
@@ -298,8 +377,8 @@
 PSIGNW m2, m3
 mova %1, m0
 mova %2, m2
- ACCUM por, 5, 0, %7
- por m5, m2
+ ACCUM por, %8, 0, %7
+ ACCUM por, %8, 2, %7+mmsize
 %endmacro
 
 ;-----------------------------------------------------------------------------
@@ -307,13 +386,14 @@
 ;-----------------------------------------------------------------------------
 %macro QUANT_DC 2-3 0
 cglobal %1, 1,1,%3
- QUANT_DC_START
 %if %2==1
- QUANT_ONE [r0], m6, m7, 0
+ QUANT_DC_START 2,3
+ QUANT_ONE [r0], m2, m3, 0, 5
 %else
+ QUANT_DC_START 4,6
 %assign x 0
 %rep %2/2
- QUANT_TWO [r0+x], [r0+x+mmsize], m6, m6, m7, m7, x
+ QUANT_TWO [r0+x], [r0+x+mmsize], m4, m4, m6, m6, x, 5
 %assign x x+mmsize*2
 %endrep
 %endif
@@ -326,15 +406,57 @@
 ;-----------------------------------------------------------------------------
 %macro QUANT_AC 2
 cglobal %1, 3,3
+%if %2==1
+ QUANT_ONE [r0], [r1], [r2], 0, 5
+%else
 %assign x 0
 %rep %2/2
- QUANT_TWO [r0+x], [r0+x+mmsize], [r1+x], [r1+x+mmsize], [r2+x], [r2+x+mmsize], x
+ QUANT_TWO [r0+x], [r0+x+mmsize], [r1+x], [r1+x+mmsize], [r2+x], [r2+x+mmsize], x, 5
 %assign x x+mmsize*2
 %endrep
+%endif
 QUANT_END
 RET
 %endmacro
 
+%macro QUANT_4x4 2
+%if UNIX64
+ QUANT_TWO [r0+%1+mmsize*0], [r0+%1+mmsize*1], m8, m9, m10, m11, mmsize*0, %2
+%else
+ QUANT_TWO [r0+%1+mmsize*0], [r0+%1+mmsize*1], [r1+mmsize*0], [r1+mmsize*1], [r2+mmsize*0], [r2+mmsize*1], mmsize*0, %2
+%if mmsize==8
+ QUANT_TWO [r0+%1+mmsize*2], [r0+%1+mmsize*3], [r1+mmsize*2], [r1+mmsize*3], [r2+mmsize*2], [r2+mmsize*3], mmsize*2, %2
+%endif
+%endif
+%endmacro
+
+%macro QUANT_4x4x4 0
+cglobal quant_4x4x4, 3,3,7
+%if UNIX64
+ mova m8, [r1+mmsize*0]
+ mova m9, [r1+mmsize*1]
+ mova m10, [r2+mmsize*0]
+ mova m11, [r2+mmsize*1]
+%endif
+ QUANT_4x4 0, 4
+ QUANT_4x4 32, 5
+ packssdw m4, m5
+ QUANT_4x4 64, 5
+ QUANT_4x4 96, 6
+ packssdw m5, m6
+ packssdw m4, m5
+%if mmsize == 16
+ packssdw m4, m4 ; AA BB CC DD
+%endif
+ packsswb m4, m4 ; A B C D
+ pxor m3, m3
+ pcmpeqb m4, m3
+ pmovmskb eax, m4
+ not eax
+ and eax, 0xf
+ RET
+%endmacro
+
 INIT_MMX mmx2
 QUANT_DC quant_2x2_dc, 1
 %if ARCH_X86_64 == 0 ; not needed because sse2 is faster
@@ -342,26 +464,54 @@
 INIT_MMX mmx
 QUANT_AC quant_4x4, 4
 QUANT_AC quant_8x8, 16
+QUANT_4x4x4
 %endif
 
 INIT_XMM sse2
-QUANT_DC quant_4x4_dc, 2, 8
+QUANT_DC quant_4x4_dc, 2, 7
 QUANT_AC quant_4x4, 2
 QUANT_AC quant_8x8, 8
+QUANT_4x4x4
 
 INIT_XMM ssse3
-QUANT_DC quant_4x4_dc, 2, 8
+QUANT_DC quant_4x4_dc, 2, 7
 QUANT_AC quant_4x4, 2
 QUANT_AC quant_8x8, 8
+QUANT_4x4x4
 
 INIT_MMX ssse3
 QUANT_DC quant_2x2_dc, 1
 
 INIT_XMM sse4
 ;Not faster on Conroe, so only used in SSE4 versions
-QUANT_DC quant_4x4_dc, 2, 8
+QUANT_DC quant_4x4_dc, 2, 7
 QUANT_AC quant_4x4, 2
 QUANT_AC quant_8x8, 8
+
+INIT_YMM avx2
+QUANT_AC quant_4x4, 1
+QUANT_AC quant_8x8, 4
+QUANT_DC quant_4x4_dc, 1, 6
+
+INIT_YMM avx2
+cglobal quant_4x4x4, 3,3,6
+ mova m2, [r1]
+ mova m3, [r2]
+ QUANT_ONE [r0+ 0], m2, m3, 0, 4
+ QUANT_ONE [r0+32], m2, m3, 0, 5
+ packssdw m4, m5
+ QUANT_ONE [r0+64], m2, m3, 0, 5
+ QUANT_ONE [r0+96], m2, m3, 0, 1
+ packssdw m5, m1
+ packssdw m4, m5
+ pxor m3, m3
+ pcmpeqd m4, m3
+ movmskps eax, m4
+ mov edx, eax
+ shr eax, 4
+ and eax, edx
+ xor eax, 0xf
+ RET
 %endif ; !HIGH_BIT_DEPTH
 
 
@@ -370,56 +520,81 @@
 ; dequant
 ;=============================================================================
 
-%macro DEQUANT16_L 3
+%macro DEQUANT16_L 4
 ;;; %1 dct[y][x]
 ;;; %2,%3 dequant_mf[i_mf][y][x]
 ;;; m2 i_qbits
- mova m0, %2
 %if HIGH_BIT_DEPTH
- pmaddwd m0, %1
- pslld m0, m2
+ mova m0, %1
+ mova m1, %4
+ pmaddwd m0, %2
+ pmaddwd m1, %3
+ pslld m0, xm2
+ pslld m1, xm2
+ mova %1, m0
+ mova %4, m1
 %else
+ mova m0, %2
 packssdw m0, %3
- pmullw m0, %1
- psllw m0, m2
+%if mmsize==32
+ vpermq m0, m0, q3120
 %endif
+ pmullw m0, %1
+ psllw m0, xm2
 mova %1, m0
+%endif
 %endmacro
 
-%macro DEQUANT32_R 3
+%macro DEQUANT32_R 4
 ;;; %1 dct[y][x]
 ;;; %2,%3 dequant_mf[i_mf][y][x]
 ;;; m2 -i_qbits
 ;;; m3 f
 ;;; m4 0
- mova m0, %1
 %if HIGH_BIT_DEPTH
+ mova m0, %1
+ mova m1, %4
 pmadcswd m0, m0, %2, m3
- psrad m0, m2
+ pmadcswd m1, m1, %3, m3
+ psrad m0, xm2
+ psrad m1, xm2
+ mova %1, m0
+ mova %4, m1
 %else
+%if mmsize == 32
+ pmovzxwd m0, %1
+ pmovzxwd m1, %4
+%else
+ mova m0, %1
 punpckhwd m1, m0, m4
 punpcklwd m0, m4
+%endif
 pmadcswd m0, m0, %2, m3
 pmadcswd m1, m1, %3, m3
- psrad m0, m2
- psrad m1, m2
+ psrad m0, xm2
+ psrad m1, xm2
 packssdw m0, m1
+%if mmsize == 32
+ vpermq m0, m0, q3120
 %endif
 mova %1, m0
+%endif
 %endmacro
 
 %macro DEQUANT_LOOP 3
-%if 8*(%2-2*%3)
+%if 8*(%2-2*%3) > 0
 mov t0d, 8*(%2-2*%3)
 %%loop:
- %1 [r0+(t0 )*SIZEOF_PIXEL], [r1+t0*2 ], [r1+t0*2+ 8*%3]
- %1 [r0+(t0+8*%3)*SIZEOF_PIXEL], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
+ %1 [r0+(t0 )*SIZEOF_PIXEL], [r1+t0*2 ], [r1+t0*2+ 8*%3], [r0+(t0+ 4*%3)*SIZEOF_PIXEL]
+ %1 [r0+(t0+8*%3)*SIZEOF_PIXEL], [r1+t0*2+16*%3], [r1+t0*2+24*%3], [r0+(t0+12*%3)*SIZEOF_PIXEL]
 sub t0d, 16*%3
 jge %%loop
 RET
 %else
- %1 [r0+(8*%3)*SIZEOF_PIXEL], [r1+16*%3], [r1+24*%3]
- %1 [r0+(0 )*SIZEOF_PIXEL], [r1+0 ], [r1+ 8*%3]
+%if mmsize < 32
+ %1 [r0+(8*%3)*SIZEOF_PIXEL], [r1+16*%3], [r1+24*%3], [r0+(12*%3)*SIZEOF_PIXEL]
+%endif
+ %1 [r0+(0 )*SIZEOF_PIXEL], [r1+0 ], [r1+ 8*%3], [r0+( 4*%3)*SIZEOF_PIXEL]
 RET
 %endif
 %endmacro
@@ -441,10 +616,8 @@
 %endrep
 %endmacro
 
-%if WIN64
+%if ARCH_X86_64
 DECLARE_REG_TMP 6,3,2
-%elif ARCH_X86_64
- DECLARE_REG_TMP 4,3,2
 %else
 DECLARE_REG_TMP 2,0,1
 %endif
@@ -453,8 +626,8 @@
 movifnidn t2d, r2m
 imul t0d, t2d, 0x2b
 shr t0d, 8 ; i_qbits = i_qp / 6
- lea t1, [t0*3]
- sub t2d, t1d
+ lea t1d, [t0*5]
+ sub t2d, t0d
 sub t2d, t1d ; i_mf = i_qp % 6
 shl t2d, %1
 %if ARCH_X86_64
@@ -476,19 +649,19 @@
 DEQUANT_START %2+2, %2
 
 .lshift:
- movd m2, t0d
+ movd xm2, t0d
 DEQUANT_LOOP DEQUANT16_L, %1*%1/4, %3
 
 .rshift32:
 neg t0d
- movd m2, t0d
 mova m3, [pd_1]
+ movd xm2, t0d
+ pslld m3, xm2
 pxor m4, m4
- pslld m3, m2
 psrld m3, 1
 DEQUANT_LOOP DEQUANT32_R, %1*%1/4, %3
 
-%if HIGH_BIT_DEPTH == 0 && notcpuflag(avx)
+%if HIGH_BIT_DEPTH == 0 && (notcpuflag(avx) || mmsize == 32)
 cglobal dequant_%1x%1_flat16, 0,3
 movifnidn t2d, r2m
 %if %1 == 8
@@ -498,8 +671,8 @@
 %endif
 imul t0d, t2d, 0x2b
 shr t0d, 8 ; i_qbits = i_qp / 6
- lea t1, [t0*3]
- sub t2d, t1d
+ lea t1d, [t0*5]
+ sub t2d, t0d
 sub t2d, t1d ; i_mf = i_qp % 6
 shl t2d, %2
 %ifdef PIC
@@ -509,23 +682,41 @@
 lea r1, [dequant%1_scale + t2]
 %endif
 movifnidn r0, r0mp
- movd m4, t0d
+ movd xm4, t0d
 %if %1 == 4
 %if mmsize == 8
 DEQUANT16_FLAT [r1], 0, 16
 DEQUANT16_FLAT [r1+8], 8, 24
-%else
+%elif mmsize == 16
 DEQUANT16_FLAT [r1], 0, 16
+%else
+ vbroadcasti128 m0, [r1]
+ psllw m0, xm4
+ pmullw m0, [r0]
+ mova [r0], m0
 %endif
 %elif mmsize == 8
 DEQUANT16_FLAT [r1], 0, 8, 64, 72
 DEQUANT16_FLAT [r1+16], 16, 24, 48, 56
 DEQUANT16_FLAT [r1+16], 80, 88, 112, 120
 DEQUANT16_FLAT [r1+32], 32, 40, 96, 104
-%else
+%elif mmsize == 16
 DEQUANT16_FLAT [r1], 0, 64
 DEQUANT16_FLAT [r1+16], 16, 48, 80, 112
 DEQUANT16_FLAT [r1+32], 32, 96
+%else
+ mova m1, [r1+ 0]
+ mova m2, [r1+32]
+ psllw m1, xm4
+ psllw m2, xm4
+ pmullw m0, m1, [r0+ 0]
+ pmullw m3, m2, [r0+32]
+ pmullw m4, m1, [r0+64]
+ pmullw m5, m2, [r0+96]
+ mova [r0+ 0], m0
+ mova [r0+32], m3
+ mova [r0+64], m4
+ mova [r0+96], m5
 %endif
 RET
 %endif ; !HIGH_BIT_DEPTH && !AVX
@@ -533,11 +724,14 @@
 
 %if HIGH_BIT_DEPTH
 INIT_XMM sse2
-DEQUANT 4, 4, 1
-DEQUANT 8, 6, 1
+DEQUANT 4, 4, 2
+DEQUANT 8, 6, 2
 INIT_XMM xop
-DEQUANT 4, 4, 1
-DEQUANT 8, 6, 1
+DEQUANT 4, 4, 2
+DEQUANT 8, 6, 2
+INIT_YMM avx2
+DEQUANT 4, 4, 4
+DEQUANT 8, 6, 4
 %else
 %if ARCH_X86_64 == 0
 INIT_MMX mmx
@@ -553,6 +747,9 @@
 INIT_XMM xop
 DEQUANT 4, 4, 2
 DEQUANT 8, 6, 2
+INIT_YMM avx2
+DEQUANT 4, 4, 4
+DEQUANT 8, 6, 4
 %endif
 
 %macro DEQUANT_DC 2
@@ -560,55 +757,62 @@
 DEQUANT_START 6, 6
 
 .lshift:
- movd m3, [r1]
- movd m2, t0d
- pslld m3, m2
- SPLAT%1 m3, m3, 0
-%assign x 0
-%rep SIZEOF_PIXEL*16/mmsize
- mova m0, [r0+mmsize*0+x]
- mova m1, [r0+mmsize*1+x]
- %2 m0, m3
- %2 m1, m3
- mova [r0+mmsize*0+x], m0
- mova [r0+mmsize*1+x], m1
-%assign x x+mmsize*2
+%if cpuflag(avx2)
+ vpbroadcastdct m3, [r1]
+%else
+ movd xm3, [r1]
+ SPLAT%1 m3, xm3
+%endif
+ movd xm2, t0d
+ pslld m3, xm2
+%assign %%x 0
+%rep SIZEOF_PIXEL*32/mmsize
+ %2 m0, m3, [r0+%%x]
+ mova [r0+%%x], m0
+%assign %%x %%x+mmsize
 %endrep
 RET
 
 .rshift32:
- neg t0d
- movd m3, t0d
- mova m4, [p%1_1]
- mova m5, m4
- pslld m4, m3
- psrld m4, 1
- movd m2, [r1]
-%assign x 0
+ neg t0d
+%if cpuflag(avx2)
+ vpbroadcastdct m2, [r1]
+%else
+ movd xm2, [r1]
+%endif
+ mova m5, [p%1_1]
+ movd xm3, t0d
+ pslld m4, m5, xm3
+ psrld m4, 1
 %if HIGH_BIT_DEPTH
- pshufd m2, m2, 0
+%if notcpuflag(avx2)
+ pshufd m2, m2, 0
+%endif
+%assign %%x 0
 %rep SIZEOF_PIXEL*32/mmsize
- mova m0, [r0+x]
- pmadcswd m0, m0, m2, m4
- psrad m0, m3
- mova [r0+x], m0
-%assign x x+mmsize
+ pmadcswd m0, m2, [r0+%%x], m4
+ psrad m0, xm3
+ mova [r0+%%x], m0
+%assign %%x %%x+mmsize
 %endrep
 
 %else ; !HIGH_BIT_DEPTH
+%if notcpuflag(avx2)
 PSHUFLW m2, m2, 0
+%endif
 punpcklwd m2, m4
+%assign %%x 0
 %rep SIZEOF_PIXEL*32/mmsize
- mova m0, [r0+x]
+ mova m0, [r0+%%x]
 punpckhwd m1, m0, m5
 punpcklwd m0, m5
 pmaddwd m0, m2
 pmaddwd m1, m2
- psrad m0, m3
- psrad m1, m3
+ psrad m0, xm3
+ psrad m1, xm3
 packssdw m0, m1
- mova [r0+x], m0
-%assign x x+mmsize
+ mova [r0+%%x], m0
+%assign %%x %%x+mmsize
 %endrep
 %endif ; !HIGH_BIT_DEPTH
 RET
@@ -619,6 +823,8 @@
 DEQUANT_DC d, pmaddwd
 INIT_XMM xop
 DEQUANT_DC d, pmaddwd
+INIT_YMM avx2
+DEQUANT_DC d, pmaddwd
 %else
 %if ARCH_X86_64 == 0
 INIT_MMX mmx2
@@ -628,6 +834,8 @@
 DEQUANT_DC w, pmullw
 INIT_XMM avx
 DEQUANT_DC w, pmullw
+INIT_YMM avx2
+DEQUANT_DC w, pmullw
 %endif
 
 ; t4 is eax for return value.
@@ -757,31 +965,29 @@
 ; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size )
 ;-----------------------------------------------------------------------------
 %macro DENOISE_DCT 0
-cglobal denoise_dct, 4,4,8
- pxor m6, m6
+cglobal denoise_dct, 4,4,6
+ pxor m5, m5
 movsxdifnidn r3, r3d
 .loop:
 mova m2, [r0+r3*4-2*mmsize]
 mova m3, [r0+r3*4-1*mmsize]
 ABSD m0, m2
 ABSD m1, m3
- mova m4, m0
- mova m5, m1
+ paddd m4, m0, [r1+r3*4-2*mmsize]
 psubd m0, [r2+r3*4-2*mmsize]
+ mova [r1+r3*4-2*mmsize], m4
+ paddd m4, m1, [r1+r3*4-1*mmsize]
 psubd m1, [r2+r3*4-1*mmsize]
- pcmpgtd m7, m0, m6
- pand m0, m7
- pcmpgtd m7, m1, m6
- pand m1, m7
+ mova [r1+r3*4-1*mmsize], m4
+ pcmpgtd m4, m0, m5
+ pand m0, m4
+ pcmpgtd m4, m1, m5
+ pand m1, m4
 PSIGND m0, m2
 PSIGND m1, m3
 mova [r0+r3*4-2*mmsize], m0
 mova [r0+r3*4-1*mmsize], m1
- paddd m4, [r1+r3*4-2*mmsize]
- paddd m5, [r1+r3*4-1*mmsize]
- mova [r1+r3*4-2*mmsize], m4
- mova [r1+r3*4-1*mmsize], m5
- sub r3, mmsize/2
+ sub r3d, mmsize/2
 jg .loop
 RET
 %endmacro
@@ -796,6 +1002,8 @@
 DENOISE_DCT
 INIT_XMM avx
 DENOISE_DCT
+INIT_YMM avx2
+DENOISE_DCT
 
 %else ; !HIGH_BIT_DEPTH
 
@@ -845,6 +1053,27 @@
 INIT_XMM avx
 DENOISE_DCT
 
+INIT_YMM avx2
+cglobal denoise_dct, 4,4,4
+ pxor m3, m3
+ movsxdifnidn r3, r3d
+.loop:
+ mova m1, [r0+r3*2-mmsize]
+ pabsw m0, m1
+ psubusw m2, m0, [r2+r3*2-mmsize]
+ vpermq m0, m0, q3120
+ psignw m2, m1
+ mova [r0+r3*2-mmsize], m2
+ punpcklwd m1, m0, m3
+ punpckhwd m0, m3
+ paddd m1, [r1+r3*4-2*mmsize]
+ paddd m0, [r1+r3*4-1*mmsize]
+ mova [r1+r3*4-2*mmsize], m1
+ mova [r1+r3*4-1*mmsize], m0
+ sub r3, mmsize/2
+ jg .loop
+ RET
+
 %endif ; !HIGH_BIT_DEPTH
 
 ;-----------------------------------------------------------------------------
@@ -854,56 +1083,55 @@
 %macro DECIMATE_MASK 5
 %if mmsize==16
 %if HIGH_BIT_DEPTH
- movdqa xmm0, [%3+ 0]
- movdqa xmm1, [%3+32]
- packssdw xmm0, [%3+16]
- packssdw xmm1, [%3+48]
- ABSW2 xmm0, xmm1, xmm0, xmm1, xmm3, xmm4
+ movdqa m0, [%3+ 0]
+ movdqa m1, [%3+32]
+ packssdw m0, [%3+16]
+ packssdw m1, [%3+48]
+ ABSW2 m0, m1, m0, m1, m3, m4
 %else
- ABSW xmm0, [%3+ 0], xmm3
- ABSW xmm1, [%3+16], xmm4
+ ABSW m0, [%3+ 0], m3
+ ABSW m1, [%3+16], m4
 %endif
- packsswb xmm0, xmm1
- pxor xmm2, xmm2
- pcmpeqb xmm2, xmm0
- pcmpgtb xmm0, %4
- pmovmskb %1, xmm2
- pmovmskb %2, xmm0
-
+ packsswb m0, m1
+ pxor m2, m2
+ pcmpeqb m2, m0
+ pcmpgtb m0, %4
+ pmovmskb %1, m2
+ pmovmskb %2, m0
 %else ; mmsize==8
 %if HIGH_BIT_DEPTH
- movq mm0, [%3+ 0]
- movq mm1, [%3+16]
- movq mm2, [%3+32]
- movq mm3, [%3+48]
- packssdw mm0, [%3+ 8]
- packssdw mm1, [%3+24]
- packssdw mm2, [%3+40]
- packssdw mm3, [%3+56]
+ movq m0, [%3+ 0]
+ movq m1, [%3+16]
+ movq m2, [%3+32]
+ movq m3, [%3+48]
+ packssdw m0, [%3+ 8]
+ packssdw m1, [%3+24]
+ packssdw m2, [%3+40]
+ packssdw m3, [%3+56]
 %else
- movq mm0, [%3+ 0]
- movq mm1, [%3+ 8]
- movq mm2, [%3+16]
- movq mm3, [%3+24]
-%endif
- ABSW2 mm0, mm1, mm0, mm1, mm6, mm7
- ABSW2 mm2, mm3, mm2, mm3, mm6, mm7
- packsswb mm0, mm1
- packsswb mm2, mm3
- pxor mm4, mm4
- pxor mm6, mm6
- pcmpeqb mm4, mm0
- pcmpeqb mm6, mm2
- pcmpgtb mm0, %4
- pcmpgtb mm2, %4
- pmovmskb %5, mm4
- pmovmskb %1, mm6
- shl %1, 8
- or %1, %5
- pmovmskb %5, mm0
- pmovmskb %2, mm2
- shl %2, 8
- or %2, %5
+ movq m0, [%3+ 0]
+ movq m1, [%3+ 8]
+ movq m2, [%3+16]
+ movq m3, [%3+24]
+%endif
+ ABSW2 m0, m1, m0, m1, m6, m7
+ ABSW2 m2, m3, m2, m3, m6, m7
+ packsswb m0, m1
+ packsswb m2, m3
+ pxor m4, m4
+ pxor m6, m6
+ pcmpeqb m4, m0
+ pcmpeqb m6, m2
+ pcmpgtb m0, %4
+ pcmpgtb m2, %4
+ pmovmskb %5, m4
+ pmovmskb %1, m6
+ shl %1, 8
+ or %1, %5
+ pmovmskb %5, m0
+ pmovmskb %2, m2
+ shl %2, 8
+ or %2, %5
 %endif
 %endmacro
 
@@ -912,8 +1140,6 @@
 
 %macro DECIMATE4x4 1
 
-;A LUT is faster than bsf on older AMD processors.
-;This is not true for score64.
 cglobal decimate_score%1, 1,3
 %ifdef PIC
 lea r4, [decimate_table4]
@@ -932,7 +1158,6 @@
 %if %1==15
 shr edx, 1
 %endif
-%if cpuflag(slowctz)
 movzx ecx, dl
 movzx eax, byte [mask_table + rcx]
 cmp edx, ecx
@@ -940,19 +1165,11 @@
 bsr ecx, ecx
 shr edx, 1
 shr edx, cl
- bsf ecx, edx
+ tzcnt ecx, edx
 shr edx, 1
 shr edx, cl
 add al, byte [table + rcx]
 add al, byte [mask_table + rdx]
-%else
-.loop:
- tzcnt ecx, edx
- shr edx, cl
- add al, byte [table + rcx]
- shr edx, 1
- jne .loop
-%endif
 .ret:
 REP_RET
 .ret9:
@@ -965,22 +1182,36 @@
 INIT_MMX mmx2
 DECIMATE4x4 15
 DECIMATE4x4 16
-INIT_MMX mmx2, slowctz
-DECIMATE4x4 15
-DECIMATE4x4 16
 %endif
 INIT_XMM sse2
 DECIMATE4x4 15
 DECIMATE4x4 16
-INIT_XMM sse2, slowctz
-DECIMATE4x4 15
-DECIMATE4x4 16
 INIT_XMM ssse3
 DECIMATE4x4 15
 DECIMATE4x4 16
-INIT_XMM ssse3, slowctz
-DECIMATE4x4 15
-DECIMATE4x4 16
+
+; 2x gt1 output, 2x nz output, 1x mask
+%macro DECIMATE_MASK64_AVX2 5
+ pabsw m0, [r0+ 0]
+ pabsw m2, [r0+32]
+ pabsw m1, [r0+64]
+ pabsw m3, [r0+96]
+ packsswb m0, m2
+ packsswb m1, m3
+ pcmpgtb m2, m0, %5 ; the > 1 checks don't care about order, so
+ pcmpgtb m3, m1, %5 ; we can save latency by doing them here
+ pmovmskb %1, m2
+ pmovmskb %2, m3
+ or %1, %2
+ jne .ret9
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q3120
+ pxor m4, m4
+ pcmpeqb m0, m4
+ pcmpeqb m1, m4
+ pmovmskb %3, m0
+ pmovmskb %4, m1
+%endmacro
 
 %macro DECIMATE8x8 0
 
@@ -993,33 +1224,44 @@
 %define table decimate_table8
 %endif
 mova m5, [pb_1]
+%if mmsize==32
+ DECIMATE_MASK64_AVX2 eax, r2d, r1d, r3d, m5
+ shl r3, 32
+ or r1, r3
+ xor r1, -1
+ je .ret
+%else
 DECIMATE_MASK r1d, eax, r0+SIZEOF_DCTCOEF* 0, m5, null
- test eax, eax
+ test eax, eax
 jne .ret9
 DECIMATE_MASK r2d, eax, r0+SIZEOF_DCTCOEF*16, m5, null
- shl r2d, 16
- or r1d, r2d
+ shl r2d, 16
+ or r1d, r2d
 DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*32, m5, null
 shl r2, 32
- or eax, r3d
+ or eax, r3d
 or r1, r2
 DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*48, m5, null
 shl r2, 48
 or r1, r2
 xor r1, -1
 je .ret
- add eax, r3d
+ add eax, r3d
 jne .ret9
+%endif
+ mov al, -6
 .loop:
 tzcnt rcx, r1
 shr r1, cl
 add al, byte [table + rcx]
+ jge .ret9
 shr r1, 1
 jne .loop
+ add al, 6
 .ret:
 REP_RET
 .ret9:
- mov eax, 9
+ mov eax, 9
 RET
 
 %else ; ARCH
@@ -1029,6 +1271,13 @@
 cglobal decimate_score64, 1,5
 %endif
 mova m5, [pb_1]
+%if mmsize==32
+ DECIMATE_MASK64_AVX2 r0, r2, r3, r4, m5
+ xor r3, -1
+ je .tryret
+ xor r4, -1
+.cont:
+%else
 DECIMATE_MASK r3, r2, r0+SIZEOF_DCTCOEF* 0, m5, r5
 test r2, r2
 jne .ret9
@@ -1045,22 +1294,24 @@
 xor r4, -1
 .cont:
 add r0, r2
- jne .ret9 ;r0 is zero at this point, so we don't need to zero it
+ jne .ret9
+%endif
+ mov al, -6
 .loop:
 tzcnt ecx, r3
 test r3, r3
 je .largerun
 shrd r3, r4, cl
 shr r4, cl
- add r0b, byte [decimate_table8 + ecx]
+ add al, byte [decimate_table8 + ecx]
+ jge .ret9
 shrd r3, r4, 1
 shr r4, 1
- cmp r0, 6 ;score64's threshold is never higher than 6
- jge .ret9 ;this early termination is only useful on 32-bit because it can be done in the latency after shrd
 test r3, r3
 jne .loop
 test r4, r4
 jne .loop
+ add al, 6
 .ret:
 REP_RET
 .tryret:
@@ -1077,6 +1328,7 @@
 shr r3, cl
 shr r3, 1
 jne .loop
+ add al, 6
 RET
 %endif ; ARCH
 
@@ -1090,6 +1342,8 @@
 DECIMATE8x8
 INIT_XMM ssse3
 DECIMATE8x8
+INIT_YMM avx2
+DECIMATE8x8
 
 ;-----------------------------------------------------------------------------
 ; int coeff_last( dctcoef *dct )
@@ -1281,38 +1535,38 @@
 RET
 
 %if ARCH_X86_64 == 0
-cglobal coeff_last64, 1, 5-mmsize/16
+cglobal coeff_last64, 1, 4-mmsize/16
 pxor m2, m2
- LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF* 32, r4d
- LAST_MASK 16, r3d, r0+SIZEOF_DCTCOEF* 48, r4d
- shl r3d, 16
- or r2d, r3d
- xor r2d, -1
+ LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 32, r3d
+ LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF* 48, r3d
+ shl r2d, 16
+ or r1d, r2d
+ xor r1d, -1
 jne .secondhalf
- LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0, r4d
- LAST_MASK 16, r3d, r0+SIZEOF_DCTCOEF*16, r4d
- shl r3d, 16
- or r1d, r3d
+ LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0, r3d
+ LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*16, r3d
+ shl r2d, 16
+ or r1d, r2d
 not r1d
 BSR eax, r1d, 0x1f
 RET
 .secondhalf:
- BSR eax, r2d, 0x1f
+ BSR eax, r1d, 0x1f
 add eax, 32
 RET
 %else
-cglobal coeff_last64, 1,4
+cglobal coeff_last64, 1,3
 pxor m2, m2
 LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0
 LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*16
- LAST_MASK 16, r3d, r0+SIZEOF_DCTCOEF*32
- LAST_MASK 16, r0d, r0+SIZEOF_DCTCOEF*48
 shl r2d, 16
- shl r0d, 16
 or r1d, r2d
- or r3d, r0d
- shl r3, 32
- or r1, r3
+ LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*32
+ LAST_MASK 16, r0d, r0+SIZEOF_DCTCOEF*48
+ shl r0d, 16
+ or r2d, r0d
+ shl r2, 32
+ or r1, r2
 not r1
 BSR rax, r1, 0x3f
 RET
@@ -1328,10 +1582,63 @@
 INIT_XMM sse2, lzcnt
 COEFF_LAST
 
+%macro LAST_MASK_AVX2 2
+%if HIGH_BIT_DEPTH
+ mova m0, [%2+ 0]
+ packssdw m0, [%2+32]
+ mova m1, [%2+64]
+ packssdw m1, [%2+96]
+ packsswb m0, m1
+ mova m1, [deinterleave_shufd]
+ vpermd m0, m1, m0
+%else
+ mova m0, [%2+ 0]
+ packsswb m0, [%2+32]
+ vpermq m0, m0, q3120
+%endif
+ pcmpeqb m0, m2
+ pmovmskb %1, m0
+%endmacro
+
+%if ARCH_X86_64 == 0
+INIT_YMM avx2,lzcnt
+cglobal coeff_last64, 1,2
+ pxor m2, m2
+ LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF*32
+ xor r1d, -1
+ jne .secondhalf
+ LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF* 0
+ not r1d
+ BSR eax, r1d, 0x1f
+ RET
+.secondhalf:
+ BSR eax, r1d, 0x1f
+ add eax, 32
+ RET
+%else
+INIT_YMM avx2,lzcnt
+cglobal coeff_last64, 1,3
+ pxor m2, m2
+ LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF* 0
+ LAST_MASK_AVX2 r2d, r0+SIZEOF_DCTCOEF*32
+ shl r2, 32
+ or r1, r2
+ not r1
+ BSR rax, r1, 0x3f
+ RET
+%endif
+
 ;-----------------------------------------------------------------------------
 ; int coeff_level_run( dctcoef *dct, run_level_t *runlevel )
 ;-----------------------------------------------------------------------------
 
+struc levelrun
+ .last: resd 1
+ .mask: resd 1
+ align 16, resb 1
+ .level: resw 16
+endstruc
+
 ; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
 %if WIN64
 DECLARE_REG_TMP 3,1,2,0,4,5,6
@@ -1346,6 +1653,7 @@
 movifnidn t0, r0mp
 movifnidn t1, r1mp
 pxor m2, m2
+ xor t3d, t3d
 LAST_MASK %1, t5d, t0-(%1&1)*SIZEOF_DCTCOEF, t4d
 %if %1==15
 shr t5d, 1
@@ -1355,7 +1663,7 @@
 and t5d, 0xf
 %endif
 xor t5d, (1<<%1)-1
- mov [t1+4], t5d
+ mov [t1+levelrun.mask], t5d
 shl t5d, 32-%1
 mov t4d, %1-1
 LZCOUNT t3d, t5d, 0x1f
@@ -1363,7 +1671,7 @@
 add t5d, t5d
 sub t4d, t3d
 shl t5d, t3b
- mov [t1], t4d
+ mov [t1+levelrun.last], t4d
 .loop:
 LZCOUNT t3d, t5d, 0x1f
 %if HIGH_BIT_DEPTH
@@ -1374,9 +1682,9 @@
 inc t3d
 shl t5d, t3b
 %if HIGH_BIT_DEPTH
- mov [t1+t6*4+ 8], t2d
+ mov [t1+t6*4+levelrun.level], t2d
 %else
- mov [t1+t6*2+ 8], t2w
+ mov [t1+t6*2+levelrun.level], t2w
 %endif
 inc t6d
 sub t4d, t3d
@@ -1406,3 +1714,133 @@
 INIT_MMX mmx2, lzcnt
 COEFF_LEVELRUN 4
 COEFF_LEVELRUN 8
+
+; Similar to the one above, but saves the DCT
+; coefficients in m0/m1 so we don't have to load
+; them later.
+%macro LAST_MASK_LUT 3
+ pxor xm5, xm5
+%if %1 <= 8
+ mova m0, [%3]
+ packsswb m2, m0, m0
+%else
+ mova xm0, [%3+ 0]
+ mova xm1, [%3+16]
+ packsswb xm2, xm0, xm1
+%if mmsize==32
+ vinserti128 m0, m0, xm1, 1
+%endif
+%endif
+ pcmpeqb xm2, xm5
+ pmovmskb %2, xm2
+%endmacro
+
+%macro COEFF_LEVELRUN_LUT 1
+cglobal coeff_level_run%1,2,4+(%1/9)
+%ifdef PIC
+ lea r5, [$$]
+ %define GLOBAL +r5-$$
+%else
+ %define GLOBAL
+%endif
+ LAST_MASK_LUT %1, eax, r0-(%1&1)*SIZEOF_DCTCOEF
+%if %1==15
+ shr eax, 1
+%elif %1==8
+ and eax, 0xff
+%elif %1==4
+ and eax, 0xf
+%endif
+ xor eax, (1<<%1)-1
+ mov [r1+levelrun.mask], eax
+%if %1==15
+ add eax, eax
+%endif
+%if %1 > 8
+%if ARCH_X86_64
+ mov r4d, eax
+ shr r4d, 8
+%else
+ movzx r4d, ah ; first 8 bits
+%endif
+%endif
+ movzx r2d, al ; second 8 bits
+ shl eax, 32-%1-(%1&1)
+ LZCOUNT eax, eax, 0x1f
+ mov r3d, %1-1
+ sub r3d, eax
+ mov [r1+levelrun.last], r3d
+; Here we abuse pshufb, combined with a lookup table, to do a gather
+; operation based on a bitmask. For example:
+;
+; dct 15-8 (input): 0 0 4 0 0 -2 1 0
+; dct 7-0 (input): 0 0 -1 0 0 0 0 15
+; bitmask 1: 0 0 1 0 0 1 1 0
+; bitmask 2: 0 0 1 0 0 0 0 1
+; gather 15-8: 4 -2 1 __ __ __ __ __
+; gather 7-0: -1 15 __ __ __ __ __ __
+; levels (output): 4 -2 1 -1 15 __ __ __ __ __ __ __ __ __ __ __
+;
+; The overlapping, dependent stores almost surely cause a mess of
+; forwarding issues, but it's still enormously faster.
+%if %1 > 8
+ movzx eax, byte [popcnt_table+r4 GLOBAL]
+ movzx r3d, byte [popcnt_table+r2 GLOBAL]
+%if mmsize==16
+ movh m3, [dct_coef_shuffle+r4*8 GLOBAL]
+ movh m2, [dct_coef_shuffle+r2*8 GLOBAL]
+ mova m4, [pw_256]
+; Storing 8 bytes of shuffle constant and converting it (unpack + or)
+; is neutral to slightly faster in local speed measurements, but it
+; cuts the table size in half, which is surely a big cache win.
+ punpcklbw m3, m3
+ punpcklbw m2, m2
+ por m3, m4
+ por m2, m4
+ pshufb m1, m3
+ pshufb m0, m2
+ mova [r1+levelrun.level], m1
+; This obnoxious unaligned store messes with store forwarding and
+; stalls the CPU to no end, but merging the two registers before
+; storing requires a variable 128-bit shift. Emulating this does
+; work, but requires a lot of ops and the gain is tiny and
+; inconsistent, so we'll err on the side of fewer instructions.
+ movu [r1+rax*2+levelrun.level], m0
+%else ; mmsize==32
+ movq xm2, [dct_coef_shuffle+r4*8 GLOBAL]
+ vinserti128 m2, m2, [dct_coef_shuffle+r2*8 GLOBAL], 1
+ punpcklbw m2, m2
+ por m2, [pw_256]
+ pshufb m0, m2
+ vextracti128 [r1+levelrun.level], m0, 1
+ movu [r1+rax*2+levelrun.level], xm0
+%endif
+ add eax, r3d
+%else
+ movzx eax, byte [popcnt_table+r2 GLOBAL]
+ movh m1, [dct_coef_shuffle+r2*8 GLOBAL]
+ punpcklbw m1, m1
+ por m1, [pw_256]
+ pshufb m0, m1
+ mova [r1+levelrun.level], m0
+%endif
+ RET
+%endmacro
+
+%if HIGH_BIT_DEPTH==0
+INIT_MMX ssse3
+COEFF_LEVELRUN_LUT 4
+INIT_XMM ssse3
+COEFF_LEVELRUN_LUT 8
+COEFF_LEVELRUN_LUT 15
+COEFF_LEVELRUN_LUT 16
+INIT_MMX ssse3, lzcnt
+COEFF_LEVELRUN_LUT 4
+INIT_XMM ssse3, lzcnt
+COEFF_LEVELRUN_LUT 8
+COEFF_LEVELRUN_LUT 15
+COEFF_LEVELRUN_LUT 16
+INIT_XMM avx2, lzcnt
+COEFF_LEVELRUN_LUT 15
+COEFF_LEVELRUN_LUT 16
+%endif

x264-snapshot-20130224-2245.tar.bz2/common/x86/quant.h -> x264-snapshot-20130723-2245.tar.bz2/common/x86/quant.h Changed

@@ -31,19 +31,27 @@
 int x264_quant_2x2_dc_mmx2( dctcoef dct[4], int mf, int bias );
 int x264_quant_4x4_dc_mmx2( dctcoef dct[16], int mf, int bias );
 int x264_quant_4x4_mmx( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
+int x264_quant_4x4x4_mmx( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
 int x264_quant_8x8_mmx( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
 int x264_quant_2x2_dc_sse2( dctcoef dct[16], int mf, int bias );
 int x264_quant_4x4_dc_sse2( dctcoef dct[16], int mf, int bias );
 int x264_quant_4x4_sse2( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
+int x264_quant_4x4x4_sse2( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
 int x264_quant_8x8_sse2( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
 int x264_quant_2x2_dc_ssse3( dctcoef dct[4], int mf, int bias );
 int x264_quant_4x4_dc_ssse3( dctcoef dct[16], int mf, int bias );
 int x264_quant_4x4_ssse3( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
+int x264_quant_4x4x4_ssse3( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
 int x264_quant_8x8_ssse3( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
 int x264_quant_2x2_dc_sse4( dctcoef dct[16], int mf, int bias );
 int x264_quant_4x4_dc_sse4( dctcoef dct[16], int mf, int bias );
 int x264_quant_4x4_sse4( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
+int x264_quant_4x4x4_sse4( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
 int x264_quant_8x8_sse4( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
+int x264_quant_4x4_avx2( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
+int x264_quant_4x4_dc_avx2( dctcoef dct[16], int mf, int bias );
+int x264_quant_8x8_avx2( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
+int x264_quant_4x4x4_avx2( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
 void x264_dequant_4x4_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
 void x264_dequant_4x4dc_mmx2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
 void x264_dequant_8x8_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
@@ -56,10 +64,15 @@
 void x264_dequant_4x4_xop( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
 void x264_dequant_4x4dc_xop( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
 void x264_dequant_8x8_xop( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
+void x264_dequant_4x4_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_4x4dc_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_8x8_avx2( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
 void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
 void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
 void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
 void x264_dequant_8x8_flat16_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
+void x264_dequant_4x4_flat16_avx2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_8x8_flat16_avx2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
 int x264_optimize_chroma_2x2_dc_sse2( dctcoef dct[4], int dequant_mf );
 int x264_optimize_chroma_2x2_dc_ssse3( dctcoef dct[4], int dequant_mf );
 int x264_optimize_chroma_2x2_dc_sse4( dctcoef dct[4], int dequant_mf );
@@ -68,21 +81,17 @@
 void x264_denoise_dct_sse2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
 void x264_denoise_dct_ssse3( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
 void x264_denoise_dct_avx  ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
+void x264_denoise_dct_avx2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
 int x264_decimate_score15_mmx2( dctcoef *dct );
 int x264_decimate_score15_sse2( dctcoef *dct );
 int x264_decimate_score15_ssse3( dctcoef *dct );
 int x264_decimate_score16_mmx2( dctcoef *dct );
 int x264_decimate_score16_sse2( dctcoef *dct );
 int x264_decimate_score16_ssse3( dctcoef *dct );
-int x264_decimate_score15_mmx2_slowctz( dctcoef *dct );
-int x264_decimate_score15_sse2_slowctz( dctcoef *dct );
-int x264_decimate_score15_ssse3_slowctz( dctcoef *dct );
-int x264_decimate_score16_mmx2_slowctz( dctcoef *dct );
-int x264_decimate_score16_sse2_slowctz( dctcoef *dct );
-int x264_decimate_score16_ssse3_slowctz( dctcoef *dct );
 int x264_decimate_score64_mmx2( dctcoef *dct );
 int x264_decimate_score64_sse2( dctcoef *dct );
 int x264_decimate_score64_ssse3( dctcoef *dct );
+int x264_decimate_score64_avx2( int16_t *dct );
 int x264_coeff_last4_mmx2( dctcoef *dct );
 int x264_coeff_last8_mmx2( dctcoef *dct );
 int x264_coeff_last15_mmx2( dctcoef *dct );
@@ -98,18 +107,29 @@
 int x264_coeff_last15_sse2_lzcnt( dctcoef *dct );
 int x264_coeff_last16_sse2_lzcnt( dctcoef *dct );
 int x264_coeff_last64_sse2_lzcnt( dctcoef *dct );
+int x264_coeff_last64_avx2_lzcnt( dctcoef *dct );
 int x264_coeff_level_run16_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run16_sse2( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run16_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run16_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run16_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run16_avx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run15_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run15_sse2( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run15_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run15_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run15_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run15_avx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run4_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run4_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run4_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run4_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run8_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run8_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run8_sse2( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run8_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run8_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run8_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_trellis_cabac_4x4_sse2 ( TRELLIS_PARAMS, int b_ac );
 int x264_trellis_cabac_4x4_ssse3( TRELLIS_PARAMS, int b_ac );
 int x264_trellis_cabac_8x8_sse2 ( TRELLIS_PARAMS, int b_interlaced );

x264-snapshot-20130224-2245.tar.bz2/common/x86/sad-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/sad-a.asm Changed

@@ -29,6 +29,12 @@
 %include "x86inc.asm"
 %include "x86util.asm"
 
+SECTION_RODATA 32
+
+pb_shuf8x8c2: times 2 db 0,0,0,0,8,8,8,8,-1,-1,-1,-1,-1,-1,-1,-1
+deinterleave_sadx4: dd 0,4,2,6
+hpred_shuf: db 0,0,2,2,8,8,10,10,1,1,3,3,9,9,11,11
+
 SECTION .text
 
 cextern pb_3
@@ -556,6 +562,65 @@
 INIT_MMX ssse3
 INTRA_SAD_8x8C
 
+INIT_YMM avx2
+cglobal intra_sad_x3_8x8c, 3,3,7
+    vpbroadcastq m2, [r1 - FDEC_STRIDE]         ; V pred
+    add          r1, FDEC_STRIDE*4-1
+    pxor        xm5, xm5
+    punpckldq   xm3, xm2, xm5                   ; V0 _ V1 _
+    movd        xm0, [r1 + FDEC_STRIDE*-1 - 3]
+    movd        xm1, [r1 + FDEC_STRIDE* 3 - 3]
+    pinsrb      xm0, [r1 + FDEC_STRIDE*-4], 0
+    pinsrb      xm1, [r1 + FDEC_STRIDE* 0], 0
+    pinsrb      xm0, [r1 + FDEC_STRIDE*-3], 1
+    pinsrb      xm1, [r1 + FDEC_STRIDE* 1], 1
+    pinsrb      xm0, [r1 + FDEC_STRIDE*-2], 2
+    pinsrb      xm1, [r1 + FDEC_STRIDE* 2], 2
+    punpcklqdq  xm0, xm1                        ; H0 _ H1 _
+    vinserti128  m3, m3, xm0, 1                 ; V0 V1 H0 H1
+    pshufb      xm0, [hpred_shuf]               ; H00224466 H11335577
+    psadbw       m3, m5                         ; s0 s1 s2 s3
+    vpermq       m4, m3, q3312                  ; s2 s1 s3 s3
+    vpermq       m3, m3, q1310                  ; s0 s1 s3 s1
+    paddw        m3, m4
+    psrlw        m3, 2
+    pavgw        m3, m5                         ; s0+s2 s1 s3 s1+s3
+    pshufb       m3, [pb_shuf8x8c2]             ; DC0 _ DC1 _
+    vpblendd     m3, m3, m2, 11001100b          ; DC0 V DC1 V
+    vinserti128  m1, m3, xm3, 1                 ; DC0 V DC0 V
+    vperm2i128   m6, m3, m3, q0101              ; DC1 V DC1 V
+    vpermq       m0, m0, q3120                  ; H00224466 _ H11335577 _
+    movddup      m2, [r0+FENC_STRIDE*0]
+    movddup      m4, [r0+FENC_STRIDE*2]
+    pshuflw      m3, m0, q0000
+    psadbw       m3, m2
+    psadbw       m2, m1
+    pshuflw      m5, m0, q1111
+    psadbw       m5, m4
+    psadbw       m4, m1
+    paddw        m2, m4
+    paddw        m3, m5
+    movddup      m4, [r0+FENC_STRIDE*4]
+    pshuflw      m5, m0, q2222
+    psadbw       m5, m4
+    psadbw       m4, m6
+    paddw        m2, m4
+    paddw        m3, m5
+    movddup      m4, [r0+FENC_STRIDE*6]
+    pshuflw      m5, m0, q3333
+    psadbw       m5, m4
+    psadbw       m4, m6
+    paddw        m2, m4
+    paddw        m3, m5
+    vextracti128 xm0, m2, 1
+    vextracti128 xm1, m3, 1
+    paddw       xm2, xm0 ; DC V
+    paddw       xm3, xm1 ; H
+    pextrd   [r2+8], xm2, 2 ; V
+    movd     [r2+4], xm3    ; H
+    movd     [r2+0], xm2    ; DC
+    RET
+
 
 ;-----------------------------------------------------------------------------
 ; void intra_sad_x3_16x16( uint8_t *fenc, uint8_t *fdec, int res[3] );
@@ -648,7 +713,50 @@
 INIT_XMM ssse3
 INTRA_SAD16
 
-
+INIT_YMM avx2
+cglobal intra_sad_x3_16x16, 3,5,6
+    pxor   xm0, xm0
+    psadbw xm0, [r1-FDEC_STRIDE]
+    movhlps xm1, xm0
+    paddw  xm0, xm1
+    movd   r3d, xm0
+%assign x 0
+%rep 16
+    movzx  r4d, byte [r1-1+FDEC_STRIDE*(x&3)]
+%if (x&3)==3 && x!=15
+    add     r1, FDEC_STRIDE*4
+%endif
+    add    r3d, r4d
+%assign x x+1
+%endrep
+    sub     r1, FDEC_STRIDE*12
+    add    r3d, 16
+    shr    r3d, 5
+    movd   xm5, r3d
+    vpbroadcastb xm5, xm5
+    vinserti128 m5, m5, [r1-FDEC_STRIDE], 1 ; m5 contains DC and V prediction
+
+    pxor    m4, m4  ; DC / V accumulator
+    pxor   xm3, xm3 ; H accumulator
+    mov    r3d, 15*FENC_STRIDE
+.vloop:
+    vpbroadcastb  xm2, [r1+r3*2-1]
+    vbroadcasti128 m0, [r0+r3]
+    psadbw  m1, m0, m5
+    psadbw xm0, xm2
+    paddw   m4, m1
+    paddw  xm3, xm0
+    add    r3d, -FENC_STRIDE
+    jge .vloop
+    punpckhqdq m5, m4, m4
+    movhlps xm2, xm3
+    paddw   m4, m5      ; DC / V
+    paddw  xm3, xm2     ; H
+    vextracti128 xm2, m4, 1
+    movd  [r2+0], xm2
+    movd  [r2+4], xm3
+    movd  [r2+8], xm4
+    RET
 
 ;=============================================================================
 ; SAD x3/x4 MMX
@@ -944,17 +1052,27 @@
 %endif
 %endmacro
 
-%macro SAD_X3_2x16P_SSE2 1
-%if %1
+%macro SAD_X3_4x16P_SSE2 2
+%if %1==0
+%if UNIX64
+    mov  r6, r5
+%endif
+    lea  r5, [r4*3]
     SAD_X3_START_1x16P_SSE2
 %else
-    SAD_X3_1x16P_SSE2 0, 0
+    SAD_X3_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0
+%endif
+    SAD_X3_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r4*1
+    SAD_X3_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2
+    SAD_X3_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), r5
+%if %1 != %2-1
+%if (%1&1) != 0
+    add  r0, 8*FENC_STRIDE
+%endif
+    lea  r1, [r1+4*r4]
+    lea  r2, [r2+4*r4]
+    lea  r3, [r3+4*r4]
 %endif
-    SAD_X3_1x16P_SSE2 FENC_STRIDE, r4
-    add  r0, 2*FENC_STRIDE
-    lea  r1, [r1+2*r4]
-    lea  r2, [r2+2*r4]
-    lea  r3, [r3+2*r4]
 %endmacro
 
 %macro SAD_X3_START_2x8P_SSE2 0
@@ -971,15 +1089,15 @@
     psadbw  xmm2, xmm7
 %endmacro
 
-%macro SAD_X3_2x8P_SSE2 0
-    movq    xmm7, [r0]
-    movq    xmm3, [r1]
-    movq    xmm4, [r2]
-    movq    xmm5, [r3]
-    movhps  xmm7, [r0+FENC_STRIDE]
-    movhps  xmm3, [r1+r4]
-    movhps  xmm4, [r2+r4]
-    movhps  xmm5, [r3+r4]
+%macro SAD_X3_2x8P_SSE2 4
+    movq    xmm7, [r0+%1]
+    movq    xmm3, [r1+%2]
+    movq    xmm4, [r2+%2]
+    movq    xmm5, [r3+%2]
+    movhps  xmm7, [r0+%3]
+    movhps  xmm3, [r1+%4]
+    movhps  xmm4, [r2+%4]
+    movhps  xmm5, [r3+%4]
     psadbw  xmm3, xmm7
     psadbw  xmm4, xmm7
     psadbw  xmm5, xmm7
@@ -1005,18 +1123,18 @@
     psadbw  xmm3, xmm7
 %endmacro
 
-%macro SAD_X4_2x8P_SSE2 0
-    movq    xmm7, [r0]
-    movq    xmm4, [r1]
-    movq    xmm5, [r2]
+%macro SAD_X4_2x8P_SSE2 4
+    movq    xmm7, [r0+%1]
+    movq    xmm4, [r1+%2]
+    movq    xmm5, [r2+%2]
 %if ARCH_X86_64
-    movq    xmm6, [r3]
-    movq    xmm8, [r4]
-    movhps  xmm7, [r0+FENC_STRIDE]
-    movhps  xmm4, [r1+r5]
-    movhps  xmm5, [r2+r5]
-    movhps  xmm6, [r3+r5]
-    movhps  xmm8, [r4+r5]
+    movq    xmm6, [r3+%2]
+    movq    xmm8, [r4+%2]
+    movhps  xmm7, [r0+%3]
+    movhps  xmm4, [r1+%4]
+    movhps  xmm5, [r2+%4]
+    movhps  xmm6, [r3+%4]
+    movhps  xmm8, [r4+%4]
     psadbw  xmm4, xmm7
     psadbw  xmm5, xmm7
     psadbw  xmm6, xmm7
@@ -1026,17 +1144,17 @@
     paddw   xmm2, xmm6
     paddw   xmm3, xmm8
 %else
-    movhps  xmm7, [r0+FENC_STRIDE]
-    movhps  xmm4, [r1+r5]
-    movhps  xmm5, [r2+r5]
+    movhps  xmm7, [r0+%3]
+    movhps  xmm4, [r1+%4]
+    movhps  xmm5, [r2+%4]
     psadbw  xmm4, xmm7
     psadbw  xmm5, xmm7
     paddw   xmm0, xmm4
     paddw   xmm1, xmm5
-    movq    xmm6, [r3]
-    movq    xmm4, [r4]
-    movhps  xmm6, [r3+r5]
-    movhps  xmm4, [r4+r5]
+    movq    xmm6, [r3+%2]
+    movq    xmm4, [r4+%2]
+    movhps  xmm6, [r3+%4]
+    movhps  xmm4, [r4+%4]
     psadbw  xmm6, xmm7
     psadbw  xmm4, xmm7
     paddw   xmm2, xmm6
@@ -1110,43 +1228,65 @@
 %endif
 %endmacro
 
-%macro SAD_X4_2x16P_SSE2 1
-%if %1
+%macro SAD_X4_4x16P_SSE2 2
+%if %1==0
+    lea  r6, [r5*3]
     SAD_X4_START_1x16P_SSE2
 %else
-    SAD_X4_1x16P_SSE2 0, 0
+    SAD_X4_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r5*0
+%endif
+    SAD_X4_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r5*1
+    SAD_X4_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r5*2
+    SAD_X4_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), r6
+%if %1 != %2-1
+%if (%1&1) != 0
+    add  r0, 8*FENC_STRIDE
+%endif
+    lea  r1, [r1+4*r5]
+    lea  r2, [r2+4*r5]
+    lea  r3, [r3+4*r5]
+    lea  r4, [r4+4*r5]
 %endif
-    SAD_X4_1x16P_SSE2 FENC_STRIDE, r5
-    add  r0, 2*FENC_STRIDE
-    lea  r1, [r1+2*r5]
-    lea  r2, [r2+2*r5]
-    lea  r3, [r3+2*r5]
-    lea  r4, [r4+2*r5]
 %endmacro
 
-%macro SAD_X3_2x8P_SSE2 1
-%if %1
+%macro SAD_X3_4x8P_SSE2 2
+%if %1==0
+%if UNIX64
+    mov  r6, r5
+%endif
+    lea  r5, [r4*3]
     SAD_X3_START_2x8P_SSE2
 %else
-    SAD_X3_2x8P_SSE2
+    SAD_X3_2x8P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0, FENC_STRIDE*(1+(%1&1)*4), r4*1
+%endif
+    SAD_X3_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2, FENC_STRIDE*(3+(%1&1)*4), r5
+%if %1 != %2-1
+%if (%1&1) != 0
+    add  r0, 8*FENC_STRIDE
+%endif
+    lea  r1, [r1+4*r4]
+    lea  r2, [r2+4*r4]
+    lea  r3, [r3+4*r4]
 %endif
-    add  r0, 2*FENC_STRIDE
-    lea  r1, [r1+2*r4]
-    lea  r2, [r2+2*r4]
-    lea  r3, [r3+2*r4]
 %endmacro
 
-%macro SAD_X4_2x8P_SSE2 1
-%if %1
+%macro SAD_X4_4x8P_SSE2 2
+%if %1==0
+    lea    r6, [r5*3]
     SAD_X4_START_2x8P_SSE2
 %else
-    SAD_X4_2x8P_SSE2
+    SAD_X4_2x8P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1
+%endif
+    SAD_X4_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6
+%if %1 != %2-1
+%if (%1&1) != 0
+    add  r0, 8*FENC_STRIDE
+%endif
+    lea  r1, [r1+4*r5]
+    lea  r2, [r2+4*r5]
+    lea  r3, [r3+4*r5]
+    lea  r4, [r4+4*r5]
 %endif
-    add  r0, 2*FENC_STRIDE
-    lea  r1, [r1+2*r5]
-    lea  r2, [r2+2*r5]
-    lea  r3, [r3+2*r5]
-    lea  r4, [r4+2*r5]
 %endmacro
 
 %macro SAD_X3_END_SSE2 0
@@ -1157,9 +1297,9 @@
     paddw   xmm1, xmm5
     paddw   xmm2, xmm6
 %if UNIX64
-    movd [r5+0], xmm0
-    movd [r5+4], xmm1
-    movd [r5+8], xmm2
+    movd [r6+0], xmm0
+    movd [r6+4], xmm1
+    movd [r6+8], xmm2
 %else
     mov      r0, r5mp
     movd [r0+0], xmm0
@@ -1184,15 +1324,230 @@
     RET
 %endmacro
 
+%macro SAD_X4_START_2x8P_SSSE3 0
+    movddup xmm4, [r0]
+    movq    xmm0, [r1]
+    movq    xmm1, [r3]
+    movhps  xmm0, [r2]
+    movhps  xmm1, [r4]
+    movddup xmm5, [r0+FENC_STRIDE]
+    movq    xmm2, [r1+r5]
+    movq    xmm3, [r3+r5]
+    movhps  xmm2, [r2+r5]
+    movhps  xmm3, [r4+r5]
+    psadbw  xmm0, xmm4
+    psadbw  xmm1, xmm4
+    psadbw  xmm2, xmm5
+    psadbw  xmm3, xmm5
+    paddw   xmm0, xmm2
+    paddw   xmm1, xmm3
+%endmacro
+
+%macro SAD_X4_2x8P_SSSE3 4
+    movddup xmm6, [r0+%1]
+    movq    xmm2, [r1+%2]
+    movq    xmm3, [r3+%2]
+    movhps  xmm2, [r2+%2]
+    movhps  xmm3, [r4+%2]
+    movddup xmm7, [r0+%3]
+    movq    xmm4, [r1+%4]
+    movq    xmm5, [r3+%4]
+    movhps  xmm4, [r2+%4]
+    movhps  xmm5, [r4+%4]
+    psadbw  xmm2, xmm6
+    psadbw  xmm3, xmm6
+    psadbw  xmm4, xmm7
+    psadbw  xmm5, xmm7
+    paddw   xmm0, xmm2
+    paddw   xmm1, xmm3
+    paddw   xmm0, xmm4
+    paddw   xmm1, xmm5
+%endmacro
+
+%macro SAD_X4_4x8P_SSSE3 2
+%if %1==0
+    lea    r6, [r5*3]
+    SAD_X4_START_2x8P_SSSE3
+%else
+    SAD_X4_2x8P_SSSE3 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1
+%endif
+    SAD_X4_2x8P_SSSE3 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6
+%if %1 != %2-1
+%if (%1&1) != 0
+    add  r0, 8*FENC_STRIDE
+%endif
+    lea  r1, [r1+4*r5]
+    lea  r2, [r2+4*r5]
+    lea  r3, [r3+4*r5]
+    lea  r4, [r4+4*r5]
+%endif
+%endmacro
+
+%macro SAD_X4_END_SSSE3 0
+    mov       r0, r6mp
+    packssdw xmm0, xmm1
+    movdqa  [r0], xmm0
+    RET
+%endmacro
+
+%macro SAD_X3_START_2x16P_AVX2 0
+    movu    m3, [r0] ; assumes FENC_STRIDE == 16
+    movu   xm0, [r1]
+    movu   xm1, [r2]
+    movu   xm2, [r3]
+    vinserti128  m0, m0, [r1+r4], 1
+    vinserti128  m1, m1, [r2+r4], 1
+    vinserti128  m2, m2, [r3+r4], 1
+    psadbw  m0, m3
+    psadbw  m1, m3
+    psadbw  m2, m3
+%endmacro
+
+%macro SAD_X3_2x16P_AVX2 3
+    movu    m3, [r0+%1] ; assumes FENC_STRIDE == 16
+    movu   xm4, [r1+%2]
+    movu   xm5, [r2+%2]
+    movu   xm6, [r3+%2]
+    vinserti128  m4, m4, [r1+%3], 1
+    vinserti128  m5, m5, [r2+%3], 1
+    vinserti128  m6, m6, [r3+%3], 1
+    psadbw  m4, m3
+    psadbw  m5, m3
+    psadbw  m6, m3
+    paddw   m0, m4
+    paddw   m1, m5
+    paddw   m2, m6
+%endmacro
+
+%macro SAD_X3_4x16P_AVX2 2
+%if %1==0
+%if UNIX64
+    mov  r6, r5
+%endif
+    lea  r5, [r4*3]
+    SAD_X3_START_2x16P_AVX2
+%else
+    SAD_X3_2x16P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r4*0, r4*1
+%endif
+    SAD_X3_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r4*2, r5
+%if %1 != %2-1
+%if (%1&1) != 0
+    add  r0, 8*FENC_STRIDE
+%endif
+    lea  r1, [r1+4*r4]
+    lea  r2, [r2+4*r4]
+    lea  r3, [r3+4*r4]
+%endif
+%endmacro
+
+%macro SAD_X4_START_2x16P_AVX2 0
+    vbroadcasti128 m4, [r0]
+    vbroadcasti128 m5, [r0+FENC_STRIDE]
+    movu   xm0, [r1]
+    movu   xm1, [r3]
+    movu   xm2, [r1+r5]
+    movu   xm3, [r3+r5]
+    vinserti128 m0, m0, [r2], 1
+    vinserti128 m1, m1, [r4], 1
+    vinserti128 m2, m2, [r2+r5], 1
+    vinserti128 m3, m3, [r4+r5], 1
+    psadbw  m0, m4
+    psadbw  m1, m4
+    psadbw  m2, m5
+    psadbw  m3, m5
+    paddw   m0, m2
+    paddw   m1, m3
+%endmacro
+
+%macro SAD_X4_2x16P_AVX2 4
+    vbroadcasti128 m6, [r0+%1]
+    vbroadcasti128 m7, [r0+%3]
+    movu   xm2, [r1+%2]
+    movu   xm3, [r3+%2]
+    movu   xm4, [r1+%4]
+    movu   xm5, [r3+%4]
+    vinserti128 m2, m2, [r2+%2], 1
+    vinserti128 m3, m3, [r4+%2], 1
+    vinserti128 m4, m4, [r2+%4], 1
+    vinserti128 m5, m5, [r4+%4], 1
+    psadbw  m2, m6
+    psadbw  m3, m6
+    psadbw  m4, m7
+    psadbw  m5, m7
+    paddw   m0, m2
+    paddw   m1, m3
+    paddw   m0, m4
+    paddw   m1, m5
+%endmacro
+
+%macro SAD_X4_4x16P_AVX2 2
+%if %1==0
+    lea  r6, [r5*3]
+    SAD_X4_START_2x16P_AVX2
+%else
+    SAD_X4_2x16P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1
+%endif
+    SAD_X4_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6
+%if %1 != %2-1
+%if (%1&1) != 0
+    add  r0, 8*FENC_STRIDE
+%endif
+    lea  r1, [r1+4*r5]
+    lea  r2, [r2+4*r5]
+    lea  r3, [r3+4*r5]
+    lea  r4, [r4+4*r5]
+%endif
+%endmacro
+
+%macro SAD_X3_END_AVX2 0
+    vextracti128 xm4, m0, 1
+    vextracti128 xm5, m1, 1
+    vextracti128 xm6, m2, 1
+    paddw   xm0, xm4
+    paddw   xm1, xm5
+    paddw   xm2, xm6
+    movhlps xm4, xm0
+    movhlps xm5, xm1
+    movhlps xm6, xm2
+    paddw   xm0, xm4
+    paddw   xm1, xm5
+    paddw   xm2, xm6
+%if UNIX64
+    movd [r6+0], xm0
+    movd [r6+4], xm1
+    movd [r6+8], xm2
+%else
+    mov      r0, r5mp
+    movd [r0+0], xm0
+    movd [r0+4], xm1
+    movd [r0+8], xm2
+%endif
+    RET
+%endmacro
+
+%macro SAD_X4_END_AVX2 0
+    mov      r0, r6mp
+    punpckhqdq m2, m0, m0
+    punpckhqdq m3, m1, m1
+    paddw    m0, m2
+    paddw    m1, m3
+    packssdw m0, m1
+    mova    xm2, [deinterleave_sadx4]
+    vpermd   m0, m2, m0
+    mova   [r0], xm0
+    RET
+%endmacro
+
 ;-----------------------------------------------------------------------------
 ; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
 ;                          uint8_t *pix2, intptr_t i_stride, int scores[3] )
 ;-----------------------------------------------------------------------------
 %macro SAD_X_SSE2 3
-cglobal pixel_sad_x%1_%2x%3, 2+%1,2+%1,9
-    SAD_X%1_2x%2P_SSE2 1
-%rep %3/2-1
-    SAD_X%1_2x%2P_SSE2 0
+cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,9
+%assign x 0
+%rep %3/4
+    SAD_X%1_4x%2P_SSE2 x, %3/4
+%assign x x+1
 %endrep
     SAD_X%1_END_SSE2
 %endmacro
@@ -1221,7 +1576,36 @@
 SAD_X_SSE2 4, 16, 16
 SAD_X_SSE2 4, 16,  8
 
+%macro SAD_X_SSSE3 3
+cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,8
+%assign x 0
+%rep %3/4
+    SAD_X%1_4x%2P_SSSE3 x, %3/4
+%assign x x+1
+%endrep
+    SAD_X%1_END_SSSE3
+%endmacro
+
+INIT_XMM ssse3
+SAD_X_SSSE3 4, 8, 16
+SAD_X_SSSE3 4, 8,  8
+SAD_X_SSSE3 4, 8,  4
+
+%macro SAD_X_AVX2 4
+cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,%4
+%assign x 0
+%rep %3/4
+    SAD_X%1_4x%2P_AVX2 x, %3/4
+%assign x x+1
+%endrep
+    SAD_X%1_END_AVX2
+%endmacro
 
+INIT_YMM avx2
+SAD_X_AVX2 3, 16, 16, 7
+SAD_X_AVX2 3, 16,  8, 7
+SAD_X_AVX2 4, 16, 16, 8
+SAD_X_AVX2 4, 16,  8, 8
 
 ;=============================================================================
 ; SAD cacheline split
@@ -1410,12 +1794,12 @@
 .split:
 %if ARCH_X86_64
     PROLOGUE 6,9
+    push r3
+    push r2
 %if WIN64
     movsxd r4, r4d
-    sub  rsp, 8
+    sub rsp, 40 ; shadow space and alignment
 %endif
-    push r3
-    push r2
     mov  r2, r1
     mov  r1, FENC_STRIDE
     mov  r3, r4
@@ -1424,7 +1808,7 @@
     call pixel_sad_%1x%2_cache%3_%5
     mov  [r8], eax
 %if WIN64
-    mov  r2, [rsp]
+    mov  r2, [rsp+40+0*8]
 %else
     pop  r2
 %endif
@@ -1432,7 +1816,7 @@
     call pixel_sad_%1x%2_cache%3_%5
     mov  [r8+4], eax
 %if WIN64
-    mov  r2, [rsp+8]
+    mov  r2, [rsp+40+1*8]
 %else
     pop  r2
 %endif
@@ -1440,7 +1824,7 @@
     call pixel_sad_%1x%2_cache%3_%5
     mov  [r8+8], eax
 %if WIN64
-    add  rsp, 24
+    add  rsp, 40+2*8
 %endif
     RET
 %else
@@ -1480,6 +1864,9 @@
     push r4
     push r3
     push r2
+%if WIN64
+    sub rsp, 32 ; shadow space
+%endif
     mov  r2, r1
     mov  r1, FENC_STRIDE
     mov  r3, r5
@@ -1487,7 +1874,7 @@
     call pixel_sad_%1x%2_cache%3_%5
     mov  [r8], eax
 %if WIN64
-    mov  r2, [rsp]
+    mov  r2, [rsp+32+0*8]
 %else
     pop  r2
 %endif
@@ -1495,7 +1882,7 @@
     call pixel_sad_%1x%2_cache%3_%5
     mov  [r8+4], eax
 %if WIN64
-    mov  r2, [rsp+8]
+    mov  r2, [rsp+32+1*8]
 %else
     pop  r2
 %endif
@@ -1503,7 +1890,7 @@
     call pixel_sad_%1x%2_cache%3_%5
     mov  [r8+8], eax
 %if WIN64
-    mov  r2, [rsp+16]
+    mov  r2, [rsp+32+2*8]
 %else
     pop  r2
 %endif
@@ -1511,7 +1898,7 @@
     call pixel_sad_%1x%2_cache%3_%5
     mov  [r8+12], eax
 %if WIN64
-    add  rsp, 24
+    add  rsp, 32+3*8
 %endif
     RET
 %else

x264-snapshot-20130224-2245.tar.bz2/common/x86/sad16-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/sad16-a.asm Changed

@@ -4,6 +4,7 @@
 ;* Copyright (C) 2010-2013 x264 project
 ;*
 ;* Authors: Oskar Arvidsson <oskar@irock.se>
+;* Henrik Gramner <henrik@gramner.com>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
@@ -90,11 +91,18 @@
 ; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t )
 ;-----------------------------------------------------------------------------
 %macro SAD_MMX 3
-cglobal pixel_sad_%1x%2, 4,4
+cglobal pixel_sad_%1x%2, 4,5-(%2&4/4)
 pxor m0, m0
-%rep %2/%3
+%if %2 == 4
 SAD_INC_%3x%1P_MMX
-%endrep
+ SAD_INC_%3x%1P_MMX
+%else
+ mov r4d, %2/%3
+.loop:
+ SAD_INC_%3x%1P_MMX
+ dec r4d
+ jg .loop
+%endif
 %if %1*%2 == 256
 HADDUW m0, m1
 %else
@@ -120,7 +128,8 @@
 ; SAD XMM
 ;=============================================================================
 
-%macro SAD_INC_2x16P_XMM 0
+%macro SAD_INC_2ROW 1
+%if 2*%1 > mmsize
 movu m1, [r2+ 0]
 movu m2, [r2+16]
 movu m3, [r2+2*r3+ 0]
@@ -137,9 +146,7 @@
 paddw m3, m4
 paddw m0, m1
 paddw m0, m3
-%endmacro
-
-%macro SAD_INC_2x8P_XMM 0
+%else
 movu m1, [r2]
 movu m2, [r2+2*r3]
 psubw m1, [r0]
@@ -149,44 +156,55 @@
 lea r2, [r2+4*r3]
 paddw m0, m1
 paddw m0, m2
+%endif
 %endmacro
 
 ;-----------------------------------------------------------------------------
 ; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t )
 ;-----------------------------------------------------------------------------
-%macro SAD_XMM 2
-cglobal pixel_sad_%1x%2, 4,4,8
+%macro SAD 2
+cglobal pixel_sad_%1x%2, 4,5-(%2&4/4),8*(%1/mmsize)
 pxor m0, m0
-%rep %2/2
- SAD_INC_2x%1P_XMM
-%endrep
+%if %2 == 4
+ SAD_INC_2ROW %1
+ SAD_INC_2ROW %1
+%else
+ mov r4d, %2/2
+.loop:
+ SAD_INC_2ROW %1
+ dec r4d
+ jg .loop
+%endif
 HADDW m0, m1
- movd eax, m0
+ movd eax, xm0
 RET
 %endmacro
 
 INIT_XMM sse2
-SAD_XMM 16, 16
-SAD_XMM 16, 8
-SAD_XMM 8, 16
-SAD_XMM 8, 8
-SAD_XMM 8, 4
+SAD 16, 16
+SAD 16, 8
+SAD 8, 16
+SAD 8, 8
+SAD 8, 4
 INIT_XMM sse2, aligned
-SAD_XMM 16, 16
-SAD_XMM 16, 8
-SAD_XMM 8, 16
-SAD_XMM 8, 8
+SAD 16, 16
+SAD 16, 8
+SAD 8, 16
+SAD 8, 8
 INIT_XMM ssse3
-SAD_XMM 16, 16
-SAD_XMM 16, 8
-SAD_XMM 8, 16
-SAD_XMM 8, 8
-SAD_XMM 8, 4
+SAD 16, 16
+SAD 16, 8
+SAD 8, 16
+SAD 8, 8
+SAD 8, 4
 INIT_XMM ssse3, aligned
-SAD_XMM 16, 16
-SAD_XMM 16, 8
-SAD_XMM 8, 16
-SAD_XMM 8, 8
+SAD 16, 16
+SAD 16, 8
+SAD 8, 16
+SAD 8, 8
+INIT_YMM avx2
+SAD 16, 16
+SAD 16, 8
 
 ;=============================================================================
 ; SAD x3/x4
@@ -237,14 +255,14 @@
 HADDW m2, m5
 %endif
 %if UNIX64
- movd [r5+0], m0
- movd [r5+4], m1
- movd [r5+8], m2
+ movd [r5+0], xm0
+ movd [r5+4], xm1
+ movd [r5+8], xm2
 %else
 mov r0, r5mp
- movd [r0+0], m0
- movd [r0+4], m1
- movd [r0+8], m2
+ movd [r0+0], xm0
+ movd [r0+4], xm1
+ movd [r0+8], xm2
 %endif
 RET
 %endmacro
@@ -333,10 +351,10 @@
 HADDW m3, m7
 %endif
 mov r0, r6mp
- movd [r0+ 0], m0
- movd [r0+ 4], m1
- movd [r0+ 8], m2
- movd [r0+12], m3
+ movd [r0+ 0], xm0
+ movd [r0+ 4], xm1
+ movd [r0+ 8], xm2
+ movd [r0+12], xm3
 RET
 %endmacro
 
@@ -400,8 +418,39 @@
 INIT_XMM xop
 PIXEL_VSAD
 
+INIT_YMM avx2
+cglobal pixel_vsad, 3,3
+ mova m0, [r0]
+ mova m1, [r0+2*r1]
+ lea r0, [r0+4*r1]
+ psubw m0, m1
+ pabsw m0, m0
+ sub r2d, 2
+ je .end
+.loop:
+ mova m2, [r0]
+ mova m3, [r0+2*r1]
+ lea r0, [r0+4*r1]
+ psubw m1, m2
+ psubw m2, m3
+ pabsw m1, m1
+ pabsw m2, m2
+ paddw m0, m1
+ paddw m0, m2
+ mova m1, m3
+ sub r2d, 2
+ jg .loop
+.end:
+%if BIT_DEPTH == 9
+ HADDW m0, m1
+%else
+ HADDUW m0, m1
+%endif
+ movd eax, xm0
+ RET
+
 ;-----------------------------------------------------------------------------
-; void pixel_sad_xK_MxN( uint16_t *fenc, uint16_t *pix0, uint16_t *pix1,
+; void pixel_sad_xN_WxH( uint16_t *fenc, uint16_t *pix0, uint16_t *pix1,
 ; uint16_t *pix2, intptr_t i_stride, int scores[3] )
 ;-----------------------------------------------------------------------------
 %macro SAD_X 3
@@ -445,29 +494,38 @@
 SAD_X 4, 4, 8
 SAD_X 4, 4, 4
 INIT_XMM ssse3
-%define XMM_REGS 9
+%define XMM_REGS 7
 SAD_X 3, 16, 16
 SAD_X 3, 16, 8
 SAD_X 3, 8, 16
 SAD_X 3, 8, 8
 SAD_X 3, 8, 4
+%define XMM_REGS 9
 SAD_X 4, 16, 16
 SAD_X 4, 16, 8
 SAD_X 4, 8, 16
 SAD_X 4, 8, 8
 SAD_X 4, 8, 4
 INIT_XMM sse2
-%define XMM_REGS 11
+%define XMM_REGS 8
 SAD_X 3, 16, 16
 SAD_X 3, 16, 8
 SAD_X 3, 8, 16
 SAD_X 3, 8, 8
 SAD_X 3, 8, 4
+%define XMM_REGS 11
 SAD_X 4, 16, 16
 SAD_X 4, 16, 8
 SAD_X 4, 8, 16
 SAD_X 4, 8, 8
 SAD_X 4, 8, 4
+INIT_YMM avx2
+%define XMM_REGS 7
+SAD_X 3, 16, 16
+SAD_X 3, 16, 8
+%define XMM_REGS 9
+SAD_X 4, 16, 16
+SAD_X 4, 16, 8
 
 ;-----------------------------------------------------------------------------
 ; void intra_sad_x3_4x4( uint16_t *fenc, uint16_t *fdec, int res[3] );
@@ -475,52 +533,57 @@
 
 %macro INTRA_SAD_X3_4x4 0
 cglobal intra_sad_x3_4x4, 3,3,7
- movq m0, [r1-1*FDEC_STRIDEB]
+ movddup m0, [r1-1*FDEC_STRIDEB]
 movq m1, [r0+0*FENC_STRIDEB]
 movq m2, [r0+2*FENC_STRIDEB]
 pshuflw m6, m0, q1032
 paddw m6, m0
 pshuflw m5, m6, q2301
 paddw m6, m5
- punpcklqdq m6, m6 ;A+B+C+D 8 times
- punpcklqdq m0, m0
+ punpcklqdq m6, m6 ; A+B+C+D 8 times
 movhps m1, [r0+1*FENC_STRIDEB]
 movhps m2, [r0+3*FENC_STRIDEB]
 psubw m3, m1, m0
 psubw m0, m2
- ABSW m3, m3, m5
- ABSW m0, m0, m5
+ ABSW2 m3, m0, m3, m0, m4, m5
 paddw m0, m3
- HADDW m0, m5
- movd [r2], m0 ;V prediction cost
 movd m3, [r1+0*FDEC_STRIDEB-4]
- movhps m3, [r1+1*FDEC_STRIDEB-8]
 movd m4, [r1+2*FDEC_STRIDEB-4]
+ movhps m3, [r1+1*FDEC_STRIDEB-8]
 movhps m4, [r1+3*FDEC_STRIDEB-8]
 pshufhw m3, m3, q3333
 pshufhw m4, m4, q3333
 pshuflw m3, m3, q1111 ; FF FF EE EE
 pshuflw m4, m4, q1111 ; HH HH GG GG
 paddw m5, m3, m4
- pshufd m0, m5, q1032
+ paddw m6, [pw_4]
+ paddw m6, m5
+ pshufd m5, m5, q1032
 paddw m5, m6
- paddw m5, m0
- paddw m5, [pw_4]
 psrlw m5, 3
 psubw m6, m5, m2
 psubw m5, m1
 psubw m1, m3
 psubw m2, m4
- ABSW m5, m5, m0
- ABSW m6, m6, m0
- ABSW m1, m1, m0
- ABSW m2, m2, m0
+ ABSW2 m5, m6, m5, m6, m3, m4
+ ABSW2 m1, m2, m1, m2, m3, m4
 paddw m5, m6
 paddw m1, m2
- HADDW m5, m0
- HADDW m1, m2
- movd [r2+8], m5 ;DC prediction cost
- movd [r2+4], m1 ;H prediction cost
+%if cpuflag(ssse3)
+ phaddw m0, m1
+ movhlps m3, m5
+ paddw m5, m3
+ phaddw m0, m5
+ pmaddwd m0, [pw_1]
+ mova [r2], m0
+%else
+ HADDW m0, m3
+ HADDW m1, m3
+ HADDW m5, m3
+ movd [r2], m0 ; V prediction cost
+ movd [r2+4], m1 ; H prediction cost
+ movd [r2+8], m5 ; DC prediction cost
+%endif
 RET
 %endmacro
 
@@ -581,12 +644,21 @@
 INTRA_SAD_HVDC_ITER 5, q2222
 INTRA_SAD_HVDC_ITER 6, q1111
 INTRA_SAD_HVDC_ITER 7, q0000
+%if cpuflag(ssse3)
+ phaddw m2, m3 ; 2 2 2 2 3 3 3 3
+ movhlps m3, m1
+ paddw m1, m3 ; 1 1 1 1 _ _ _ _
+ phaddw m2, m1 ; 2 2 3 3 1 1 _ _
+ pmaddwd m2, [pw_1] ; 2 3 1 _
+ mova [r2], m2
+%else
 HADDW m2, m4
 HADDW m3, m4
 HADDW m1, m4
 movd [r2+0], m2
 movd [r2+4], m3
 movd [r2+8], m1
+%endif
 RET
 %endmacro
 
@@ -594,3 +666,44 @@
 INTRA_SAD_X3_8x8
 INIT_XMM ssse3
 INTRA_SAD_X3_8x8
+
+%macro INTRA_SAD_HVDC_ITER_YMM 2
+ mova xm4, [r0+(%1-4)*FENC_STRIDEB]
+ vinserti128 m4, m4, [r0+%1*FENC_STRIDEB], 1
+ pshufd m5, m7, %2
+ psubw m5, m4
+ pabsw m5, m5
+ ACCUM paddw, 2, 5, %1 ; H
+ psubw m5, m4, m6
+ psubw m4, m0
+ pabsw m5, m5
+ pabsw m4, m4
+ ACCUM paddw, 1, 5, %1 ; V
+ ACCUM paddw, 3, 4, %1 ; DC
+%endmacro
+
+INIT_YMM avx2
+cglobal intra_sad_x3_8x8, 3,3,8
+ add r0, 4*FENC_STRIDEB
+ movu xm0, [r1+7*SIZEOF_PIXEL]
+ vbroadcasti128 m6, [r1+16*SIZEOF_PIXEL] ; V prediction
+ vpermq m7, m0, q0011
+ paddw xm0, xm6
+ paddw xm0, [pw_1] ; equal to +8 after HADDW
+ HADDW xm0, xm4
+ psrld xm0, 4
+ vpbroadcastw m0, xm0
+ punpcklwd m7, m7
+ INTRA_SAD_HVDC_ITER_YMM 0, q3333
+ INTRA_SAD_HVDC_ITER_YMM 1, q2222
+ INTRA_SAD_HVDC_ITER_YMM 2, q1111
+ INTRA_SAD_HVDC_ITER_YMM 3, q0000
+ phaddw m1, m2 ; 1 1 1 1 2 2 2 2 1 1 1 1 2 2 2 2
+ punpckhqdq m2, m3, m3
+ paddw m3, m2 ; 3 3 3 3 _ _ _ _ 3 3 3 3 _ _ _ _
+ phaddw m1, m3 ; 1 1 2 2 3 3 _ _ 1 1 2 2 3 3 _ _
+ vextracti128 xm2, m1, 1
+ paddw xm1, xm2 ; 1 1 2 2 3 3 _ _
+ pmaddwd xm1, [pw_1] ; 1 2 3 _
+ mova [r2], xm1
+ RET

x264-snapshot-20130224-2245.tar.bz2/common/x86/trellis-64.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/trellis-64.asm Changed

@@ -96,6 +96,15 @@
 %endif
 %endmacro
 
+%macro LOAD_DUP 2 ; dst, src
+%if cpuflag(ssse3)
+    movddup    %1, %2
+%else
+    movd       %1, %2
+    punpcklqdq %1, %1
+%endif
+%endmacro
+
 ;-----------------------------------------------------------------------------
 ; int trellis_cabac_4x4_psy(
 ;     const int *unquant_mf, const uint8_t *zigzag, int lambda2,
@@ -186,12 +195,11 @@
     mov dword levelgt1_ctxm, 9
 %endif
 %if psy
-    movd    m6, psy_trellism
+    LOAD_DUP m6, psy_trellism
     %define psy_trellis m6
 %elif dc
-    movd       m6, [unquant_mfq]
+    LOAD_DUP   m6, [unquant_mfq]
     paddd      m6, m6
-    punpcklqdq m6, m6
     %define unquant_mf m6
 %endif
 %ifdef PIC
@@ -333,13 +341,12 @@
     movd    m0, abs_leveld
     mov     r6, orig_coefsm
 %if HIGH_BIT_DEPTH
-    movd    m1, [r6 + zigzagiq*SIZEOF_DCTCOEF]
+    LOAD_DUP m1, [r6 + zigzagiq*SIZEOF_DCTCOEF]
 %else
-    movd    m1, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
-    psrad   m1, 16
+    LOAD_DUP m1, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
+    psrad    m1, 16     ; sign_coef
 %endif
     punpcklqdq m0, m0 ; quant_coef
-    punpcklqdq m1, m1 ; sign_coef
 %if cpuflag(ssse3)
     pabsd   m0, m0
     pabsd   m2, m1 ; abs_coef
@@ -403,11 +410,10 @@
 %else
 %ifdef PIC
     mov    r10, unquant_mfm
-    movd    m3, [r10 + zigzagiq*4]
+    LOAD_DUP m3, [r10 + zigzagiq*4]
 %else
-    movd    m3, [unquant_mfq + zigzagiq*4]
+    LOAD_DUP m3, [unquant_mfq + zigzagiq*4]
 %endif
-    punpcklqdq m3, m3
     pmuludq m0, m3
 %endif
     paddd   m0, [pq_128]
@@ -420,8 +426,7 @@
 %if dc
     psllq   m0, 8
 %else
-    movd    m5, [dct_weight2_tab + zigzagiq*4 GLOBAL]
-    punpcklqdq m5, m5
+    LOAD_DUP m5, [dct_weight2_tab + zigzagiq*4 GLOBAL]
     pmuludq m0, m5
 %endif
 
@@ -434,12 +439,11 @@
     ; ssd1[k] -= psy_weight * psy_value;
     mov     r6, fenc_dctm
 %if HIGH_BIT_DEPTH
-    movd    m3, [r6 + zigzagiq*SIZEOF_DCTCOEF]
+    LOAD_DUP m3, [r6 + zigzagiq*SIZEOF_DCTCOEF]
 %else
-    movd    m3, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
+    LOAD_DUP m3, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
     psrad   m3, 16 ; orig_coef
 %endif
-    punpcklqdq m3, m3
 %if cpuflag(ssse3)
     psignd  m4, m1 ; SIGN(unquant_abs_level, sign_coef)
 %else
@@ -453,9 +457,8 @@
     ABSD    m3, m4
     SWAP     4, 3
 %endif
-    movd    m1, [dct_weight1_tab + zigzagiq*4 GLOBAL]
+    LOAD_DUP m1, [dct_weight1_tab + zigzagiq*4 GLOBAL]
     pmuludq m1, psy_trellis
-    punpcklqdq m1, m1
     pmuludq m4, m1
     psubq   m0, m4
 %if %1

x264-snapshot-20130224-2245.tar.bz2/common/x86/util.h -> x264-snapshot-20130723-2245.tar.bz2/common/x86/util.h Changed

@@ -121,42 +121,132 @@
     return amvd;
 }
 
+#define x264_predictor_clip x264_predictor_clip_mmx2
+static int ALWAYS_INLINE x264_predictor_clip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
+{
+    static const uint32_t pd_32 = 0x20;
+    intptr_t tmp = (intptr_t)mv_limit, mvc_max = i_mvc, i = 0;
+
+    asm(
+        "movq       (%2), %%mm5 \n"
+        "movd         %6, %%mm3 \n"
+        "psllw        $2, %%mm5 \n" // Convert to subpel
+        "pshufw $0xEE, %%mm5, %%mm6 \n"
+        "dec         %k3        \n"
+        "jz 2f                  \n" // if( i_mvc == 1 ) {do the last iteration}
+        "punpckldq %%mm3, %%mm3 \n"
+        "punpckldq %%mm5, %%mm5 \n"
+        "movd         %7, %%mm4 \n"
+        "lea   (%0,%3,4), %3    \n"
+        "1:                     \n"
+        "movq       (%0), %%mm0 \n"
+        "add          $8, %0    \n"
+        "movq      %%mm3, %%mm1 \n"
+        "pxor      %%mm2, %%mm2 \n"
+        "pcmpeqd   %%mm0, %%mm1 \n" // mv == pmv
+        "pcmpeqd   %%mm0, %%mm2 \n" // mv == 0
+        "por       %%mm1, %%mm2 \n" // (mv == pmv || mv == 0) * -1
+        "pmovmskb  %%mm2, %k2   \n" // (mv == pmv || mv == 0) * 0xf
+        "pmaxsw    %%mm5, %%mm0 \n"
+        "pminsw    %%mm6, %%mm0 \n"
+        "pand      %%mm4, %%mm2 \n" // (mv0 == pmv || mv0 == 0) * 32
+        "psrlq     %%mm2, %%mm0 \n" // drop mv0 if it's skipped
+        "movq      %%mm0, (%5,%4,4) \n"
+        "and         $24, %k2   \n"
+        "add          $2, %4    \n"
+        "add          $8, %k2   \n"
+        "shr          $4, %k2   \n" // (4-val)>>1
+        "sub          %2, %4    \n" // +1 for each valid motion vector
+        "cmp          %3, %0    \n"
+        "jl 1b                  \n"
+        "jg 3f                  \n" // if( i == i_mvc - 1 ) {do the last iteration}
+
+        /* Do the last iteration */
+        "2:                     \n"
+        "movd       (%0), %%mm0 \n"
+        "pxor      %%mm2, %%mm2 \n"
+        "pcmpeqd   %%mm0, %%mm3 \n"
+        "pcmpeqd   %%mm0, %%mm2 \n"
+        "por       %%mm3, %%mm2 \n"
+        "pmovmskb  %%mm2, %k2   \n"
+        "pmaxsw    %%mm5, %%mm0 \n"
+        "pminsw    %%mm6, %%mm0 \n"
+        "movd      %%mm0, (%5,%4,4) \n"
+        "inc          %4        \n"
+        "and          $1, %k2   \n"
+        "sub          %2, %4    \n" // output += !(mv == pmv || mv == 0)
+        "3:                     \n"
+        :"+r"(mvc), "=m"(M64( dst )), "+r"(tmp), "+r"(mvc_max), "+r"(i)
+        :"r"(dst), "g"(pmv), "m"(pd_32), "m"(M64( mvc ))
+    );
+    return i;
+}
+
+/* Same as the above, except we do (mv + 2) >> 2 on the input. */
 #define x264_predictor_roundclip x264_predictor_roundclip_mmx2
-static void ALWAYS_INLINE x264_predictor_roundclip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
+static int ALWAYS_INLINE x264_predictor_roundclip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
 {
-    uint32_t mv_min = pack16to32_mask( mv_x_min, mv_y_min );
-    uint32_t mv_max = pack16to32_mask( mv_x_max, mv_y_max );
     static const uint64_t pw_2 = 0x0002000200020002ULL;
-    intptr_t i = i_mvc;
+    static const uint32_t pd_32 = 0x20;
+    intptr_t tmp = (intptr_t)mv_limit, mvc_max = i_mvc, i = 0;
+
     asm(
-        "movd    %2, %%mm5       \n"
-        "movd    %3, %%mm6       \n"
-        "movq    %4, %%mm7       \n"
-        "punpckldq %%mm5, %%mm5  \n"
-        "punpckldq %%mm6, %%mm6  \n"
-        "test $1, %0             \n"
-        "jz 1f                   \n"
-        "movd -4(%6,%0,4), %%mm0 \n"
-        "paddw %%mm7, %%mm0      \n"
-        "psraw $2, %%mm0         \n"
-        "pmaxsw %%mm5, %%mm0     \n"
-        "pminsw %%mm6, %%mm0     \n"
-        "movd %%mm0, -4(%5,%0,4) \n"
-        "dec %0                  \n"
-        "jz 2f                   \n"
-        "1:                      \n"
-        "movq -8(%6,%0,4), %%mm0 \n"
-        "paddw %%mm7, %%mm0      \n"
-        "psraw $2, %%mm0         \n"
-        "pmaxsw %%mm5, %%mm0     \n"
-        "pminsw %%mm6, %%mm0     \n"
-        "movq %%mm0, -8(%5,%0,4) \n"
-        "sub $2, %0              \n"
-        "jnz 1b                  \n"
-        "2:                      \n"
-        :"+r"(i), "=m"(M64( dst ))
-        :"g"(mv_min), "g"(mv_max), "m"(pw_2), "r"(dst), "r"(mvc), "m"(M64( mvc ))
+        "movq       (%2), %%mm5 \n"
+        "movq         %6, %%mm7 \n"
+        "movd         %7, %%mm3 \n"
+        "pshufw $0xEE, %%mm5, %%mm6 \n"
+        "dec         %k3        \n"
+        "jz 2f                  \n"
+        "punpckldq %%mm3, %%mm3 \n"
+        "punpckldq %%mm5, %%mm5 \n"
+        "movd         %8, %%mm4 \n"
+        "lea   (%0,%3,4), %3    \n"
+        "1:                     \n"
+        "movq       (%0), %%mm0 \n"
+        "add          $8, %0    \n"
+        "paddw     %%mm7, %%mm0 \n"
+        "psraw        $2, %%mm0 \n"
+        "movq      %%mm3, %%mm1 \n"
+        "pxor      %%mm2, %%mm2 \n"
+        "pcmpeqd   %%mm0, %%mm1 \n"
+        "pcmpeqd   %%mm0, %%mm2 \n"
+        "por       %%mm1, %%mm2 \n"
+        "pmovmskb  %%mm2, %k2   \n"
+        "pmaxsw    %%mm5, %%mm0 \n"
+        "pminsw    %%mm6, %%mm0 \n"
+        "pand      %%mm4, %%mm2 \n"
+        "psrlq     %%mm2, %%mm0 \n"
+        "movq      %%mm0, (%5,%4,4) \n"
+        "and         $24, %k2   \n"
+        "add          $2, %4    \n"
+        "add          $8, %k2   \n"
+        "shr          $4, %k2   \n"
+        "sub          %2, %4    \n"
+        "cmp          %3, %0    \n"
+        "jl 1b                  \n"
+        "jg 3f                  \n"
+
+        /* Do the last iteration */
+        "2:                     \n"
+        "movd       (%0), %%mm0 \n"
+        "paddw     %%mm7, %%mm0 \n"
+        "psraw        $2, %%mm0 \n"
+        "pxor      %%mm2, %%mm2 \n"
+        "pcmpeqd   %%mm0, %%mm3 \n"
+        "pcmpeqd   %%mm0, %%mm2 \n"
+        "por       %%mm3, %%mm2 \n"
+        "pmovmskb  %%mm2, %k2   \n"
+        "pmaxsw    %%mm5, %%mm0 \n"
+        "pminsw    %%mm6, %%mm0 \n"
+        "movd      %%mm0, (%5,%4,4) \n"
+        "inc          %4        \n"
+        "and          $1, %k2   \n"
+        "sub          %2, %4    \n"
+        "3:                     \n"
+        :"+r"(mvc), "=m"(M64( dst )), "+r"(tmp), "+r"(mvc_max), "+r"(i)
+        :"r"(dst), "m"(pw_2), "g"(pmv), "m"(pd_32), "m"(M64( mvc ))
     );
+    return i;
 }
 
 #endif

x264-snapshot-20130224-2245.tar.bz2/common/x86/x86inc.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/x86inc.asm Changed

@@ -6,7 +6,7 @@
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;* Anton Mitrofanov <BugMaster@narod.ru>
 ;* Jason Garrett-Glaser <darkshikari@gmail.com>
-;* Henrik Gramner <hengar-6@student.ltu.se>
+;* Henrik Gramner <henrik@gramner.com>
 ;*
 ;* Permission to use, copy, modify, and/or distribute this software for any
 ;* purpose with or without fee is hereby granted, provided that the above
@@ -34,8 +34,12 @@
 ; as this feature might be useful for others as well. Send patches or ideas
 ; to x264-devel@videolan.org .
 
-%ifndef program_name
- %define program_name x264
+%ifndef private_prefix
+ %define private_prefix x264
+%endif
+
+%ifndef public_prefix
+ %define public_prefix private_prefix
 %endif
 
 %define WIN64 0
@@ -56,29 +60,12 @@
 %define mangle(x) x
 %endif
 
-; Name of the .rodata section.
-; Kludge: Something on OS X fails to align .rodata even given an align attribute,
-; so use a different read-only section.
 %macro SECTION_RODATA 0-1 16
- %ifidn __OUTPUT_FORMAT__,macho64
- SECTION .text align=%1
- %elifidn __OUTPUT_FORMAT__,macho
- SECTION .text align=%1
- fakegot:
- %elifidn __OUTPUT_FORMAT__,aout
- section .text
- %else
- SECTION .rodata align=%1
- %endif
+ SECTION .rodata align=%1
 %endmacro
 
-; aout does not support align=
 %macro SECTION_TEXT 0-1 16
- %ifidn __OUTPUT_FORMAT__,aout
- SECTION .text
- %else
- SECTION .text align=%1
- %endif
+ SECTION .text align=%1
 %endmacro
 
 %if WIN64
@@ -323,14 +310,18 @@
 %if stack_size < 0
 %assign stack_size -stack_size
 %endif
- %if mmsize != 8
- %assign xmm_regs_used %2
+ %assign stack_size_padded stack_size
+ %if WIN64
+ %assign stack_size_padded stack_size_padded + 32 ; reserve 32 bytes for shadow space
+ %if mmsize != 8
+ %assign xmm_regs_used %2
+ %if xmm_regs_used > 8
+ %assign stack_size_padded stack_size_padded + (xmm_regs_used-8)*16
+ %endif
+ %endif
 %endif
 %if mmsize <= 16 && HAVE_ALIGNED_STACK
- %assign stack_size_padded stack_size + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1))
- %if xmm_regs_used > 6
- %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
- %endif
+ %assign stack_size_padded stack_size_padded + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1))
 SUB rsp, stack_size_padded
 %else
 %assign %%reg_num (regs_used - 1)
@@ -340,14 +331,6 @@
 ; stack in a single instruction (i.e. mov rsp, rstk or mov
 ; rsp, [rsp+stack_size_padded])
 mov rstk, rsp
- %assign stack_size_padded stack_size
- %if xmm_regs_used > 6
- %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
- %if mmsize == 32 && xmm_regs_used & 1
- ; re-align to 32 bytes
- %assign stack_size_padded (stack_size_padded + 16)
- %endif
- %endif
 %if %1 < 0 ; need to store rsp on stack
 sub rsp, gprsize+stack_size_padded
 and rsp, ~(%%stack_alignment-1)
@@ -359,9 +342,7 @@
 %xdefine rstkm rstk
 %endif
 %endif
- %if xmm_regs_used > 6
- WIN64_PUSH_XMM
- %endif
+ WIN64_PUSH_XMM
 %endif
 %endif
 %endmacro
@@ -422,40 +403,55 @@
 %endmacro
 
 %macro WIN64_PUSH_XMM 0
- %assign %%i xmm_regs_used
- %rep (xmm_regs_used-6)
- %assign %%i %%i-1
- movdqa [rsp + (%%i-6)*16 + stack_size + (~stack_offset&8)], xmm %+ %%i
- %endrep
+ ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
+ %if xmm_regs_used > 6
+ movaps [rstk + stack_offset + 8], xmm6
+ %endif
+ %if xmm_regs_used > 7
+ movaps [rstk + stack_offset + 24], xmm7
+ %endif
+ %if xmm_regs_used > 8
+ %assign %%i 8
+ %rep xmm_regs_used-8
+ movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
+ %assign %%i %%i+1
+ %endrep
+ %endif
 %endmacro
 
 %macro WIN64_SPILL_XMM 1
 %assign xmm_regs_used %1
 ASSERT xmm_regs_used <= 16
- %if xmm_regs_used > 6
- SUB rsp, (xmm_regs_used-6)*16+16
- WIN64_PUSH_XMM
+ %if xmm_regs_used > 8
+ %assign stack_size_padded (xmm_regs_used-8)*16 + (~stack_offset&8) + 32
+ SUB rsp, stack_size_padded
 %endif
+ WIN64_PUSH_XMM
 %endmacro
 
 %macro WIN64_RESTORE_XMM_INTERNAL 1
- %if xmm_regs_used > 6
+ %assign %%pad_size 0
+ %if xmm_regs_used > 8
 %assign %%i xmm_regs_used
- %rep (xmm_regs_used-6)
+ %rep xmm_regs_used-8
 %assign %%i %%i-1
- movdqa xmm %+ %%i, [%1 + (%%i-6)*16+stack_size+(~stack_offset&8)]
+ movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32]
 %endrep
- %if stack_size_padded == 0
- add %1, (xmm_regs_used-6)*16+16
- %endif
 %endif
 %if stack_size_padded > 0
 %if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0)
 mov rsp, rstkm
 %else
 add %1, stack_size_padded
+ %assign %%pad_size stack_size_padded
 %endif
 %endif
+ %if xmm_regs_used > 7
+ movaps xmm7, [%1 + stack_offset - %%pad_size + 24]
+ %endif
+ %if xmm_regs_used > 6
+ movaps xmm6, [%1 + stack_offset - %%pad_size + 8]
+ %endif
 %endmacro
 
 %macro WIN64_RESTORE_XMM 1
@@ -643,38 +639,48 @@
 ; Applies any symbol mangling needed for C linkage, and sets up a define such that
 ; subsequent uses of the function name automatically refer to the mangled version.
 ; Appends cpuflags to the function name if cpuflags has been specified.
+; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX
+; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2).
 %macro cglobal 1-2+ "" ; name, [PROLOGUE args]
- ; the "" is a workaround for nasm, which fails if SUFFIX is empty
- ; and we call cglobal_internal with just %1 %+ SUFFIX (without %2)
- cglobal_internal %1 %+ SUFFIX, %2
+ cglobal_internal 1, %1 %+ SUFFIX, %2
 %endmacro
-%macro cglobal_internal 1-2+
- %ifndef cglobaled_%1
- %xdefine %1 mangle(program_name %+ _ %+ %1)
- %xdefine %1.skip_prologue %1 %+ .skip_prologue
- CAT_XDEFINE cglobaled_, %1, 1
+%macro cvisible 1-2+ "" ; name, [PROLOGUE args]
+ cglobal_internal 0, %1 %+ SUFFIX, %2
+%endmacro
+%macro cglobal_internal 2-3+
+ %if %1
+ %xdefine %%FUNCTION_PREFIX private_prefix
+ %xdefine %%VISIBILITY hidden
+ %else
+ %xdefine %%FUNCTION_PREFIX public_prefix
+ %xdefine %%VISIBILITY
+ %endif
+ %ifndef cglobaled_%2
+ %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2)
+ %xdefine %2.skip_prologue %2 %+ .skip_prologue
+ CAT_XDEFINE cglobaled_, %2, 1
 %endif
- %xdefine current_function %1
+ %xdefine current_function %2
 %ifidn __OUTPUT_FORMAT__,elf
- global %1:function hidden
+ global %2:function %%VISIBILITY
 %else
- global %1
+ global %2
 %endif
 align function_align
- %1:
- RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
- %xdefine rstk rsp
- %assign stack_offset 0
- %assign stack_size 0
- %assign stack_size_padded 0
- %assign xmm_regs_used 0
- %ifnidn %2, ""
- PROLOGUE %2
+ %2:
+ RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer
+ %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
+ %assign stack_offset 0 ; stack pointer offset relative to the return address
+ %assign stack_size 0 ; amount of stack space that can be freely used inside a function
+ %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
+ %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
+ %ifnidn %3, ""
+ PROLOGUE %3
 %endif
 %endmacro
 
 %macro cextern 1
- %xdefine %1 mangle(program_name %+ _ %+ %1)
+ %xdefine %1 mangle(private_prefix %+ _ %+ %1)
 CAT_XDEFINE cglobaled_, %1, 1
 extern %1
 %endmacro
@@ -686,9 +692,13 @@
 extern %1
 %endmacro
 
-%macro const 2+
- %xdefine %1 mangle(program_name %+ _ %+ %1)
- global %1
+%macro const 1-2+
+ %xdefine %1 mangle(private_prefix %+ _ %+ %1)
+ %ifidn __OUTPUT_FORMAT__,elf
+ global %1:data hidden
+ %else
+ global %1
+ %endif
 %1: %2
 %endmacro
 
@@ -724,9 +734,8 @@
 %assign cpuflags_misalign (1<<20)
 %assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant
 %assign cpuflags_atom (1<<22)
-%assign cpuflags_bmi1 (1<<23)
+%assign cpuflags_bmi1 (1<<23)|cpuflags_lzcnt
 %assign cpuflags_bmi2 (1<<24)|cpuflags_bmi1
-%assign cpuflags_tbm (1<<25)|cpuflags_bmi1
 
 %define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
 %define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
@@ -735,6 +744,7 @@
 ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
 ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
 %macro INIT_CPUFLAGS 0-2
+ CPU amdnop
 %if %0 >= 1
 %xdefine cpuname %1
 %assign cpuflags cpuflags_%1
@@ -756,6 +766,9 @@
 %elifidn %1, sse3
 %define movu lddqu
 %endif
+ %if ARCH_X86_64 == 0 && notcpuflag(sse2)
+ CPU basicnop
+ %endif
 %else
 %xdefine SUFFIX
 %undef cpuname
@@ -763,7 +776,11 @@
 %endif
 %endmacro
 
-; merge mmx and sse*
+; Merge mmx and sse*
+; m# is a simd regsiter of the currently selected size
+; xm# is the corresponding xmmreg (if selcted xmm or ymm size), or mmreg (if selected mmx)
+; ym# is the corresponding ymmreg (if selcted xmm or ymm size), or mmreg (if selected mmx)
+; (All 3 remain in sync through SWAP.)
 
 %macro CAT_XDEFINE 3
 %xdefine %1%2 %3
@@ -840,6 +857,26 @@
 
 INIT_XMM
 
+%macro DECLARE_MMCAST 1
+ %define mmmm%1 mm%1
+ %define mmxmm%1 mm%1
+ %define mmymm%1 mm%1
+ %define xmmmm%1 mm%1
+ %define xmmxmm%1 xmm%1
+ %define xmmymm%1 xmm%1
+ %define ymmmm%1 mm%1
+ %define ymmxmm%1 ymm%1
+ %define ymmymm%1 ymm%1
+ %define xm%1 xmm %+ m%1
+ %define ym%1 ymm %+ m%1
+%endmacro
+
+%assign i 0
+%rep 16
+ DECLARE_MMCAST i
+%assign i i+1
+%endrep
+
 ; I often want to use macros that permute their arguments. e.g. there's no
 ; efficient way to implement butterfly or transpose or dct without swapping some
 ; arguments.
@@ -856,42 +893,42 @@
 
 %macro PERMUTE 2-* ; takes a list of pairs to swap
 %rep %0/2
- %xdefine tmp%2 m%2
- %xdefine ntmp%2 nm%2
+ %xdefine %%tmp%2 m%2
 %rotate 2
 %endrep
 %rep %0/2
- %xdefine m%1 tmp%2
- %xdefine nm%1 ntmp%2
- %undef tmp%2
- %undef ntmp%2
+ %xdefine m%1 %%tmp%2
+ CAT_XDEFINE n, m%1, %1
 %rotate 2
 %endrep
 %endmacro
 
-%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
-%rep %0-1
-%ifdef m%1
- %xdefine tmp m%1
- %xdefine m%1 m%2
- %xdefine m%2 tmp
- CAT_XDEFINE n, m%1, %1
- CAT_XDEFINE n, m%2, %2
-%else
- ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
- ; Be careful using this mode in nested macros though, as in some cases there may be
- ; other copies of m# that have already been dereferenced and don't get updated correctly.
- %xdefine %%n1 n %+ %1
- %xdefine %%n2 n %+ %2
- %xdefine tmp m %+ %%n1
- CAT_XDEFINE m, %%n1, m %+ %%n2
- CAT_XDEFINE m, %%n2, tmp
- CAT_XDEFINE n, m %+ %%n1, %%n1
- CAT_XDEFINE n, m %+ %%n2, %%n2
+%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs)
+%ifnum %1 ; SWAP 0, 1, ...
+ SWAP_INTERNAL_NUM %1, %2
+%else ; SWAP m0, m1, ...
+ SWAP_INTERNAL_NAME %1, %2
 %endif
- %undef tmp
+%endmacro
+
+%macro SWAP_INTERNAL_NUM 2-*
+ %rep %0-1
+ %xdefine %%tmp m%1
+ %xdefine m%1 m%2
+ %xdefine m%2 %%tmp
+ CAT_XDEFINE n, m%1, %1
+ CAT_XDEFINE n, m%2, %2
 %rotate 1
-%endrep
+ %endrep
+%endmacro
+
+%macro SWAP_INTERNAL_NAME 2-*
+ %xdefine %%args n %+ %1
+ %rep %0-1
+ %xdefine %%args %%args, n %+ %2
+ %rotate 1
+ %endrep
+ SWAP_INTERNAL_NUM %%args
 %endmacro
 
 ; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
@@ -1094,10 +1131,10 @@
 AVX_INSTR blendps, 1, 0, 0
 AVX_INSTR blendvpd, 1, 0, 0
 AVX_INSTR blendvps, 1, 0, 0
-AVX_INSTR cmppd, 1, 0, 0
-AVX_INSTR cmpps, 1, 0, 0
-AVX_INSTR cmpsd, 1, 0, 0
-AVX_INSTR cmpss, 1, 0, 0
+AVX_INSTR cmppd, 1, 1, 0
+AVX_INSTR cmpps, 1, 1, 0
+AVX_INSTR cmpsd, 1, 1, 0
+AVX_INSTR cmpss, 1, 1, 0
 AVX_INSTR comisd
 AVX_INSTR comiss
 AVX_INSTR cvtdq2pd
@@ -1399,3 +1436,14 @@
 FMA4_INSTR fnmsubps, fnmsub132ps, fnmsub213ps, fnmsub231ps
 FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd
 FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss
+
+; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug
+%if ARCH_X86_64 == 0
+%macro vpbroadcastq 2
+%if sizeof%1 == 16
+ movddup %1, %2
+%else
+ vbroadcastsd %1, %2
+%endif
+%endmacro
+%endif

x264-snapshot-20130224-2245.tar.bz2/common/x86/x86util.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/x86util.asm Changed

@@ -30,10 +30,14 @@
 %assign SIZEOF_PIXEL 1
 %assign SIZEOF_DCTCOEF 2
 %define pixel byte
+%define vpbroadcastdct vpbroadcastw
+%define vpbroadcastpix vpbroadcastb
 %if HIGH_BIT_DEPTH
 %assign SIZEOF_PIXEL 2
 %assign SIZEOF_DCTCOEF 4
 %define pixel word
+ %define vpbroadcastdct vpbroadcastd
+ %define vpbroadcastpix vpbroadcastw
 %endif
 
 %assign FENC_STRIDEB SIZEOF_PIXEL*FENC_STRIDE
@@ -52,7 +56,10 @@
 
 
 %macro SBUTTERFLY 4
-%if avx_enabled && mmsize == 16
+%ifidn %1, dqqq
+ vperm2i128 m%4, m%2, m%3, q0301 ; punpckh
+ vinserti128 m%2, m%2, xm%3, 1 ; punpckl
+%elif avx_enabled && mmsize >= 16
 punpckh%1 m%4, m%2, m%3
 punpckl%1 m%2, m%3
 %else
@@ -214,15 +221,20 @@
 %endif
 %endmacro
 
-%macro ABSD 2
+%macro ABSD 2-3
 %if cpuflag(ssse3)
 pabsd %1, %2
 %else
- pxor %1, %1
- pcmpgtd %1, %2
- pxor %2, %1
- psubd %2, %1
- SWAP %1, %2
+ %define %%s %2
+%if %0 == 3
+ mova %3, %2
+ %define %%s %3
+%endif
+ pxor %1, %1
+ pcmpgtd %1, %%s
+ pxor %%s, %1
+ psubd %%s, %1
+ SWAP %1, %%s
 %endif
 %endmacro
 
@@ -255,9 +267,13 @@
 %endmacro
 
 %imacro SPLATW 2-3 0
- PSHUFLW %1, %2, (%3)*q1111
+%if cpuflag(avx2) && %3 == 0
+ vpbroadcastw %1, %2
+%else
+ PSHUFLW %1, %2, (%3)*q1111
 %if mmsize == 16
- punpcklqdq %1, %1
+ punpcklqdq %1, %1
+%endif
 %endif
 %endmacro
 
@@ -275,16 +291,24 @@
 %endmacro
 
 %macro HADDD 2 ; sum junk
-%if mmsize == 16
+%if sizeof%1 == 32
+%define %2 xmm%2
+ vextracti128 %2, %1, 1
+%define %1 xmm%1
+ paddd %1, %2
+%endif
+%if mmsize >= 16
 movhlps %2, %1
 paddd %1, %2
 %endif
 PSHUFLW %2, %1, q0032
 paddd %1, %2
+%undef %1
+%undef %2
 %endmacro
 
 %macro HADDW 2 ; reg, tmp
-%if cpuflag(xop) && mmsize == 16
+%if cpuflag(xop) && sizeof%1 == 16
 vphaddwq %1, %1
 movhlps %2, %1
 paddd %1, %2
@@ -294,22 +318,41 @@
 %endif
 %endmacro
 
-%macro HADDUW 2
-%if cpuflag(xop) && mmsize == 16
- vphadduwq %1, %1
- movhlps %2, %1
- paddd %1, %2
+%macro HADDUWD 2
+%if cpuflag(xop) && sizeof%1 == 16
+ vphadduwd %1, %1
 %else
 psrld %2, %1, 16
 pslld %1, 16
 psrld %1, 16
 paddd %1, %2
- HADDD %1, %2
+%endif
+%endmacro
+
+%macro HADDUW 2
+%if cpuflag(xop) && sizeof%1 == 16
+ vphadduwq %1, %1
+ movhlps %2, %1
+ paddd %1, %2
+%else
+ HADDUWD %1, %2
+ HADDD %1, %2
 %endif
 %endmacro
 
 %macro PALIGNR 4-5 ; [dst,] src1, src2, imm, tmp
-%if cpuflag(ssse3)
+; AVX2 version uses a precalculated extra input that
+; can be re-used across calls
+%if sizeof%1==32
+ ; %3 = abcdefgh ijklmnop (lower address)
+ ; %2 = ABCDEFGH IJKLMNOP (higher address)
+; vperm2i128 %5, %2, %3, q0003 ; %5 = ijklmnop ABCDEFGH
+%if %4 < 16
+ palignr %1, %5, %3, %4 ; %1 = bcdefghi jklmnopA
+%else
+ palignr %1, %2, %5, %4-16 ; %1 = pABCDEFG HIJKLMNO
+%endif
+%elif cpuflag(ssse3)
 %if %0==5
 palignr %1, %2, %3, %4
 %else
@@ -475,7 +518,7 @@
 %endif
 %elifidn %1, q
 shufps m%5, m%3, m%4, q3131
- shufps m%3, m%4, q2020
+ shufps m%3, m%3, m%4, q2020
 SWAP %4, %5
 %endif
 %endmacro
@@ -498,22 +541,24 @@
 ; %5(%6): tmpregs
 %if %1!=0 ; have to reorder stuff for horizontal op
 %ifidn %2, sumsub
- %define ORDER ord
- ; sumsub needs order because a-b != b-a unless a=b
+ %define ORDER ord
+ ; sumsub needs order because a-b != b-a unless a=b
 %else
- %define ORDER unord
- ; if we just max, order doesn't matter (allows pblendw+or in sse4)
+ %define ORDER unord
+ ; if we just max, order doesn't matter (allows pblendw+or in sse4)
 %endif
 %if %1==1
- TRANS d, ORDER, %3, %4, %5, %6
+ TRANS d, ORDER, %3, %4, %5, %6
 %elif %1==2
- %if mmsize==8
- SBUTTERFLY dq, %3, %4, %5
- %else
- TRANS q, ORDER, %3, %4, %5, %6
- %endif
+ %if mmsize==8
+ SBUTTERFLY dq, %3, %4, %5
+ %else
+ TRANS q, ORDER, %3, %4, %5, %6
+ %endif
 %elif %1==4
- SBUTTERFLY qdq, %3, %4, %5
+ SBUTTERFLY qdq, %3, %4, %5
+ %elif %1==8
+ SBUTTERFLY dqqq, %3, %4, %5
 %endif
 %endif
 %ifidn %2, sumsub
@@ -675,11 +720,18 @@
 %endmacro
 
 
-%macro LOAD_DIFF 5
+%macro LOAD_DIFF 5-6 1
 %if HIGH_BIT_DEPTH
+%if %6 ; %5 aligned?
 mova %1, %4
 psubw %1, %5
-%elifidn %3, none
+%else
+ movu %1, %4
+ movu %2, %5
+ psubw %1, %2
+%endif
+%else ; !HIGH_BIT_DEPTH
+%ifidn %3, none
 movh %1, %4
 movh %2, %5
 punpcklbw %1, %2
@@ -692,6 +744,7 @@
 punpcklbw %2, %3
 psubw %1, %2
 %endif
+%endif ; HIGH_BIT_DEPTH
 %endmacro
 
 %macro LOAD_DIFF8x4 8 ; 4x dst, 1x tmp, 1x mul, 2x ptr
@@ -742,17 +795,27 @@
 movh [r0+3*FDEC_STRIDE], %4
 %endmacro
 
-%macro LOAD_DIFF_8x4P 7-10 r0,r2,0 ; 4x dest, 2x temp, 2x pointer, increment?
- LOAD_DIFF m%1, m%5, m%7, [%8], [%9]
- LOAD_DIFF m%2, m%6, m%7, [%8+r1], [%9+r3]
- LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3]
- LOAD_DIFF m%4, m%6, m%7, [%8+r4], [%9+r5]
+%macro LOAD_DIFF_8x4P 7-11 r0,r2,0,1 ; 4x dest, 2x temp, 2x pointer, increment, aligned?
+ LOAD_DIFF m%1, m%5, m%7, [%8], [%9], %11
+ LOAD_DIFF m%2, m%6, m%7, [%8+r1], [%9+r3], %11
+ LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3], %11
+ LOAD_DIFF m%4, m%6, m%7, [%8+r4], [%9+r5], %11
 %if %10
 lea %8, [%8+4*r1]
 lea %9, [%9+4*r3]
 %endif
 %endmacro
 
+; 2xdst, 2xtmp, 2xsrcrow
+%macro LOAD_DIFF16x2_AVX2 6
+ pmovzxbw m%1, [r1+%5*FENC_STRIDE]
+ pmovzxbw m%2, [r1+%6*FENC_STRIDE]
+ pmovzxbw m%3, [r2+(%5-4)*FDEC_STRIDE]
+ pmovzxbw m%4, [r2+(%6-4)*FDEC_STRIDE]
+ psubw m%1, m%3
+ psubw m%2, m%4
+%endmacro
+
 %macro DIFFx2 6-7
 movh %3, %5
 punpcklbw %3, %4

x264-snapshot-20130224-2245.tar.bz2/configure -> x264-snapshot-20130723-2245.tar.bz2/configure Changed

@@ -25,6 +25,7 @@
 --system-libx264 use system libx264 instead of internal
 --enable-shared build shared library
 --enable-static build static library
+ --disable-opencl disable OpenCL features
 --disable-gpl disable GPL-only features
 --disable-thread disable multithreaded encoding
 --enable-win32thread use win32threads (windows only)
@@ -46,7 +47,7 @@
 --sysroot=SYSROOT root of cross-build tree
 
 External library support:
- --disable-avs disable avisynth support (windows only)
+ --disable-avs disable avisynth support
 --disable-swscale disable swscale support
 --disable-lavf disable libavformat support
 --disable-ffms disable ffmpegsource support
@@ -80,6 +81,9 @@
 [[ "$arg" = -falign-loops* ]] && arg=
 [ "$arg" = -fno-tree-vectorize ] && arg=
 [ "$arg" = -Wshadow ] && arg=
+ [[ "$arg" = -mpreferred-stack-boundary* ]] && arg=
+ [[ "$arg" = -l* ]] && arg=
+ [[ "$arg" = -L* ]] && arg=
 if [ $compiler = ICL ]; then
 [ "$arg" = -Wall ] && arg=-W0
 [ "$arg" = -g ] && arg=-Z7
@@ -133,7 +137,7 @@
 [ -n "$1" ] && echo "#include <$1>" > conftest.c
 echo "int main () { $3 return 0; }" >> conftest.c
 if [ $compiler = ICL ]; then
- cc_cmd="$CC conftest.c $CFLAGS $2 -link $(icl_ldflags $2 $LDFLAGSCLI $LDFLAGS)"
+ cc_cmd="$CC conftest.c $(intel_cflags $CFLAGS $2) -link $(icl_ldflags $2 $LDFLAGSCLI $LDFLAGS)"
 else
 cc_cmd="$CC conftest.c $CFLAGS $2 $LDFLAGSCLI $LDFLAGS -o conftest"
 fi
@@ -273,6 +277,7 @@
 bit_depth="8"
 chroma_format="all"
 compiler="GNU"
+opencl="yes"
 
 CFLAGS="$CFLAGS -Wall -I. -I\$(SRCPATH)"
 LDFLAGS="$LDFLAGS"
@@ -285,7 +290,7 @@
 EXE=""
 
 # list of all preprocessor HAVE values we can define
-CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F VISUALIZE SWSCALE LAVF FFMS GPAC GF_MALLOC AVS GPL VECTOREXT INTERLACED CPU_COUNT"
+CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F VISUALIZE SWSCALE LAVF FFMS GPAC GF_MALLOC AVS GPL VECTOREXT INTERLACED CPU_COUNT OPENCL"
 
 # parse options
 
@@ -381,6 +386,9 @@
 --host=*)
 host="$optarg"
 ;;
+ --disable-opencl)
+ opencl="no"
+ ;;
 --cross-prefix=*)
 cross_prefix="$optarg"
 ;;
@@ -521,6 +529,13 @@
 fi
 HAVE_GETOPT_LONG=0
 ;;
+ *qnx*)
+ SYS="QNX"
+ define HAVE_MALLOC_H
+ libm="-lm"
+ HAVE_GETOPT_LONG=0
+ CFLAGS="$CFLAGS -I\$(SRCPATH)/extras"
+ ;;
 *)
 die "Unknown system $host, edit the configure"
 ;;
@@ -564,6 +579,7 @@
 elif [ "$SYS" = WINDOWS -o "$SYS" = CYGWIN ]; then
 ASFLAGS="$ASFLAGS -f win32 -DPREFIX"
 LDFLAGS="$LDFLAGS -Wl,--large-address-aware"
+ [ $compiler = GNU ] && LDFLAGS="$LDFLAGS -Wl,--nxcompat -Wl,--dynamicbase"
 [ $compiler = GNU ] && RCFLAGS="--target=pe-i386 $RCFLAGS"
 else
 ASFLAGS="$ASFLAGS -f elf"
@@ -583,6 +599,7 @@
 ASFLAGS="$ASFLAGS -f win32 -m amd64"
 # only the GNU toolchain is inconsistent in prefixing function names with _
 [ $compiler = GNU ] && cc_check "" "-S" && grep -q "_main:" conftest && ASFLAGS="$ASFLAGS -DPREFIX"
+ [ $compiler = GNU ] && LDFLAGS="$LDFLAGS -Wl,--nxcompat -Wl,--dynamicbase"
 [ $compiler = GNU ] && RCFLAGS="--target=pe-x86-64 $RCFLAGS"
 else
 ASFLAGS="$ASFLAGS -f elf -m amd64"
@@ -703,6 +720,10 @@
 exit 1
 fi
 define HAVE_MMX
+ if cc_check '' -mpreferred-stack-boundary=5 ; then
+ CFLAGS="$CFLAGS -mpreferred-stack-boundary=5"
+ define HAVE_32B_STACK_ALIGNMENT
+ fi
 fi
 
 if [ $asm = auto -a $ARCH = ARM ] ; then
@@ -770,6 +791,9 @@
 thread="win32"
 fi
 ;;
+ QNX)
+ cc_check pthread.h -lc && thread="posix" && libpthread="-lc"
+ ;;
 *)
 cc_check pthread.h -lpthread && thread="posix" && libpthread="-lpthread"
 ;;
@@ -917,8 +941,16 @@
 avs="no"
 # cygwin can use avisynth if it can use LoadLibrary
 if [ $SYS = WINDOWS ] || ([ $SYS = CYGWIN ] && cc_check windows.h "" "LoadLibrary(0);") ; then
- avs="yes"
+ avs="avisynth"
+ define HAVE_AVS
+ define USE_AVXSYNTH 0
+ elif [ "$SYS" = "LINUX" -o "$SYS" = "MACOSX" ] ; then
+ # AvxSynth currently only supports Linux and OSX
+ avs="avxsynth"
 define HAVE_AVS
+ define USE_AVXSYNTH 1
+ AVS_LIBS="-ldl"
+ LDFLAGSCLI="$AVS_LIBS $LDFLAGSCLI"
 fi
 fi
 
@@ -978,6 +1010,7 @@
 if [ "$bit_depth" -gt "8" ]; then
 define HIGH_BIT_DEPTH
 ASFLAGS="$ASFLAGS -DHIGH_BIT_DEPTH=1"
+ opencl="no"
 else
 ASFLAGS="$ASFLAGS -DHIGH_BIT_DEPTH=0"
 fi
@@ -992,6 +1025,30 @@
 
 [ $interlaced = yes ] && define HAVE_INTERLACED && x264_interlaced=1 || x264_interlaced=0
 
+libdl=""
+if [ "$opencl" = "yes" ]; then
+ opencl="no"
+ log_check "for perl"
+ output=$(perl -v)
+ if [ "$output" = "" ]; then
+ log_fail
+ echo 'OpenCL support requires perl to compile.'
+ echo 'use --disable-opencl to compile without OpenCL.'
+ exit 1
+ fi
+ log_ok
+ # cygwin can use opencl if it can use LoadLibrary
+ if [ $SYS = WINDOWS ] || ([ $SYS = CYGWIN ] && cc_check windows.h "" "LoadLibrary(0);") ; then
+ opencl="yes"
+ define HAVE_OPENCL
+ elif [ "$SYS" = "LINUX" -o "$SYS" = "MACOSX" ] ; then
+ opencl="yes"
+ define HAVE_OPENCL
+ libdl="-ldl"
+ fi
+ LDFLAGS="$LDFLAGS $libdl"
+fi
+
 #define undefined vars as 0
 for var in $CONFIG_HAVE; do
 grep -q "HAVE_$var 1" config.h || define HAVE_$var 0
@@ -1083,6 +1140,7 @@
 PROF_GEN_LD=$PROF_GEN_LD
 PROF_USE_CC=$PROF_USE_CC
 PROF_USE_LD=$PROF_USE_LD
+HAVE_OPENCL=$opencl
 EOF
 
 if [ $compiler = ICL ]; then
@@ -1162,7 +1220,7 @@
 Description: H.264 (MPEG4 AVC) encoder library
 Version: $(grep POINTVER < x264_config.h | sed -e 's/.* "//; s/".*//')
 Libs: -L$libdir -lx264
-Libs.private: $libpthread $libm
+Libs.private: $libpthread $libm $libdl
 Cflags: -I$includedir
 EOF
 
@@ -1186,6 +1244,7 @@
 gpac: $gpac
 gpl: $gpl
 thread: $thread
+opencl: $opencl
 filters: $filters
 debug: $debug
 gprof: $gprof

x264-snapshot-20130224-2245.tar.bz2/doc/regression_test.txt -> x264-snapshot-20130723-2245.tar.bz2/doc/regression_test.txt Changed

x264-snapshot-20130224-2245.tar.bz2/encoder/analyse.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/analyse.c Changed

@@ -467,8 +467,8 @@
 if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
 h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
 }
- h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
- h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
+ h->mb.mv_limit_fpel[0][0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
+ h->mb.mv_limit_fpel[1][0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
 if( h->mb.i_mb_x == 0 && !(h->mb.i_mb_y & PARAM_INTERLACED) )
 {
 int mb_y = h->mb.i_mb_y >> SLICE_MBAFF;
@@ -516,8 +516,8 @@
 h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
 h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
 h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
- h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
- h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
+ h->mb.mv_limit_fpel[0][1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
+ h->mb.mv_limit_fpel[1][1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
 }
 }
 if( PARAM_INTERLACED )
@@ -527,8 +527,8 @@
 h->mb.mv_max[1] = h->mb.mv_maxy_row[i];
 h->mb.mv_min_spel[1] = h->mb.mv_miny_spel_row[i];
 h->mb.mv_max_spel[1] = h->mb.mv_maxy_spel_row[i];
- h->mb.mv_min_fpel[1] = h->mb.mv_miny_fpel_row[i];
- h->mb.mv_max_fpel[1] = h->mb.mv_maxy_fpel_row[i];
+ h->mb.mv_limit_fpel[0][1] = h->mb.mv_miny_fpel_row[i];
+ h->mb.mv_limit_fpel[1][1] = h->mb.mv_maxy_fpel_row[i];
 }
 #undef CLIP_FMV
 
@@ -888,7 +888,7 @@
 {
 if( !h->mb.b_lossless && predict_mode[5] >= 0 )
 {
- int satd[9];
+ ALIGNED_ARRAY_16( int32_t, satd,[9] );
 h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
 int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
 satd[i_pred_mode] -= 3 * lambda;
@@ -1006,7 +1006,7 @@
 {
 if( !h->mb.b_lossless && predict_mode[5] >= 0 )
 {
- int satd[9];
+ ALIGNED_ARRAY_16( int32_t, satd,[9] );
 h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
 int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
 satd[i_pred_mode] -= 3 * lambda;
@@ -1706,7 +1706,7 @@
 static ALWAYS_INLINE int x264_mb_analyse_inter_p4x4_chroma_internal( x264_t *h, x264_mb_analysis_t *a,
 pixel **p_fref, int i8x8, int size, int chroma )
 {
- ALIGNED_ARRAY_16( pixel, pix1,[16*16] );
+ ALIGNED_ARRAY_N( pixel, pix1,[16*16] );
 pixel *pix2 = pix1+8;
 int i_stride = h->mb.pic.i_stride[1];
 int chroma_h_shift = chroma <= CHROMA_422;
@@ -1890,8 +1890,8 @@
 
 static ALWAYS_INLINE int x264_analyse_bi_chroma( x264_t *h, x264_mb_analysis_t *a, int idx, int i_pixel )
 {
- ALIGNED_ARRAY_16( pixel, pix, [4],[16*16] );
- ALIGNED_ARRAY_16( pixel, bi, [2],[16*16] );
+ ALIGNED_ARRAY_N( pixel, pix, [4],[16*16] );
+ ALIGNED_ARRAY_N( pixel, bi, [2],[16*16] );
 int i_chroma_cost = 0;
 int chromapix = h->luma2chroma_pixel[i_pixel];
 
@@ -1984,8 +1984,8 @@
 
 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
 {
- ALIGNED_ARRAY_16( pixel, pix0,[16*16] );
- ALIGNED_ARRAY_16( pixel, pix1,[16*16] );
+ ALIGNED_ARRAY_N( pixel, pix0,[16*16] );
+ ALIGNED_ARRAY_N( pixel, pix1,[16*16] );
 pixel *src0, *src1;
 intptr_t stride0 = 16, stride1 = 16;
 int i_ref, i_mvc;
@@ -2454,7 +2454,7 @@
 
 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
 {
- ALIGNED_ARRAY_16( pixel, pix,[2],[16*8] );
+ ALIGNED_ARRAY_N( pixel, pix,[2],[16*8] );
 ALIGNED_4( int16_t mvc[3][2] );
 
 h->mb.i_partition = D_16x8;
@@ -2836,12 +2836,28 @@
 
 int plane_count = CHROMA444 && h->mb.b_chroma_me ? 3 : 1;
 int i_cost8 = 0, i_cost4 = 0;
- for( int p = 0; p < plane_count; p++ )
+ /* Not all platforms have a merged SATD function */
+ if( h->pixf.sa8d_satd[PIXEL_16x16] )
 {
- i_cost8 += h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
- h->mb.pic.p_fdec[p], FDEC_STRIDE );
- i_cost4 += h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
- h->mb.pic.p_fdec[p], FDEC_STRIDE );
+ uint64_t cost = 0;
+ for( int p = 0; p < plane_count; p++ )
+ {
+ cost += h->pixf.sa8d_satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
+ h->mb.pic.p_fdec[p], FDEC_STRIDE );
+
+ }
+ i_cost8 = (uint32_t)cost;
+ i_cost4 = (uint32_t)(cost >> 32);
+ }
+ else
+ {
+ for( int p = 0; p < plane_count; p++ )
+ {
+ i_cost8 += h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
+ h->mb.pic.p_fdec[p], FDEC_STRIDE );
+ i_cost4 += h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
+ h->mb.pic.p_fdec[p], FDEC_STRIDE );
+ }
 }
 
 h->mb.b_transform_8x8 = i_cost8 < i_cost4;
@@ -3002,8 +3018,8 @@
 h->mb.i_qp = x264_ratecontrol_mb_qp( h );
 /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
 * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */
- if( h->param.rc.i_aq_mode && h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
- h->mb.i_qp = h->mb.i_last_qp;
+ if( h->param.rc.i_aq_mode && h->param.analyse.i_subpel_refine < 10 )
+ h->mb.i_qp = abs(h->mb.i_qp - h->mb.i_last_qp) == 1 ? h->mb.i_last_qp : h->mb.i_qp;
 
 if( h->param.analyse.b_mb_info )
 h->fdec->effective_qp[h->mb.i_mb_xy] = h->mb.i_qp; /* Store the real analysis QP. */

x264-snapshot-20130224-2245.tar.bz2/encoder/analyse.h -> x264-snapshot-20130723-2245.tar.bz2/encoder/analyse.h Changed

x264-snapshot-20130224-2245.tar.bz2/encoder/cabac.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/cabac.c Changed

@@ -152,8 +152,10 @@
 int i_dqp = h->mb.i_qp - h->mb.i_last_qp;
 int ctx;
 
- /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely flat background area */
- if( h->mb.i_type == I_16x16 && !h->mb.cbp[h->mb.i_mb_xy] )
+ /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely
+ * flat background area. Don't do this if it would raise the quantizer, since that could
+ * cause unexpected deblocking artifacts. */
+ if( h->mb.i_type == I_16x16 && !h->mb.cbp[h->mb.i_mb_xy] && h->mb.i_qp > h->mb.i_last_qp )
 {
 #if !RDO_SKIP_BS
 h->mb.i_qp = h->mb.i_last_qp;
@@ -161,9 +163,7 @@
 i_dqp = 0;
 }
 
- /* Since, per the above, empty-CBP I16x16 blocks never have delta quants,
- * we don't have to check for them. */
- ctx = h->mb.i_last_dqp && h->mb.cbp[h->mb.i_mb_prev_xy];
+ ctx = h->mb.i_last_dqp && (h->mb.type[h->mb.i_mb_prev_xy] == I_16x16 || (h->mb.cbp[h->mb.i_mb_prev_xy]&0x3f));
 
 if( i_dqp != 0 )
 {
@@ -644,26 +644,17 @@
 }
 }
 
-static const uint16_t significant_coeff_flag_offset[2][14] =
-{
- { 105+0, 105+15, 105+29, 105+44, 105+47, 402, 484+0, 484+15, 484+29, 660, 528+0, 528+15, 528+29, 718 },
- { 277+0, 277+15, 277+29, 277+44, 277+47, 436, 776+0, 776+15, 776+29, 675, 820+0, 820+15, 820+29, 733 }
-};
-static const uint16_t last_coeff_flag_offset[2][14] =
-{
- { 166+0, 166+15, 166+29, 166+44, 166+47, 417, 572+0, 572+15, 572+29, 690, 616+0, 616+15, 616+29, 748 },
- { 338+0, 338+15, 338+29, 338+44, 338+47, 451, 864+0, 864+15, 864+29, 699, 908+0, 908+15, 908+29, 757 }
-};
-static const uint16_t coeff_abs_level_m1_offset[14] =
-{
- 227+0, 227+10, 227+20, 227+30, 227+39, 426, 952+0, 952+10, 952+20, 708, 982+0, 982+10, 982+20, 766
-};
-#if RDO_SKIP_BS
-extern const uint8_t x264_significant_coeff_flag_offset_8x8[2][63];
+#if !RDO_SKIP_BS
+extern const uint8_t x264_significant_coeff_flag_offset_8x8[2][64];
 extern const uint8_t x264_last_coeff_flag_offset_8x8[63];
 extern const uint8_t x264_coeff_flag_offset_chroma_422_dc[7];
+extern const uint16_t x264_significant_coeff_flag_offset[2][16];
+extern const uint16_t x264_last_coeff_flag_offset[2][16];
+extern const uint16_t x264_coeff_abs_level_m1_offset[16];
+extern const uint8_t x264_count_cat_m1[14];
 #else
-const uint8_t x264_significant_coeff_flag_offset_8x8[2][63] =
+/* Padded to [64] for easier addressing */
+const uint8_t x264_significant_coeff_flag_offset_8x8[2][64] =
 {{
 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
 4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
@@ -683,6 +674,21 @@
 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
 };
 const uint8_t x264_coeff_flag_offset_chroma_422_dc[7] = { 0, 0, 1, 1, 2, 2, 2 }; /* MIN( i/2, 2 ) */
+const uint16_t x264_significant_coeff_flag_offset[2][16] =
+{
+ { 105+0, 105+15, 105+29, 105+44, 105+47, 402, 484+0, 484+15, 484+29, 660, 528+0, 528+15, 528+29, 718, 0, 0 },
+ { 277+0, 277+15, 277+29, 277+44, 277+47, 436, 776+0, 776+15, 776+29, 675, 820+0, 820+15, 820+29, 733, 0, 0 }
+};
+const uint16_t x264_last_coeff_flag_offset[2][16] =
+{
+ { 166+0, 166+15, 166+29, 166+44, 166+47, 417, 572+0, 572+15, 572+29, 690, 616+0, 616+15, 616+29, 748, 0, 0 },
+ { 338+0, 338+15, 338+29, 338+44, 338+47, 451, 864+0, 864+15, 864+29, 699, 908+0, 908+15, 908+29, 757, 0, 0 }
+};
+const uint16_t x264_coeff_abs_level_m1_offset[16] =
+{
+ 227+0, 227+10, 227+20, 227+30, 227+39, 426, 952+0, 952+10, 952+20, 708, 982+0, 982+10, 982+20, 766
+};
+const uint8_t x264_count_cat_m1[14] = {15, 14, 15, 3, 14, 63, 15, 14, 15, 63, 15, 14, 15, 63};
 #endif
 
 // node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
@@ -694,20 +700,20 @@
 /* 4:2:2 chroma dc uses a slightly different state machine for some reason, also note that
 * 4:2:0 chroma dc doesn't use the last state so it has identical output with both arrays. */
 static const uint8_t coeff_abs_levelgt1_ctx_chroma_dc[8] = { 5, 5, 5, 5, 6, 7, 8, 8 };
+
 static const uint8_t coeff_abs_level_transition[2][8] = {
 /* update node ctx after coding a level=1 */
 { 1, 2, 3, 3, 4, 5, 6, 7 },
 /* update node ctx after coding a level>1 */
 { 4, 4, 4, 4, 5, 6, 7, 7 }
 };
-static const uint8_t count_cat_m1[14] = {15, 14, 15, 3, 14, 63, 15, 14, 15, 63, 15, 14, 15, 63};
 
 #if !RDO_SKIP_BS
 static ALWAYS_INLINE void x264_cabac_block_residual_internal( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l, int chroma422dc )
 {
- int ctx_sig = significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
- int ctx_last = last_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
- int ctx_level = coeff_abs_level_m1_offset[ctx_block_cat];
+ int ctx_sig = x264_significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
+ int ctx_last = x264_last_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
+ int ctx_level = x264_coeff_abs_level_m1_offset[ctx_block_cat];
 int coeff_idx = -1, node_ctx = 0;
 int last = h->quantf.coeff_last[ctx_block_cat]( l );
 const uint8_t *levelgt1_ctx = chroma422dc ? coeff_abs_levelgt1_ctx_chroma_dc : coeff_abs_levelgt1_ctx;
@@ -747,7 +753,7 @@
 }
 else
 {
- int count_m1 = count_cat_m1[ctx_block_cat];
+ int count_m1 = x264_count_cat_m1[ctx_block_cat];
 if( count_m1 == 63 )
 {
 const uint8_t *sig_offset = x264_significant_coeff_flag_offset_8x8[MB_INTERLACED];
@@ -787,10 +793,20 @@
 x264_cabac_encode_bypass( cb, coeff_sign );
 } while( --coeff_idx >= 0 );
 }
-static void x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
+
+void x264_cabac_block_residual_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
 {
 x264_cabac_block_residual_internal( h, cb, ctx_block_cat, l, 0 );
 }
+
+static void ALWAYS_INLINE x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
+{
+#if ARCH_X86_64 && HAVE_MMX
+ h->bsf.cabac_block_residual_internal( l, MB_INTERLACED, ctx_block_cat, cb );
+#else
+ x264_cabac_block_residual_c( h, cb, ctx_block_cat, l );
+#endif
+}
 static void x264_cabac_block_residual_422_dc( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
 {
 /* Template a version specifically for chroma 4:2:2 DC in order to avoid
@@ -806,16 +822,16 @@
 static void ALWAYS_INLINE x264_cabac_block_residual_internal( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l, int b_8x8, int chroma422dc )
 {
 const uint8_t *sig_offset = x264_significant_coeff_flag_offset_8x8[MB_INTERLACED];
- int ctx_sig = significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
- int ctx_last = last_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
- int ctx_level = coeff_abs_level_m1_offset[ctx_block_cat];
+ int ctx_sig = x264_significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
+ int ctx_last = x264_last_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
+ int ctx_level = x264_coeff_abs_level_m1_offset[ctx_block_cat];
 int last = h->quantf.coeff_last[ctx_block_cat]( l );
 int coeff_abs = abs(l[last]);
 int ctx = coeff_abs_level1_ctx[0] + ctx_level;
 int node_ctx;
 const uint8_t *levelgt1_ctx = chroma422dc ? coeff_abs_levelgt1_ctx_chroma_dc : coeff_abs_levelgt1_ctx;
 
- if( last != (b_8x8 ? 63 : chroma422dc ? 7 : count_cat_m1[ctx_block_cat]) )
+ if( last != (b_8x8 ? 63 : chroma422dc ? 7 : x264_count_cat_m1[ctx_block_cat]) )
 {
 x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[last] :
 chroma422dc ? x264_coeff_flag_offset_chroma_422_dc[last] : last), 1 );
@@ -888,17 +904,35 @@
 }
 }
 
-static void x264_cabac_block_residual_8x8( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
+void x264_cabac_block_residual_8x8_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
 {
 x264_cabac_block_residual_internal( h, cb, ctx_block_cat, l, 1, 0 );
 }
-static void x264_cabac_block_residual_422_dc( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
+void x264_cabac_block_residual_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
 {
- x264_cabac_block_residual_internal( h, cb, DCT_CHROMA_DC, l, 0, 1 );
+ x264_cabac_block_residual_internal( h, cb, ctx_block_cat, l, 0, 0 );
 }
-static void x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
+
+static ALWAYS_INLINE void x264_cabac_block_residual_8x8( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
 {
- x264_cabac_block_residual_internal( h, cb, ctx_block_cat, l, 0, 0 );
+#if ARCH_X86_64 && HAVE_MMX
+ h->bsf.cabac_block_residual_8x8_rd_internal( l, MB_INTERLACED, ctx_block_cat, cb );
+#else
+ x264_cabac_block_residual_8x8_rd_c( h, cb, ctx_block_cat, l );
+#endif
+}
+static ALWAYS_INLINE void x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
+{
+#if ARCH_X86_64 && HAVE_MMX
+ h->bsf.cabac_block_residual_rd_internal( l, MB_INTERLACED, ctx_block_cat, cb );
+#else
+ x264_cabac_block_residual_rd_c( h, cb, ctx_block_cat, l );
+#endif
+}
+
+static void x264_cabac_block_residual_422_dc( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
+{
+ x264_cabac_block_residual_internal( h, cb, DCT_CHROMA_DC, l, 0, 1 );
 }
 #endif
 
@@ -1051,25 +1085,23 @@
 MUNGE_8x8_NNZ( BACKUP )
 
 for( int p = 0; p < 3; p++ )
- for( int i = 0; i < 4; i++ )
- if( h->mb.i_cbp_luma & ( 1 <dct.luma8x8[i+p*4], b_intra );
+ FOREACH_BIT( i, 0, h->mb.i_cbp_luma )
+ x264_cabac_block_residual_8x8_cbf( h, cb, ctx_cat_plane[DCT_LUMA_8x8][p], i*4+p*16, h->dct.luma8x8[i+p*4], b_intra );
 
 MUNGE_8x8_NNZ( RESTORE )
 }
 else
 {
- for( int i = 0; i < 4; i++ )
- if( h->mb.i_cbp_luma & ( 1 <dct.luma8x8[i] );
+ FOREACH_BIT( i, 0, h->mb.i_cbp_luma )
+ x264_cabac_block_residual_8x8( h, cb, DCT_LUMA_8x8, h->dct.luma8x8[i] );
 }
 }
 else
 {
 for( int p = 0; p < plane_count; p++ )
- for( int i = 0; i < 16; i++ )
- if( h->mb.i_cbp_luma & ( 1 << ( i >> 2 ) ) )
- x264_cabac_block_residual_cbf( h, cb, ctx_cat_plane[DCT_LUMA_4x4][p], i+p*16, h->dct.luma4x4[i+p*16], b_intra );
+ FOREACH_BIT( i8x8, 0, h->mb.i_cbp_luma )
+ for( int i = 0; i < 4; i++ )
+ x264_cabac_block_residual_cbf( h, cb, ctx_cat_plane[DCT_LUMA_4x4][p], i+i8x8*4+p*16, h->dct.luma4x4[i+i8x8*4+p*16], b_intra );
 }
 
 if( chroma && h->mb.i_cbp_chroma ) /* Chroma DC residual present */

x264-snapshot-20130224-2245.tar.bz2/encoder/cavlc.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/cavlc.c Changed

@@ -128,13 +128,13 @@
 unsigned int i_sign;
 
 /* level and run and total */
- /* set these to 2 to allow branchless i_trailing calculation */
- runlevel.level[1] = 2;
- runlevel.level[2] = 2;
 i_total = h->quantf.coeff_level_run[ctx_block_cat]( l, &runlevel );
 x264_prefetch( &x264_run_before[runlevel.mask] );
 i_total_zero = runlevel.last + 1 - i_total;
 
+ /* branchless i_trailing calculation */
+ runlevel.level[i_total+0] = 2;
+ runlevel.level[i_total+1] = 2;
 i_trailing = ((((runlevel.level[0]+1) | (1-runlevel.level[0])) >> 31) & 1) // abs(runlevel.level[0])>1
 | ((((runlevel.level[1]+1) | (1-runlevel.level[1])) >> 31) & 2)
 | ((((runlevel.level[2]+1) | (1-runlevel.level[2])) >> 31) & 4);
@@ -213,11 +213,14 @@
 bs_t *s = &h->out.bs;
 int i_dqp = h->mb.i_qp - h->mb.i_last_qp;
 
- /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely flat background area */
+ /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely
+ * flat background area. Don't do this if it would raise the quantizer, since that could
+ * cause unexpected deblocking artifacts. */
 if( h->mb.i_type == I_16x16 && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma)
 && !h->mb.cache.non_zero_count[x264_scan8[LUMA_DC]]
 && !h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]]
- && !h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] )
+ && !h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]]
+ && h->mb.i_qp > h->mb.i_last_qp )
 {
 #if !RDO_SKIP_BS
 h->mb.i_qp = h->mb.i_last_qp;
@@ -268,20 +271,33 @@
 }
 }
 
-static inline void x264_cavlc_macroblock_luma_residual( x264_t *h, int i8start, int i8end )
+static ALWAYS_INLINE void x264_cavlc_macroblock_luma_residual( x264_t *h, int plane_count )
 {
 if( h->mb.b_transform_8x8 )
 {
 /* shuffle 8x8 dct coeffs into 4x4 lists */
- for( int i8 = i8start; i8 <= i8end; i8++ )
- if( h->mb.cache.non_zero_count[x264_scan8[i8*4]] )
- h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4], h->dct.luma8x8[i8], &h->mb.cache.non_zero_count[x264_scan8[i8*4]] );
+ for( int p = 0; p < plane_count; p++ )
+ for( int i8 = 0; i8 < 4; i8++ )
+ if( h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4]] )
+ h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[p*16+i8*4], h->dct.luma8x8[p*4+i8],
+ &h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4]] );
 }
 
- for( int i8 = i8start; i8 <= i8end; i8++ )
- if( h->mb.i_cbp_luma & (1 << (i8&3)) )
+ for( int p = 0; p < plane_count; p++ )
+ FOREACH_BIT( i8, 0, h->mb.i_cbp_luma )
 for( int i4 = 0; i4 < 4; i4++ )
- x264_cavlc_block_residual( h, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4] );
+ x264_cavlc_block_residual( h, DCT_LUMA_4x4, i4+i8*4+p*16, h->dct.luma4x4[i4+i8*4+p*16] );
+}
+
+static ALWAYS_INLINE void x264_cavlc_partition_luma_residual( x264_t *h, int i8, int p )
+{
+ if( h->mb.b_transform_8x8 && h->mb.cache.non_zero_count[x264_scan8[i8*4]] )
+ h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4+p*16], h->dct.luma8x8[i8+p*4],
+ &h->mb.cache.non_zero_count[x264_scan8[i8*4+p*16]] );
+
+ if( h->mb.i_cbp_luma & (1 << i8) )
+ for( int i4 = 0; i4 < 4; i4++ )
+ x264_cavlc_block_residual( h, DCT_LUMA_4x4, i4+i8*4+p*16, h->dct.luma4x4[i4+i8*4+p*16] );
 }
 
 static void x264_cavlc_mb_header_i( x264_t *h, int i_mb_type, int i_mb_i_offset, int chroma )
@@ -552,7 +568,7 @@
 else if( h->mb.i_cbp_luma | h->mb.i_cbp_chroma )
 {
 x264_cavlc_qp_delta( h );
- x264_cavlc_macroblock_luma_residual( h, 0, plane_count*4-1 );
+ x264_cavlc_macroblock_luma_residual( h, plane_count );
 }
 if( h->mb.i_cbp_chroma )
 {
@@ -612,7 +628,7 @@
 for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- )
 {
 for( int p = 0; p < plane_count; p++ )
- x264_cavlc_macroblock_luma_residual( h, p*4+i8, p*4+i8 );
+ x264_cavlc_partition_luma_residual( h, i8, p );
 if( h->mb.i_cbp_chroma )
 {
 if( CHROMA_FORMAT == CHROMA_422 )
@@ -665,7 +681,7 @@
 h->out.bs.i_bits_encoded = x264_cavlc_intra4x4_pred_size( h, 4*i8, i_mode );
 bs_write_ue( &h->out.bs, cbp_to_golomb[!CHROMA444][1][(h->mb.i_cbp_chroma << 4)|h->mb.i_cbp_luma] );
 for( int p = 0; p < plane_count; p++ )
- x264_cavlc_macroblock_luma_residual( h, p*4+i8, p*4+i8 );
+ x264_cavlc_partition_luma_residual( h, i8, p );
 return h->out.bs.i_bits_encoded;
 }

x264-snapshot-20130224-2245.tar.bz2/encoder/encoder.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/encoder.c Changed

@@ -353,34 +353,49 @@
 
 /* If we are within a reasonable distance of the end of the memory allocated for the bitstream, */
 /* reallocate, adding an arbitrary amount of space. */
-static int x264_bitstream_check_buffer( x264_t *h )
+static int x264_bitstream_check_buffer_internal( x264_t *h, int size, int b_cabac, int i_nal )
 {
- uint8_t *bs_bak = h->out.p_bitstream;
- int max_row_size = (2500 << SLICE_MBAFF) * h->mb.i_mb_width;
- if( (h->param.b_cabac && (h->cabac.p_end - h->cabac.p < max_row_size)) ||
- (h->out.bs.p_end - h->out.bs.p < max_row_size) )
+ if( (b_cabac && (h->cabac.p_end - h->cabac.p < size)) ||
+ (h->out.bs.p_end - h->out.bs.p < size) )
 {
- h->out.i_bitstream += max_row_size;
- CHECKED_MALLOC( h->out.p_bitstream, h->out.i_bitstream );
- h->mc.memcpy_aligned( h->out.p_bitstream, bs_bak, (h->out.i_bitstream - max_row_size) & ~15 );
- intptr_t delta = h->out.p_bitstream - bs_bak;
+ int buf_size = h->out.i_bitstream + size;
+ uint8_t *buf = x264_malloc( buf_size );
+ if( !buf )
+ return -1;
+ int aligned_size = h->out.i_bitstream & ~15;
+ h->mc.memcpy_aligned( buf, h->out.p_bitstream, aligned_size );
+ memcpy( buf + aligned_size, h->out.p_bitstream + aligned_size, h->out.i_bitstream - aligned_size );
+
+ intptr_t delta = buf - h->out.p_bitstream;
 
 h->out.bs.p_start += delta;
 h->out.bs.p += delta;
- h->out.bs.p_end = h->out.p_bitstream + h->out.i_bitstream;
+ h->out.bs.p_end = buf + buf_size;
 
 h->cabac.p_start += delta;
 h->cabac.p += delta;
- h->cabac.p_end = h->out.p_bitstream + h->out.i_bitstream;
+ h->cabac.p_end = buf + buf_size;
 
- for( int i = 0; i <= h->out.i_nal; i++ )
+ for( int i = 0; i <= i_nal; i++ )
 h->out.nal[i].p_payload += delta;
- x264_free( bs_bak );
+
+ x264_free( h->out.p_bitstream );
+ h->out.p_bitstream = buf;
+ h->out.i_bitstream = buf_size;
 }
 return 0;
-fail:
- x264_free( bs_bak );
- return -1;
+}
+
+static int x264_bitstream_check_buffer( x264_t *h )
+{
+ int max_row_size = (2500 << SLICE_MBAFF) * h->mb.i_mb_width;
+ return x264_bitstream_check_buffer_internal( h, max_row_size, h->param.b_cabac, h->out.i_nal );
+}
+
+static int x264_bitstream_check_buffer_filler( x264_t *h, int filler )
+{
+ filler += 32; // add padding for safety
+ return x264_bitstream_check_buffer_internal( h, filler, 0, -1 );
 }
 
 #if HAVE_THREAD
@@ -417,17 +432,33 @@
 static int x264_validate_parameters( x264_t *h, int b_open )
 {
 #if HAVE_MMX
-#ifdef __SSE__
- if( b_open && !(x264_cpu_detect() & X264_CPU_SSE) )
+ if( b_open )
 {
- x264_log( h, X264_LOG_ERROR, "your cpu does not support SSE1, but x264 was compiled with asm support\n");
+ int cpuflags = x264_cpu_detect();
+ int fail = 0;
+#ifdef __SSE__
+ if( !(cpuflags & X264_CPU_SSE) )
+ {
+ x264_log( h, X264_LOG_ERROR, "your cpu does not support SSE1, but x264 was compiled with asm\n");
+ fail = 1;
+ }
 #else
- if( b_open && !(x264_cpu_detect() & X264_CPU_MMX2) )
- {
- x264_log( h, X264_LOG_ERROR, "your cpu does not support MMXEXT, but x264 was compiled with asm support\n");
+ if( !(cpuflags & X264_CPU_MMX2) )
+ {
+ x264_log( h, X264_LOG_ERROR, "your cpu does not support MMXEXT, but x264 was compiled with asm\n");
+ fail = 1;
+ }
 #endif
- x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm support (configure --disable-asm)\n");
- return -1;
+ if( !fail && !(cpuflags & X264_CPU_CMOV) )
+ {
+ x264_log( h, X264_LOG_ERROR, "your cpu does not support CMOV, but x264 was compiled with asm\n");
+ fail = 1;
+ }
+ if( fail )
+ {
+ x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm (configure --disable-asm)\n");
+ return -1;
+ }
 }
 #endif
 
@@ -503,8 +534,6 @@
 
 if( h->param.i_threads == X264_THREADS_AUTO )
 h->param.i_threads = x264_cpu_num_processors() * (h->param.b_sliced_threads?2:3)/2;
- if( h->param.i_lookahead_threads == X264_THREADS_AUTO )
- h->param.i_lookahead_threads = h->param.i_threads / (h->param.b_sliced_threads?1:6);
 int max_sliced_threads = X264_MAX( 1, (h->param.i_height+15)/16 / 4 );
 if( h->param.i_threads > 1 )
 {
@@ -518,7 +547,6 @@
 h->param.i_threads = X264_MIN( h->param.i_threads, max_sliced_threads );
 }
 h->param.i_threads = x264_clip3( h->param.i_threads, 1, X264_THREAD_MAX );
- h->param.i_lookahead_threads = x264_clip3( h->param.i_lookahead_threads, 1, X264_MIN( max_sliced_threads, X264_LOOKAHEAD_THREAD_MAX ) );
 if( h->param.i_threads == 1 )
 {
 h->param.b_sliced_threads = 0;
@@ -528,6 +556,28 @@
 if( h->i_thread_frames > 1 )
 h->param.nalu_process = NULL;
 
+ if( h->param.b_opencl )
+ {
+#if !HAVE_OPENCL
+ x264_log( h, X264_LOG_WARNING, "OpenCL: not compiled with OpenCL support, disabling\n" );
+ h->param.b_opencl = 0;
+#elif BIT_DEPTH > 8
+ x264_log( h, X264_LOG_WARNING, "OpenCL lookahead does not support high bit depth, disabling opencl\n" );
+ h->param.b_opencl = 0;
+#else
+ if( h->param.i_width < 32 || h->param.i_height < 32 )
+ {
+ x264_log( h, X264_LOG_WARNING, "OpenCL: frame size is too small, disabling opencl\n" );
+ h->param.b_opencl = 0;
+ }
+#endif
+ if( h->param.opencl_device_id && h->param.i_opencl_device )
+ {
+ x264_log( h, X264_LOG_WARNING, "OpenCL: device id and device skip count configured; dropping skip\n" );
+ h->param.i_opencl_device = 0;
+ }
+ }
+
 h->param.i_keyint_max = x264_clip3( h->param.i_keyint_max, 1, X264_KEYINT_MAX_INFINITE );
 if( h->param.i_keyint_max == 1 )
 {
@@ -646,7 +696,7 @@
 h->param.rc.i_rc_method == X264_RC_ABR )
 {
 x264_log( h, X264_LOG_WARNING, "max bitrate less than average bitrate, assuming CBR\n" );
- h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate;
+ h->param.rc.i_bitrate = h->param.rc.i_vbv_max_bitrate;
 }
 }
 else if( h->param.rc.i_vbv_max_bitrate )
@@ -657,6 +707,22 @@
 
 h->param.i_slice_max_size = X264_MAX( h->param.i_slice_max_size, 0 );
 h->param.i_slice_max_mbs = X264_MAX( h->param.i_slice_max_mbs, 0 );
+ h->param.i_slice_min_mbs = X264_MAX( h->param.i_slice_min_mbs, 0 );
+ if( h->param.i_slice_max_mbs )
+ h->param.i_slice_min_mbs = X264_MIN( h->param.i_slice_min_mbs, h->param.i_slice_max_mbs/2 );
+ else if( !h->param.i_slice_max_size )
+ h->param.i_slice_min_mbs = 0;
+ if( PARAM_INTERLACED && h->param.i_slice_min_mbs )
+ {
+ x264_log( h, X264_LOG_WARNING, "interlace + slice-min-mbs is not implemented\n" );
+ h->param.i_slice_min_mbs = 0;
+ }
+ int mb_width = (h->param.i_width+15)/16;
+ if( h->param.i_slice_min_mbs > mb_width )
+ {
+ x264_log( h, X264_LOG_WARNING, "slice-min-mbs > row mb size (%d) not implemented\n", mb_width );
+ h->param.i_slice_min_mbs = mb_width;
+ }
 
 int max_slices = (h->param.i_height+((16<<PARAM_INTERLACED)-1))/(16<<PARAM_INTERLACED);
 if( h->param.b_sliced_threads )
@@ -667,6 +733,8 @@
 if( h->param.i_slice_max_mbs || h->param.i_slice_max_size )
 h->param.i_slice_count = 0;
 }
+ if( h->param.i_slice_count_max > 0 )
+ h->param.i_slice_count_max = X264_MAX( h->param.i_slice_count, h->param.i_slice_count_max );
 
 if( h->param.b_bluray_compat )
 {
@@ -895,6 +963,35 @@
 
 h->param.analyse.i_weighted_pred = x264_clip3( h->param.analyse.i_weighted_pred, X264_WEIGHTP_NONE, X264_WEIGHTP_SMART );
 
+ if( h->param.i_lookahead_threads == X264_THREADS_AUTO )
+ {
+ if( h->param.b_sliced_threads )
+ h->param.i_lookahead_threads = h->param.i_threads;
+ else
+ {
+ /* If we're using much slower lookahead settings than encoding settings, it helps a lot to use
+ * more lookahead threads. This typically happens in the first pass of a two-pass encode, so
+ * try to guess at this sort of case.
+ *
+ * Tuned by a little bit of real encoding with the various presets. */
+ int badapt = h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS;
+ int subme = X264_MIN( h->param.analyse.i_subpel_refine / 3, 3 ) + (h->param.analyse.i_subpel_refine > 1);
+ int bframes = X264_MIN( (h->param.i_bframe - 1) / 3, 3 );
+
+ /* [b-adapt 0/1 vs 2][quantized subme][quantized bframes] */
+ static const uint8_t lookahead_thread_div[2][5][4] =
+ {{{6,6,6,6}, {3,3,3,3}, {4,4,4,4}, {6,6,6,6}, {12,12,12,12}},
+ {{3,2,1,1}, {2,1,1,1}, {4,3,2,1}, {6,4,3,2}, {12, 9, 6, 4}}};
+
+ h->param.i_lookahead_threads = h->param.i_threads / lookahead_thread_div[badapt][subme][bframes];
+ /* Since too many lookahead threads significantly degrades lookahead accuracy, limit auto
+ * lookahead threads to about 8 macroblock rows high each at worst. This number is chosen
+ * pretty much arbitrarily. */
+ h->param.i_lookahead_threads = X264_MIN( h->param.i_lookahead_threads, h->param.i_height / 128 );
+ }
+ }
+ h->param.i_lookahead_threads = x264_clip3( h->param.i_lookahead_threads, 1, X264_MIN( max_sliced_threads, X264_LOOKAHEAD_THREAD_MAX ) );
+
 if( PARAM_INTERLACED )
 {
 if( h->param.analyse.i_me_method >= X264_ME_ESA )
@@ -982,7 +1079,9 @@
 BOOLIFY( b_fake_interlaced );
 BOOLIFY( b_open_gop );
 BOOLIFY( b_bluray_compat );
+ BOOLIFY( b_stitchable );
 BOOLIFY( b_full_recon );
+ BOOLIFY( b_opencl );
 BOOLIFY( analyse.b_transform_8x8 );
 BOOLIFY( analyse.b_weighted_bipred );
 BOOLIFY( analyse.b_chroma_me );
@@ -1221,7 +1320,7 @@
 x264_dct_init( h->param.cpu, &h->dctf );
 x264_zigzag_init( h->param.cpu, &h->zigzagf_progressive, &h->zigzagf_interlaced );
 memcpy( &h->zigzagf, PARAM_INTERLACED ? &h->zigzagf_interlaced : &h->zigzagf_progressive, sizeof(h->zigzagf) );
- x264_mc_init( h->param.cpu, &h->mc );
+ x264_mc_init( h->param.cpu, &h->mc, h->param.b_cpu_independent );
 x264_quant_init( h, h->param.cpu, &h->quantf );
 x264_deblock_init( h->param.cpu, &h->loopf, PARAM_INTERLACED );
 x264_bitstream_init( h->param.cpu, &h->bsf );
@@ -1236,6 +1335,9 @@
 p = buf + sprintf( buf, "using cpu capabilities:" );
 for( int i = 0; x264_cpu_names[i].flags; i++ )
 {
+ if( !strcmp(x264_cpu_names[i].name, "SSE")
+ && h->param.cpu & (X264_CPU_SSE2) )
+ continue;
 if( !strcmp(x264_cpu_names[i].name, "SSE2")
 && h->param.cpu & (X264_CPU_SSE2_IS_FAST|X264_CPU_SSE2_IS_SLOW) )
 continue;
@@ -1245,6 +1347,9 @@
 if( !strcmp(x264_cpu_names[i].name, "SSE4.1")
 && (h->param.cpu & X264_CPU_SSE42) )
 continue;
+ if( !strcmp(x264_cpu_names[i].name, "BMI1")
+ && (h->param.cpu & X264_CPU_BMI2) )
+ continue;
 if( (h->param.cpu & x264_cpu_names[i].flags) == x264_cpu_names[i].flags
 && (!i || x264_cpu_names[i].flags != x264_cpu_names[i-1].flags) )
 p += sprintf( p, " %s", x264_cpu_names[i].name );
@@ -1277,7 +1382,7 @@
 {
 x264_log( h, X264_LOG_ERROR, "CLZ test failed: x264 has been miscompiled!\n" );
 #if ARCH_X86 || ARCH_X86_64
- x264_log( h, X264_LOG_ERROR, "Are you attempting to run an SSE4a-targeted build on a CPU that\n" );
+ x264_log( h, X264_LOG_ERROR, "Are you attempting to run an SSE4a/LZCNT-targeted build on a CPU that\n" );
 x264_log( h, X264_LOG_ERROR, "doesn't support it?\n" );
 #endif
 goto fail;
@@ -1288,7 +1393,7 @@
 * ( h->param.rc.i_rc_method == X264_RC_ABR ? pow( 0.95, h->param.rc.i_qp_min )
 : pow( 0.95, h->param.rc.i_qp_constant ) * X264_MAX( 1, h->param.rc.f_ip_factor )));
 
- h->nal_buffer_size = h->out.i_bitstream * 3/2 + 4;
+ h->nal_buffer_size = h->out.i_bitstream * 3/2 + 4 + 64; /* +4 for startcode, +64 for nal_escape assembly padding */
 CHECKED_MALLOC( h->nal_buffer, h->nal_buffer_size );
 
 if( h->param.i_threads > 1 &&
@@ -1298,6 +1403,18 @@
 x264_threadpool_init( &h->lookaheadpool, h->param.i_lookahead_threads, (void*)x264_lookahead_thread_init, h ) )
 goto fail;
 
+#if HAVE_OPENCL
+ if( h->param.b_opencl )
+ {
+ h->opencl.ocl = x264_opencl_load_library();
+ if( !h->opencl.ocl )
+ {
+ x264_log( h, X264_LOG_WARNING, "failed to load OpenCL\n" );
+ h->param.b_opencl = 0;
+ }
+ }
+#endif
+
 h->thread[0] = h;
 for( int i = 1; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
 CHECKED_MALLOC( h->thread[i], sizeof(x264_t) );
@@ -1338,6 +1455,11 @@
 goto fail;
 }
 
+#if HAVE_OPENCL
+ if( h->param.b_opencl && x264_opencl_lookahead_init( h ) < 0 )
+ h->param.b_opencl = 0;
+#endif
+
 if( x264_lookahead_init( h, i_slicetype_length ) )
 goto fail;
 
@@ -1452,7 +1574,9 @@
 COPY( i_bframe_pyramid );
 COPY( i_slice_max_size );
 COPY( i_slice_max_mbs );
+ COPY( i_slice_min_mbs );
 COPY( i_slice_count );
+ COPY( i_slice_count_max );
 COPY( b_tff );
 
 /* VBV can't be turned on if it wasn't on to begin with */
@@ -1529,9 +1653,9 @@
 x264_nal_t *nal = &h->out.nal[h->out.i_nal];
 uint8_t *end = &h->out.p_bitstream[bs_pos( &h->out.bs ) / 8];
 nal->i_payload = end - nal->p_payload;
- /* nal_escape_mmx reads past the end of the input.
+ /* Assembly implementation of nal_escape reads past the end of the input.
 * While undefined padding wouldn't actually affect the output, it makes valgrind unhappy. */
- memset( end, 0xff, 32 );
+ memset( end, 0xff, 64 );
 if( h->param.nalu_process )
 h->param.nalu_process( h, nal, h->fenc->opaque );
 h->out.i_nal++;
@@ -1541,6 +1665,7 @@
 
 static int x264_encoder_encapsulate_nals( x264_t *h, int start )
 {
+ x264_t *h0 = h->thread[0];
 int nal_size = 0, previous_nal_size = 0;
 
 if( h->param.nalu_process )
@@ -1557,20 +1682,26 @@
 nal_size += h->out.nal[i].i_payload;
 
 /* Worst-case NAL unit escaping: reallocate the buffer if it's too small. */
- int necessary_size = nal_size * 3/2 + h->out.i_nal * 4;
- if( h->nal_buffer_size < necessary_size )
+ int necessary_size = previous_nal_size + nal_size * 3/2 + h->out.i_nal * 4 + 4 + 64;
+ if( h0->nal_buffer_size < necessary_size )
 {
- h->nal_buffer_size = necessary_size * 2;
- uint8_t *buf = x264_malloc( h->nal_buffer_size );
+ necessary_size *= 2;
+ uint8_t *buf = x264_malloc( necessary_size );
 if( !buf )
 return -1;
 if( previous_nal_size )
- memcpy( buf, h->nal_buffer, previous_nal_size );
- x264_free( h->nal_buffer );
- h->nal_buffer = buf;
+ memcpy( buf, h0->nal_buffer, previous_nal_size );
+
+ intptr_t delta = buf - h0->nal_buffer;
+ for( int i = 0; i < start; i++ )
+ h->out.nal[i].p_payload += delta;
+
+ x264_free( h0->nal_buffer );
+ h0->nal_buffer = buf;
+ h0->nal_buffer_size = necessary_size;
 }
 
- uint8_t *nal_buffer = h->nal_buffer + previous_nal_size;
+ uint8_t *nal_buffer = h0->nal_buffer + previous_nal_size;
 
 for( int i = start; i < h->out.i_nal; i++ )
 {
@@ -1581,7 +1712,7 @@
 
 x264_emms();
 
- return nal_buffer - (h->nal_buffer + previous_nal_size);
+ return nal_buffer - (h0->nal_buffer + previous_nal_size);
 }
 
 /****************************************************************************
@@ -2224,8 +2355,12 @@
 int b_deblock = h->sh.i_disable_deblocking_filter_idc != 1;
 int b_hpel = h->fdec->b_kept_as_ref;
 int orig_last_mb = h->sh.i_last_mb;
+ int thread_last_mb = h->i_threadslice_end * h->mb.i_mb_width - 1;
 uint8_t *last_emu_check;
- x264_bs_bak_t bs_bak[2];
+#define BS_BAK_SLICE_MAX_SIZE 0
+#define BS_BAK_SLICE_MIN_MBS 1
+#define BS_BAK_ROW_VBV 2
+ x264_bs_bak_t bs_bak[3];
 b_deblock &= b_hpel || h->param.b_full_recon || h->param.psz_dump_yuv;
 bs_realign( &h->out.bs );
 
@@ -2273,13 +2408,17 @@
 if( x264_bitstream_check_buffer( h ) )
 return -1;
 if( !(i_mb_y & SLICE_MBAFF) && h->param.rc.i_vbv_buffer_size )
- x264_bitstream_backup( h, &bs_bak[1], i_skip, 1 );
+ x264_bitstream_backup( h, &bs_bak[BS_BAK_ROW_VBV], i_skip, 1 );
 if( !h->mb.b_reencode_mb )
 x264_fdec_filter_row( h, i_mb_y, 0 );
 }
 
 if( !(i_mb_y & SLICE_MBAFF) && back_up_bitstream )
- x264_bitstream_backup( h, &bs_bak[0], i_skip, 0 );
+ {
+ x264_bitstream_backup( h, &bs_bak[BS_BAK_SLICE_MAX_SIZE], i_skip, 0 );
+ if( slice_max_size && (thread_last_mb+1-mb_xy) == h->param.i_slice_min_mbs )
+ x264_bitstream_backup( h, &bs_bak[BS_BAK_SLICE_MIN_MBS], i_skip, 0 );
+ }
 
 if( PARAM_INTERLACED )
 {
@@ -2342,7 +2481,7 @@
 h->mb.i_skip_intra = 0;
 h->mb.b_skip_mc = 0;
 h->mb.b_overflow = 0;
- x264_bitstream_restore( h, &bs_bak[0], &i_skip, 0 );
+ x264_bitstream_restore( h, &bs_bak[BS_BAK_SLICE_MAX_SIZE], &i_skip, 0 );
 goto reencode;
 }
 }
@@ -2367,26 +2506,50 @@
 /* We'll just re-encode this last macroblock if we go over the max slice size. */
 if( total_bits - starting_bits > slice_max_size && !h->mb.b_reencode_mb )
 {
- if( mb_xy-SLICE_MBAFF*h->mb.i_mb_stride != h->sh.i_first_mb )
+ if( !x264_frame_new_slice( h, h->fdec ) )
 {
- x264_bitstream_restore( h, &bs_bak[0], &i_skip, 0 );
- h->mb.b_reencode_mb = 1;
- if( SLICE_MBAFF )
+ /* Handle the most obnoxious slice-min-mbs edge case: we need to end the slice
+ * because it's gone over the maximum size, but doing so would violate slice-min-mbs.
+ * If possible, roll back to the last checkpoint and try again.
+ * We could try raising QP, but that would break in the case where a slice spans multiple
+ * rows, which the re-encoding infrastructure can't currently handle. */
+ if( mb_xy <= thread_last_mb && (thread_last_mb+1-mb_xy) < h->param.i_slice_min_mbs )
 {
- // set to bottom of previous mbpair
- if( i_mb_x )
- h->sh.i_last_mb = mb_xy-1+h->mb.i_mb_stride*(!(i_mb_y&1));
+ if( thread_last_mb-h->param.i_slice_min_mbs < h->sh.i_first_mb+h->param.i_slice_min_mbs )
+ {
+ x264_log( h, X264_LOG_WARNING, "slice-max-size violated (frame %d, cause: slice-min-mbs)\n", h->i_frame );
+ slice_max_size = 0;
+ goto cont;
+ }
+ x264_bitstream_restore( h, &bs_bak[BS_BAK_SLICE_MIN_MBS], &i_skip, 0 );
+ h->mb.b_reencode_mb = 1;
+ h->sh.i_last_mb = thread_last_mb-h->param.i_slice_min_mbs;
+ break;
+ }
+ if( mb_xy-SLICE_MBAFF*h->mb.i_mb_stride != h->sh.i_first_mb )
+ {
+ x264_bitstream_restore( h, &bs_bak[BS_BAK_SLICE_MAX_SIZE], &i_skip, 0 );
+ h->mb.b_reencode_mb = 1;
+ if( SLICE_MBAFF )
+ {
+ // set to bottom of previous mbpair
+ if( i_mb_x )
+ h->sh.i_last_mb = mb_xy-1+h->mb.i_mb_stride*(!(i_mb_y&1));
+ else
+ h->sh.i_last_mb = (i_mb_y-2+!(i_mb_y&1))*h->mb.i_mb_stride + h->mb.i_mb_width - 1;
+ }
 else
- h->sh.i_last_mb = (i_mb_y-2+!(i_mb_y&1))*h->mb.i_mb_stride + h->mb.i_mb_width - 1;
+ h->sh.i_last_mb = mb_xy-1;
+ break;
 }
 else
- h->sh.i_last_mb = mb_xy-1;
- break;
+ h->sh.i_last_mb = mb_xy;
 }
 else
- h->sh.i_last_mb = mb_xy;
+ slice_max_size = 0;
 }
 }
+cont:
 h->mb.b_reencode_mb = 0;
 
 #if HAVE_VISUALIZE
@@ -2399,7 +2562,7 @@
 
 if( x264_ratecontrol_mb( h, mb_size ) < 0 )
 {
- x264_bitstream_restore( h, &bs_bak[1], &i_skip, 1 );
+ x264_bitstream_restore( h, &bs_bak[BS_BAK_ROW_VBV], &i_skip, 1 );
 h->mb.b_reencode_mb = 1;
 i_mb_x = 0;
 i_mb_y = i_mb_y - SLICE_MBAFF;
@@ -2498,6 +2661,9 @@
 i_mb_x = 0;
 }
 }
+ if( h->sh.i_last_mb < h->sh.i_first_mb )
+ return 0;
+
 h->out.nal[h->out.i_nal].i_last_mb = h->sh.i_last_mb;
 
 if( h->param.b_cabac )
@@ -2596,27 +2762,35 @@
 while( h->sh.i_first_mb + SLICE_MBAFF*h->mb.i_mb_stride <= last_thread_mb )
 {
 h->sh.i_last_mb = last_thread_mb;
- if( h->param.i_slice_max_mbs )
+ if( !i_slice_num || !x264_frame_new_slice( h, h->fdec ) )
 {
- if( SLICE_MBAFF )
+ if( h->param.i_slice_max_mbs )
 {
- // convert first to mbaff form, add slice-max-mbs, then convert back to normal form
- int last_mbaff = 2*(h->sh.i_first_mb % h->mb.i_mb_width)
- + h->mb.i_mb_width*(h->sh.i_first_mb / h->mb.i_mb_width)
- + h->param.i_slice_max_mbs - 1;
- int last_x = (last_mbaff % (2*h->mb.i_mb_width))/2;
- int last_y = (last_mbaff / (2*h->mb.i_mb_width))*2 + 1;
- h->sh.i_last_mb = last_x + h->mb.i_mb_stride*last_y;
+ if( SLICE_MBAFF )
+ {
+ // convert first to mbaff form, add slice-max-mbs, then convert back to normal form
+ int last_mbaff = 2*(h->sh.i_first_mb % h->mb.i_mb_width)
+ + h->mb.i_mb_width*(h->sh.i_first_mb / h->mb.i_mb_width)
+ + h->param.i_slice_max_mbs - 1;
+ int last_x = (last_mbaff % (2*h->mb.i_mb_width))/2;
+ int last_y = (last_mbaff / (2*h->mb.i_mb_width))*2 + 1;
+ h->sh.i_last_mb = last_x + h->mb.i_mb_stride*last_y;
+ }
+ else
+ {
+ h->sh.i_last_mb = h->sh.i_first_mb + h->param.i_slice_max_mbs - 1;
+ if( h->sh.i_last_mb < last_thread_mb && last_thread_mb - h->sh.i_last_mb < h->param.i_slice_min_mbs )
+ h->sh.i_last_mb = last_thread_mb - h->param.i_slice_min_mbs;
+ }
+ i_slice_num++;
+ }
+ else if( h->param.i_slice_count && !h->param.b_sliced_threads )
+ {
+ int height = h->mb.i_mb_height >> PARAM_INTERLACED;
+ int width = h->mb.i_mb_width << PARAM_INTERLACED;
+ i_slice_num++;
+ h->sh.i_last_mb = (height * i_slice_num + h->param.i_slice_count/2) / h->param.i_slice_count * width - 1;
 }
- else
- h->sh.i_last_mb = h->sh.i_first_mb + h->param.i_slice_max_mbs - 1;
- }
- else if( h->param.i_slice_count && !h->param.b_sliced_threads )
- {
- int height = h->mb.i_mb_height >> PARAM_INTERLACED;
- int width = h->mb.i_mb_width << PARAM_INTERLACED;
- i_slice_num++;
- h->sh.i_last_mb = (height * i_slice_num + h->param.i_slice_count/2) / h->param.i_slice_count * width - 1;
 }
 h->sh.i_last_mb = X264_MIN( h->sh.i_last_mb, last_thread_mb );
 if( x264_stack_align( x264_slice_write, h ) )
@@ -2755,6 +2929,11 @@
 int i_nal_type, i_nal_ref_idc, i_global_qp;
 int overhead = NALU_OVERHEAD;
 
+#if HAVE_OPENCL
+ if( h->opencl.b_fatal_error )
+ return -1;
+#endif
+
 if( h->i_thread_frames > 1 )
 {
 thread_prev = h->thread[ h->i_thread_phase ];
@@ -3324,6 +3503,8 @@
 else
 f = X264_MAX( 0, filler - overhead );
 
+ if( x264_bitstream_check_buffer_filler( h, f ) )
+ return -1;
 x264_nal_start( h, NAL_FILLER, NAL_PRIORITY_DISPOSABLE );
 x264_filler_write( h, &h->out.bs, f );
 if( x264_nal_end( h ) )
@@ -3503,6 +3684,11 @@
 
 x264_lookahead_delete( h );
 
+#if HAVE_OPENCL
+ x264_opencl_lookahead_delete( h );
+ x264_opencl_function_t *ocl = h->opencl.ocl;
+#endif
+
 if( h->param.b_sliced_threads )
 x264_threadpool_wait_all( h );
 if( h->param.i_threads > 1 )
@@ -3852,6 +4038,9 @@
 x264_pthread_cond_destroy( &h->thread[i]->cv );
 x264_free( h->thread[i] );
 }
+#if HAVE_OPENCL
+ x264_opencl_close_library( ocl );
+#endif
 }
 
 int x264_encoder_delayed_frames( x264_t *h )

x264-snapshot-20130224-2245.tar.bz2/encoder/lookahead.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/lookahead.c Changed

@@ -70,18 +70,19 @@
     x264_stack_align( x264_slicetype_decide, h );
 
     x264_lookahead_update_last_nonb( h, h->lookahead->next.list[0] );
+    int shift_frames = h->lookahead->next.list[0]->i_bframes + 1;
 
     x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex );
     while( h->lookahead->ofbuf.i_size == h->lookahead->ofbuf.i_max_size )
         x264_pthread_cond_wait( &h->lookahead->ofbuf.cv_empty, &h->lookahead->ofbuf.mutex );
 
     x264_pthread_mutex_lock( &h->lookahead->next.mutex );
-    x264_lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, h->lookahead->next.list[0]->i_bframes + 1 );
+    x264_lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, shift_frames );
     x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
 
     /* For MB-tree and VBV lookahead, we have to perform propagation analysis on I-frames too. */
     if( h->lookahead->b_analyse_keyframe && IS_X264_TYPE_I( h->lookahead->last_nonb->i_type ) )
-        x264_stack_align( x264_slicetype_analyse, h, 1 );
+        x264_stack_align( x264_slicetype_analyse, h, shift_frames );
 
     x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
 }
@@ -236,11 +237,12 @@
 
         x264_stack_align( x264_slicetype_decide, h );
         x264_lookahead_update_last_nonb( h, h->lookahead->next.list[0] );
-        x264_lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, h->lookahead->next.list[0]->i_bframes + 1 );
+        int shift_frames = h->lookahead->next.list[0]->i_bframes + 1;
+        x264_lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, shift_frames );
 
         /* For MB-tree and VBV lookahead, we have to perform propagation analysis on I-frames too. */
         if( h->lookahead->b_analyse_keyframe && IS_X264_TYPE_I( h->lookahead->last_nonb->i_type ) )
-            x264_stack_align( x264_slicetype_analyse, h, 1 );
+            x264_stack_align( x264_slicetype_analyse, h, shift_frames );
 
         x264_lookahead_encoder_shift( h );
     }

x264-snapshot-20130224-2245.tar.bz2/encoder/macroblock.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/macroblock.c Changed

@@ -128,8 +128,8 @@
 pixel *p_src = h->mb.pic.p_fenc[p];
 pixel *p_dst = h->mb.pic.p_fdec[p];
 
- ALIGNED_ARRAY_16( dctcoef, dct4x4,[16],[16] );
- ALIGNED_ARRAY_16( dctcoef, dct_dc4x4,[16] );
+ ALIGNED_ARRAY_N( dctcoef, dct4x4,[16],[16] );
+ ALIGNED_ARRAY_N( dctcoef, dct_dc4x4,[16] );
 
 int nz, block_cbp = 0;
 int decimate_score = h->mb.b_dct_decimate ? 0 : 9;
@@ -157,28 +157,51 @@
 return;
 }
 
+ M32( &h->mb.cache.non_zero_count[x264_scan8[ 0+p*16]] ) = 0;
+ M32( &h->mb.cache.non_zero_count[x264_scan8[ 2+p*16]] ) = 0;
+ M32( &h->mb.cache.non_zero_count[x264_scan8[ 8+p*16]] ) = 0;
+ M32( &h->mb.cache.non_zero_count[x264_scan8[10+p*16]] ) = 0;
+
 h->dctf.sub16x16_dct( dct4x4, p_src, p_dst );
 
- for( int i = 0; i < 16; i++ )
+ if( h->mb.b_noise_reduction )
+ for( int idx = 0; idx < 16; idx++ )
+ h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 );
+
+ for( int idx = 0; idx < 16; idx++ )
 {
- /* copy dc coeff */
- if( h->mb.b_noise_reduction )
- h->quantf.denoise_dct( dct4x4[i], h->nr_residual_sum[0], h->nr_offset[0], 16 );
- dct_dc4x4[block_idx_xy_1d[i]] = dct4x4[i][0];
- dct4x4[i][0] = 0;
+ dct_dc4x4[block_idx_xy_1d[idx]] = dct4x4[idx][0];
+ dct4x4[idx][0] = 0;
+ }
 
- /* quant/scan/dequant */
- if( h->mb.b_trellis )
- nz = x264_quant_4x4_trellis( h, dct4x4[i], i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_AC][p], 1, !!p, i );
- else
- nz = h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
- h->mb.cache.non_zero_count[x264_scan8[16*p+i]] = nz;
- if( nz )
+ if( h->mb.b_trellis )
+ {
+ for( int idx = 0; idx < 16; idx++ )
+ if( x264_quant_4x4_trellis( h, dct4x4[idx], i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_AC][p], 1, !!p, idx ) )
+ {
+ block_cbp = 0xf;
+ h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+idx], dct4x4[idx] );
+ h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[i_quant_cat], i_qp );
+ if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+idx] );
+ h->mb.cache.non_zero_count[x264_scan8[16*p+idx]] = 1;
+ }
+ }
+ else
+ {
+ for( int i8x8 = 0; i8x8 < 4; i8x8++ )
 {
- h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+i], dct4x4[i] );
- h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[i_quant_cat], i_qp );
- if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+i] );
- block_cbp = 0xf;
+ nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
+ if( nz )
+ {
+ block_cbp = 0xf;
+ FOREACH_BIT( idx, i8x8*4, nz )
+ {
+ h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+idx], dct4x4[idx] );
+ h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[i_quant_cat], i_qp );
+ if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+idx] );
+ h->mb.cache.non_zero_count[x264_scan8[16*p+idx]] = 1;
+ }
+ }
 }
 }
 
@@ -245,6 +268,18 @@
 h->mb.i_cbp_chroma = 0;
 h->nr_count[2] += h->mb.b_noise_reduction * 4;
 
+ M16( &h->mb.cache.non_zero_count[x264_scan8[16]] ) = 0;
+ M16( &h->mb.cache.non_zero_count[x264_scan8[18]] ) = 0;
+ M16( &h->mb.cache.non_zero_count[x264_scan8[32]] ) = 0;
+ M16( &h->mb.cache.non_zero_count[x264_scan8[34]] ) = 0;
+ if( chroma422 )
+ {
+ M16( &h->mb.cache.non_zero_count[x264_scan8[24]] ) = 0;
+ M16( &h->mb.cache.non_zero_count[x264_scan8[26]] ) = 0;
+ M16( &h->mb.cache.non_zero_count[x264_scan8[40]] ) = 0;
+ M16( &h->mb.cache.non_zero_count[x264_scan8[42]] ) = 0;
+ }
+
 /* Early termination: check variance of chroma residual before encoding.
 * Don't bother trying early termination at low QPs.
 * Values are experimentally derived. */
@@ -259,17 +294,6 @@
 score += h->pixf.var2[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] );
 if( score < thresh*4 )
 {
- M16( &h->mb.cache.non_zero_count[x264_scan8[16]] ) = 0;
- M16( &h->mb.cache.non_zero_count[x264_scan8[18]] ) = 0;
- M16( &h->mb.cache.non_zero_count[x264_scan8[32]] ) = 0;
- M16( &h->mb.cache.non_zero_count[x264_scan8[34]] ) = 0;
- if( chroma422 )
- {
- M16( &h->mb.cache.non_zero_count[x264_scan8[24]] ) = 0;
- M16( &h->mb.cache.non_zero_count[x264_scan8[26]] ) = 0;
- M16( &h->mb.cache.non_zero_count[x264_scan8[40]] ) = 0;
- M16( &h->mb.cache.non_zero_count[x264_scan8[42]] ) = 0;
- }
 h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] = 0;
 h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] = 0;
 
@@ -326,10 +350,10 @@
 {
 pixel *p_src = h->mb.pic.p_fenc[1+ch];
 pixel *p_dst = h->mb.pic.p_fdec[1+ch];
- int i_decimate_score = 0;
+ int i_decimate_score = b_decimate ? 0 : 7;
 int nz_ac = 0;
 
- ALIGNED_ARRAY_16( dctcoef, dct4x4,[8],[16] );
+ ALIGNED_ARRAY_N( dctcoef, dct4x4,[8],[16] );
 
 if( h->mb.b_lossless )
 {
@@ -361,20 +385,40 @@
 dct2x2dc( dct_dc, dct4x4 );
 
 /* calculate dct coeffs */
- for( int i = 0; i < (chroma422?8:4); i++ )
+ for( int i8x8 = 0; i8x8 < (chroma422?2:1); i8x8++ )
 {
 if( h->mb.b_trellis )
- nz = x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 1, 0 );
+ {
+ for( int i4x4 = 0; i4x4 < 4; i4x4++ )
+ {
+ if( x264_quant_4x4_trellis( h, dct4x4[i8x8*4+i4x4], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 1, 0 ) )
+ {
+ int idx = 16+ch*16+i8x8*8+i4x4;
+ h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[i8x8*4+i4x4] );
+ h->quantf.dequant_4x4( dct4x4[i8x8*4+i4x4], dequant_mf, i_qp );
+ if( i_decimate_score < 7 )
+ i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[idx] );
+ h->mb.cache.non_zero_count[x264_scan8[idx]] = 1;
+ nz_ac = 1;
+ }
+ }
+ }
 else
- nz = h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qp], h->quant4_bias[CQM_4IC+b_inter][i_qp] );
- h->mb.cache.non_zero_count[x264_scan8[16+i+(chroma422?i&4:0)+ch*16]] = nz;
- if( nz )
 {
- nz_ac = 1;
- h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16], dct4x4[i] );
- h->quantf.dequant_4x4( dct4x4[i], dequant_mf, i_qp );
- if( b_decimate )
- i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16] );
+ nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[CQM_4IC+b_inter][i_qp],
+ h->quant4_bias[CQM_4IC+b_inter][i_qp] );
+ nz_ac |= nz;
+
+ FOREACH_BIT( i4x4, 0, nz )
+ {
+ int idx = 16+ch*16+i8x8*8+i4x4;
+
+ h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[i8x8*4+i4x4] );
+ h->quantf.dequant_4x4( dct4x4[i8x8*4+i4x4], dequant_mf, i_qp );
+ if( i_decimate_score < 7 )
+ i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[idx] );
+ h->mb.cache.non_zero_count[x264_scan8[idx]] = 1;
+ }
 }
 }
 
@@ -390,7 +434,7 @@
 
 h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = nz_dc;
 
- if( (b_decimate && i_decimate_score < 7) || !nz_ac )
+ if( i_decimate_score < 7 || !nz_ac )
 {
 /* Decimate the block */
 M16( &h->mb.cache.non_zero_count[x264_scan8[16+16*ch]] ) = 0;
@@ -646,11 +690,8 @@
 {
 h->mb.b_transform_8x8 = 0;
 
- for( int p = 0; p < plane_count; p++ )
- {
+ for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
 x264_mb_encode_i16x16( h, p, i_qp );
- i_qp = h->mb.i_chroma_qp;
- }
 }
 else if( h->mb.i_type == I_8x8 )
 {
@@ -668,14 +709,13 @@
 if( h->mb.i_skip_intra == 2 )
 h->mc.memcpy_aligned( h->dct.luma8x8, h->mb.pic.i8x8_dct_buf, sizeof(h->mb.pic.i8x8_dct_buf) );
 }
- for( int p = 0; p < plane_count; p++ )
+ for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
 {
 for( int i = (p == 0 && h->mb.i_skip_intra) ? 3 : 0 ; i < 4; i++ )
 {
 int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]];
 x264_mb_encode_i8x8( h, p, i, i_qp, i_mode, NULL, 1 );
 }
- i_qp = h->mb.i_chroma_qp;
 }
 }
 else if( h->mb.i_type == I_4x4 )
@@ -694,7 +734,7 @@
 if( h->mb.i_skip_intra == 2 )
 h->mc.memcpy_aligned( h->dct.luma4x4, h->mb.pic.i4x4_dct_buf, sizeof(h->mb.pic.i4x4_dct_buf) );
 }
- for( int p = 0; p < plane_count; p++ )
+ for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
 {
 for( int i = (p == 0 && h->mb.i_skip_intra) ? 15 : 0 ; i < 16; i++ )
 {
@@ -707,7 +747,6 @@
 
 x264_mb_encode_i4x4( h, p, i, i_qp, i_mode, 1 );
 }
- i_qp = h->mb.i_chroma_qp;
 }
 }
 else /* Inter MB */
@@ -744,11 +783,12 @@
 }
 else if( h->mb.b_transform_8x8 )
 {
- ALIGNED_ARRAY_16( dctcoef, dct8x8,[4],[64] );
+ ALIGNED_ARRAY_N( dctcoef, dct8x8,[4],[64] );
 b_decimate &= !h->mb.b_trellis || !h->param.b_cabac; // 8x8 trellis is inherently optimal decimation for CABAC
 
- for( int p = 0; p < plane_count; p++ )
+ for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
 {
+ CLEAR_16x16_NNZ( p );
 h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[p], h->mb.pic.p_fdec[p] );
 h->nr_count[1+!!p*2] += h->mb.b_noise_reduction * 4;
 
@@ -772,99 +812,92 @@
 }
 }
 
- if( i_decimate_mb < 6 && b_decimate )
- {
- plane_cbp = 0;
- CLEAR_16x16_NNZ( p );
- }
- else
+ if( i_decimate_mb >= 6 || !b_decimate )
 {
- for( int idx = 0; idx < 4; idx++ )
+ h->mb.i_cbp_luma |= plane_cbp;
+ FOREACH_BIT( idx, 0, plane_cbp )
 {
- int x = idx&1;
- int y = idx>>1;
-
- if( plane_cbp&(1<<idx) )
- {
- h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[p?CQM_8PC:CQM_8PY], i_qp );
- h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE], dct8x8[idx] );
- STORE_8x8_NNZ( p, idx, 1 );
- }
- else
- STORE_8x8_NNZ( p, idx, 0 );
+ h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[p?CQM_8PC:CQM_8PY], i_qp );
+ h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[p][8*(idx&1) + 8*(idx>>1)*FDEC_STRIDE], dct8x8[idx] );
+ STORE_8x8_NNZ( p, idx, 1 );
 }
 }
- h->mb.i_cbp_luma |= plane_cbp;
- i_qp = h->mb.i_chroma_qp;
 }
 }
 else
 {
- ALIGNED_ARRAY_16( dctcoef, dct4x4,[16],[16] );
- for( int p = 0; p < plane_count; p++ )
+ ALIGNED_ARRAY_N( dctcoef, dct4x4,[16],[16] );
+ for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
 {
+ CLEAR_16x16_NNZ( p );
 h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[p], h->mb.pic.p_fdec[p] );
- h->nr_count[0+!!p*2] += h->mb.b_noise_reduction * 16;
+
+ if( h->mb.b_noise_reduction )
+ {
+ h->nr_count[0+!!p*2] += 16;
+ for( int idx = 0; idx < 16; idx++ )
+ h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 );
+ }
 
 int plane_cbp = 0;
 for( int i8x8 = 0; i8x8 < 4; i8x8++ )
 {
- int i_decimate_8x8 = 0;
- int cbp = 0;
-
- /* encode one 4x4 block */
- for( int i4x4 = 0; i4x4 < 4; i4x4++ )
+ int i_decimate_8x8 = b_decimate ? 0 : 6;
+ int nnz8x8 = 0;
+ if( h->mb.b_trellis )
 {
- int idx = i8x8 * 4 + i4x4;
-
- nz = x264_quant_4x4( h, dct4x4[idx], i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, idx );
- h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = nz;
-
+ for( int i4x4 = 0; i4x4 < 4; i4x4++ )
+ {
+ int idx = i8x8*4+i4x4;
+ if( x264_quant_4x4_trellis( h, dct4x4[idx], CQM_4PY, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, !!p, p*16+idx ) )
+ {
+ h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4[idx] );
+ h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[p?CQM_4PC:CQM_4PY], i_qp );
+ if( i_decimate_8x8 < 6 )
+ i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+idx] );
+ h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = 1;
+ nnz8x8 = 1;
+ }
+ }
+ }
+ else
+ {
+ nnz8x8 = nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
 if( nz )
 {
- h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4[idx] );
- h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[p?CQM_4PC:CQM_4PY], i_qp );
- if( b_decimate && i_decimate_8x8 < 6 )
- i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+idx] );
- cbp = 1;
+ FOREACH_BIT( idx, i8x8*4, nz )
+ {
+ h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4[idx] );
+ h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[p?CQM_4PC:CQM_4PY], i_qp );
+ if( i_decimate_8x8 < 6 )
+ i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+idx] );
+ h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = 1;
+ }
 }
 }
-
- int x = i8x8&1;
- int y = i8x8>>1;
-
- /* decimate this 8x8 block */
- i_decimate_mb += i_decimate_8x8;
- if( b_decimate )
+ if( nnz8x8 )
 {
+ i_decimate_mb += i_decimate_8x8;
 if( i_decimate_8x8 < 4 )
 STORE_8x8_NNZ( p, i8x8, 0 );
 else
 plane_cbp |= 1<<i8x8;
 }
- else if( cbp )
- {
- h->dctf.add8x8_idct( &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE], &dct4x4[i8x8*4] );
- plane_cbp |= 1<<i8x8;
- }
 }
 
- if( b_decimate )
+ if( i_decimate_mb < 6 )
 {
- if( i_decimate_mb < 6 )
- {
- plane_cbp = 0;
- CLEAR_16x16_NNZ( p );
- }
- else
+ plane_cbp = 0;
+ CLEAR_16x16_NNZ( p );
+ }
+ else
+ {
+ h->mb.i_cbp_luma |= plane_cbp;
+ FOREACH_BIT( i8x8, 0, plane_cbp )
 {
- for( int i8x8 = 0; i8x8 < 4; i8x8++ )
- if( plane_cbp&(1<<i8x8) )
- h->dctf.add8x8_idct( &h->mb.pic.p_fdec[p][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
+ h->dctf.add8x8_idct( &h->mb.pic.p_fdec[p][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
 }
 }
- h->mb.i_cbp_luma |= plane_cbp;
- i_qp = h->mb.i_chroma_qp;
 }
 }
 }
@@ -933,12 +966,12 @@
 *****************************************************************************/
 static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_bidir, int plane_count, int chroma )
 {
- ALIGNED_ARRAY_16( dctcoef, dct4x4,[8],[16] );
+ ALIGNED_ARRAY_N( dctcoef, dct4x4,[8],[16] );
 ALIGNED_ARRAY_16( dctcoef, dctscan,[16] );
 ALIGNED_4( int16_t mvp[2] );
 int i_qp = h->mb.i_qp;
 
- for( int p = 0; p < plane_count; p++ )
+ for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
 {
 int quant_cat = p ? CQM_4PC : CQM_4PY;
 if( !b_bidir )
@@ -957,23 +990,23 @@
 {
 int fenc_offset = (i8x8&1) * 8 + (i8x8>>1) * FENC_STRIDE * 8;
 int fdec_offset = (i8x8&1) * 8 + (i8x8>>1) * FDEC_STRIDE * 8;
- /* get luma diff */
+
 h->dctf.sub8x8_dct( dct4x4, h->mb.pic.p_fenc[p] + fenc_offset,
 h->mb.pic.p_fdec[p] + fdec_offset );
- /* encode one 4x4 block */
- for( int i4x4 = 0; i4x4 < 4; i4x4++ )
- {
- if( h->mb.b_noise_reduction )
+
+ if( h->mb.b_noise_reduction )
+ for( int i4x4 = 0; i4x4 < 4; i4x4++ )
 h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 );
- if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[quant_cat][i_qp], h->quant4_bias[quant_cat][i_qp] ) )
- continue;
- h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
+
+ int nz = h->quantf.quant_4x4x4( dct4x4, h->quant4_mf[quant_cat][i_qp], h->quant4_bias[quant_cat][i_qp] );
+ FOREACH_BIT( idx, 0, nz )
+ {
+ h->zigzagf.scan_4x4( dctscan, dct4x4[idx] );
 i_decimate_mb += h->quantf.decimate_score16( dctscan );
 if( i_decimate_mb >= 6 )
 return 0;
 }
 }
- i_qp = h->mb.i_chroma_qp;
 }
 
 if( chroma == CHROMA_420 || chroma == CHROMA_422 )
@@ -1023,6 +1056,7 @@
 {
 h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 );
 dct_dc[i4x4] = dct4x4[i4x4][0];
+ dct4x4[i4x4][0] = 0;
 }
 }
 else
@@ -1043,21 +1077,26 @@
 continue;
 
 if( !h->mb.b_noise_reduction )
- for( int i = 0; i <= chroma422; i++ )
+ for( int i = 0; i <= chroma422; i++ )
+ {
 h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE );
+ dct4x4[i*4+0][0] = 0;
+ dct4x4[i*4+1][0] = 0;
+ dct4x4[i*4+2][0] = 0;
+ dct4x4[i*4+3][0] = 0;
+ }
 
 /* calculate dct coeffs */
- for( int i4x4 = 0, i_decimate_mb = 0; i4x4 < (chroma422?8:4); i4x4++ )
+ for( int i8x8 = 0, i_decimate_mb = 0; i8x8 < (chroma422?2:1); i8x8++ )
 {
- dct4x4[i4x4][0] = 0;
- if( h->mb.b_noise_reduction )
- h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 );
- if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ) )
- continue;
- h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
- i_decimate_mb += h->quantf.decimate_score15( dctscan );
- if( i_decimate_mb >= 7 )
- return 0;
+ int nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
+ FOREACH_BIT( idx, i8x8*4, nz )
+ {
+ h->zigzagf.scan_4x4( dctscan, dct4x4[idx] );
+ i_decimate_mb += h->quantf.decimate_score15( dctscan );
+ if( i_decimate_mb >= 7 )
+ return 0;
+ }
 }
 }
 }
@@ -1176,12 +1215,13 @@
 {
 if( h->mb.b_transform_8x8 )
 {
- for( int p = 0; p < plane_count; p++ )
+ for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
 {
 int quant_cat = p ? CQM_8PC : CQM_8PY;
 pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
 pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
- ALIGNED_ARRAY_16( dctcoef, dct8x8,[64] );
+ ALIGNED_ARRAY_N( dctcoef, dct8x8,[64] );
+
 h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
 int nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, i8 );
 if( nnz8x8 )
@@ -1196,50 +1236,74 @@
 h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[quant_cat], i_qp );
 h->dctf.add8x8_idct8( p_fdec, dct8x8 );
 STORE_8x8_NNZ( p, i8, 1 );
+ h->mb.i_cbp_luma |= 1 << i8;
 }
 else
 STORE_8x8_NNZ( p, i8, 0 );
 }
 else
 STORE_8x8_NNZ( p, i8, 0 );
- h->mb.i_cbp_luma |= nnz8x8 << i8;
- i_qp = h->mb.i_chroma_qp;
 }
 }
 else
 {
- for( int p = 0; p < plane_count; p++ )
+ for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
 {
 int quant_cat = p ? CQM_4PC : CQM_4PY;
 pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
 pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
- int i_decimate_8x8 = 0, nnz8x8 = 0;
- ALIGNED_ARRAY_16( dctcoef, dct4x4,[4],[16] );
+ int i_decimate_8x8 = b_decimate ? 0 : 4;
+ ALIGNED_ARRAY_N( dctcoef, dct4x4,[4],[16] );
+ int nnz8x8 = 0;
+
 h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
- for( int i4 = 0; i4 < 4; i4++ )
+ STORE_8x8_NNZ( p, i8, 0 );
+
+ if( h->mb.b_noise_reduction )
+ for( int idx = 0; idx < 4; idx++ )
+ h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 );
+
+ if( h->mb.b_trellis )
 {
- nz = x264_quant_4x4( h, dct4x4[i4], i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, i8*4+i4 );
- h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4+i4]] = nz;
+ for( int i4x4 = 0; i4x4 < 4; i4x4++ )
+ {
+ if( x264_quant_4x4_trellis( h, dct4x4[i4x4], quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, !!p, i8*4+i4x4+p*16 ) )
+ {
+ h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+i8*4+i4x4], dct4x4[i4x4] );
+ h->quantf.dequant_4x4( dct4x4[i4x4], h->dequant4_mf[quant_cat], i_qp );
+ if( i_decimate_8x8 < 4 )
+ i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+i8*4+i4x4] );
+ h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4+i4x4]] = 1;
+ nnz8x8 = 1;
+ }
+ }
+ }
+ else
+ {
+ nnz8x8 = nz = h->quantf.quant_4x4x4( dct4x4, h->quant4_mf[quant_cat][i_qp], h->quant4_bias[quant_cat][i_qp] );
 if( nz )
 {
- h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+i8*4+i4], dct4x4[i4] );
- h->quantf.dequant_4x4( dct4x4[i4], h->dequant4_mf[quant_cat], i_qp );
- if( b_decimate )
- i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+i8*4+i4] );
- nnz8x8 = 1;
+ FOREACH_BIT( i4x4, 0, nz )
+ {
+ h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+i8*4+i4x4], dct4x4[i4x4] );
+ h->quantf.dequant_4x4( dct4x4[i4x4], h->dequant4_mf[quant_cat], i_qp );
+ if( i_decimate_8x8 < 4 )
+ i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+i8*4+i4x4] );
+ h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4+i4x4]] = 1;
+ }
 }
 }
-
- if( b_decimate && i_decimate_8x8 < 4 )
- nnz8x8 = 0;
-
 if( nnz8x8 )
- h->dctf.add8x8_idct( p_fdec, dct4x4 );
- else
- STORE_8x8_NNZ( p, i8, 0 );
-
- h->mb.i_cbp_luma |= nnz8x8 << i8;
- i_qp = h->mb.i_chroma_qp;
+ {
+ /* decimate this 8x8 block */
+ if( i_decimate_8x8 < 4 )
+ STORE_8x8_NNZ( p, i8, 0 );
+ else
+ {
+ h->dctf.add8x8_idct( p_fdec, dct4x4 );
+ h->mb.i_cbp_luma |= 1 << i8;
+ }
+ }
 }
 }
 
@@ -1248,7 +1312,7 @@
 i_qp = h->mb.i_chroma_qp;
 for( int ch = 0; ch < 2; ch++ )
 {
- ALIGNED_ARRAY_16( dctcoef, dct4x4,[2],[16] );
+ ALIGNED_ARRAY_N( dctcoef, dct4x4,[2],[16] );
 pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE;
 pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE;
 
@@ -1297,7 +1361,7 @@
 {
 int i_qp = h->mb.i_qp;
 
- for( int p = 0; p < plane_count; p++ )
+ for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
 {
 int quant_cat = p ? CQM_4PC : CQM_4PY;
 pixel *p_fenc = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[i4]];
@@ -1313,7 +1377,7 @@
 }
 else
 {
- ALIGNED_ARRAY_16( dctcoef, dct4x4,[16] );
+ ALIGNED_ARRAY_N( dctcoef, dct4x4,[16] );
 h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
 nz = x264_quant_4x4( h, dct4x4, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, i4 );
 h->mb.cache.non_zero_count[x264_scan8[p*16+i4]] = nz;
@@ -1324,7 +1388,6 @@
 h->dctf.add4x4_idct( p_fdec, dct4x4 );
 }
 }
- i_qp = h->mb.i_chroma_qp;
 }
 }

x264-snapshot-20130224-2245.tar.bz2/encoder/macroblock.h -> x264-snapshot-20130723-2245.tar.bz2/encoder/macroblock.h Changed

@@ -104,12 +104,16 @@
     M32( &h->mb.cache.non_zero_count[x264_scan8[16*p+10]] ) = 0;\
 } while(0)
 
+/* A special for loop that iterates branchlessly over each set
+ * bit in a 4-bit input. */
+#define FOREACH_BIT(idx,start,mask) for( int idx = start, msk = mask, skip; msk && (skip = x264_ctz_4bit(msk), idx += skip, msk >>= skip+1, 1); idx++ )
+
 static ALWAYS_INLINE void x264_mb_encode_i4x4( x264_t *h, int p, int idx, int i_qp, int i_mode, int b_predict )
 {
     int nz;
     pixel *p_src = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[idx]];
     pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[idx]];
-    ALIGNED_ARRAY_16( dctcoef, dct4x4,[16] );
+    ALIGNED_ARRAY_N( dctcoef, dct4x4,[16] );
 
     if( b_predict )
     {
@@ -147,7 +151,7 @@
     int nz;
     pixel *p_src = &h->mb.pic.p_fenc[p][8*x + 8*y*FENC_STRIDE];
     pixel *p_dst = &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE];
-    ALIGNED_ARRAY_16( dctcoef, dct8x8,[64] );
+    ALIGNED_ARRAY_N( dctcoef, dct8x8,[64] );
     ALIGNED_ARRAY_32( pixel, edge_buf,[36] );
 
     if( b_predict )

x264-snapshot-20130224-2245.tar.bz2/encoder/me.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/me.c Changed

@@ -61,21 +61,22 @@
 (p_cost_mvx[(mx)<<2] + p_cost_mvy[(my)<<2])
 
 #define COST_MV( mx, my )\
+do\
 {\
 int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE,\
 &p_fref_w[(my)*stride+(mx)], stride )\
 + BITS_MVD(mx,my);\
 COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my );\
-}
+} while(0)
 
-#define COST_MV_HPEL( mx, my ) \
-{ \
- intptr_t stride2 = 16; \
- pixel *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh, &m->weight[0] ); \
- int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, src, stride2 ) \
- + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
- COPY3_IF_LT( bpred_cost, cost, bpred_mx, mx, bpred_my, my ); \
-}
+#define COST_MV_HPEL( mx, my, cost )\
+do\
+{\
+ intptr_t stride2 = 16;\
+ pixel *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh, &m->weight[0] );\
+ cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, src, stride2 )\
+ + p_cost_mvx[ mx ] + p_cost_mvy[ my ];\
+} while(0)
 
 #define COST_MV_X3_DIR( m0x, m0y, m1x, m1y, m2x, m2y, costs )\
 {\
@@ -174,6 +175,10 @@
 }\
 }
 
+#define FPEL(mv) (((mv)+2)>>2) /* Convert subpel MV to fullpel with rounding... */
+#define SPEL(mv) ((mv)<<2) /* ... and the reverse. */
+#define SPELx2(mv) (SPEL(mv)&0xFFFCFFFC) /* for two packed MVs */
+
 void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_halfpel_thresh )
 {
 const int bw = x264_pixel_size[m->i_pixel].w;
@@ -181,97 +186,136 @@
 const int i_pixel = m->i_pixel;
 const int stride = m->i_stride[0];
 int i_me_range = h->param.analyse.i_me_range;
- int bmx, bmy, bcost;
- int bpred_mx = 0, bpred_my = 0, bpred_cost = COST_MAX;
+ int bmx, bmy, bcost = COST_MAX;
+ int bpred_cost = COST_MAX;
 int omx, omy, pmx, pmy;
 pixel *p_fenc = m->p_fenc[0];
 pixel *p_fref_w = m->p_fref_w;
- ALIGNED_ARRAY_16( pixel, pix,[16*16] );
-
- int costs[16];
-
- int mv_x_min = h->mb.mv_min_fpel[0];
- int mv_y_min = h->mb.mv_min_fpel[1];
- int mv_x_max = h->mb.mv_max_fpel[0];
- int mv_y_max = h->mb.mv_max_fpel[1];
- int mv_x_min_qpel = mv_x_min << 2;
- int mv_y_min_qpel = mv_y_min << 2;
- int mv_x_max_qpel = mv_x_max << 2;
- int mv_y_max_qpel = mv_y_max << 2;
+ ALIGNED_ARRAY_N( pixel, pix,[16*16] );
+ ALIGNED_ARRAY_8( int16_t, mvc_temp,[16],[2] );
+
+ ALIGNED_ARRAY_16( int, costs,[16] );
+
+ int mv_x_min = h->mb.mv_limit_fpel[0][0];
+ int mv_y_min = h->mb.mv_limit_fpel[0][1];
+ int mv_x_max = h->mb.mv_limit_fpel[1][0];
+ int mv_y_max = h->mb.mv_limit_fpel[1][1];
 /* Special version of pack to allow shortcuts in CHECK_MVRANGE */
 #define pack16to32_mask2(mx,my) ((mx<<16)|(my&0x7FFF))
 uint32_t mv_min = pack16to32_mask2( -mv_x_min, -mv_y_min );
 uint32_t mv_max = pack16to32_mask2( mv_x_max, mv_y_max )|0x8000;
+ uint32_t pmv, bpred_mv = 0;
 
 #define CHECK_MVRANGE(mx,my) (!(((pack16to32_mask2(mx,my) + mv_min) | (mv_max - pack16to32_mask2(mx,my))) & 0x80004000))
 
 const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
 const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
 
- uint32_t pmv;
- bmx = x264_clip3( m->mvp[0], mv_x_min_qpel, mv_x_max_qpel );
- bmy = x264_clip3( m->mvp[1], mv_y_min_qpel, mv_y_max_qpel );
- pmx = ( bmx + 2 ) >> 2;
- pmy = ( bmy + 2 ) >> 2;
- bcost = COST_MAX;
-
- /* try extra predictors if provided */
+ /* Try extra predictors if provided. If subme >= 3, check subpel predictors,
+ * otherwise round them to fullpel. */
 if( h->mb.i_subpel_refine >= 3 )
 {
- pmv = pack16to32_mask(bmx,bmy);
- if( i_mvc )
- COST_MV_HPEL( bmx, bmy );
- for( int i = 0; i < i_mvc; i++ )
+ /* Calculate and check the MVP first */
+ int bpred_mx = x264_clip3( m->mvp[0], SPEL(mv_x_min), SPEL(mv_x_max) );
+ int bpred_my = x264_clip3( m->mvp[1], SPEL(mv_y_min), SPEL(mv_y_max) );
+ pmv = pack16to32_mask( bpred_mx, bpred_my );
+ pmx = FPEL( bpred_mx );
+ pmy = FPEL( bpred_my );
+
+ COST_MV_HPEL( bpred_mx, bpred_my, bpred_cost );
+ int pmv_cost = bpred_cost;
+
+ if( i_mvc > 0 )
 {
- if( M32( mvc[i] ) && (pmv != M32( mvc[i] )) )
+ /* Clip MV candidates and eliminate those equal to zero and pmv. */
+ int valid_mvcs = x264_predictor_clip( mvc_temp+2, mvc, i_mvc, h->mb.mv_limit_fpel, pmv );
+ if( valid_mvcs > 0 )
 {
- int mx = x264_clip3( mvc[i][0], mv_x_min_qpel, mv_x_max_qpel );
- int my = x264_clip3( mvc[i][1], mv_y_min_qpel, mv_y_max_qpel );
- COST_MV_HPEL( mx, my );
+ int i = 1, cost;
+ /* We stuff pmv here to branchlessly pick between pmv and the various
+ * MV candidates. [0] gets skipped in order to maintain alignment for
+ * x264_predictor_clip. */
+ M32( mvc_temp[1] ) = pmv;
+ bpred_cost <<= 4;
+ do
+ {
+ int mx = mvc_temp[i+1][0];
+ int my = mvc_temp[i+1][1];
+ COST_MV_HPEL( mx, my, cost );
+ COPY1_IF_LT( bpred_cost, (cost << 4) + i );
+ } while( ++i <= valid_mvcs );
+ bpred_mx = mvc_temp[(bpred_cost&15)+1][0];
+ bpred_my = mvc_temp[(bpred_cost&15)+1][1];
+ bpred_cost >>= 4;
 }
 }
- bmx = ( bpred_mx + 2 ) >> 2;
- bmy = ( bpred_my + 2 ) >> 2;
- COST_MV( bmx, bmy );
+
+ /* Round the best predictor back to fullpel and get the cost, since this is where
+ * we'll be starting the fullpel motion search. */
+ bmx = FPEL( bpred_mx );
+ bmy = FPEL( bpred_my );
+ bpred_mv = pack16to32_mask(bpred_mx, bpred_my);
+ if( bpred_mv&0x00030003 ) /* Only test if the tested predictor is actually subpel... */
+ COST_MV( bmx, bmy );
+ else /* Otherwise just copy the cost (we already know it) */
+ bcost = bpred_cost;
+
+ /* Test the zero vector if it hasn't been tested yet. */
+ if( pmv )
+ {
+ if( bmx|bmy ) COST_MV( 0, 0 );
+ }
+ /* If a subpel mv candidate was better than the zero vector, the previous
+ * fullpel check won't have gotten it even if the pmv was zero. So handle
+ * that possibility here. */
+ else
+ {
+ COPY3_IF_LT( bcost, pmv_cost, bmx, 0, bmy, 0 );
+ }
 }
 else
 {
- /* check the MVP */
- bmx = pmx;
- bmy = pmy;
+ /* Calculate and check the fullpel MVP first */
+ bmx = pmx = x264_clip3( FPEL(m->mvp[0]), mv_x_min, mv_x_max );
+ bmy = pmy = x264_clip3( FPEL(m->mvp[1]), mv_y_min, mv_y_max );
+ pmv = pack16to32_mask( bmx, bmy );
+
 /* Because we are rounding the predicted motion vector to fullpel, there will be
 * an extra MV cost in 15 out of 16 cases. However, when the predicted MV is
 * chosen as the best predictor, it is often the case that the subpel search will
- * result in a vector at or next to the predicted motion vector. Therefore, it is
- * sensible to omit the cost of the MV from the rounded MVP to avoid unfairly
- * biasing against use of the predicted motion vector. */
+ * result in a vector at or next to the predicted motion vector. Therefore, we omit
+ * the cost of the MV from the rounded MVP to avoid unfairly biasing against use of
+ * the predicted motion vector.
+ *
+ * Disclaimer: this is a post-hoc rationalization for why this hack works. */
 bcost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[bmy*stride+bmx], stride );
- pmv = pack16to32_mask( bmx, bmy );
+
 if( i_mvc > 0 )
 {
- ALIGNED_ARRAY_8( int16_t, mvc_fpel,[16],[2] );
- x264_predictor_roundclip( mvc_fpel+2, mvc, i_mvc, mv_x_min, mv_x_max, mv_y_min, mv_y_max );
- M32( mvc_fpel[1] ) = pmv;
- bcost <<= 4;
- for( int i = 1; i <= i_mvc; i++ )
+ /* Like in subme>=3, except we also round the candidates to fullpel. */
+ int valid_mvcs = x264_predictor_roundclip( mvc_temp+2, mvc, i_mvc, h->mb.mv_limit_fpel, pmv );
+ if( valid_mvcs > 0 )
 {
- if( M32( mvc_fpel[i+1] ) && (pmv != M32( mvc_fpel[i+1] )) )
+ int i = 1, cost;
+ M32( mvc_temp[1] ) = pmv;
+ bcost <<= 4;
+ do
 {
- int mx = mvc_fpel[i+1][0];
- int my = mvc_fpel[i+1][1];
- int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[my*stride+mx], stride ) + BITS_MVD( mx, my );
- cost = (cost << 4) + i;
- COPY1_IF_LT( bcost, cost );
- }
+ int mx = mvc_temp[i+1][0];
+ int my = mvc_temp[i+1][1];
+ cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[my*stride+mx], stride ) + BITS_MVD( mx, my );
+ COPY1_IF_LT( bcost, (cost << 4) + i );
+ } while( ++i <= valid_mvcs );
+ bmx = mvc_temp[(bcost&15)+1][0];
+ bmy = mvc_temp[(bcost&15)+1][1];
+ bcost >>= 4;
 }
- bmx = mvc_fpel[(bcost&15)+1][0];
- bmy = mvc_fpel[(bcost&15)+1][1];
- bcost >>= 4;
 }
- }
 
- if( pmv )
- COST_MV( 0, 0 );
+ /* Same as above, except the condition is simpler. */
+ if( pmv )
+ COST_MV( 0, 0 );
+ }
 
 switch( h->mb.i_me_method )
 {
@@ -358,19 +402,20 @@
 bcost >>= 3;
 #endif
 /* square refine */
- int dir = 0;
+ bcost <<= 4;
 COST_MV_X4_DIR( 0,-1, 0,1, -1,0, 1,0, costs );
- COPY2_IF_LT( bcost, costs[0], dir, 1 );
- COPY2_IF_LT( bcost, costs[1], dir, 2 );
- COPY2_IF_LT( bcost, costs[2], dir, 3 );
- COPY2_IF_LT( bcost, costs[3], dir, 4 );
+ COPY1_IF_LT( bcost, (costs[0]<<4)+1 );
+ COPY1_IF_LT( bcost, (costs[1]<<4)+2 );
+ COPY1_IF_LT( bcost, (costs[2]<<4)+3 );
+ COPY1_IF_LT( bcost, (costs[3]<<4)+4 );
 COST_MV_X4_DIR( -1,-1, -1,1, 1,-1, 1,1, costs );
- COPY2_IF_LT( bcost, costs[0], dir, 5 );
- COPY2_IF_LT( bcost, costs[1], dir, 6 );
- COPY2_IF_LT( bcost, costs[2], dir, 7 );
- COPY2_IF_LT( bcost, costs[3], dir, 8 );
- bmx += square1[dir][0];
- bmy += square1[dir][1];
+ COPY1_IF_LT( bcost, (costs[0]<<4)+5 );
+ COPY1_IF_LT( bcost, (costs[1]<<4)+6 );
+ COPY1_IF_LT( bcost, (costs[2]<<4)+7 );
+ COPY1_IF_LT( bcost, (costs[3]<<4)+8 );
+ bmx += square1[bcost&15][0];
+ bmy += square1[bcost&15][1];
+ bcost >>= 4;
 break;
 }
 
@@ -609,7 +654,7 @@
 if( h->mb.i_me_method == X264_ME_TESA )
 {
 // ADS threshold, then SAD threshold, then keep the best few SADs, then SATD
- mvsad_t *mvsads = (mvsad_t *)(xs + ((width+15)&~15) + 4);
+ mvsad_t *mvsads = (mvsad_t *)(xs + ((width+31)&~31) + 4);
 int nmvsad = 0, limit;
 int sad_thresh = i_me_range <= 16 ? 10 : i_me_range <= 24 ? 11 : 12;
 int bsad = h->pixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref_w+bmy*stride+bmx, stride )
@@ -727,24 +772,22 @@
 }
 
 /* -> qpel mv */
- if( bpred_cost < bcost )
+ uint32_t bmv = pack16to32_mask(bmx,bmy);
+ uint32_t bmv_spel = SPELx2(bmv);
+ if( h->mb.i_subpel_refine < 3 )
 {
- m->mv[0] = bpred_mx;
- m->mv[1] = bpred_my;
- m->cost = bpred_cost;
+ m->cost_mv = p_cost_mvx[bmx<<2] + p_cost_mvy[bmy<<2];
+ m->cost = bcost;
+ /* compute the real cost */
+ if( bmv == pmv ) m->cost += m->cost_mv;
+ M32( m->mv ) = bmv_spel;
 }
 else
 {
- m->mv[0] = bmx << 2;
- m->mv[1] = bmy << 2;
- m->cost = bcost;
+ M32(m->mv) = bpred_cost < bcost ? bpred_mv : bmv_spel;
+ m->cost = X264_MIN( bpred_cost, bcost );
 }
 
- /* compute the real cost */
- m->cost_mv = p_cost_mvx[ m->mv[0] ] + p_cost_mvy[ m->mv[1] ];
- if( bmx == pmx && bmy == pmy && h->mb.i_subpel_refine < 3 )
- m->cost += m->cost_mv;
-
 /* subpel refine */
 if( h->mb.i_subpel_refine >= 2 )
 {
@@ -831,40 +874,52 @@
 int chroma_v_shift = CHROMA_V_SHIFT;
 int mvy_offset = chroma_v_shift & MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
 
- ALIGNED_ARRAY_16( pixel, pix,[64*18] ); // really 17x17x2, but round up for alignment
+ ALIGNED_ARRAY_N( pixel, pix,[64*18] ); // really 17x17x2, but round up for alignment
+ ALIGNED_ARRAY_16( int, costs,[4] );
 
 int bmx = m->mv[0];
 int bmy = m->mv[1];
 int bcost = m->cost;
 int odir = -1, bdir;
 
- /* try the subpel component of the predicted mv */
- if( hpel_iters && h->mb.i_subpel_refine < 3 )
- {
- int mx = x264_clip3( m->mvp[0], h->mb.mv_min_spel[0]+2, h->mb.mv_max_spel[0]-2 );
- int my = x264_clip3( m->mvp[1], h->mb.mv_min_spel[1]+2, h->mb.mv_max_spel[1]-2 );
- if( (mx-bmx)|(my-bmy) )
- COST_MV_SAD( mx, my );
- }
-
 /* halfpel diamond search */
- for( int i = hpel_iters; i > 0; i-- )
+ if( hpel_iters )
 {
- int omx = bmx, omy = bmy;
- int costs[4];
- intptr_t stride = 64; // candidates are either all hpel or all qpel, so one stride is enough
- pixel *src0, *src1, *src2, *src3;
- src0 = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], omx, omy-2, bw, bh+1, &m->weight[0] );
- src2 = h->mc.get_ref( pix+32, &stride, m->p_fref, m->i_stride[0], omx-2, omy, bw+4, bh, &m->weight[0] );
- src1 = src0 + stride;
- src3 = src2 + 1;
- h->pixf.fpelcmp_x4[i_pixel]( m->p_fenc[0], src0, src1, src2, src3, stride, costs );
- COPY2_IF_LT( bcost, costs[0] + p_cost_mvx[omx ] + p_cost_mvy[omy-2], bmy, omy-2 );
- COPY2_IF_LT( bcost, costs[1] + p_cost_mvx[omx ] + p_cost_mvy[omy+2], bmy, omy+2 );
- COPY3_IF_LT( bcost, costs[2] + p_cost_mvx[omx-2] + p_cost_mvy[omy ], bmx, omx-2, bmy, omy );
- COPY3_IF_LT( bcost, costs[3] + p_cost_mvx[omx+2] + p_cost_mvy[omy ], bmx, omx+2, bmy, omy );
- if( (bmx == omx) & (bmy == omy) )
- break;
+ /* try the subpel component of the predicted mv */
+ if( h->mb.i_subpel_refine < 3 )
+ {
+ int mx = x264_clip3( m->mvp[0], h->mb.mv_min_spel[0]+2, h->mb.mv_max_spel[0]-2 );
+ int my = x264_clip3( m->mvp[1], h->mb.mv_min_spel[1]+2, h->mb.mv_max_spel[1]-2 );
+ if( (mx-bmx)|(my-bmy) )
+ COST_MV_SAD( mx, my );
+ }
+
+ bcost <<= 6;
+ for( int i = hpel_iters; i > 0; i-- )
+ {
+ int omx = bmx, omy = bmy;
+ intptr_t stride = 64; // candidates are either all hpel or all qpel, so one stride is enough
+ pixel *src0, *src1, *src2, *src3;
+ src0 = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], omx, omy-2, bw, bh+1, &m->weight[0] );
+ src2 = h->mc.get_ref( pix+32, &stride, m->p_fref, m->i_stride[0], omx-2, omy, bw+4, bh, &m->weight[0] );
+ src1 = src0 + stride;
+ src3 = src2 + 1;
+ h->pixf.fpelcmp_x4[i_pixel]( m->p_fenc[0], src0, src1, src2, src3, stride, costs );
+ costs[0] += p_cost_mvx[omx ] + p_cost_mvy[omy-2];
+ costs[1] += p_cost_mvx[omx ] + p_cost_mvy[omy+2];
+ costs[2] += p_cost_mvx[omx-2] + p_cost_mvy[omy ];
+ costs[3] += p_cost_mvx[omx+2] + p_cost_mvy[omy ];
+ COPY1_IF_LT( bcost, (costs[0]<<6)+2 );
+ COPY1_IF_LT( bcost, (costs[1]<<6)+6 );
+ COPY1_IF_LT( bcost, (costs[2]<<6)+16 );
+ COPY1_IF_LT( bcost, (costs[3]<<6)+48 );
+ if( !(bcost&63) )
+ break;
+ bmx -= (bcost<<26)>>29;
+ bmy -= (bcost<<29)>>29;
+ bcost &= ~63;
+ }
+ bcost >>= 6;
 }
 
 if( !b_refine_qpel && (h->pixf.mbcmp_unaligned[0] != h->pixf.fpelcmp[0] || b_chroma_me) )
@@ -909,7 +964,6 @@
 /* Special simplified case for subme=1 */
 else if( bmy > h->mb.mv_min_spel[1] && bmy < h->mb.mv_max_spel[1] && bmx > h->mb.mv_min_spel[0] && bmx < h->mb.mv_max_spel[0] )
 {
- int costs[4];
 int omx = bmx, omy = bmy;
 /* We have to use mc_luma because all strides must be the same to use fpelcmp_x4 */
 h->mc.mc_luma( pix , 64, m->p_fref, m->i_stride[0], omx, omy-1, bw, bh, &m->weight[0] );
@@ -917,10 +971,18 @@
 h->mc.mc_luma( pix+32, 64, m->p_fref, m->i_stride[0], omx-1, omy, bw, bh, &m->weight[0] );
 h->mc.mc_luma( pix+48, 64, m->p_fref, m->i_stride[0], omx+1, omy, bw, bh, &m->weight[0] );
 h->pixf.fpelcmp_x4[i_pixel]( m->p_fenc[0], pix, pix+16, pix+32, pix+48, 64, costs );
- COPY2_IF_LT( bcost, costs[0] + p_cost_mvx[omx ] + p_cost_mvy[omy-1], bmy, omy-1 );
- COPY2_IF_LT( bcost, costs[1] + p_cost_mvx[omx ] + p_cost_mvy[omy+1], bmy, omy+1 );
- COPY3_IF_LT( bcost, costs[2] + p_cost_mvx[omx-1] + p_cost_mvy[omy ], bmx, omx-1, bmy, omy );
- COPY3_IF_LT( bcost, costs[3] + p_cost_mvx[omx+1] + p_cost_mvy[omy ], bmx, omx+1, bmy, omy );
+ costs[0] += p_cost_mvx[omx ] + p_cost_mvy[omy-1];
+ costs[1] += p_cost_mvx[omx ] + p_cost_mvy[omy+1];
+ costs[2] += p_cost_mvx[omx-1] + p_cost_mvy[omy ];
+ costs[3] += p_cost_mvx[omx+1] + p_cost_mvy[omy ];
+ bcost <<= 4;
+ COPY1_IF_LT( bcost, (costs[0]<<4)+1 );
+ COPY1_IF_LT( bcost, (costs[1]<<4)+3 );
+ COPY1_IF_LT( bcost, (costs[2]<<4)+4 );
+ COPY1_IF_LT( bcost, (costs[3]<<4)+12 );
+ bmx -= (bcost<<28)>>30;
+ bmy -= (bcost<<30)>>30;
+ bcost >>= 4;
 }
 
 m->cost = bcost;
@@ -971,9 +1033,9 @@
 const int i_pixel = m0->i_pixel;
 const int bw = x264_pixel_size[i_pixel].w;
 const int bh = x264_pixel_size[i_pixel].h;
- ALIGNED_ARRAY_16( pixel, pixy_buf,[2],[9][16*16] );
- ALIGNED_ARRAY_16( pixel, pixu_buf,[2],[9][16*16] );
- ALIGNED_ARRAY_16( pixel, pixv_buf,[2],[9][16*16] );
+ ALIGNED_ARRAY_N( pixel, pixy_buf,[2],[9][16*16] );
+ ALIGNED_ARRAY_N( pixel, pixu_buf,[2],[9][16*16] );
+ ALIGNED_ARRAY_N( pixel, pixv_buf,[2],[9][16*16] );
 pixel *src[3][2][9];
 int chromapix = h->luma2chroma_pixel[i_pixel];
 int chroma_v_shift = CHROMA_V_SHIFT;
@@ -996,7 +1058,7 @@
 uint64_t bcostrd = COST_MAX64;
 uint16_t amvd;
 /* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */
- ALIGNED_ARRAY_16( uint8_t, visited,[8],[8][8] );
+ ALIGNED_ARRAY_N( uint8_t, visited,[8],[8][8] );
 /* all permutations of an offset in up to 2 of the dimensions */
 ALIGNED_4( static const int8_t dia4d[33][4] ) =
 {

x264-snapshot-20130224-2245.tar.bz2/encoder/ratecontrol.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/ratecontrol.c Changed

@@ -1118,7 +1118,8 @@
 total_qp_aq += qp_aq;
 p = next;
 }
- h->pps->i_pic_init_qp = SPEC_QP( (int)(total_qp_aq / rc->num_entries + 0.5) );
+ if( !h->param.b_stitchable )
+ h->pps->i_pic_init_qp = SPEC_QP( (int)(total_qp_aq / rc->num_entries + 0.5) );
 
 x264_free( stats_buf );
 
@@ -1667,7 +1668,8 @@
 rc->qpm = x264_clip3f( (prev_row_qp + rc->qpm)*0.5f, prev_row_qp + 1.0f, qp_max );
 rc->qpa_rc = rc->qpa_rc_prev;
 rc->qpa_aq = rc->qpa_aq_prev;
- h->fdec->i_row_bits[y] = h->fdec->i_row_bits[y-SLICE_MBAFF] = 0;
+ h->fdec->i_row_bits[y] = 0;
+ h->fdec->i_row_bits[y-SLICE_MBAFF] = 0;
 return -1;
 }
 }
@@ -1683,7 +1685,8 @@
 rc->qpm = qp_max;
 rc->qpa_rc = rc->qpa_rc_prev;
 rc->qpa_aq = rc->qpa_aq_prev;
- h->fdec->i_row_bits[y] = h->fdec->i_row_bits[y-SLICE_MBAFF] = 0;
+ h->fdec->i_row_bits[y] = 0;
+ h->fdec->i_row_bits[y-SLICE_MBAFF] = 0;
 return -1;
 }
 }
@@ -2591,14 +2594,16 @@
 if( h->i_frame == 0 )
 for( int i = 0; i < h->param.i_threads; i++ )
 {
- x264_ratecontrol_t *t = h->thread[i]->rc;
- memcpy( t->row_preds, rc->row_preds, sizeof(rc->row_preds) );
+ x264_t *t = h->thread[i];
+ if( t != h )
+ memcpy( t->rc->row_preds, rc->row_preds, sizeof(rc->row_preds) );
 }
 
 for( int i = 0; i < h->param.i_threads; i++ )
 {
 x264_t *t = h->thread[i];
- memcpy( t->rc, rc, offsetof(x264_ratecontrol_t, row_pred) );
+ if( t != h )
+ memcpy( t->rc, rc, offsetof(x264_ratecontrol_t, row_pred) );
 t->rc->row_pred = &t->rc->row_preds[h->sh.i_type];
 /* Calculate the planned slice size. */
 if( rc->b_vbv && rc->frame_size_planned )

x264-snapshot-20130224-2245.tar.bz2/encoder/rdo.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/rdo.c Changed

@@ -634,13 +634,13 @@
                          const uint8_t *zigzag, int ctx_block_cat, int lambda2, int b_ac,
                          int b_chroma, int dc, int num_coefs, int idx )
 {
-    ALIGNED_ARRAY_16( dctcoef, orig_coefs, [64] );
-    ALIGNED_ARRAY_16( dctcoef, quant_coefs, [64] );
+    ALIGNED_ARRAY_N( dctcoef, orig_coefs, [64] );
+    ALIGNED_ARRAY_N( dctcoef, quant_coefs, [64] );
     const uint32_t *coef_weight1 = num_coefs == 64 ? x264_dct8_weight_tab : x264_dct4_weight_tab;
     const uint32_t *coef_weight2 = num_coefs == 64 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
     const int b_interlaced = MB_INTERLACED;
-    uint8_t *cabac_state_sig = &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][ctx_block_cat] ];
-    uint8_t *cabac_state_last = &h->cabac.state[ last_coeff_flag_offset[b_interlaced][ctx_block_cat] ];
+    uint8_t *cabac_state_sig = &h->cabac.state[ x264_significant_coeff_flag_offset[b_interlaced][ctx_block_cat] ];
+    uint8_t *cabac_state_last = &h->cabac.state[ x264_last_coeff_flag_offset[b_interlaced][ctx_block_cat] ];
     int levelgt1_ctx = b_chroma && dc ? 8 : 9;
 
     if( dc )
@@ -683,7 +683,7 @@
     }
 
     int last_nnz = h->quantf.coeff_last[ctx_block_cat]( quant_coefs+b_ac )+b_ac;
-    uint8_t *cabac_state = &h->cabac.state[ coeff_abs_level_m1_offset[ctx_block_cat] ];
+    uint8_t *cabac_state = &h->cabac.state[ x264_coeff_abs_level_m1_offset[ctx_block_cat] ];
 
     /* shortcut for dc-only blocks.
      * this doesn't affect the output, but saves some unnecessary computation. */
@@ -1161,5 +1161,6 @@
         h->mb.cache.non_zero_count[x264_scan8[idx*4+i]] = nz;
         nzaccum |= nz;
     }
+    STORE_8x8_NNZ( 0, idx, 0 );
     return nzaccum;
 }

x264-snapshot-20130224-2245.tar.bz2/encoder/set.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/set.c Changed

@@ -208,9 +208,9 @@
 ( csp >= X264_CSP_BGR ? 1 : 0 ) );
 sps->vui.b_color_description_present = 0;
 
- sps->vui.i_colorprim = ( param->vui.i_colorprim >= 0 && param->vui.i_colorprim <= 8 ? param->vui.i_colorprim : 2 );
- sps->vui.i_transfer = ( param->vui.i_transfer >= 0 && param->vui.i_transfer <= 10 ? param->vui.i_transfer : 2 );
- sps->vui.i_colmatrix = ( param->vui.i_colmatrix >= 0 && param->vui.i_colmatrix <= 8 ? param->vui.i_colmatrix :
+ sps->vui.i_colorprim = ( param->vui.i_colorprim >= 0 && param->vui.i_colorprim <= 9 ? param->vui.i_colorprim : 2 );
+ sps->vui.i_transfer = ( param->vui.i_transfer >= 0 && param->vui.i_transfer <= 15 ? param->vui.i_transfer : 2 );
+ sps->vui.i_colmatrix = ( param->vui.i_colmatrix >= 0 && param->vui.i_colmatrix <= 10 ? param->vui.i_colmatrix :
 ( csp >= X264_CSP_BGR ? 0 : 2 ) );
 if( sps->vui.i_colorprim != 2 ||
 sps->vui.i_transfer != 2 ||
@@ -430,7 +430,7 @@
 pps->b_weighted_pred = param->analyse.i_weighted_pred > 0;
 pps->b_weighted_bipred = param->analyse.b_weighted_bipred ? 2 : 0;
 
- pps->i_pic_init_qp = param->rc.i_rc_method == X264_RC_ABR ? 26 + QP_BD_OFFSET : SPEC_QP( param->rc.i_qp_constant );
+ pps->i_pic_init_qp = param->rc.i_rc_method == X264_RC_ABR || param->b_stitchable ? 26 + QP_BD_OFFSET : SPEC_QP( param->rc.i_qp_constant );
 pps->i_pic_init_qs = 26 + QP_BD_OFFSET;
 
 pps->i_chroma_qp_index_offset = param->analyse.i_chroma_qp_offset;

x264-snapshot-20130723-2245.tar.bz2/encoder/slicetype-cl.c Added

@@ -0,0 +1,780 @@
+/*****************************************************************************
+ * slicetype-cl.c: OpenCL slicetype decision code (lowres lookahead)
+ *****************************************************************************
+ * Copyright (C) 2012-2013 x264 project
+ *
+ * Authors: Steve Borho <sborho@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "macroblock.h"
+#include "me.h"
+
+#if HAVE_OPENCL
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead );
+
+/* We define CL_QUEUE_THREAD_HANDLE_AMD here because it is not defined
+ * in the OpenCL headers shipped with NVIDIA drivers. We need to be
+ * able to compile on an NVIDIA machine and run optimally on an AMD GPU. */
+#define CL_QUEUE_THREAD_HANDLE_AMD 0x403E
+
+#define OCLCHECK( method, ... )\
+do\
+{\
+ if( h->opencl.b_fatal_error )\
+ return -1;\
+ status = ocl->method( __VA_ARGS__ );\
+ if( status != CL_SUCCESS ) {\
+ h->param.b_opencl = 0;\
+ h->opencl.b_fatal_error = 1;\
+ x264_log( h, X264_LOG_ERROR, # method " error '%d'\n", status );\
+ return -1;\
+ }\
+} while( 0 )
+
+void x264_opencl_flush( x264_t *h )
+{
+ x264_opencl_function_t *ocl = h->opencl.ocl;
+
+ ocl->clFinish( h->opencl.queue );
+
+ /* Finish copies from the GPU by copying from the page-locked buffer to
+ * their final destination */
+ for( int i = 0; i < h->opencl.num_copies; i++ )
+ memcpy( h->opencl.copies[i].dest, h->opencl.copies[i].src, h->opencl.copies[i].bytes );
+ h->opencl.num_copies = 0;
+ h->opencl.pl_occupancy = 0;
+}
+
+static void *x264_opencl_alloc_locked( x264_t *h, int bytes )
+{
+ if( h->opencl.pl_occupancy + bytes >= PAGE_LOCKED_BUF_SIZE )
+ x264_opencl_flush( h );
+ assert( bytes < PAGE_LOCKED_BUF_SIZE );
+ char *ptr = h->opencl.page_locked_ptr + h->opencl.pl_occupancy;
+ h->opencl.pl_occupancy += bytes;
+ return ptr;
+}
+
+int x264_opencl_lowres_init( x264_t *h, x264_frame_t *fenc, int lambda )
+{
+ if( fenc->b_intra_calculated )
+ return 0;
+ fenc->b_intra_calculated = 1;
+
+ x264_opencl_function_t *ocl = h->opencl.ocl;
+ int luma_length = fenc->i_stride[0] * fenc->i_lines[0];
+
+#define CREATEBUF( out, flags, size )\
+ out = ocl->clCreateBuffer( h->opencl.context, (flags), (size), NULL, &status );\
+ if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateBuffer error '%d'\n", status ); return -1; }
+#define CREATEIMAGE( out, flags, pf, width, height )\
+ out = ocl->clCreateImage2D( h->opencl.context, (flags), &pf, width, height, 0, NULL, &status );\
+ if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateImage2D error '%d'\n", status ); return -1; }
+
+ int mb_count = h->mb.i_mb_count;
+ cl_int status;
+
+ if( !h->opencl.lowres_mv_costs )
+ {
+ /* Allocate shared memory buffers */
+ int width = h->mb.i_mb_width * 8 * sizeof(pixel);
+ int height = h->mb.i_mb_height * 8 * sizeof(pixel);
+
+ cl_image_format pixel_format;
+ pixel_format.image_channel_order = CL_R;
+ pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
+ CREATEIMAGE( h->opencl.weighted_luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
+
+ for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
+ {
+ pixel_format.image_channel_order = CL_RGBA;
+ pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
+ CREATEIMAGE( h->opencl.weighted_scaled_images[i], CL_MEM_READ_WRITE, pixel_format, width, height );
+ width >>= 1;
+ height >>= 1;
+ }
+
+ CREATEBUF( h->opencl.lowres_mv_costs, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
+ CREATEBUF( h->opencl.lowres_costs[0], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
+ CREATEBUF( h->opencl.lowres_costs[1], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
+ CREATEBUF( h->opencl.mv_buffers[0], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
+ CREATEBUF( h->opencl.mv_buffers[1], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
+ CREATEBUF( h->opencl.mvp_buffer, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
+ CREATEBUF( h->opencl.frame_stats[0], CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
+ CREATEBUF( h->opencl.frame_stats[1], CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
+ CREATEBUF( h->opencl.row_satds[0], CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
+ CREATEBUF( h->opencl.row_satds[1], CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
+ CREATEBUF( h->opencl.luma_16x16_image[0], CL_MEM_READ_ONLY, luma_length );
+ CREATEBUF( h->opencl.luma_16x16_image[1], CL_MEM_READ_ONLY, luma_length );
+ }
+
+ if( !fenc->opencl.intra_cost )
+ {
+ /* Allocate per-frame buffers */
+ int width = h->mb.i_mb_width * 8 * sizeof(pixel);
+ int height = h->mb.i_mb_height * 8 * sizeof(pixel);
+
+ cl_image_format pixel_format;
+ pixel_format.image_channel_order = CL_R;
+ pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
+ CREATEIMAGE( fenc->opencl.luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
+
+ for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
+ {
+ pixel_format.image_channel_order = CL_RGBA;
+ pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
+ CREATEIMAGE( fenc->opencl.scaled_image2Ds[i], CL_MEM_READ_WRITE, pixel_format, width, height );
+ width >>= 1;
+ height >>= 1;
+ }
+ CREATEBUF( fenc->opencl.inv_qscale_factor, CL_MEM_READ_ONLY, mb_count * sizeof(int16_t) );
+ CREATEBUF( fenc->opencl.intra_cost, CL_MEM_WRITE_ONLY, mb_count * sizeof(int16_t) );
+ CREATEBUF( fenc->opencl.lowres_mvs0, CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
+ CREATEBUF( fenc->opencl.lowres_mvs1, CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
+ CREATEBUF( fenc->opencl.lowres_mv_costs0, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
+ CREATEBUF( fenc->opencl.lowres_mv_costs1, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
+ }
+#undef CREATEBUF
+#undef CREATEIMAGE
+
+ /* Copy image to the GPU, downscale to unpadded 8x8, then continue for all scales */
+
+ char *locked = x264_opencl_alloc_locked( h, luma_length );
+ memcpy( locked, fenc->plane[0], luma_length );
+ OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue, h->opencl.luma_16x16_image[h->opencl.last_buf], CL_FALSE, 0, luma_length, locked, 0, NULL, NULL );
+
+ size_t gdim[2];
+ if( h->param.rc.i_aq_mode && fenc->i_inv_qscale_factor )
+ {
+ int size = h->mb.i_mb_count * sizeof(int16_t);
+ locked = x264_opencl_alloc_locked( h, size );
+ memcpy( locked, fenc->i_inv_qscale_factor, size );
+ OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue, fenc->opencl.inv_qscale_factor, CL_FALSE, 0, size, locked, 0, NULL, NULL );
+ }
+ else
+ {
+ /* Fill fenc->opencl.inv_qscale_factor with NOP (256) */
+ cl_uint arg = 0;
+ int16_t value = 256;
+ OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
+ OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(int16_t), &value );
+ gdim[0] = h->mb.i_mb_count;
+ OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.memset_kernel, 1, NULL, gdim, NULL, 0, NULL, NULL );
+ }
+
+ int stride = fenc->i_stride[0];
+ cl_uint arg = 0;
+ OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.luma_16x16_image[h->opencl.last_buf] );
+ OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
+ OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.luma_hpel );
+ OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(int), &stride );
+ gdim[0] = 8 * h->mb.i_mb_width;
+ gdim[1] = 8 * h->mb.i_mb_height;
+ OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.downscale_hpel_kernel, 2, NULL, gdim, NULL, 0, NULL, NULL );
+
+ for( int i = 0; i < NUM_IMAGE_SCALES - 1; i++ )
+ {
+ /* Workaround for AMD Southern Island:
+ *
+ * Alternate kernel instances. No perf impact to this, so we do it for
+ * all GPUs. It prevents the same kernel from being enqueued
+ * back-to-back, avoiding a dependency calculation bug in the driver.
+ */
+ cl_kernel kern = i & 1 ? h->opencl.downscale_kernel1 : h->opencl.downscale_kernel2;
+
+ arg = 0;
+ OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i] );
+ OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i+1] );
+ gdim[0] >>= 1;
+ gdim[1] >>= 1;
+ if( gdim[0] < 16 || gdim[1] < 16 )
+ break;
+ OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, kern, 2, NULL, gdim, NULL, 0, NULL, NULL );
+ }
+
+ size_t ldim[2];
+ gdim[0] = ((h->mb.i_mb_width + 31)>>5)<<5;
+ gdim[1] = 8*h->mb.i_mb_height;
+ ldim[0] = 32;
+ ldim[1] = 8;
+ arg = 0;
+
+ /* For presets slow, slower, and placebo, check all 10 intra modes that the
+ * C lookahead supports. For faster presets, only check the most frequent 8
+ * modes
+ */
+ int slow = h->param.analyse.i_subpel_refine > 7;
+ OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
+ OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
+ OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
+ OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &lambda );
+ OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
+ OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &slow );
+ OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
+
+ gdim[0] = 256;
+ gdim[1] = h->mb.i_mb_height;
+ ldim[0] = 256;
+ ldim[1] = 1;
+ arg = 0;
+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] );
+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
+ OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
+
+ if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 )
+ x264_opencl_flush( h );
+
+ int size = h->mb.i_mb_count * sizeof(int16_t);
+ locked = x264_opencl_alloc_locked( h, size );
+ OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.intra_cost, CL_FALSE, 0, size, locked, 0, NULL, NULL );
+ h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[0][0];
+ h->opencl.copies[h->opencl.num_copies].src = locked;
+ h->opencl.copies[h->opencl.num_copies].bytes = size;
+ h->opencl.num_copies++;
+
+ size = h->mb.i_mb_height * sizeof(int);
+ locked = x264_opencl_alloc_locked( h, size );
+ OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
+ h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[0][0];
+ h->opencl.copies[h->opencl.num_copies].src = locked;
+ h->opencl.copies[h->opencl.num_copies].bytes = size;
+ h->opencl.num_copies++;
+
+ size = sizeof(int) * 4;
+ locked = x264_opencl_alloc_locked( h, size );
+ OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
+ h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[0][0];
+ h->opencl.copies[h->opencl.num_copies].src = locked;
+ h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
+ h->opencl.num_copies++;
+ h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[0][0];
+ h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int);
+ h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
+ h->opencl.num_copies++;
+
+ h->opencl.last_buf = !h->opencl.last_buf;
+ return 0;
+}
+
+/* This function was tested emprically on a number of AMD and NV GPUs. Making a
+ * function which returns perfect launch dimensions is impossible; some
+ * applications will have self-tuning code to try many possible variables and
+ * measure the runtime. Here we simply make an educated guess based on what we
+ * know GPUs typically prefer. */
+static void x264_optimal_launch_dims( x264_t *h, size_t *gdims, size_t *ldims, const cl_kernel kernel, const cl_device_id device )
+{
+ x264_opencl_function_t *ocl = h->opencl.ocl;
+ size_t max_work_group = 256; /* reasonable defaults for OpenCL 1.0 devices, below APIs may fail */
+ size_t preferred_multiple = 64;
+ cl_uint num_cus = 6;
+
+ ocl->clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group, NULL );
+ ocl->clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &preferred_multiple, NULL );
+ ocl->clGetDeviceInfo( device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &num_cus, NULL );
+
+ ldims[0] = preferred_multiple;
+ ldims[1] = 8;
+
+ /* make ldims[1] an even divisor of gdims[1] */
+ while( gdims[1] & (ldims[1] - 1) )
+ {
+ ldims[0] <<= 1;
+ ldims[1] >>= 1;
+ }
+ /* make total ldims fit under the max work-group dimensions for the device */
+ while( ldims[0] * ldims[1] > max_work_group )
+ {
+ if( (ldims[0] <= preferred_multiple) && (ldims[1] > 1) )
+ ldims[1] >>= 1;
+ else
+ ldims[0] >>= 1;
+ }
+
+ if( ldims[0] > gdims[0] )
+ {
+ /* remove preferred multiples until we're close to gdims[0] */
+ while( gdims[0] + preferred_multiple < ldims[0] )
+ ldims[0] -= preferred_multiple;
+ gdims[0] = ldims[0];
+ }
+ else
+ {
+ /* make gdims an even multiple of ldims */
+ gdims[0] = (gdims[0]+ldims[0]-1)/ldims[0];
+ gdims[0] *= ldims[0];
+ }
+
+ /* make ldims smaller to spread work across compute units */
+ while( (gdims[0]/ldims[0]) * (gdims[1]/ldims[1]) * 2 <= num_cus )
+ {
+ if( ldims[0] > preferred_multiple )
+ ldims[0] >>= 1;
+ else if( ldims[1] > 1 )
+ ldims[1] >>= 1;
+ else
+ break;
+ }
+ /* for smaller GPUs, try not to abuse their texture cache */
+ if( num_cus == 6 && ldims[0] == 64 && ldims[1] == 4 )
+ ldims[0] = 32;
+}
+
+int x264_opencl_motionsearch( x264_t *h, x264_frame_t **frames, int b, int ref, int b_islist1, int lambda, const x264_weight_t *w )
+{
+ x264_opencl_function_t *ocl = h->opencl.ocl;
+ x264_frame_t *fenc = frames[b];
+ x264_frame_t *fref = frames[ref];
+
+ cl_mem ref_scaled_images[NUM_IMAGE_SCALES];
+ cl_mem ref_luma_hpel;
+ cl_int status;
+
+ if( w && w->weightfn )
+ {
+ size_t gdims[2];
+
+ gdims[0] = 8 * h->mb.i_mb_width;
+ gdims[1] = 8 * h->mb.i_mb_height;
+
+ /* WeightP: Perform a filter on fref->opencl.scaled_image2Ds[] and fref->opencl.luma_hpel */
+ for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
+ {
+ cl_uint arg = 0;
+ OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &fref->opencl.scaled_image2Ds[i] );
+ OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_scaled_images[i] );
+ OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_offset );
+ OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_scale );
+ OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_denom );
+ OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_scaled_images_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL );
+
+ gdims[0] >>= 1;
+ gdims[1] >>= 1;
+ if( gdims[0] < 16 || gdims[1] < 16 )
+ break;
+ }
+
+ cl_uint arg = 0;
+ gdims[0] = 8 * h->mb.i_mb_width;
+ gdims[1] = 8 * h->mb.i_mb_height;
+
+ OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &fref->opencl.luma_hpel );
+ OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_luma_hpel );
+ OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_offset );
+ OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_scale );
+ OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_denom );
+ OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_hpel_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL );
+
+ /* Use weighted reference planes for motion search */
+ for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
+ ref_scaled_images[i] = h->opencl.weighted_scaled_images[i];
+ ref_luma_hpel = h->opencl.weighted_luma_hpel;
+ }
+ else
+ {
+ /* Use unweighted reference planes for motion search */
+ for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
+ ref_scaled_images[i] = fref->opencl.scaled_image2Ds[i];
+ ref_luma_hpel = fref->opencl.luma_hpel;
+ }
+
+ const int num_iterations[NUM_IMAGE_SCALES] = { 1, 1, 2, 3 };
+ int b_first_iteration = 1;
+ int b_reverse_references = 1;
+ int A = 1;
+
+
+ int mb_per_group = 0;
+ int cost_local_size = 0;
+ int mvc_local_size = 0;
+ int mb_width;
+
+ size_t gdims[2];
+ size_t ldims[2];
+
+ /* scale 0 is 8x8 */
+ for( int scale = NUM_IMAGE_SCALES-1; scale >= 0; scale-- )
+ {
+ mb_width = h->mb.i_mb_width >> scale;
+ gdims[0] = mb_width;
+ gdims[1] = h->mb.i_mb_height >> scale;
+ if( gdims[0] < 2 || gdims[1] < 2 )
+ continue;
+ gdims[0] <<= 2;
+ x264_optimal_launch_dims( h, gdims, ldims, h->opencl.hme_kernel, h->opencl.device );
+
+ mb_per_group = (ldims[0] >> 2) * ldims[1];
+ cost_local_size = 4 * mb_per_group * sizeof(int16_t);
+ mvc_local_size = 4 * mb_per_group * sizeof(int16_t) * 2;
+ int scaled_me_range = h->param.analyse.i_me_range >> scale;
+ int b_shift_index = 1;
+
+ cl_uint arg = 0;
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[scale] );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &ref_scaled_images[scale] );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[!A] );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), (void*)&h->opencl.mvp_buffer );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, cost_local_size, NULL );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, mvc_local_size, NULL );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &mb_width );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &lambda );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scaled_me_range );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scale );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_shift_index );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_first_iteration );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_reverse_references );
+
+ for( int iter = 0; iter < num_iterations[scale]; iter++ )
+ {
+ OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.hme_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
+
+ b_shift_index = 0;
+ b_first_iteration = 0;
+
+ /* alternate top-left vs bot-right MB references at lower scales, so
+ * motion field smooths more quickly. */
+ if( scale > 2 )
+ b_reverse_references ^= 1;
+ else
+ b_reverse_references = 0;
+ A = !A;
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 2, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 3, sizeof(cl_mem), &h->opencl.mv_buffers[!A] );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 3, sizeof(int), &b_shift_index );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 2, sizeof(int), &b_first_iteration );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 1, sizeof(int), &b_reverse_references );
+ }
+ }
+
+ int satd_local_size = mb_per_group * sizeof(uint32_t) * 16;
+ cl_uint arg = 0;
+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &ref_luma_hpel );
+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs );
+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, cost_local_size, NULL );
+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, satd_local_size, NULL );
+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, mvc_local_size, NULL );
+
+ if( b_islist1 )
+ {
+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 );
+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 );
+ }
+ else
+ {
+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 );
+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 );
+ }
+
+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &mb_width );
+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &lambda );
+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b );
+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &ref );
+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b_islist1 );
+
+ if( h->opencl.b_device_AMD_SI )
+ {
+ /* workaround for AMD Southern Island driver scheduling bug (fixed in
+ * July 2012), perform meaningless small copy to add a data dependency */
+ OCLCHECK( clEnqueueCopyBuffer, h->opencl.queue, h->opencl.mv_buffers[A], h->opencl.mv_buffers[!A], 0, 0, 20, 0, NULL, NULL );
+ }
+
+ OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.subpel_refine_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
+
+ int mvlen = 2 * sizeof(int16_t) * h->mb.i_mb_count;
+
+ if( h->opencl.num_copies >= MAX_FINISH_COPIES - 1 )
+ x264_opencl_flush( h );
+
+ char *locked = x264_opencl_alloc_locked( h, mvlen );
+ h->opencl.copies[h->opencl.num_copies].src = locked;
+ h->opencl.copies[h->opencl.num_copies].bytes = mvlen;
+
+ if( b_islist1 )
+ {
+ int mvs_offset = mvlen * (ref - b - 1);
+ OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs1, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL );
+ h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[1][ref - b - 1];
+ }
+ else
+ {
+ int mvs_offset = mvlen * (b - ref - 1);
+ OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs0, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL );
+ h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[0][b - ref - 1];
+ }
+
+ h->opencl.num_copies++;
+
+ return 0;
+}
+
+int x264_opencl_finalize_cost( x264_t *h, int lambda, x264_frame_t **frames, int p0, int p1, int b, int dist_scale_factor )
+{
+ x264_opencl_function_t *ocl = h->opencl.ocl;
+ cl_int status;
+ x264_frame_t *fenc = frames[b];
+ x264_frame_t *fref0 = frames[p0];
+ x264_frame_t *fref1 = frames[p1];
+
+ int bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor >> 2) : 32;
+
+ /* Tasks for this kernel:
+ * 1. Select least cost mode (intra, ref0, ref1)
+ * list_used 0, 1, 2, or 3. if B frame, do not allow intra
+ * 2. if B frame, try bidir predictions.
+ * 3. lowres_costs[i_mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT); */
+ size_t gdims[2] = { h->mb.i_mb_width, h->mb.i_mb_height };
+ size_t ldim_bidir[2];
+ size_t *ldims = NULL;
+ int cost_local_size = 4;
+ int satd_local_size = 4;
+ if( b < p1 )
+ {
+ /* For B frames, use 4 threads per MB for BIDIR checks */
+ ldims = ldim_bidir;
+ gdims[0] <<= 2;
+ x264_optimal_launch_dims( h, gdims, ldims, h->opencl.mode_select_kernel, h->opencl.device );
+ int mb_per_group = (ldims[0] >> 2) * ldims[1];
+ cost_local_size = 4 * mb_per_group * sizeof(int16_t);
+ satd_local_size = 16 * mb_per_group * sizeof(uint32_t);
+ }
+
+ cl_uint arg = 0;
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref0->opencl.luma_hpel );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.luma_hpel );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.lowres_mvs0 );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, cost_local_size, NULL );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, satd_local_size, NULL );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &bipred_weight );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &dist_scale_factor );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &b );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p0 );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p1 );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &lambda );
+ OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.mode_select_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
+
+ /* Sum costs across rows, atomicAdd down frame */
+ size_t gdim[2] = { 256, h->mb.i_mb_height };
+ size_t ldim[2] = { 256, 1 };
+
+ arg = 0;
+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] );
+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] );
+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->param.i_bframe_bias );
+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &b );
+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p0 );
+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p1 );
+ OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_inter_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
+
+ if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 )
+ x264_opencl_flush( h );
+
+ int size = h->mb.i_mb_count * sizeof(int16_t);
+ char *locked = x264_opencl_alloc_locked( h, size );
+ h->opencl.copies[h->opencl.num_copies].src = locked;
+ h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[b - p0][p1 - b];
+ h->opencl.copies[h->opencl.num_copies].bytes = size;
+ OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.lowres_costs[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
+ h->opencl.num_copies++;
+
+ size = h->mb.i_mb_height * sizeof(int);
+ locked = x264_opencl_alloc_locked( h, size );
+ h->opencl.copies[h->opencl.num_copies].src = locked;
+ h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[b - p0][p1 - b];
+ h->opencl.copies[h->opencl.num_copies].bytes = size;
+ OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
+ h->opencl.num_copies++;
+
+ size = 4 * sizeof(int);
+ locked = x264_opencl_alloc_locked( h, size );
+ OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
+ h->opencl.last_buf = !h->opencl.last_buf;
+
+ h->opencl.copies[h->opencl.num_copies].src = locked;
+ h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[b - p0][p1 - b];
+ h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
+ h->opencl.num_copies++;
+ h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int);
+ h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[b - p0][p1 - b];
+ h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
+ h->opencl.num_copies++;
+
+ if( b == p1 ) // P frames only
+ {
+ h->opencl.copies[h->opencl.num_copies].src = locked + 2 * sizeof(int);
+ h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_intra_mbs[b - p0];
+ h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
+ h->opencl.num_copies++;
+ }
+ return 0;
+}
+
+void x264_opencl_slicetype_prep( x264_t *h, x264_frame_t **frames, int num_frames, int lambda )
+{
+ if( h->param.b_opencl )
+ {
+#ifdef _WIN32
+ /* Temporarily boost priority of this lookahead thread and the OpenCL
+ * driver's thread until the end of this function. On AMD GPUs this
+ * greatly reduces the latency of enqueuing kernels and getting results
+ * on Windows. */
+ HANDLE id = GetCurrentThread();
+ h->opencl.lookahead_thread_pri = GetThreadPriority( id );
+ SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL );
+ x264_opencl_function_t *ocl = h->opencl.ocl;
+ cl_int status = ocl->clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL );
+ if( status == CL_SUCCESS )
+ {
+ h->opencl.opencl_thread_pri = GetThreadPriority( id );
+ SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL );
+ }
+#endif
+
+ /* precalculate intra and I frames */
+ for( int i = 0; i <= num_frames; i++ )
+ x264_opencl_lowres_init( h, frames[i], lambda );
+ x264_opencl_flush( h );
+
+ if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS && h->param.i_bframe )
+ {
+ /* For trellis B-Adapt, precompute exhaustive motion searches */
+ for( int b = 0; b <= num_frames; b++ )
+ {
+ for( int j = 1; j < h->param.i_bframe; j++ )
+ {
+ int p0 = b - j;
+ if( p0 >= 0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF )
+ {
+ const x264_weight_t *w = x264_weight_none;
+
+ if( h->param.analyse.i_weighted_pred )
+ {
+ x264_emms();
+ x264_weights_analyse( h, frames[b], frames[p0], 1 );
+ w = frames[b]->weight[0];
+ }
+ frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
+ x264_opencl_motionsearch( h, frames, b, p0, 0, lambda, w );
+ }
+ int p1 = b + j;
+ if( p1 <= num_frames && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF )
+ {
+ frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
+ x264_opencl_motionsearch( h, frames, b, p1, 1, lambda, NULL );
+ }
+ }
+ }
+
+ x264_opencl_flush( h );
+ }
+ }
+}
+
+
+void x264_opencl_slicetype_end( x264_t *h )
+{
+#ifdef _WIN32
+ if( h->param.b_opencl )
+ {
+ HANDLE id = GetCurrentThread();
+ SetThreadPriority( id, h->opencl.lookahead_thread_pri );
+ x264_opencl_function_t *ocl = h->opencl.ocl;
+ cl_int status = ocl->clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL );
+ if( status == CL_SUCCESS )
+ SetThreadPriority( id, h->opencl.opencl_thread_pri );
+ }
+#endif
+}
+
+int x264_opencl_precalculate_frame_cost( x264_t *h, x264_frame_t **frames, int lambda, int p0, int p1, int b )
+{
+ if( (frames[b]->i_cost_est[b-p0][p1-b] >= 0) || (b == p0 && b == p1) )
+ return 0;
+ else
+ {
+ int do_search[2];
+ int dist_scale_factor = 128;
+ const x264_weight_t *w = x264_weight_none;
+
+ // avoid duplicating work
+ frames[b]->i_cost_est[b-p0][p1-b] = 0;
+
+ do_search[0] = b != p0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF;
+ do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
+ if( do_search[0] )
+ {
+ if( h->param.analyse.i_weighted_pred && b == p1 )
+ {
+ x264_emms();
+ x264_weights_analyse( h, frames[b], frames[p0], 1 );
+ w = frames[b]->weight[0];
+ }
+ frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
+ }
+ if( do_search[1] )
+ frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
+ if( b == p1 )
+ frames[b]->i_intra_mbs[b-p0] = 0;
+ if( p1 != p0 )
+ dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
+
+ frames[b]->i_cost_est[b-p0][p1-b] = 0;
+ frames[b]->i_cost_est_aq[b-p0][p1-b] = 0;
+
+ x264_opencl_lowres_init( h, frames[b], lambda );
+
+ if( do_search[0] )
+ {
+ x264_opencl_lowres_init( h, frames[p0], lambda );
+ x264_opencl_motionsearch( h, frames, b, p0, 0, lambda, w );
+ }
+ if( do_search[1] )
+ {
+ x264_opencl_lowres_init( h, frames[p1], lambda );
+ x264_opencl_motionsearch( h, frames, b, p1, 1, lambda, NULL );
+ }
+ x264_opencl_finalize_cost( h, lambda, frames, p0, p1, b, dist_scale_factor );
+ return 1;
+ }
+}
+
+#endif

x264-snapshot-20130224-2245.tar.bz2/encoder/slicetype.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/slicetype.c Changed

@@ -36,6 +36,18 @@
 x264_frame_t **frames, int p0, int p1, int b,
 int b_intra_penalty );
 
+void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead );
+
+#if HAVE_OPENCL
+int x264_opencl_lowres_init( x264_t *h, x264_frame_t *fenc, int lambda );
+int x264_opencl_motionsearch( x264_t *h, x264_frame_t **frames, int b, int ref, int b_islist1, int lambda, const x264_weight_t *w );
+int x264_opencl_finalize_cost( x264_t *h, int lambda, x264_frame_t **frames, int p0, int p1, int b, int dist_scale_factor );
+int x264_opencl_precalculate_frame_cost( x264_t *h, x264_frame_t **frames, int lambda, int p0, int p1, int b );
+void x264_opencl_flush( x264_t *h );
+void x264_opencl_slicetype_prep( x264_t *h, x264_frame_t **frames, int num_frames, int lambda );
+void x264_opencl_slicetype_end( x264_t *h );
+#endif
+
 static void x264_lowres_context_init( x264_t *h, x264_mb_analysis_t *a )
 {
 a->i_qp = X264_LOOKAHEAD_QP;
@@ -60,7 +72,7 @@
 w->i_offset = offset;
 w->i_denom = 7;
 w->i_scale = weight_nonh264;
- while( w->i_denom > 0 && (w->i_scale > 127 || !(w->i_scale & 1)) )
+ while( w->i_denom > 0 && (w->i_scale > 127) )
 {
 w->i_denom--;
 w->i_scale >>= 1;
@@ -276,7 +288,7 @@
 return cost;
 }
 
-static void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead )
+void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead )
 {
 int i_delta_index = fenc->i_frame - ref->i_frame - 1;
 /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
@@ -286,21 +298,40 @@
 SET_WEIGHT( weights[1], 0, 1, 0, 0 );
 SET_WEIGHT( weights[2], 0, 1, 0, 0 );
 int chroma_initted = 0;
+ float guess_scale[3];
+ float fenc_mean[3];
+ float ref_mean[3];
+ for( int plane = 0; plane <= 2*!b_lookahead; plane++ )
+ {
+ float fenc_var = fenc->i_pixel_ssd[plane] + !ref->i_pixel_ssd[plane];
+ float ref_var = ref->i_pixel_ssd[plane] + !ref->i_pixel_ssd[plane];
+ guess_scale[plane] = sqrtf( fenc_var / ref_var );
+ fenc_mean[plane] = (float)fenc->i_pixel_sum[plane] / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]) / (1 << (BIT_DEPTH - 8));
+ ref_mean[plane] = (float) ref->i_pixel_sum[plane] / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]) / (1 << (BIT_DEPTH - 8));
+ }
+
+ int chroma_denom = 7;
+ if( !b_lookahead )
+ {
+ /* make sure both our scale factors fit */
+ while( chroma_denom > 0 )
+ {
+ float thresh = 127.f / (1<<chroma_denom);
+ if( guess_scale[1] < thresh && guess_scale[2] < thresh )
+ break;
+ chroma_denom--;
+ }
+ }
+
 /* Don't check chroma in lookahead, or if there wasn't a luma weight. */
 for( int plane = 0; plane <= 2 && !( plane && ( !weights[0].weightfn || b_lookahead ) ); plane++ )
 {
- int cur_offset, start_offset, end_offset;
 int minoff, minscale, mindenom;
 unsigned int minscore, origscore;
 int found;
- float fenc_var = fenc->i_pixel_ssd[plane] + !ref->i_pixel_ssd[plane];
- float ref_var = ref->i_pixel_ssd[plane] + !ref->i_pixel_ssd[plane];
- float guess_scale = sqrtf( fenc_var / ref_var );
- float fenc_mean = (float)fenc->i_pixel_sum[plane] / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]) / (1 << (BIT_DEPTH - 8));
- float ref_mean = (float) ref->i_pixel_sum[plane] / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]) / (1 << (BIT_DEPTH - 8));
 
 //early termination
- if( fabsf( ref_mean - fenc_mean ) < 0.5f && fabsf( 1.f - guess_scale ) < epsilon )
+ if( fabsf( ref_mean[plane] - fenc_mean[plane] ) < 0.5f && fabsf( 1.f - guess_scale[plane] ) < epsilon )
 {
 SET_WEIGHT( weights[plane], 0, 1, 0, 0 );
 continue;
@@ -308,8 +339,8 @@
 
 if( plane )
 {
- weights[plane].i_denom = 6;
- weights[plane].i_scale = x264_clip3( round( guess_scale * 64 ), 0, 255 );
+ weights[plane].i_denom = chroma_denom;
+ weights[plane].i_scale = x264_clip3( round( guess_scale[plane] * (1<<chroma_denom) ), 0, 255 );
 if( weights[plane].i_scale > 127 )
 {
 weights[1].weightfn = weights[2].weightfn = NULL;
@@ -317,7 +348,7 @@
 }
 }
 else
- x264_weight_get_h264( round( guess_scale * 128 ), 0, &weights[plane] );
+ x264_weight_get_h264( round( guess_scale[plane] * 128 ), 0, &weights[plane] );
 
 found = 0;
 mindenom = weights[plane].i_denom;
@@ -357,33 +388,65 @@
 if( !minscore )
 continue;
 
- // This gives a slight improvement due to rounding errors but only tests one offset in lookahead.
- // Currently only searches within +/- 1 of the best offset found so far.
- // TODO: Try other offsets/multipliers/combinations thereof?
- cur_offset = fenc_mean - ref_mean * minscale / (1 << mindenom) + 0.5f * b_lookahead;
- start_offset = x264_clip3( cur_offset - !b_lookahead, -128, 127 );
- end_offset = x264_clip3( cur_offset + !b_lookahead, -128, 127 );
- for( int i_off = start_offset; i_off <= end_offset; i_off++ )
+ /* Picked somewhat arbitrarily */
+ static const uint8_t weight_check_distance[][2] =
+ {
+ {0,0},{0,0},{0,1},{0,1},
+ {0,1},{0,1},{0,1},{1,1},
+ {1,1},{2,1},{2,1},{4,2}
+ };
+ int scale_dist = b_lookahead ? 0 : weight_check_distance[h->param.analyse.i_subpel_refine][0];
+ int offset_dist = b_lookahead ? 0 : weight_check_distance[h->param.analyse.i_subpel_refine][1];
+
+ int start_scale = x264_clip3( minscale - scale_dist, 0, 127 );
+ int end_scale = x264_clip3( minscale + scale_dist, 0, 127 );
+ for( int i_scale = start_scale; i_scale <= end_scale; i_scale++ )
 {
- SET_WEIGHT( weights[plane], 1, minscale, mindenom, i_off );
- unsigned int s;
- if( plane )
+ int cur_scale = i_scale;
+ int cur_offset = fenc_mean[plane] - ref_mean[plane] * cur_scale / (1 << mindenom) + 0.5f * b_lookahead;
+ if( cur_offset < - 128 || cur_offset > 127 )
 {
- if( CHROMA444 )
- s = x264_weight_cost_chroma444( h, fenc, mcbuf, &weights[plane], plane );
- else
- s = x264_weight_cost_chroma( h, fenc, mcbuf, &weights[plane] );
+ /* Rescale considering the constraints on cur_offset. We do it in this order
+ * because scale has a much wider range than offset (because of denom), so
+ * it should almost never need to be clamped. */
+ cur_offset = x264_clip3( cur_offset, -128, 127 );
+ cur_scale = (1 << mindenom) * (fenc_mean[plane] - cur_offset) / ref_mean[plane] + 0.5f;
+ cur_scale = x264_clip3( cur_scale, 0, 127 );
 }
- else
- s = x264_weight_cost_luma( h, fenc, mcbuf, &weights[plane] );
- COPY3_IF_LT( minscore, s, minoff, i_off, found, 1 );
+ int start_offset = x264_clip3( cur_offset - offset_dist, -128, 127 );
+ int end_offset = x264_clip3( cur_offset + offset_dist, -128, 127 );
+ for( int i_off = start_offset; i_off <= end_offset; i_off++ )
+ {
+ SET_WEIGHT( weights[plane], 1, cur_scale, mindenom, i_off );
+ unsigned int s;
+ if( plane )
+ {
+ if( CHROMA444 )
+ s = x264_weight_cost_chroma444( h, fenc, mcbuf, &weights[plane], plane );
+ else
+ s = x264_weight_cost_chroma( h, fenc, mcbuf, &weights[plane] );
+ }
+ else
+ s = x264_weight_cost_luma( h, fenc, mcbuf, &weights[plane] );
+ COPY4_IF_LT( minscore, s, minscale, cur_scale, minoff, i_off, found, 1 );
 
- // Don't check any more offsets if the previous one had a lower cost than the current one
- if( minoff == start_offset && i_off != start_offset )
- break;
+ // Don't check any more offsets if the previous one had a lower cost than the current one
+ if( minoff == start_offset && i_off != start_offset )
+ break;
+ }
 }
 x264_emms();
 
+ /* Use a smaller denominator if possible */
+ if( !plane )
+ {
+ while( mindenom > 0 && !(minscale&1) )
+ {
+ mindenom--;
+ minscale >>= 1;
+ }
+ }
+
 /* FIXME: More analysis can be done here on SAD vs. SATD termination. */
 /* 0.2% termination derived experimentally to avoid weird weights in frames that are mostly intra. */
 if( !found || (minscale == 1 << mindenom && minoff == 0) || (float)minscore / origscore > 0.998f )
@@ -398,18 +461,29 @@
 fenc->f_weighted_cost_delta[i_delta_index] = (float)minscore / origscore;
 }
 
- //FIXME, what is the correct way to deal with this?
- if( weights[1].weightfn && weights[2].weightfn && weights[1].i_denom != weights[2].i_denom )
+ /* Optimize and unify denominator */
+ if( weights[1].weightfn || weights[2].weightfn )
 {
- int denom = X264_MIN( weights[1].i_denom, weights[2].i_denom );
- int i;
- for( i = 1; i <= 2; i++ )
+ int denom = weights[1].weightfn ? weights[1].i_denom : weights[2].i_denom;
+ int both_weighted = weights[1].weightfn && weights[2].weightfn;
+ /* If only one plane is weighted, the other has an implicit scale of 1<<denom.
+ * With denom==7, this comes out to 128, which is invalid, so don't allow that. */
+ while( (!both_weighted && denom==7) ||
+ (denom > 0 && !(weights[1].weightfn && (weights[1].i_scale&1))
+ && !(weights[2].weightfn && (weights[2].i_scale&1))) )
 {
- weights[i].i_scale = x264_clip3( weights[i].i_scale >> ( weights[i].i_denom - denom ), 0, 255 );
- weights[i].i_denom = denom;
- h->mc.weight_cache( h, &weights[i] );
+ denom--;
+ for( int i = 1; i <= 2; i++ )
+ if( weights[i].weightfn )
+ {
+ weights[i].i_scale >>= 1;
+ weights[i].i_denom = denom;
+ }
 }
 }
+ for( int i = 1; i <= 2; i++ )
+ if( weights[i].weightfn )
+ h->mc.weight_cache( h, &weights[i] );
 
 if( weights[0].weightfn && b_lookahead )
 {
@@ -472,16 +546,16 @@
 goto lowres_intra_mb;
 
 // no need for h->mb.mv_min[]
- h->mb.mv_min_fpel[0] = -8*h->mb.i_mb_x - 4;
- h->mb.mv_max_fpel[0] = 8*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 4;
- h->mb.mv_min_spel[0] = 4*( h->mb.mv_min_fpel[0] - 8 );
- h->mb.mv_max_spel[0] = 4*( h->mb.mv_max_fpel[0] + 8 );
+ h->mb.mv_limit_fpel[0][0] = -8*h->mb.i_mb_x - 4;
+ h->mb.mv_limit_fpel[1][0] = 8*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 4;
+ h->mb.mv_min_spel[0] = 4*( h->mb.mv_limit_fpel[0][0] - 8 );
+ h->mb.mv_max_spel[0] = 4*( h->mb.mv_limit_fpel[1][0] + 8 );
 if( h->mb.i_mb_x >= h->mb.i_mb_width - 2 )
 {
- h->mb.mv_min_fpel[1] = -8*h->mb.i_mb_y - 4;
- h->mb.mv_max_fpel[1] = 8*( h->mb.i_mb_height - h->mb.i_mb_y - 1 ) + 4;
- h->mb.mv_min_spel[1] = 4*( h->mb.mv_min_fpel[1] - 8 );
- h->mb.mv_max_spel[1] = 4*( h->mb.mv_max_fpel[1] + 8 );
+ h->mb.mv_limit_fpel[0][1] = -8*h->mb.i_mb_y - 4;
+ h->mb.mv_limit_fpel[1][1] = 8*( h->mb.i_mb_height - h->mb.i_mb_y - 1 ) + 4;
+ h->mb.mv_min_spel[1] = 4*( h->mb.mv_limit_fpel[0][1] - 8 );
+ h->mb.mv_max_spel[1] = 4*( h->mb.mv_limit_fpel[1][1] + 8 );
 }
 
 #define LOAD_HPELS_LUMA(dst, src) \
@@ -633,15 +707,16 @@
 if( !fenc->b_intra_calculated )
 {
 ALIGNED_ARRAY_16( pixel, edge,[36] );
- pixel *pix = &pix1[8+FDEC_STRIDE - 1];
- pixel *src = &fenc->lowres[0][i_pel_offset - 1];
+ pixel *pix = &pix1[8+FDEC_STRIDE];
+ pixel *src = &fenc->lowres[0][i_pel_offset];
 const int intra_penalty = 5 * a->i_lambda;
 int satds[3];
+ int pixoff = 4 / sizeof(pixel);
 
- memcpy( pix-FDEC_STRIDE, src-i_stride, 17 * sizeof(pixel) );
- for( int i = 0; i < 8; i++ )
- pix[i*FDEC_STRIDE] = src[i*i_stride];
- pix++;
+ /* Avoid store forwarding stalls by writing larger chunks */
+ memcpy( pix-FDEC_STRIDE, src-i_stride, 16 * sizeof(pixel) );
+ for( int i = -1; i < 8; i++ )
+ M32( &pix[i*FDEC_STRIDE-pixoff] ) = M32( &src[i*i_stride-pixoff] );
 
 h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[0], pix, satds );
 int i_icost = X264_MIN3( satds[0], satds[1], satds[2] );
@@ -793,96 +868,120 @@
 output_inter[0] = h->scratch_buffer2;
 output_intra[0] = output_inter[0] + output_buf_size;
 
- if( h->param.i_lookahead_threads > 1 )
+#if HAVE_OPENCL
+ if( h->param.b_opencl )
 {
- x264_slicetype_slice_t s[X264_LOOKAHEAD_THREAD_MAX];
+ x264_opencl_lowres_init(h, fenc, a->i_lambda );
+ if( do_search[0] )
+ {
+ x264_opencl_lowres_init( h, frames[p0], a->i_lambda );
+ x264_opencl_motionsearch( h, frames, b, p0, 0, a->i_lambda, w );
+ }
+ if( do_search[1] )
+ {
+ x264_opencl_lowres_init( h, frames[p1], a->i_lambda );
+ x264_opencl_motionsearch( h, frames, b, p1, 1, a->i_lambda, NULL );
+ }
+ if( b != p0 )
+ x264_opencl_finalize_cost( h, a->i_lambda, frames, p0, p1, b, dist_scale_factor );
+ x264_opencl_flush( h );
 
- for( int i = 0; i < h->param.i_lookahead_threads; i++ )
+ i_score = fenc->i_cost_est[b-p0][p1-b];
+ }
+ else
+#endif
+ {
+ if( h->param.i_lookahead_threads > 1 )
 {
- x264_t *t = h->lookahead_thread[i];
+ x264_slicetype_slice_t s[X264_LOOKAHEAD_THREAD_MAX];
 
- /* FIXME move this somewhere else */
- t->mb.i_me_method = h->mb.i_me_method;
- t->mb.i_subpel_refine = h->mb.i_subpel_refine;
- t->mb.b_chroma_me = h->mb.b_chroma_me;
+ for( int i = 0; i < h->param.i_lookahead_threads; i++ )
+ {
+ x264_t *t = h->lookahead_thread[i];
 
- s[i] = (x264_slicetype_slice_t){ t, a, frames, p0, p1, b, dist_scale_factor, do_search, w,
- output_inter[i], output_intra[i] };
+ /* FIXME move this somewhere else */
+ t->mb.i_me_method = h->mb.i_me_method;
+ t->mb.i_subpel_refine = h->mb.i_subpel_refine;
+ t->mb.b_chroma_me = h->mb.b_chroma_me;
 
- t->i_threadslice_start = ((h->mb.i_mb_height * i + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads);
- t->i_threadslice_end = ((h->mb.i_mb_height * (i+1) + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads);
+ s[i] = (x264_slicetype_slice_t){ t, a, frames, p0, p1, b, dist_scale_factor, do_search, w,
+ output_inter[i], output_intra[i] };
 
- int thread_height = t->i_threadslice_end - t->i_threadslice_start;
- int thread_output_size = thread_height + NUM_INTS;
- memset( output_inter[i], 0, thread_output_size * sizeof(int) );
- memset( output_intra[i], 0, thread_output_size * sizeof(int) );
- output_inter[i][NUM_ROWS] = output_intra[i][NUM_ROWS] = thread_height;
+ t->i_threadslice_start = ((h->mb.i_mb_height * i + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads);
+ t->i_threadslice_end = ((h->mb.i_mb_height * (i+1) + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads);
 
- output_inter[i+1] = output_inter[i] + thread_output_size + PAD_SIZE;
- output_intra[i+1] = output_intra[i] + thread_output_size + PAD_SIZE;
+ int thread_height = t->i_threadslice_end - t->i_threadslice_start;
+ int thread_output_size = thread_height + NUM_INTS;
+ memset( output_inter[i], 0, thread_output_size * sizeof(int) );
+ memset( output_intra[i], 0, thread_output_size * sizeof(int) );
+ output_inter[i][NUM_ROWS] = output_intra[i][NUM_ROWS] = thread_height;
 
- x264_threadpool_run( h->lookaheadpool, (void*)x264_slicetype_slice_cost, &s[i] );
- }
- for( int i = 0; i < h->param.i_lookahead_threads; i++ )
- x264_threadpool_wait( h->lookaheadpool, &s[i] );
- }
- else
- {
- h->i_threadslice_start = 0;
- h->i_threadslice_end = h->mb.i_mb_height;
- memset( output_inter[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) );
- memset( output_intra[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) );
- output_inter[0][NUM_ROWS] = output_intra[0][NUM_ROWS] = h->mb.i_mb_height;
- x264_slicetype_slice_t s = (x264_slicetype_slice_t){ h, a, frames, p0, p1, b, dist_scale_factor, do_search, w,
- output_inter[0], output_intra[0] };
- x264_slicetype_slice_cost( &s );
- }
+ output_inter[i+1] = output_inter[i] + thread_output_size + PAD_SIZE;
+ output_intra[i+1] = output_intra[i] + thread_output_size + PAD_SIZE;
 
- /* Sum up accumulators */
- if( b == p1 )
- fenc->i_intra_mbs[b-p0] = 0;
- if( !fenc->b_intra_calculated )
- {
- fenc->i_cost_est[0][0] = 0;
- fenc->i_cost_est_aq[0][0] = 0;
- }
- fenc->i_cost_est[b-p0][p1-b] = 0;
- fenc->i_cost_est_aq[b-p0][p1-b] = 0;
+ x264_threadpool_run( h->lookaheadpool, (void*)x264_slicetype_slice_cost, &s[i] );
+ }
+ for( int i = 0; i < h->param.i_lookahead_threads; i++ )
+ x264_threadpool_wait( h->lookaheadpool, &s[i] );
+ }
+ else
+ {
+ h->i_threadslice_start = 0;
+ h->i_threadslice_end = h->mb.i_mb_height;
+ memset( output_inter[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) );
+ memset( output_intra[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) );
+ output_inter[0][NUM_ROWS] = output_intra[0][NUM_ROWS] = h->mb.i_mb_height;
+ x264_slicetype_slice_t s = (x264_slicetype_slice_t){ h, a, frames, p0, p1, b, dist_scale_factor, do_search, w,
+ output_inter[0], output_intra[0] };
+ x264_slicetype_slice_cost( &s );
+ }
 
- int *row_satd_inter = fenc->i_row_satds[b-p0][p1-b];
- int *row_satd_intra = fenc->i_row_satds[0][0];
- for( int i = 0; i < h->param.i_lookahead_threads; i++ )
- {
+ /* Sum up accumulators */
 if( b == p1 )
- fenc->i_intra_mbs[b-p0] += output_inter[i][INTRA_MBS];
+ fenc->i_intra_mbs[b-p0] = 0;
 if( !fenc->b_intra_calculated )
 {
- fenc->i_cost_est[0][0] += output_intra[i][COST_EST];
- fenc->i_cost_est_aq[0][0] += output_intra[i][COST_EST_AQ];
+ fenc->i_cost_est[0][0] = 0;
+ fenc->i_cost_est_aq[0][0] = 0;
 }
+ fenc->i_cost_est[b-p0][p1-b] = 0;
+ fenc->i_cost_est_aq[b-p0][p1-b] = 0;
 
- fenc->i_cost_est[b-p0][p1-b] += output_inter[i][COST_EST];
- fenc->i_cost_est_aq[b-p0][p1-b] += output_inter[i][COST_EST_AQ];
-
- if( h->param.rc.i_vbv_buffer_size )
+ int *row_satd_inter = fenc->i_row_satds[b-p0][p1-b];
+ int *row_satd_intra = fenc->i_row_satds[0][0];
+ for( int i = 0; i < h->param.i_lookahead_threads; i++ )
 {
- int row_count = output_inter[i][NUM_ROWS];
- memcpy( row_satd_inter, output_inter[i] + NUM_INTS, row_count * sizeof(int) );
+ if( b == p1 )
+ fenc->i_intra_mbs[b-p0] += output_inter[i][INTRA_MBS];
 if( !fenc->b_intra_calculated )
- memcpy( row_satd_intra, output_intra[i] + NUM_INTS, row_count * sizeof(int) );
- row_satd_inter += row_count;
- row_satd_intra += row_count;
+ {
+ fenc->i_cost_est[0][0] += output_intra[i][COST_EST];
+ fenc->i_cost_est_aq[0][0] += output_intra[i][COST_EST_AQ];
+ }
+
+ fenc->i_cost_est[b-p0][p1-b] += output_inter[i][COST_EST];
+ fenc->i_cost_est_aq[b-p0][p1-b] += output_inter[i][COST_EST_AQ];
+
+ if( h->param.rc.i_vbv_buffer_size )
+ {
+ int row_count = output_inter[i][NUM_ROWS];
+ memcpy( row_satd_inter, output_inter[i] + NUM_INTS, row_count * sizeof(int) );
+ if( !fenc->b_intra_calculated )
+ memcpy( row_satd_intra, output_intra[i] + NUM_INTS, row_count * sizeof(int) );
+ row_satd_inter += row_count;
+ row_satd_intra += row_count;
+ }
 }
- }
 
- i_score = fenc->i_cost_est[b-p0][p1-b];
- if( b != p1 )
- i_score = (uint64_t)i_score * 100 / (120 + h->param.i_bframe_bias);
- else
- fenc->b_intra_calculated = 1;
+ i_score = fenc->i_cost_est[b-p0][p1-b];
+ if( b != p1 )
+ i_score = (uint64_t)i_score * 100 / (120 + h->param.i_bframe_bias);
+ else
+ fenc->b_intra_calculated = 1;
 
- fenc->i_cost_est[b-p0][p1-b] = i_score;
- x264_emms();
+ fenc->i_cost_est[b-p0][p1-b] = i_score;
+ x264_emms();
+ }
 }
 
 if( b_intra_penalty )
@@ -1393,7 +1492,7 @@
 return scenecut_internal( h, a, frames, p0, p1, real_scenecut );
 }
 
-void x264_slicetype_analyse( x264_t *h, int keyframe )
+void x264_slicetype_analyse( x264_t *h, int intra_minigop )
 {
 x264_mb_analysis_t a;
 x264_frame_t *frames[X264_LOOKAHEAD_MAX+3] = { NULL, };
@@ -1402,8 +1501,13 @@
 int cost1p0, cost2p0, cost1b1, cost2p1;
 int i_max_search = X264_MIN( h->lookahead->next.i_size, X264_LOOKAHEAD_MAX );
 int vbv_lookahead = h->param.rc.i_vbv_buffer_size && h->param.rc.i_lookahead;
+ /* For determinism we should limit the search to the number of frames lookahead has for sure
+ * in h->lookahead->next.list buffer, except at the end of stream.
+ * For normal calls with (intra_minigop == 0) that is h->lookahead->i_slicetype_length + 1 frames.
+ * And for I-frame calls (intra_minigop != 0) we already removed intra_minigop frames from there. */
 if( h->param.b_deterministic )
- i_max_search = X264_MIN( i_max_search, h->lookahead->i_slicetype_length + !keyframe );
+ i_max_search = X264_MIN( i_max_search, h->lookahead->i_slicetype_length + 1 - intra_minigop );
+ int keyframe = !!intra_minigop;
 
 assert( h->frames.b_have_lowres );
 
@@ -1448,6 +1552,10 @@
 return;
 }
 
+#if HAVE_OPENCL
+ x264_opencl_slicetype_prep( h, frames, num_frames, a.i_lambda );
+#endif
+
 if( h->param.i_bframe )
 {
 if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS )
@@ -1481,6 +1589,18 @@
 continue;
 }
 
+#if HAVE_OPENCL
+ if( h->param.b_opencl )
+ {
+ int b_work_done = 0;
+ b_work_done |= x264_opencl_precalculate_frame_cost(h, frames, a.i_lambda, i+0, i+2, i+1 );
+ b_work_done |= x264_opencl_precalculate_frame_cost(h, frames, a.i_lambda, i+0, i+1, i+1 );
+ b_work_done |= x264_opencl_precalculate_frame_cost(h, frames, a.i_lambda, i+1, i+2, i+2 );
+ if( b_work_done )
+ x264_opencl_flush( h );
+ }
+#endif
+
 cost1b1 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+2, i+1, 0 );
 cost1p0 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+1, i+1, 0 );
 cost2p0 = x264_slicetype_frame_cost( h, &a, frames, i+1, i+2, i+2, 0 );
@@ -1563,6 +1683,10 @@
 /* Restore frametypes for all frames that haven't actually been decided yet. */
 for( int j = reset_start; j <= num_frames; j++ )
 frames[j]->i_type = X264_TYPE_AUTO;
+
+#if HAVE_OPENCL
+ x264_opencl_slicetype_end( h );
+#endif
 }
 
 void x264_slicetype_decide( x264_t *h )

x264-snapshot-20130723-2245.tar.bz2/extras/avxsynth_c.h Added

@@ -0,0 +1,727 @@
+// Avisynth C Interface Version 0.20
+// Copyright 2003 Kevin Atkinson
+
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
+// http://www.gnu.org/copyleft/gpl.html .
+//
+// As a special exception, I give you permission to link to the
+// Avisynth C interface with independent modules that communicate with
+// the Avisynth C interface solely through the interfaces defined in
+// avisynth_c.h, regardless of the license terms of these independent
+// modules, and to copy and distribute the resulting combined work
+// under terms of your choice, provided that every copy of the
+// combined work is accompanied by a complete copy of the source code
+// of the Avisynth C interface and Avisynth itself (with the version
+// used to produce the combined work), being distributed under the
+// terms of the GNU General Public License plus this exception. An
+// independent module is a module which is not derived from or based
+// on Avisynth C Interface, such as 3rd-party filters, import and
+// export plugins, or graphical user interfaces.
+
+#ifndef __AVXSYNTH_C__
+#define __AVXSYNTH_C__
+ 
+#include "windowsPorts/windows2linux.h"
+#include <stdarg.h>
+
+#ifdef __cplusplus
+# define EXTERN_C extern "C"
+#else
+# define EXTERN_C
+#endif
+
+#define AVSC_USE_STDCALL 1
+
+#ifndef AVSC_USE_STDCALL
+# define AVSC_CC __cdecl
+#else
+# define AVSC_CC __stdcall
+#endif
+
+#define AVSC_INLINE static __inline
+
+#ifdef AVISYNTH_C_EXPORTS
+# define AVSC_EXPORT EXTERN_C
+# define AVSC_API(ret, name) EXTERN_C __declspec(dllexport) ret AVSC_CC name
+#else
+# define AVSC_EXPORT EXTERN_C __declspec(dllexport)
+# ifndef AVSC_NO_DECLSPEC
+# define AVSC_API(ret, name) EXTERN_C __declspec(dllimport) ret AVSC_CC name
+# else
+# define AVSC_API(ret, name) typedef ret (AVSC_CC *name##_func)
+# endif
+#endif
+
+#ifdef __GNUC__
+typedef long long int INT64;
+#else
+typedef __int64 INT64;
+#endif
+
+
+/////////////////////////////////////////////////////////////////////
+//
+// Constants
+//
+
+#ifndef __AVXSYNTH_H__
+enum { AVISYNTH_INTERFACE_VERSION = 3 };
+#endif
+
+enum {AVS_SAMPLE_INT8 = 1<<0,
+ AVS_SAMPLE_INT16 = 1<<1, 
+ AVS_SAMPLE_INT24 = 1<<2,
+ AVS_SAMPLE_INT32 = 1<<3,
+ AVS_SAMPLE_FLOAT = 1<<4};
+
+enum {AVS_PLANAR_Y=1<<0,
+ AVS_PLANAR_U=1<<1,
+ AVS_PLANAR_V=1<<2,
+ AVS_PLANAR_ALIGNED=1<<3,
+ AVS_PLANAR_Y_ALIGNED=AVS_PLANAR_Y|AVS_PLANAR_ALIGNED,
+ AVS_PLANAR_U_ALIGNED=AVS_PLANAR_U|AVS_PLANAR_ALIGNED,
+ AVS_PLANAR_V_ALIGNED=AVS_PLANAR_V|AVS_PLANAR_ALIGNED};
+
+ // Colorspace properties.
+enum {AVS_CS_BGR = 1<<28, 
+ AVS_CS_YUV = 1<<29,
+ AVS_CS_INTERLEAVED = 1<<30,
+ AVS_CS_PLANAR = 1<<31};
+
+ // Specific colorformats
+enum {
+ AVS_CS_UNKNOWN = 0,
+ AVS_CS_BGR24 = 1<<0 | AVS_CS_BGR | AVS_CS_INTERLEAVED,
+ AVS_CS_BGR32 = 1<<1 | AVS_CS_BGR | AVS_CS_INTERLEAVED,
+ AVS_CS_YUY2 = 1<<2 | AVS_CS_YUV | AVS_CS_INTERLEAVED,
+ AVS_CS_YV12 = 1<<3 | AVS_CS_YUV | AVS_CS_PLANAR, // y-v-u, planar
+ AVS_CS_I420 = 1<<4 | AVS_CS_YUV | AVS_CS_PLANAR, // y-u-v, planar
+ AVS_CS_IYUV = 1<<4 | AVS_CS_YUV | AVS_CS_PLANAR // same as above
+};
+
+enum {
+ AVS_IT_BFF = 1<<0,
+ AVS_IT_TFF = 1<<1,
+ AVS_IT_FIELDBASED = 1<<2};
+
+enum {
+ AVS_FILTER_TYPE=1,
+ AVS_FILTER_INPUT_COLORSPACE=2,
+ AVS_FILTER_OUTPUT_TYPE=9,
+ AVS_FILTER_NAME=4,
+ AVS_FILTER_AUTHOR=5,
+ AVS_FILTER_VERSION=6,
+ AVS_FILTER_ARGS=7,
+ AVS_FILTER_ARGS_INFO=8,
+ AVS_FILTER_ARGS_DESCRIPTION=10,
+ AVS_FILTER_DESCRIPTION=11};
+
+enum { //SUBTYPES
+ AVS_FILTER_TYPE_AUDIO=1,
+ AVS_FILTER_TYPE_VIDEO=2,
+ AVS_FILTER_OUTPUT_TYPE_SAME=3,
+ AVS_FILTER_OUTPUT_TYPE_DIFFERENT=4};
+
+enum {
+ AVS_CACHE_NOTHING=0,
+ AVS_CACHE_RANGE=1,
+ AVS_CACHE_ALL=2,
+ AVS_CACHE_AUDIO=3,
+ AVS_CACHE_AUDIO_NONE=4,
+ AVS_CACHE_AUDIO_AUTO=5
+};
+
+#define AVS_FRAME_ALIGN 16 
+
+typedef struct AVS_Clip AVS_Clip;
+typedef struct AVS_ScriptEnvironment AVS_ScriptEnvironment;
+
+/////////////////////////////////////////////////////////////////////
+//
+// AVS_VideoInfo
+//
+
+// AVS_VideoInfo is layed out identicly to VideoInfo
+typedef struct AVS_VideoInfo {
+ int width, height; // width=0 means no video
+ unsigned fps_numerator, fps_denominator;
+ int num_frames;
+
+ int pixel_type;
+ 
+ int audio_samples_per_second; // 0 means no audio
+ int sample_type;
+ INT64 num_audio_samples;
+ int nchannels;
+
+ // Imagetype properties
+
+ int image_type;
+} AVS_VideoInfo;
+
+// useful functions of the above
+AVSC_INLINE int avs_has_video(const AVS_VideoInfo * p) 
+ { return (p->width!=0); }
+
+AVSC_INLINE int avs_has_audio(const AVS_VideoInfo * p) 
+ { return (p->audio_samples_per_second!=0); }
+
+AVSC_INLINE int avs_is_rgb(const AVS_VideoInfo * p) 
+ { return !!(p->pixel_type&AVS_CS_BGR); }
+
+AVSC_INLINE int avs_is_rgb24(const AVS_VideoInfo * p) 
+ { return (p->pixel_type&AVS_CS_BGR24)==AVS_CS_BGR24; } // Clear out additional properties
+
+AVSC_INLINE int avs_is_rgb32(const AVS_VideoInfo * p) 
+ { return (p->pixel_type & AVS_CS_BGR32) == AVS_CS_BGR32 ; }
+
+AVSC_INLINE int avs_is_yuv(const AVS_VideoInfo * p) 
+ { return !!(p->pixel_type&AVS_CS_YUV ); }
+
+AVSC_INLINE int avs_is_yuy2(const AVS_VideoInfo * p) 
+ { return (p->pixel_type & AVS_CS_YUY2) == AVS_CS_YUY2; } 
+
+AVSC_INLINE int avs_is_yv12(const AVS_VideoInfo * p) 
+ { return ((p->pixel_type & AVS_CS_YV12) == AVS_CS_YV12)||((p->pixel_type & AVS_CS_I420) == AVS_CS_I420); }
+
+AVSC_INLINE int avs_is_color_space(const AVS_VideoInfo * p, int c_space) 
+ { return ((p->pixel_type & c_space) == c_space); }
+
+AVSC_INLINE int avs_is_property(const AVS_VideoInfo * p, int property) 
+ { return ((p->pixel_type & property)==property ); }
+
+AVSC_INLINE int avs_is_planar(const AVS_VideoInfo * p) 
+ { return !!(p->pixel_type & AVS_CS_PLANAR); }
+ 
+AVSC_INLINE int avs_is_field_based(const AVS_VideoInfo * p) 
+ { return !!(p->image_type & AVS_IT_FIELDBASED); }
+
+AVSC_INLINE int avs_is_parity_known(const AVS_VideoInfo * p) 
+ { return ((p->image_type & AVS_IT_FIELDBASED)&&(p->image_type & (AVS_IT_BFF | AVS_IT_TFF))); }
+
+AVSC_INLINE int avs_is_bff(const AVS_VideoInfo * p) 
+ { return !!(p->image_type & AVS_IT_BFF); }
+
+AVSC_INLINE int avs_is_tff(const AVS_VideoInfo * p) 
+ { return !!(p->image_type & AVS_IT_TFF); }
+
+AVSC_INLINE int avs_bits_per_pixel(const AVS_VideoInfo * p) 
+{ 
+ switch (p->pixel_type) {
+ case AVS_CS_BGR24: return 24;
+ case AVS_CS_BGR32: return 32;
+ case AVS_CS_YUY2: return 16;
+ case AVS_CS_YV12:
+ case AVS_CS_I420: return 12;
+ default: return 0;
+ }
+}
+AVSC_INLINE int avs_bytes_from_pixels(const AVS_VideoInfo * p, int pixels) 
+ { return pixels * (avs_bits_per_pixel(p)>>3); } // Will work on planar images, but will return only luma planes
+
+AVSC_INLINE int avs_row_size(const AVS_VideoInfo * p) 
+ { return avs_bytes_from_pixels(p,p->width); } // Also only returns first plane on planar images
+
+AVSC_INLINE int avs_bmp_size(const AVS_VideoInfo * vi) 
+ { if (avs_is_planar(vi)) {int p = vi->height * ((avs_row_size(vi)+3) & ~3); p+=p>>1; return p; } return vi->height * ((avs_row_size(vi)+3) & ~3); }
+
+AVSC_INLINE int avs_samples_per_second(const AVS_VideoInfo * p) 
+ { return p->audio_samples_per_second; }
+
+
+AVSC_INLINE int avs_bytes_per_channel_sample(const AVS_VideoInfo * p) 
+{
+ switch (p->sample_type) {
+ case AVS_SAMPLE_INT8: return sizeof(signed char);
+ case AVS_SAMPLE_INT16: return sizeof(signed short);
+ case AVS_SAMPLE_INT24: return 3;
+ case AVS_SAMPLE_INT32: return sizeof(signed int);
+ case AVS_SAMPLE_FLOAT: return sizeof(float);
+ default: return 0;
+ }
+}
+AVSC_INLINE int avs_bytes_per_audio_sample(const AVS_VideoInfo * p) 
+ { return p->nchannels*avs_bytes_per_channel_sample(p);}
+
+AVSC_INLINE INT64 avs_audio_samples_from_frames(const AVS_VideoInfo * p, INT64 frames) 
+ { return ((INT64)(frames) * p->audio_samples_per_second * p->fps_denominator / p->fps_numerator); }
+
+AVSC_INLINE int avs_frames_from_audio_samples(const AVS_VideoInfo * p, INT64 samples) 
+ { return (int)(samples * (INT64)p->fps_numerator / (INT64)p->fps_denominator / (INT64)p->audio_samples_per_second); }
+
+AVSC_INLINE INT64 avs_audio_samples_from_bytes(const AVS_VideoInfo * p, INT64 bytes) 
+ { return bytes / avs_bytes_per_audio_sample(p); }
+
+AVSC_INLINE INT64 avs_bytes_from_audio_samples(const AVS_VideoInfo * p, INT64 samples) 
+ { return samples * avs_bytes_per_audio_sample(p); }
+
+AVSC_INLINE int avs_audio_channels(const AVS_VideoInfo * p) 
+ { return p->nchannels; }
+
+AVSC_INLINE int avs_sample_type(const AVS_VideoInfo * p)
+ { return p->sample_type;}
+
+// useful mutator
+AVSC_INLINE void avs_set_property(AVS_VideoInfo * p, int property) 
+ { p->image_type|=property; }
+
+AVSC_INLINE void avs_clear_property(AVS_VideoInfo * p, int property) 
+ { p->image_type&=~property; }
+
+AVSC_INLINE void avs_set_field_based(AVS_VideoInfo * p, int isfieldbased) 
+ { if (isfieldbased) p->image_type|=AVS_IT_FIELDBASED; else p->image_type&=~AVS_IT_FIELDBASED; }
+
+AVSC_INLINE void avs_set_fps(AVS_VideoInfo * p, unsigned numerator, unsigned denominator) 
+{
+ unsigned x=numerator, y=denominator;
+ while (y) { // find gcd
+ unsigned t = x%y; x = y; y = t;
+ }
+ p->fps_numerator = numerator/x;
+ p->fps_denominator = denominator/x;
+}
+
+AVSC_INLINE int avs_is_same_colorspace(AVS_VideoInfo * x, AVS_VideoInfo * y)
+{
+ return (x->pixel_type == y->pixel_type)
+ || (avs_is_yv12(x) && avs_is_yv12(y));
+}
+
+/////////////////////////////////////////////////////////////////////
+//
+// AVS_VideoFrame
+//
+
+// VideoFrameBuffer holds information about a memory block which is used
+// for video data. For efficiency, instances of this class are not deleted
+// when the refcount reaches zero; instead they're stored in a linked list
+// to be reused. The instances are deleted when the corresponding AVS
+// file is closed.
+
+// AVS_VideoFrameBuffer is layed out identicly to VideoFrameBuffer
+// DO NOT USE THIS STRUCTURE DIRECTLY
+typedef struct AVS_VideoFrameBuffer {
+ unsigned char * data;
+ int data_size;
+ // sequence_number is incremented every time the buffer is changed, so
+ // that stale views can tell they're no longer valid.
+ long sequence_number;
+
+ long refcount;
+} AVS_VideoFrameBuffer;
+
+// VideoFrame holds a "window" into a VideoFrameBuffer.
+
+// AVS_VideoFrame is layed out identicly to IVideoFrame
+// DO NOT USE THIS STRUCTURE DIRECTLY
+typedef struct AVS_VideoFrame {
+ int refcount;
+ AVS_VideoFrameBuffer * vfb;
+ int offset, pitch, row_size, height, offsetU, offsetV, pitchUV; // U&V offsets are from top of picture.
+} AVS_VideoFrame;
+
+// Access functions for AVS_VideoFrame
+AVSC_INLINE int avs_get_pitch(const AVS_VideoFrame * p) {
+ return p->pitch;}
+
+AVSC_INLINE int avs_get_pitch_p(const AVS_VideoFrame * p, int plane) { 
+ switch (plane) {
+ case AVS_PLANAR_U: case AVS_PLANAR_V: return p->pitchUV;}
+ return p->pitch;}
+
+AVSC_INLINE int avs_get_row_size(const AVS_VideoFrame * p) {
+ return p->row_size; }
+
+AVSC_INLINE int avs_get_row_size_p(const AVS_VideoFrame * p, int plane) { 
+ int r;
+ switch (plane) {
+ case AVS_PLANAR_U: case AVS_PLANAR_V: 
+ if (p->pitchUV) return p->row_size>>1; 
+ else return 0;
+ case AVS_PLANAR_U_ALIGNED: case AVS_PLANAR_V_ALIGNED: 
+ if (p->pitchUV) { 
+ r = ((p->row_size+AVS_FRAME_ALIGN-1)&(~(AVS_FRAME_ALIGN-1)) )>>1; // Aligned rowsize
+ if (r < p->pitchUV) 
+ return r; 
+ return p->row_size>>1; 
+ } else return 0;
+ case AVS_PLANAR_Y_ALIGNED:
+ r = (p->row_size+AVS_FRAME_ALIGN-1)&(~(AVS_FRAME_ALIGN-1)); // Aligned rowsize
+ if (r <= p->pitch) 
+ return r; 
+ return p->row_size;
+ }
+ return p->row_size;
+}
+
+AVSC_INLINE int avs_get_height(const AVS_VideoFrame * p) {
+ return p->height;}
+
+AVSC_INLINE int avs_get_height_p(const AVS_VideoFrame * p, int plane) {
+ switch (plane) {
+ case AVS_PLANAR_U: case AVS_PLANAR_V: 
+ if (p->pitchUV) return p->height>>1;
+ return 0;
+ }
+ return p->height;}
+
+AVSC_INLINE const unsigned char* avs_get_read_ptr(const AVS_VideoFrame * p) {
+ return p->vfb->data + p->offset;}
+
+AVSC_INLINE const unsigned char* avs_get_read_ptr_p(const AVS_VideoFrame * p, int plane) 
+{
+ switch (plane) {
+ case AVS_PLANAR_U: return p->vfb->data + p->offsetU;
+ case AVS_PLANAR_V: return p->vfb->data + p->offsetV;
+ default: return p->vfb->data + p->offset;}
+}
+
+AVSC_INLINE int avs_is_writable(const AVS_VideoFrame * p) {
+ return (p->refcount == 1 && p->vfb->refcount == 1);}
+
+AVSC_INLINE unsigned char* avs_get_write_ptr(const AVS_VideoFrame * p) 
+{
+ if (avs_is_writable(p)) {
+ ++p->vfb->sequence_number;
+ return p->vfb->data + p->offset;
+ } else
+ return 0;
+}
+
+AVSC_INLINE unsigned char* avs_get_write_ptr_p(const AVS_VideoFrame * p, int plane) 
+{
+ if (plane==AVS_PLANAR_Y && avs_is_writable(p)) {
+ ++p->vfb->sequence_number;
+ return p->vfb->data + p->offset;
+ } else if (plane==AVS_PLANAR_Y) {
+ return 0;
+ } else {
+ switch (plane) {
+ case AVS_PLANAR_U: return p->vfb->data + p->offsetU;
+ case AVS_PLANAR_V: return p->vfb->data + p->offsetV;
+ default: return p->vfb->data + p->offset;
+ }
+ }
+}
+
+#if defined __cplusplus
+extern "C"
+{
+#endif // __cplusplus
+AVSC_API(void, avs_release_video_frame)(AVS_VideoFrame *);
+// makes a shallow copy of a video frame
+AVSC_API(AVS_VideoFrame *, avs_copy_video_frame)(AVS_VideoFrame *);
+#if defined __cplusplus
+}
+#endif // __cplusplus
+
+#ifndef AVSC_NO_DECLSPEC
+AVSC_INLINE void avs_release_frame(AVS_VideoFrame * f)
+ {avs_release_video_frame(f);}
+AVSC_INLINE AVS_VideoFrame * avs_copy_frame(AVS_VideoFrame * f)
+ {return avs_copy_video_frame(f);}
+#endif
+
+/////////////////////////////////////////////////////////////////////
+//
+// AVS_Value
+//
+
+// Treat AVS_Value as a fat pointer. That is use avs_copy_value
+// and avs_release_value appropiaty as you would if AVS_Value was
+// a pointer.
+
+// To maintain source code compatibility with future versions of the
+// avisynth_c API don't use the AVS_Value directly. Use the helper
+// functions below.
+
+// AVS_Value is layed out identicly to AVSValue
+typedef struct AVS_Value AVS_Value;
+struct AVS_Value {
+ short type; // 'a'rray, 'c'lip, 'b'ool, 'i'nt, 'f'loat, 's'tring, 'v'oid, or 'l'ong
+ // for some function e'rror
+ short array_size;
+ union {
+ void * clip; // do not use directly, use avs_take_clip
+ char boolean;
+ int integer;
+ INT64 integer64; // match addition of __int64 to avxplugin.h
+ float floating_pt;
+ const char * string;
+ const AVS_Value * array;
+ } d;
+};
+
+// AVS_Value should be initilized with avs_void.
+// Should also set to avs_void after the value is released
+// with avs_copy_value. Consider it the equalvent of setting
+// a pointer to NULL
+static const AVS_Value avs_void = {'v'};
+
+AVSC_API(void, avs_copy_value)(AVS_Value * dest, AVS_Value src);
+AVSC_API(void, avs_release_value)(AVS_Value);
+
+AVSC_INLINE int avs_defined(AVS_Value v) { return v.type != 'v'; }
+AVSC_INLINE int avs_is_clip(AVS_Value v) { return v.type == 'c'; }
+AVSC_INLINE int avs_is_bool(AVS_Value v) { return v.type == 'b'; }
+AVSC_INLINE int avs_is_int(AVS_Value v) { return v.type == 'i'; }
+AVSC_INLINE int avs_is_float(AVS_Value v) { return v.type == 'f' || v.type == 'i'; }
+AVSC_INLINE int avs_is_string(AVS_Value v) { return v.type == 's'; }
+AVSC_INLINE int avs_is_array(AVS_Value v) { return v.type == 'a'; }
+AVSC_INLINE int avs_is_error(AVS_Value v) { return v.type == 'e'; }
+
+#if defined __cplusplus
+extern "C"
+{
+#endif // __cplusplus
+AVSC_API(AVS_Clip *, avs_take_clip)(AVS_Value, AVS_ScriptEnvironment *);
+AVSC_API(void, avs_set_to_clip)(AVS_Value *, AVS_Clip *);
+#if defined __cplusplus
+}
+#endif // __cplusplus
+
+AVSC_INLINE int avs_as_bool(AVS_Value v) 
+ { return v.d.boolean; } 
+AVSC_INLINE int avs_as_int(AVS_Value v) 
+ { return v.d.integer; } 
+AVSC_INLINE const char * avs_as_string(AVS_Value v) 
+ { return avs_is_error(v) || avs_is_string(v) ? v.d.string : 0; }
+AVSC_INLINE double avs_as_float(AVS_Value v) 
+ { return avs_is_int(v) ? v.d.integer : v.d.floating_pt; }
+AVSC_INLINE const char * avs_as_error(AVS_Value v) 
+ { return avs_is_error(v) ? v.d.string : 0; }
+AVSC_INLINE const AVS_Value * avs_as_array(AVS_Value v)
+ { return v.d.array; }
+AVSC_INLINE int avs_array_size(AVS_Value v) 
+ { return avs_is_array(v) ? v.array_size : 1; }
+AVSC_INLINE AVS_Value avs_array_elt(AVS_Value v, int index) 
+ { return avs_is_array(v) ? v.d.array[index] : v; }
+
+// only use these functions on am AVS_Value that does not already have
+// an active value. Remember, treat AVS_Value as a fat pointer.
+AVSC_INLINE AVS_Value avs_new_value_bool(int v0) 
+ { AVS_Value v; v.type = 'b'; v.d.boolean = v0 == 0 ? 0 : 1; return v; } 
+AVSC_INLINE AVS_Value avs_new_value_int(int v0) 
+ { AVS_Value v; v.type = 'i'; v.d.integer = v0; return v; } 
+AVSC_INLINE AVS_Value avs_new_value_string(const char * v0) 
+ { AVS_Value v; v.type = 's'; v.d.string = v0; return v; }
+AVSC_INLINE AVS_Value avs_new_value_float(float v0) 
+ { AVS_Value v; v.type = 'f'; v.d.floating_pt = v0; return v;}
+AVSC_INLINE AVS_Value avs_new_value_error(const char * v0) 
+ { AVS_Value v; v.type = 'e'; v.d.string = v0; return v; }
+#ifndef AVSC_NO_DECLSPEC
+AVSC_INLINE AVS_Value avs_new_value_clip(AVS_Clip * v0)
+ { AVS_Value v; avs_set_to_clip(&v, v0); return v; }
+#endif
+AVSC_INLINE AVS_Value avs_new_value_array(AVS_Value * v0, int size)
+ { AVS_Value v; v.type = 'a'; v.d.array = v0; v.array_size = size; return v; }
+
+/////////////////////////////////////////////////////////////////////
+//
+// AVS_Clip
+//
+#if defined __cplusplus
+extern "C"
+{
+#endif // __cplusplus
+AVSC_API(void, avs_release_clip)(AVS_Clip *);
+AVSC_API(AVS_Clip *, avs_copy_clip)(AVS_Clip *);
+
+AVSC_API(const char *, avs_clip_get_error)(AVS_Clip *); // return 0 if no error
+
+AVSC_API(const AVS_VideoInfo *, avs_get_video_info)(AVS_Clip *);
+
+AVSC_API(int, avs_get_version)(AVS_Clip *);
+ 
+AVSC_API(AVS_VideoFrame *, avs_get_frame)(AVS_Clip *, int n);
+// The returned video frame must be released with avs_release_video_frame
+
+AVSC_API(int, avs_get_parity)(AVS_Clip *, int n); 
+// return field parity if field_based, else parity of first field in frame
+
+AVSC_API(int, avs_get_audio)(AVS_Clip *, void * buf, 
+ INT64 start, INT64 count); 
+// start and count are in samples
+
+AVSC_API(int, avs_set_cache_hints)(AVS_Clip *, 
+ int cachehints, size_t frame_range);
+#if defined __cplusplus
+}
+#endif // __cplusplus
+
+// This is the callback type used by avs_add_function
+typedef AVS_Value (AVSC_CC * AVS_ApplyFunc)
+ (AVS_ScriptEnvironment *, AVS_Value args, void * user_data);
+
+typedef struct AVS_FilterInfo AVS_FilterInfo;
+struct AVS_FilterInfo
+{
+ // these members should not be modified outside of the AVS_ApplyFunc callback
+ AVS_Clip * child;
+ AVS_VideoInfo vi;
+ AVS_ScriptEnvironment * env;
+ AVS_VideoFrame * (AVSC_CC * get_frame)(AVS_FilterInfo *, int n);
+ int (AVSC_CC * get_parity)(AVS_FilterInfo *, int n);
+ int (AVSC_CC * get_audio)(AVS_FilterInfo *, void * buf, 
+				 INT64 start, INT64 count);
+ int (AVSC_CC * set_cache_hints)(AVS_FilterInfo *, int cachehints, 
+					int frame_range);
+ void (AVSC_CC * free_filter)(AVS_FilterInfo *);
+ 
+ // Should be set when ever there is an error to report.
+ // It is cleared before any of the above methods are called
+ const char * error;
+ // this is to store whatever and may be modified at will
+ void * user_data;
+};
+
+// Create a new filter
+// fi is set to point to the AVS_FilterInfo so that you can
+// modify it once it is initilized.
+// store_child should generally be set to true. If it is not
+// set than ALL methods (the function pointers) must be defined
+// If it is set than you do not need to worry about freeing the child
+// clip.
+#if defined __cplusplus
+extern "C"
+{
+#endif // __cplusplus
+AVSC_API(AVS_Clip *, avs_new_c_filter)(AVS_ScriptEnvironment * e,
+ AVS_FilterInfo * * fi,
+ AVS_Value child, int store_child);
+#if defined __cplusplus
+}
+#endif // __cplusplus
+
+
+/////////////////////////////////////////////////////////////////////
+//
+// AVS_ScriptEnvironment
+//
+
+// For GetCPUFlags. These are backwards-compatible with those in VirtualDub.
+enum { 
+ /* slowest CPU to support extension */
+ AVS_CPU_FORCE = 0x01, // N/A
+ AVS_CPU_FPU = 0x02, // 386/486DX
+ AVS_CPU_MMX = 0x04, // P55C, K6, PII
+ AVS_CPU_INTEGER_SSE = 0x08, // PIII, Athlon
+ AVS_CPU_SSE = 0x10, // PIII, Athlon XP/MP
+ AVS_CPU_SSE2 = 0x20, // PIV, Hammer
+ AVS_CPU_3DNOW = 0x40, // K6-2
+ AVS_CPU_3DNOW_EXT = 0x80, // Athlon
+ AVS_CPU_X86_64 = 0xA0, // Hammer (note: equiv. to 3DNow + SSE2, 
+ // which only Hammer will have anyway)
+};
+
+#if defined __cplusplus
+extern "C"
+{
+#endif // __cplusplus
+AVSC_API(const char *, avs_get_error)(AVS_ScriptEnvironment *); // return 0 if no error
+
+AVSC_API(long, avs_get_cpu_flags)(AVS_ScriptEnvironment *);
+AVSC_API(int, avs_check_version)(AVS_ScriptEnvironment *, int version);
+
+AVSC_API(char *, avs_save_string)(AVS_ScriptEnvironment *, const char* s, int length);
+AVSC_API(char *, avs_sprintf)(AVS_ScriptEnvironment *, const char * fmt, ...);
+
+AVSC_API(char *, avs_vsprintf)(AVS_ScriptEnvironment *, const char * fmt, va_list val);
+ // note: val is really a va_list; I hope everyone typedefs va_list to a pointer
+
+AVSC_API(int, avs_add_function)(AVS_ScriptEnvironment *, 
+				 const char * name, const char * params, 
+				 AVS_ApplyFunc apply, void * user_data);
+
+AVSC_API(int, avs_function_exists)(AVS_ScriptEnvironment *, const char * name);
+
+AVSC_API(AVS_Value, avs_invoke)(AVS_ScriptEnvironment *, const char * name, 
+ AVS_Value args, const char** arg_names);
+// The returned value must be be released with avs_release_value
+
+AVSC_API(AVS_Value, avs_get_var)(AVS_ScriptEnvironment *, const char* name);
+// The returned value must be be released with avs_release_value
+
+AVSC_API(int, avs_set_var)(AVS_ScriptEnvironment *, const char* name, AVS_Value val);
+
+AVSC_API(int, avs_set_global_var)(AVS_ScriptEnvironment *, const char* name, const AVS_Value val);
+
+//void avs_push_context(AVS_ScriptEnvironment *, int level=0);
+//void avs_pop_context(AVS_ScriptEnvironment *);
+
+AVSC_API(AVS_VideoFrame *, avs_new_video_frame_a)(AVS_ScriptEnvironment *, 
+ const AVS_VideoInfo * vi, int align);
+// align should be at least 16
+#if defined __cplusplus
+}
+#endif // __cplusplus
+
+#ifndef AVSC_NO_DECLSPEC
+AVSC_INLINE 
+AVS_VideoFrame * avs_new_video_frame(AVS_ScriptEnvironment * env, 
+ const AVS_VideoInfo * vi)
+ {return avs_new_video_frame_a(env,vi,AVS_FRAME_ALIGN);}
+
+AVSC_INLINE 
+AVS_VideoFrame * avs_new_frame(AVS_ScriptEnvironment * env, 
+ const AVS_VideoInfo * vi)
+ {return avs_new_video_frame_a(env,vi,AVS_FRAME_ALIGN);}
+#endif
+
+#if defined __cplusplus
+extern "C"
+{
+#endif // __cplusplus
+AVSC_API(int, avs_make_writable)(AVS_ScriptEnvironment *, AVS_VideoFrame * * pvf);
+
+AVSC_API(void, avs_bit_blt)(AVS_ScriptEnvironment *, unsigned char* dstp, int dst_pitch, const unsigned char* srcp, int src_pitch, int row_size, int height);
+
+typedef void (AVSC_CC *AVS_ShutdownFunc)(void* user_data, AVS_ScriptEnvironment * env);
+AVSC_API(void, avs_at_exit)(AVS_ScriptEnvironment *, AVS_ShutdownFunc function, void * user_data);
+
+AVSC_API(AVS_VideoFrame *, avs_subframe)(AVS_ScriptEnvironment *, AVS_VideoFrame * src, int rel_offset, int new_pitch, int new_row_size, int new_height);
+// The returned video frame must be be released
+
+AVSC_API(int, avs_set_memory_max)(AVS_ScriptEnvironment *, int mem);
+
+AVSC_API(int, avs_set_working_dir)(AVS_ScriptEnvironment *, const char * newdir);
+
+// avisynth.dll exports this; it's a way to use it as a library, without
+// writing an AVS script or without going through AVIFile.
+AVSC_API(AVS_ScriptEnvironment *, avs_create_script_environment)(int version);
+#if defined __cplusplus
+}
+#endif // __cplusplus
+
+// this symbol is the entry point for the plugin and must
+// be defined
+AVSC_EXPORT
+const char * AVSC_CC avisynth_c_plugin_init(AVS_ScriptEnvironment* env);
+
+
+#if defined __cplusplus
+extern "C"
+{
+#endif // __cplusplus
+AVSC_API(void, avs_delete_script_environment)(AVS_ScriptEnvironment *);
+
+
+AVSC_API(AVS_VideoFrame *, avs_subframe_planar)(AVS_ScriptEnvironment *, AVS_VideoFrame * src, int rel_offset, int new_pitch, int new_row_size, int new_height, int rel_offsetU, int rel_offsetV, int new_pitchUV);
+// The returned video frame must be be released
+#if defined __cplusplus
+}
+#endif // __cplusplus
+
+#endif //__AVXSYNTH_C__

x264-snapshot-20130723-2245.tar.bz2/extras/cl.h Added

@@ -0,0 +1,1209 @@
+/*******************************************************************************
+ * Copyright (c) 2008 - 2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+#ifndef __OPENCL_CL_H
+#define __OPENCL_CL_H
+
+#include "cl_platform.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************/
+
+typedef struct _cl_platform_id * cl_platform_id;
+typedef struct _cl_device_id * cl_device_id;
+typedef struct _cl_context * cl_context;
+typedef struct _cl_command_queue * cl_command_queue;
+typedef struct _cl_mem * cl_mem;
+typedef struct _cl_program * cl_program;
+typedef struct _cl_kernel * cl_kernel;
+typedef struct _cl_event * cl_event;
+typedef struct _cl_sampler * cl_sampler;
+
+typedef cl_uint cl_bool; /* WARNING! Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */
+typedef cl_ulong cl_bitfield;
+typedef cl_bitfield cl_device_type;
+typedef cl_uint cl_platform_info;
+typedef cl_uint cl_device_info;
+typedef cl_bitfield cl_device_fp_config;
+typedef cl_uint cl_device_mem_cache_type;
+typedef cl_uint cl_device_local_mem_type;
+typedef cl_bitfield cl_device_exec_capabilities;
+typedef cl_bitfield cl_command_queue_properties;
+typedef intptr_t cl_device_partition_property;
+typedef cl_bitfield cl_device_affinity_domain;
+
+typedef intptr_t cl_context_properties;
+typedef cl_uint cl_context_info;
+typedef cl_uint cl_command_queue_info;
+typedef cl_uint cl_channel_order;
+typedef cl_uint cl_channel_type;
+typedef cl_bitfield cl_mem_flags;
+typedef cl_uint cl_mem_object_type;
+typedef cl_uint cl_mem_info;
+typedef cl_bitfield cl_mem_migration_flags;
+typedef cl_uint cl_image_info;
+typedef cl_uint cl_buffer_create_type;
+typedef cl_uint cl_addressing_mode;
+typedef cl_uint cl_filter_mode;
+typedef cl_uint cl_sampler_info;
+typedef cl_bitfield cl_map_flags;
+typedef cl_uint cl_program_info;
+typedef cl_uint cl_program_build_info;
+typedef cl_uint cl_program_binary_type;
+typedef cl_int cl_build_status;
+typedef cl_uint cl_kernel_info;
+typedef cl_uint cl_kernel_arg_info;
+typedef cl_uint cl_kernel_arg_address_qualifier;
+typedef cl_uint cl_kernel_arg_access_qualifier;
+typedef cl_bitfield cl_kernel_arg_type_qualifier;
+typedef cl_uint cl_kernel_work_group_info;
+typedef cl_uint cl_event_info;
+typedef cl_uint cl_command_type;
+typedef cl_uint cl_profiling_info;
+
+
+typedef struct _cl_image_format {
+ cl_channel_order image_channel_order;
+ cl_channel_type image_channel_data_type;
+} cl_image_format;
+
+typedef struct _cl_image_desc {
+ cl_mem_object_type image_type;
+ size_t image_width;
+ size_t image_height;
+ size_t image_depth;
+ size_t image_array_size;
+ size_t image_row_pitch;
+ size_t image_slice_pitch;
+ cl_uint num_mip_levels;
+ cl_uint num_samples;
+ cl_mem buffer;
+} cl_image_desc;
+
+typedef struct _cl_buffer_region {
+ size_t origin;
+ size_t size;
+} cl_buffer_region;
+
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_SUCCESS 0
+#define CL_DEVICE_NOT_FOUND -1
+#define CL_DEVICE_NOT_AVAILABLE -2
+#define CL_COMPILER_NOT_AVAILABLE -3
+#define CL_MEM_OBJECT_ALLOCATION_FAILURE -4
+#define CL_OUT_OF_RESOURCES -5
+#define CL_OUT_OF_HOST_MEMORY -6
+#define CL_PROFILING_INFO_NOT_AVAILABLE -7
+#define CL_MEM_COPY_OVERLAP -8
+#define CL_IMAGE_FORMAT_MISMATCH -9
+#define CL_IMAGE_FORMAT_NOT_SUPPORTED -10
+#define CL_BUILD_PROGRAM_FAILURE -11
+#define CL_MAP_FAILURE -12
+#define CL_MISALIGNED_SUB_BUFFER_OFFSET -13
+#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
+#define CL_COMPILE_PROGRAM_FAILURE -15
+#define CL_LINKER_NOT_AVAILABLE -16
+#define CL_LINK_PROGRAM_FAILURE -17
+#define CL_DEVICE_PARTITION_FAILED -18
+#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE -19
+
+#define CL_INVALID_VALUE -30
+#define CL_INVALID_DEVICE_TYPE -31
+#define CL_INVALID_PLATFORM -32
+#define CL_INVALID_DEVICE -33
+#define CL_INVALID_CONTEXT -34
+#define CL_INVALID_QUEUE_PROPERTIES -35
+#define CL_INVALID_COMMAND_QUEUE -36
+#define CL_INVALID_HOST_PTR -37
+#define CL_INVALID_MEM_OBJECT -38
+#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR -39
+#define CL_INVALID_IMAGE_SIZE -40
+#define CL_INVALID_SAMPLER -41
+#define CL_INVALID_BINARY -42
+#define CL_INVALID_BUILD_OPTIONS -43
+#define CL_INVALID_PROGRAM -44
+#define CL_INVALID_PROGRAM_EXECUTABLE -45
+#define CL_INVALID_KERNEL_NAME -46
+#define CL_INVALID_KERNEL_DEFINITION -47
+#define CL_INVALID_KERNEL -48
+#define CL_INVALID_ARG_INDEX -49
+#define CL_INVALID_ARG_VALUE -50
+#define CL_INVALID_ARG_SIZE -51
+#define CL_INVALID_KERNEL_ARGS -52
+#define CL_INVALID_WORK_DIMENSION -53
+#define CL_INVALID_WORK_GROUP_SIZE -54
+#define CL_INVALID_WORK_ITEM_SIZE -55
+#define CL_INVALID_GLOBAL_OFFSET -56
+#define CL_INVALID_EVENT_WAIT_LIST -57
+#define CL_INVALID_EVENT -58
+#define CL_INVALID_OPERATION -59
+#define CL_INVALID_GL_OBJECT -60
+#define CL_INVALID_BUFFER_SIZE -61
+#define CL_INVALID_MIP_LEVEL -62
+#define CL_INVALID_GLOBAL_WORK_SIZE -63
+#define CL_INVALID_PROPERTY -64
+#define CL_INVALID_IMAGE_DESCRIPTOR -65
+#define CL_INVALID_COMPILER_OPTIONS -66
+#define CL_INVALID_LINKER_OPTIONS -67
+#define CL_INVALID_DEVICE_PARTITION_COUNT -68
+
+/* OpenCL Version */
+#define CL_VERSION_1_0 1
+#define CL_VERSION_1_1 1
+#define CL_VERSION_1_2 1
+
+/* cl_bool */
+#define CL_FALSE 0
+#define CL_TRUE 1
+#define CL_BLOCKING CL_TRUE
+#define CL_NON_BLOCKING CL_FALSE
+
+/* cl_platform_info */
+#define CL_PLATFORM_PROFILE 0x0900
+#define CL_PLATFORM_VERSION 0x0901
+#define CL_PLATFORM_NAME 0x0902
+#define CL_PLATFORM_VENDOR 0x0903
+#define CL_PLATFORM_EXTENSIONS 0x0904
+
+/* cl_device_type - bitfield */
+#define CL_DEVICE_TYPE_DEFAULT (1 << 0)
+#define CL_DEVICE_TYPE_CPU (1 << 1)
+#define CL_DEVICE_TYPE_GPU (1 << 2)
+#define CL_DEVICE_TYPE_ACCELERATOR (1 << 3)
+#define CL_DEVICE_TYPE_CUSTOM (1 << 4)
+#define CL_DEVICE_TYPE_ALL 0xFFFFFFFF
+
+/* cl_device_info */
+#define CL_DEVICE_TYPE 0x1000
+#define CL_DEVICE_VENDOR_ID 0x1001
+#define CL_DEVICE_MAX_COMPUTE_UNITS 0x1002
+#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 0x1003
+#define CL_DEVICE_MAX_WORK_GROUP_SIZE 0x1004
+#define CL_DEVICE_MAX_WORK_ITEM_SIZES 0x1005
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR 0x1006
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT 0x1007
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT 0x1008
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG 0x1009
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT 0x100A
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE 0x100B
+#define CL_DEVICE_MAX_CLOCK_FREQUENCY 0x100C
+#define CL_DEVICE_ADDRESS_BITS 0x100D
+#define CL_DEVICE_MAX_READ_IMAGE_ARGS 0x100E
+#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS 0x100F
+#define CL_DEVICE_MAX_MEM_ALLOC_SIZE 0x1010
+#define CL_DEVICE_IMAGE2D_MAX_WIDTH 0x1011
+#define CL_DEVICE_IMAGE2D_MAX_HEIGHT 0x1012
+#define CL_DEVICE_IMAGE3D_MAX_WIDTH 0x1013
+#define CL_DEVICE_IMAGE3D_MAX_HEIGHT 0x1014
+#define CL_DEVICE_IMAGE3D_MAX_DEPTH 0x1015
+#define CL_DEVICE_IMAGE_SUPPORT 0x1016
+#define CL_DEVICE_MAX_PARAMETER_SIZE 0x1017
+#define CL_DEVICE_MAX_SAMPLERS 0x1018
+#define CL_DEVICE_MEM_BASE_ADDR_ALIGN 0x1019
+#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE 0x101A
+#define CL_DEVICE_SINGLE_FP_CONFIG 0x101B
+#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE 0x101C
+#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE 0x101D
+#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE 0x101E
+#define CL_DEVICE_GLOBAL_MEM_SIZE 0x101F
+#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE 0x1020
+#define CL_DEVICE_MAX_CONSTANT_ARGS 0x1021
+#define CL_DEVICE_LOCAL_MEM_TYPE 0x1022
+#define CL_DEVICE_LOCAL_MEM_SIZE 0x1023
+#define CL_DEVICE_ERROR_CORRECTION_SUPPORT 0x1024
+#define CL_DEVICE_PROFILING_TIMER_RESOLUTION 0x1025
+#define CL_DEVICE_ENDIAN_LITTLE 0x1026
+#define CL_DEVICE_AVAILABLE 0x1027
+#define CL_DEVICE_COMPILER_AVAILABLE 0x1028
+#define CL_DEVICE_EXECUTION_CAPABILITIES 0x1029
+#define CL_DEVICE_QUEUE_PROPERTIES 0x102A
+#define CL_DEVICE_NAME 0x102B
+#define CL_DEVICE_VENDOR 0x102C
+#define CL_DRIVER_VERSION 0x102D
+#define CL_DEVICE_PROFILE 0x102E
+#define CL_DEVICE_VERSION 0x102F
+#define CL_DEVICE_EXTENSIONS 0x1030
+#define CL_DEVICE_PLATFORM 0x1031
+#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032
+/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG */
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF 0x1034
+#define CL_DEVICE_HOST_UNIFIED_MEMORY 0x1035
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR 0x1036
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT 0x1037
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT 0x1038
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG 0x1039
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT 0x103A
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE 0x103B
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF 0x103C
+#define CL_DEVICE_OPENCL_C_VERSION 0x103D
+#define CL_DEVICE_LINKER_AVAILABLE 0x103E
+#define CL_DEVICE_BUILT_IN_KERNELS 0x103F
+#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE 0x1040
+#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE 0x1041
+#define CL_DEVICE_PARENT_DEVICE 0x1042
+#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES 0x1043
+#define CL_DEVICE_PARTITION_PROPERTIES 0x1044
+#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN 0x1045
+#define CL_DEVICE_PARTITION_TYPE 0x1046
+#define CL_DEVICE_REFERENCE_COUNT 0x1047
+#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC 0x1048
+#define CL_DEVICE_PRINTF_BUFFER_SIZE 0x1049
+#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT 0x104A
+#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT 0x104B
+
+/* cl_device_fp_config - bitfield */
+#define CL_FP_DENORM (1 << 0)
+#define CL_FP_INF_NAN (1 << 1)
+#define CL_FP_ROUND_TO_NEAREST (1 << 2)
+#define CL_FP_ROUND_TO_ZERO (1 << 3)
+#define CL_FP_ROUND_TO_INF (1 << 4)
+#define CL_FP_FMA (1 << 5)
+#define CL_FP_SOFT_FLOAT (1 << 6)
+#define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT (1 << 7)
+
+/* cl_device_mem_cache_type */
+#define CL_NONE 0x0
+#define CL_READ_ONLY_CACHE 0x1
+#define CL_READ_WRITE_CACHE 0x2
+
+/* cl_device_local_mem_type */
+#define CL_LOCAL 0x1
+#define CL_GLOBAL 0x2
+
+/* cl_device_exec_capabilities - bitfield */
+#define CL_EXEC_KERNEL (1 << 0)
+#define CL_EXEC_NATIVE_KERNEL (1 << 1)
+
+/* cl_command_queue_properties - bitfield */
+#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE (1 << 0)
+#define CL_QUEUE_PROFILING_ENABLE (1 << 1)
+
+/* cl_context_info */
+#define CL_CONTEXT_REFERENCE_COUNT 0x1080
+#define CL_CONTEXT_DEVICES 0x1081
+#define CL_CONTEXT_PROPERTIES 0x1082
+#define CL_CONTEXT_NUM_DEVICES 0x1083
+
+/* cl_context_properties */
+#define CL_CONTEXT_PLATFORM 0x1084
+#define CL_CONTEXT_INTEROP_USER_SYNC 0x1085
+
+/* cl_device_partition_property */
+#define CL_DEVICE_PARTITION_EQUALLY 0x1086
+#define CL_DEVICE_PARTITION_BY_COUNTS 0x1087
+#define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END 0x0
+#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN 0x1088
+
+/* cl_device_affinity_domain */
+#define CL_DEVICE_AFFINITY_DOMAIN_NUMA (1 << 0)
+#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE (1 << 1)
+#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE (1 << 2)
+#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE (1 << 3)
+#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE (1 << 4)
+#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE (1 << 5)
+
+/* cl_command_queue_info */
+#define CL_QUEUE_CONTEXT 0x1090
+#define CL_QUEUE_DEVICE 0x1091
+#define CL_QUEUE_REFERENCE_COUNT 0x1092
+#define CL_QUEUE_PROPERTIES 0x1093
+
+/* cl_mem_flags - bitfield */
+#define CL_MEM_READ_WRITE (1 << 0)
+#define CL_MEM_WRITE_ONLY (1 << 1)
+#define CL_MEM_READ_ONLY (1 << 2)
+#define CL_MEM_USE_HOST_PTR (1 << 3)
+#define CL_MEM_ALLOC_HOST_PTR (1 << 4)
+#define CL_MEM_COPY_HOST_PTR (1 << 5)
+// reserved (1 << 6)
+#define CL_MEM_HOST_WRITE_ONLY (1 << 7)
+#define CL_MEM_HOST_READ_ONLY (1 << 8)
+#define CL_MEM_HOST_NO_ACCESS (1 << 9)
+
+/* cl_mem_migration_flags - bitfield */
+#define CL_MIGRATE_MEM_OBJECT_HOST (1 << 0)
+#define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED (1 << 1)
+
+/* cl_channel_order */
+#define CL_R 0x10B0
+#define CL_A 0x10B1
+#define CL_RG 0x10B2
+#define CL_RA 0x10B3
+#define CL_RGB 0x10B4
+#define CL_RGBA 0x10B5
+#define CL_BGRA 0x10B6
+#define CL_ARGB 0x10B7
+#define CL_INTENSITY 0x10B8
+#define CL_LUMINANCE 0x10B9
+#define CL_Rx 0x10BA
+#define CL_RGx 0x10BB
+#define CL_RGBx 0x10BC
+#define CL_DEPTH 0x10BD
+#define CL_DEPTH_STENCIL 0x10BE
+
+/* cl_channel_type */
+#define CL_SNORM_INT8 0x10D0
+#define CL_SNORM_INT16 0x10D1
+#define CL_UNORM_INT8 0x10D2
+#define CL_UNORM_INT16 0x10D3
+#define CL_UNORM_SHORT_565 0x10D4
+#define CL_UNORM_SHORT_555 0x10D5
+#define CL_UNORM_INT_101010 0x10D6
+#define CL_SIGNED_INT8 0x10D7
+#define CL_SIGNED_INT16 0x10D8
+#define CL_SIGNED_INT32 0x10D9
+#define CL_UNSIGNED_INT8 0x10DA
+#define CL_UNSIGNED_INT16 0x10DB
+#define CL_UNSIGNED_INT32 0x10DC
+#define CL_HALF_FLOAT 0x10DD
+#define CL_FLOAT 0x10DE
+#define CL_UNORM_INT24 0x10DF
+
+/* cl_mem_object_type */
+#define CL_MEM_OBJECT_BUFFER 0x10F0
+#define CL_MEM_OBJECT_IMAGE2D 0x10F1
+#define CL_MEM_OBJECT_IMAGE3D 0x10F2
+#define CL_MEM_OBJECT_IMAGE2D_ARRAY 0x10F3
+#define CL_MEM_OBJECT_IMAGE1D 0x10F4
+#define CL_MEM_OBJECT_IMAGE1D_ARRAY 0x10F5
+#define CL_MEM_OBJECT_IMAGE1D_BUFFER 0x10F6
+
+/* cl_mem_info */
+#define CL_MEM_TYPE 0x1100
+#define CL_MEM_FLAGS 0x1101
+#define CL_MEM_SIZE 0x1102
+#define CL_MEM_HOST_PTR 0x1103
+#define CL_MEM_MAP_COUNT 0x1104
+#define CL_MEM_REFERENCE_COUNT 0x1105
+#define CL_MEM_CONTEXT 0x1106
+#define CL_MEM_ASSOCIATED_MEMOBJECT 0x1107
+#define CL_MEM_OFFSET 0x1108
+
+/* cl_image_info */
+#define CL_IMAGE_FORMAT 0x1110
+#define CL_IMAGE_ELEMENT_SIZE 0x1111
+#define CL_IMAGE_ROW_PITCH 0x1112
+#define CL_IMAGE_SLICE_PITCH 0x1113
+#define CL_IMAGE_WIDTH 0x1114
+#define CL_IMAGE_HEIGHT 0x1115
+#define CL_IMAGE_DEPTH 0x1116
+#define CL_IMAGE_ARRAY_SIZE 0x1117
+#define CL_IMAGE_BUFFER 0x1118
+#define CL_IMAGE_NUM_MIP_LEVELS 0x1119
+#define CL_IMAGE_NUM_SAMPLES 0x111A
+
+/* cl_addressing_mode */
+#define CL_ADDRESS_NONE 0x1130
+#define CL_ADDRESS_CLAMP_TO_EDGE 0x1131
+#define CL_ADDRESS_CLAMP 0x1132
+#define CL_ADDRESS_REPEAT 0x1133
+#define CL_ADDRESS_MIRRORED_REPEAT 0x1134
+
+/* cl_filter_mode */
+#define CL_FILTER_NEAREST 0x1140
+#define CL_FILTER_LINEAR 0x1141
+
+/* cl_sampler_info */
+#define CL_SAMPLER_REFERENCE_COUNT 0x1150
+#define CL_SAMPLER_CONTEXT 0x1151
+#define CL_SAMPLER_NORMALIZED_COORDS 0x1152
+#define CL_SAMPLER_ADDRESSING_MODE 0x1153
+#define CL_SAMPLER_FILTER_MODE 0x1154
+
+/* cl_map_flags - bitfield */
+#define CL_MAP_READ (1 << 0)
+#define CL_MAP_WRITE (1 << 1)
+#define CL_MAP_WRITE_INVALIDATE_REGION (1 << 2)
+
+/* cl_program_info */
+#define CL_PROGRAM_REFERENCE_COUNT 0x1160
+#define CL_PROGRAM_CONTEXT 0x1161
+#define CL_PROGRAM_NUM_DEVICES 0x1162
+#define CL_PROGRAM_DEVICES 0x1163
+#define CL_PROGRAM_SOURCE 0x1164
+#define CL_PROGRAM_BINARY_SIZES 0x1165
+#define CL_PROGRAM_BINARIES 0x1166
+#define CL_PROGRAM_NUM_KERNELS 0x1167
+#define CL_PROGRAM_KERNEL_NAMES 0x1168
+
+/* cl_program_build_info */
+#define CL_PROGRAM_BUILD_STATUS 0x1181
+#define CL_PROGRAM_BUILD_OPTIONS 0x1182
+#define CL_PROGRAM_BUILD_LOG 0x1183
+#define CL_PROGRAM_BINARY_TYPE 0x1184
+
+/* cl_program_binary_type */
+#define CL_PROGRAM_BINARY_TYPE_NONE 0x0
+#define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT 0x1
+#define CL_PROGRAM_BINARY_TYPE_LIBRARY 0x2
+#define CL_PROGRAM_BINARY_TYPE_EXECUTABLE 0x4
+
+/* cl_build_status */
+#define CL_BUILD_SUCCESS 0
+#define CL_BUILD_NONE -1
+#define CL_BUILD_ERROR -2
+#define CL_BUILD_IN_PROGRESS -3
+
+/* cl_kernel_info */
+#define CL_KERNEL_FUNCTION_NAME 0x1190
+#define CL_KERNEL_NUM_ARGS 0x1191
+#define CL_KERNEL_REFERENCE_COUNT 0x1192
+#define CL_KERNEL_CONTEXT 0x1193
+#define CL_KERNEL_PROGRAM 0x1194
+#define CL_KERNEL_ATTRIBUTES 0x1195
+
+/* cl_kernel_arg_info */
+#define CL_KERNEL_ARG_ADDRESS_QUALIFIER 0x1196
+#define CL_KERNEL_ARG_ACCESS_QUALIFIER 0x1197
+#define CL_KERNEL_ARG_TYPE_NAME 0x1198
+#define CL_KERNEL_ARG_TYPE_QUALIFIER 0x1199
+#define CL_KERNEL_ARG_NAME 0x119A
+
+/* cl_kernel_arg_address_qualifier */
+#define CL_KERNEL_ARG_ADDRESS_GLOBAL 0x119B
+#define CL_KERNEL_ARG_ADDRESS_LOCAL 0x119C
+#define CL_KERNEL_ARG_ADDRESS_CONSTANT 0x119D
+#define CL_KERNEL_ARG_ADDRESS_PRIVATE 0x119E
+
+/* cl_kernel_arg_access_qualifier */
+#define CL_KERNEL_ARG_ACCESS_READ_ONLY 0x11A0
+#define CL_KERNEL_ARG_ACCESS_WRITE_ONLY 0x11A1
+#define CL_KERNEL_ARG_ACCESS_READ_WRITE 0x11A2
+#define CL_KERNEL_ARG_ACCESS_NONE 0x11A3
+
+/* cl_kernel_arg_type_qualifer */
+#define CL_KERNEL_ARG_TYPE_NONE 0
+#define CL_KERNEL_ARG_TYPE_CONST (1 << 0)
+#define CL_KERNEL_ARG_TYPE_RESTRICT (1 << 1)
+#define CL_KERNEL_ARG_TYPE_VOLATILE (1 << 2)
+
+/* cl_kernel_work_group_info */
+#define CL_KERNEL_WORK_GROUP_SIZE 0x11B0
+#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE 0x11B1
+#define CL_KERNEL_LOCAL_MEM_SIZE 0x11B2
+#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
+#define CL_KERNEL_PRIVATE_MEM_SIZE 0x11B4
+#define CL_KERNEL_GLOBAL_WORK_SIZE 0x11B5
+
+/* cl_event_info */
+#define CL_EVENT_COMMAND_QUEUE 0x11D0
+#define CL_EVENT_COMMAND_TYPE 0x11D1
+#define CL_EVENT_REFERENCE_COUNT 0x11D2
+#define CL_EVENT_COMMAND_EXECUTION_STATUS 0x11D3
+#define CL_EVENT_CONTEXT 0x11D4
+
+/* cl_command_type */
+#define CL_COMMAND_NDRANGE_KERNEL 0x11F0
+#define CL_COMMAND_TASK 0x11F1
+#define CL_COMMAND_NATIVE_KERNEL 0x11F2
+#define CL_COMMAND_READ_BUFFER 0x11F3
+#define CL_COMMAND_WRITE_BUFFER 0x11F4
+#define CL_COMMAND_COPY_BUFFER 0x11F5
+#define CL_COMMAND_READ_IMAGE 0x11F6
+#define CL_COMMAND_WRITE_IMAGE 0x11F7
+#define CL_COMMAND_COPY_IMAGE 0x11F8
+#define CL_COMMAND_COPY_IMAGE_TO_BUFFER 0x11F9
+#define CL_COMMAND_COPY_BUFFER_TO_IMAGE 0x11FA
+#define CL_COMMAND_MAP_BUFFER 0x11FB
+#define CL_COMMAND_MAP_IMAGE 0x11FC
+#define CL_COMMAND_UNMAP_MEM_OBJECT 0x11FD
+#define CL_COMMAND_MARKER 0x11FE
+#define CL_COMMAND_ACQUIRE_GL_OBJECTS 0x11FF
+#define CL_COMMAND_RELEASE_GL_OBJECTS 0x1200
+#define CL_COMMAND_READ_BUFFER_RECT 0x1201
+#define CL_COMMAND_WRITE_BUFFER_RECT 0x1202
+#define CL_COMMAND_COPY_BUFFER_RECT 0x1203
+#define CL_COMMAND_USER 0x1204
+#define CL_COMMAND_BARRIER 0x1205
+#define CL_COMMAND_MIGRATE_MEM_OBJECTS 0x1206
+#define CL_COMMAND_FILL_BUFFER 0x1207
+#define CL_COMMAND_FILL_IMAGE 0x1208
+
+/* command execution status */
+#define CL_COMPLETE 0x0
+#define CL_RUNNING 0x1
+#define CL_SUBMITTED 0x2
+#define CL_QUEUED 0x3
+
+/* cl_buffer_create_type */
+#define CL_BUFFER_CREATE_TYPE_REGION 0x1220
+
+/* cl_profiling_info */
+#define CL_PROFILING_COMMAND_QUEUED 0x1280
+#define CL_PROFILING_COMMAND_SUBMIT 0x1281
+#define CL_PROFILING_COMMAND_START 0x1282
+#define CL_PROFILING_COMMAND_END 0x1283
+
+/********************************************************************************************************/
+
+/* Platform API */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPlatformIDs(cl_uint /* num_entries */,
+ cl_platform_id * /* platforms */,
+ cl_uint * /* num_platforms */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPlatformInfo(cl_platform_id /* platform */,
+ cl_platform_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Device APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDs(cl_platform_id /* platform */,
+ cl_device_type /* device_type */,
+ cl_uint /* num_entries */,
+ cl_device_id * /* devices */,
+ cl_uint * /* num_devices */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceInfo(cl_device_id /* device */,
+ cl_device_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateSubDevices(cl_device_id /* in_device */,
+ const cl_device_partition_property * /* properties */,
+ cl_uint /* num_devices */,
+ cl_device_id * /* out_devices */,
+ cl_uint * /* num_devices_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2;
+
+/* Context APIs */
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContext(const cl_context_properties * /* properties */,
+ cl_uint /* num_devices */,
+ const cl_device_id * /* devices */,
+ void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *),
+ void * /* user_data */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContextFromType(const cl_context_properties * /* properties */,
+ cl_device_type /* device_type */,
+ void (CL_CALLBACK * /* pfn_notify*/ )(const char *, const void *, size_t, void *),
+ void * /* user_data */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetContextInfo(cl_context /* context */,
+ cl_context_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Command Queue APIs */
+extern CL_API_ENTRY cl_command_queue CL_API_CALL
+clCreateCommandQueue(cl_context /* context */,
+ cl_device_id /* device */,
+ cl_command_queue_properties /* properties */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetCommandQueueInfo(cl_command_queue /* command_queue */,
+ cl_command_queue_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Memory Object APIs */
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateBuffer(cl_context /* context */,
+ cl_mem_flags /* flags */,
+ size_t /* size */,
+ void * /* host_ptr */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateSubBuffer(cl_mem /* buffer */,
+ cl_mem_flags /* flags */,
+ cl_buffer_create_type /* buffer_create_type */,
+ const void * /* buffer_create_info */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateImage(cl_context /* context */,
+ cl_mem_flags /* flags */,
+ const cl_image_format * /* image_format */,
+ const cl_image_desc * /* image_desc */,
+ void * /* host_ptr */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSupportedImageFormats(cl_context /* context */,
+ cl_mem_flags /* flags */,
+ cl_mem_object_type /* image_type */,
+ cl_uint /* num_entries */,
+ cl_image_format * /* image_formats */,
+ cl_uint * /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetMemObjectInfo(cl_mem /* memobj */,
+ cl_mem_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetImageInfo(cl_mem /* image */,
+ cl_image_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetMemObjectDestructorCallback( cl_mem /* memobj */,
+ void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/),
+ void * /*user_data */ ) CL_API_SUFFIX__VERSION_1_1;
+
+/* Sampler APIs */
+extern CL_API_ENTRY cl_sampler CL_API_CALL
+clCreateSampler(cl_context /* context */,
+ cl_bool /* normalized_coords */,
+ cl_addressing_mode /* addressing_mode */,
+ cl_filter_mode /* filter_mode */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSamplerInfo(cl_sampler /* sampler */,
+ cl_sampler_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Program Object APIs */
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithSource(cl_context /* context */,
+ cl_uint /* count */,
+ const char ** /* strings */,
+ const size_t * /* lengths */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBinary(cl_context /* context */,
+ cl_uint /* num_devices */,
+ const cl_device_id * /* device_list */,
+ const size_t * /* lengths */,
+ const unsigned char ** /* binaries */,
+ cl_int * /* binary_status */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBuiltInKernels(cl_context /* context */,
+ cl_uint /* num_devices */,
+ const cl_device_id * /* device_list */,
+ const char * /* kernel_names */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clBuildProgram(cl_program /* program */,
+ cl_uint /* num_devices */,
+ const cl_device_id * /* device_list */,
+ const char * /* options */,
+ void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+ void * /* user_data */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCompileProgram(cl_program /* program */,
+ cl_uint /* num_devices */,
+ const cl_device_id * /* device_list */,
+ const char * /* options */,
+ cl_uint /* num_input_headers */,
+ const cl_program * /* input_headers */,
+ const char ** /* header_include_names */,
+ void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+ void * /* user_data */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clLinkProgram(cl_context /* context */,
+ cl_uint /* num_devices */,
+ const cl_device_id * /* device_list */,
+ const char * /* options */,
+ cl_uint /* num_input_programs */,
+ const cl_program * /* input_programs */,
+ void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+ void * /* user_data */,
+ cl_int * /* errcode_ret */ ) CL_API_SUFFIX__VERSION_1_2;
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clUnloadPlatformCompiler(cl_platform_id /* platform */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramInfo(cl_program /* program */,
+ cl_program_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramBuildInfo(cl_program /* program */,
+ cl_device_id /* device */,
+ cl_program_build_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Kernel Object APIs */
+extern CL_API_ENTRY cl_kernel CL_API_CALL
+clCreateKernel(cl_program /* program */,
+ const char * /* kernel_name */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateKernelsInProgram(cl_program /* program */,
+ cl_uint /* num_kernels */,
+ cl_kernel * /* kernels */,
+ cl_uint * /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainKernel(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseKernel(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArg(cl_kernel /* kernel */,
+ cl_uint /* arg_index */,
+ size_t /* arg_size */,
+ const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelInfo(cl_kernel /* kernel */,
+ cl_kernel_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelArgInfo(cl_kernel /* kernel */,
+ cl_uint /* arg_indx */,
+ cl_kernel_arg_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelWorkGroupInfo(cl_kernel /* kernel */,
+ cl_device_id /* device */,
+ cl_kernel_work_group_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Event Object APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clWaitForEvents(cl_uint /* num_events */,
+ const cl_event * /* event_list */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventInfo(cl_event /* event */,
+ cl_event_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateUserEvent(cl_context /* context */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetUserEventStatus(cl_event /* event */,
+ cl_int /* execution_status */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetEventCallback( cl_event /* event */,
+ cl_int /* command_exec_callback_type */,
+ void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *),
+ void * /* user_data */) CL_API_SUFFIX__VERSION_1_1;
+
+/* Profiling APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventProfilingInfo(cl_event /* event */,
+ cl_profiling_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Flush and Finish APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFlush(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFinish(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Enqueued Commands APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBuffer(cl_command_queue /* command_queue */,
+ cl_mem /* buffer */,
+ cl_bool /* blocking_read */,
+ size_t /* offset */,
+ size_t /* size */,
+ void * /* ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBufferRect(cl_command_queue /* command_queue */,
+ cl_mem /* buffer */,
+ cl_bool /* blocking_read */,
+ const size_t * /* buffer_offset */,
+ const size_t * /* host_offset */,
+ const size_t * /* region */,
+ size_t /* buffer_row_pitch */,
+ size_t /* buffer_slice_pitch */,
+ size_t /* host_row_pitch */,
+ size_t /* host_slice_pitch */,
+ void * /* ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBuffer(cl_command_queue /* command_queue */,
+ cl_mem /* buffer */,
+ cl_bool /* blocking_write */,
+ size_t /* offset */,
+ size_t /* size */,
+ const void * /* ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBufferRect(cl_command_queue /* command_queue */,
+ cl_mem /* buffer */,
+ cl_bool /* blocking_write */,
+ const size_t * /* buffer_offset */,
+ const size_t * /* host_offset */,
+ const size_t * /* region */,
+ size_t /* buffer_row_pitch */,
+ size_t /* buffer_slice_pitch */,
+ size_t /* host_row_pitch */,
+ size_t /* host_slice_pitch */,
+ const void * /* ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueFillBuffer(cl_command_queue /* command_queue */,
+ cl_mem /* buffer */,
+ const void * /* pattern */,
+ size_t /* pattern_size */,
+ size_t /* offset */,
+ size_t /* size */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBuffer(cl_command_queue /* command_queue */,
+ cl_mem /* src_buffer */,
+ cl_mem /* dst_buffer */,
+ size_t /* src_offset */,
+ size_t /* dst_offset */,
+ size_t /* size */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferRect(cl_command_queue /* command_queue */,
+ cl_mem /* src_buffer */,
+ cl_mem /* dst_buffer */,
+ const size_t * /* src_origin */,
+ const size_t * /* dst_origin */,
+ const size_t * /* region */,
+ size_t /* src_row_pitch */,
+ size_t /* src_slice_pitch */,
+ size_t /* dst_row_pitch */,
+ size_t /* dst_slice_pitch */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadImage(cl_command_queue /* command_queue */,
+ cl_mem /* image */,
+ cl_bool /* blocking_read */,
+ const size_t * /* origin[3] */,
+ const size_t * /* region[3] */,
+ size_t /* row_pitch */,
+ size_t /* slice_pitch */,
+ void * /* ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteImage(cl_command_queue /* command_queue */,
+ cl_mem /* image */,
+ cl_bool /* blocking_write */,
+ const size_t * /* origin[3] */,
+ const size_t * /* region[3] */,
+ size_t /* input_row_pitch */,
+ size_t /* input_slice_pitch */,
+ const void * /* ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueFillImage(cl_command_queue /* command_queue */,
+ cl_mem /* image */,
+ const void * /* fill_color */,
+ const size_t * /* origin[3] */,
+ const size_t * /* region[3] */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImage(cl_command_queue /* command_queue */,
+ cl_mem /* src_image */,
+ cl_mem /* dst_image */,
+ const size_t * /* src_origin[3] */,
+ const size_t * /* dst_origin[3] */,
+ const size_t * /* region[3] */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImageToBuffer(cl_command_queue /* command_queue */,
+ cl_mem /* src_image */,
+ cl_mem /* dst_buffer */,
+ const size_t * /* src_origin[3] */,
+ const size_t * /* region[3] */,
+ size_t /* dst_offset */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferToImage(cl_command_queue /* command_queue */,
+ cl_mem /* src_buffer */,
+ cl_mem /* dst_image */,
+ size_t /* src_offset */,
+ const size_t * /* dst_origin[3] */,
+ const size_t * /* region[3] */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapBuffer(cl_command_queue /* command_queue */,
+ cl_mem /* buffer */,
+ cl_bool /* blocking_map */,
+ cl_map_flags /* map_flags */,
+ size_t /* offset */,
+ size_t /* size */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapImage(cl_command_queue /* command_queue */,
+ cl_mem /* image */,
+ cl_bool /* blocking_map */,
+ cl_map_flags /* map_flags */,
+ const size_t * /* origin[3] */,
+ const size_t * /* region[3] */,
+ size_t * /* image_row_pitch */,
+ size_t * /* image_slice_pitch */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueUnmapMemObject(cl_command_queue /* command_queue */,
+ cl_mem /* memobj */,
+ void * /* mapped_ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMigrateMemObjects(cl_command_queue /* command_queue */,
+ cl_uint /* num_mem_objects */,
+ const cl_mem * /* mem_objects */,
+ cl_mem_migration_flags /* flags */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
+ cl_kernel /* kernel */,
+ cl_uint /* work_dim */,
+ const size_t * /* global_work_offset */,
+ const size_t * /* global_work_size */,
+ const size_t * /* local_work_size */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueTask(cl_command_queue /* command_queue */,
+ cl_kernel /* kernel */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNativeKernel(cl_command_queue /* command_queue */,
+ void (CL_CALLBACK * /*user_func*/)(void *),
+ void * /* args */,
+ size_t /* cb_args */,
+ cl_uint /* num_mem_objects */,
+ const cl_mem * /* mem_list */,
+ const void ** /* args_mem_loc */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMarkerWithWaitList(cl_command_queue /* command_queue */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueBarrierWithWaitList(cl_command_queue /* command_queue */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+
+/* Extension function access
+ *
+ * Returns the extension function address for the given function name,
+ * or NULL if a valid function can not be found. The client must
+ * check to make sure the address is not NULL, before using or
+ * calling the returned function address.
+ */
+extern CL_API_ENTRY void * CL_API_CALL
+clGetExtensionFunctionAddressForPlatform(cl_platform_id /* platform */,
+ const char * /* func_name */) CL_API_SUFFIX__VERSION_1_2;
+
+
+// Deprecated OpenCL 1.1 APIs
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateImage2D(cl_context /* context */,
+ cl_mem_flags /* flags */,
+ const cl_image_format * /* image_format */,
+ size_t /* image_width */,
+ size_t /* image_height */,
+ size_t /* image_row_pitch */,
+ void * /* host_ptr */,
+ cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateImage3D(cl_context /* context */,
+ cl_mem_flags /* flags */,
+ const cl_image_format * /* image_format */,
+ size_t /* image_width */,
+ size_t /* image_height */,
+ size_t /* image_depth */,
+ size_t /* image_row_pitch */,
+ size_t /* image_slice_pitch */,
+ void * /* host_ptr */,
+ cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueMarker(cl_command_queue /* command_queue */,
+ cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueWaitForEvents(cl_command_queue /* command_queue */,
+ cl_uint /* num_events */,
+ const cl_event * /* event_list */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueBarrier(cl_command_queue /* command_queue */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL
+clGetExtensionFunctionAddress(const char * /* func_name */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __OPENCL_CL_H */

x264-snapshot-20130723-2245.tar.bz2/extras/cl_platform.h Added

@@ -0,0 +1,1268 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11803 $ on $Date: 2010-06-25 10:02:12 -0700 (Fri, 25 Jun 2010) $ */
+
+#ifndef __CL_PLATFORM_H
+#define __CL_PLATFORM_H
+
+#ifdef __APPLE__
+ /* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */
+ #include <AvailabilityMacros.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_WIN32)
+ #define CL_API_ENTRY
+ #define CL_API_CALL __stdcall
+ #define CL_CALLBACK __stdcall
+#else
+ #define CL_API_ENTRY
+ #define CL_API_CALL
+ #define CL_CALLBACK
+#endif
+
+#ifdef __APPLE__
+ #define CL_EXTENSION_WEAK_LINK __attribute__((weak_import))
+ #ifndef UNAVAILABLE_ATTRIBUTE
+ #define UNAVAILABLE_ATTRIBUTE
+ #endif
+ #ifdef AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+ #define CL_API_SUFFIX__VERSION_1_0 AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+ #define CL_EXT_SUFFIX__VERSION_1_0 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+ #else
+ #define CL_API_SUFFIX__VERSION_1_0 UNAVAILABLE_ATTRIBUTE
+ #define CL_EXT_SUFFIX__VERSION_1_0 CL_EXTENSION_WEAK_LINK UNAVAILABLE_ATTRIBUTE
+ #endif
+ #ifdef AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+ #define CL_API_SUFFIX__VERSION_1_1 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+ #define GCL_API_SUFFIX__VERSION_1_1 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+ #define CL_EXT_SUFFIX__VERSION_1_1 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+ #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7
+ #else
+ #define CL_API_SUFFIX__VERSION_1_1 UNAVAILABLE_ATTRIBUTE
+ #define GCL_API_SUFFIX__VERSION_1_1 UNAVAILABLE_ATTRIBUTE
+ #define CL_EXT_SUFFIX__VERSION_1_1 CL_EXTENSION_WEAK_LINK UNAVAILABLE_ATTRIBUTE
+ #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATE CL_EXT_SUFFIX__VERSION_1_0
+ #endif
+ #ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+ #define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+ #define GCL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+ #define CL_EXT_SUFFIX__VERSION_1_2 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+ #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+ #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8
+ #else
+ #define CL_API_SUFFIX__VERSION_1_2 UNAVAILABLE_ATTRIBUTE
+ #define GCL_API_SUFFIX__VERSION_1_2 UNAVAILABLE_ATTRIBUTE
+ #define CL_EXT_SUFFIX__VERSION_1_2 CL_EXTENSION_WEAK_LINK UNAVAILABLE_ATTRIBUTE
+ #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+ #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXT_SUFFIX__VERSION_1_1
+ #endif
+#else
+ #define CL_EXTENSION_WEAK_LINK
+ #define CL_API_SUFFIX__VERSION_1_0
+ #define CL_EXT_SUFFIX__VERSION_1_0
+ #define CL_API_SUFFIX__VERSION_1_1
+ #define CL_EXT_SUFFIX__VERSION_1_1
+ #define CL_API_SUFFIX__VERSION_1_2
+ #define CL_EXT_SUFFIX__VERSION_1_2
+
+ #ifdef __GNUC__
+ #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+ #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+ #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+ #else
+ #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED __attribute__((deprecated))
+ #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+ #endif
+
+ #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
+ #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+ #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+ #else
+ #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED __attribute__((deprecated))
+ #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+ #endif
+ #elif _WIN32
+ #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+ #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+ #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+ #else
+ #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+ #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED __declspec(deprecated)
+ #endif
+
+ #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
+ #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+ #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+ #else
+ #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+ #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED __declspec(deprecated)
+ #endif
+ #else
+ #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+ #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+
+ #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+ #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+ #endif
+#endif
+
+#if (defined (_WIN32) && defined(_MSC_VER))
+
+/* scalar types */
+typedef signed __int8 cl_char;
+typedef unsigned __int8 cl_uchar;
+typedef signed __int16 cl_short;
+typedef unsigned __int16 cl_ushort;
+typedef signed __int32 cl_int;
+typedef unsigned __int32 cl_uint;
+typedef signed __int64 cl_long;
+typedef unsigned __int64 cl_ulong;
+
+typedef unsigned __int16 cl_half;
+typedef float cl_float;
+typedef double cl_double;
+
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT 8
+#define CL_SCHAR_MAX 127
+#define CL_SCHAR_MIN (-127-1)
+#define CL_CHAR_MAX CL_SCHAR_MAX
+#define CL_CHAR_MIN CL_SCHAR_MIN
+#define CL_UCHAR_MAX 255
+#define CL_SHRT_MAX 32767
+#define CL_SHRT_MIN (-32767-1)
+#define CL_USHRT_MAX 65535
+#define CL_INT_MAX 2147483647
+#define CL_INT_MIN (-2147483647-1)
+#define CL_UINT_MAX 0xffffffffU
+#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+
+#define CL_FLT_DIG 6
+#define CL_FLT_MANT_DIG 24
+#define CL_FLT_MAX_10_EXP +38
+#define CL_FLT_MAX_EXP +128
+#define CL_FLT_MIN_10_EXP -37
+#define CL_FLT_MIN_EXP -125
+#define CL_FLT_RADIX 2
+#define CL_FLT_MAX 340282346638528859811704183484516925440.0f
+#define CL_FLT_MIN 1.175494350822287507969e-38f
+#define CL_FLT_EPSILON 0x1.0p-23f
+
+#define CL_DBL_DIG 15
+#define CL_DBL_MANT_DIG 53
+#define CL_DBL_MAX_10_EXP +308
+#define CL_DBL_MAX_EXP +1024
+#define CL_DBL_MIN_10_EXP -307
+#define CL_DBL_MIN_EXP -1021
+#define CL_DBL_RADIX 2
+#define CL_DBL_MAX 179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0
+#define CL_DBL_MIN 2.225073858507201383090e-308
+#define CL_DBL_EPSILON 2.220446049250313080847e-16
+
+#define CL_M_E 2.718281828459045090796
+#define CL_M_LOG2E 1.442695040888963387005
+#define CL_M_LOG10E 0.434294481903251816668
+#define CL_M_LN2 0.693147180559945286227
+#define CL_M_LN10 2.302585092994045901094
+#define CL_M_PI 3.141592653589793115998
+#define CL_M_PI_2 1.570796326794896557999
+#define CL_M_PI_4 0.785398163397448278999
+#define CL_M_1_PI 0.318309886183790691216
+#define CL_M_2_PI 0.636619772367581382433
+#define CL_M_2_SQRTPI 1.128379167095512558561
+#define CL_M_SQRT2 1.414213562373095145475
+#define CL_M_SQRT1_2 0.707106781186547572737
+
+#define CL_M_E_F 2.71828174591064f
+#define CL_M_LOG2E_F 1.44269502162933f
+#define CL_M_LOG10E_F 0.43429449200630f
+#define CL_M_LN2_F 0.69314718246460f
+#define CL_M_LN10_F 2.30258512496948f
+#define CL_M_PI_F 3.14159274101257f
+#define CL_M_PI_2_F 1.57079637050629f
+#define CL_M_PI_4_F 0.78539818525314f
+#define CL_M_1_PI_F 0.31830987334251f
+#define CL_M_2_PI_F 0.63661974668503f
+#define CL_M_2_SQRTPI_F 1.12837922573090f
+#define CL_M_SQRT2_F 1.41421353816986f
+#define CL_M_SQRT1_2_F 0.70710676908493f
+
+#define CL_NAN (CL_INFINITY - CL_INFINITY)
+#define CL_HUGE_VALF ((cl_float) 1e50)
+#define CL_HUGE_VAL ((cl_double) 1e500)
+#define CL_MAXFLOAT CL_FLT_MAX
+#define CL_INFINITY CL_HUGE_VALF
+
+#else
+
+#include <stdint.h>
+
+/* scalar types */
+typedef int8_t cl_char;
+typedef uint8_t cl_uchar;
+typedef int16_t cl_short __attribute__((aligned(2)));
+typedef uint16_t cl_ushort __attribute__((aligned(2)));
+typedef int32_t cl_int __attribute__((aligned(4)));
+typedef uint32_t cl_uint __attribute__((aligned(4)));
+typedef int64_t cl_long __attribute__((aligned(8)));
+typedef uint64_t cl_ulong __attribute__((aligned(8)));
+
+typedef uint16_t cl_half __attribute__((aligned(2)));
+typedef float cl_float __attribute__((aligned(4)));
+typedef double cl_double __attribute__((aligned(8)));
+
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT 8
+#define CL_SCHAR_MAX 127
+#define CL_SCHAR_MIN (-127-1)
+#define CL_CHAR_MAX CL_SCHAR_MAX
+#define CL_CHAR_MIN CL_SCHAR_MIN
+#define CL_UCHAR_MAX 255
+#define CL_SHRT_MAX 32767
+#define CL_SHRT_MIN (-32767-1)
+#define CL_USHRT_MAX 65535
+#define CL_INT_MAX 2147483647
+#define CL_INT_MIN (-2147483647-1)
+#define CL_UINT_MAX 0xffffffffU
+#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+
+#define CL_FLT_DIG 6
+#define CL_FLT_MANT_DIG 24
+#define CL_FLT_MAX_10_EXP +38
+#define CL_FLT_MAX_EXP +128
+#define CL_FLT_MIN_10_EXP -37
+#define CL_FLT_MIN_EXP -125
+#define CL_FLT_RADIX 2
+#define CL_FLT_MAX 0x1.fffffep127f
+#define CL_FLT_MIN 0x1.0p-126f
+#define CL_FLT_EPSILON 0x1.0p-23f
+
+#define CL_DBL_DIG 15
+#define CL_DBL_MANT_DIG 53
+#define CL_DBL_MAX_10_EXP +308
+#define CL_DBL_MAX_EXP +1024
+#define CL_DBL_MIN_10_EXP -307
+#define CL_DBL_MIN_EXP -1021
+#define CL_DBL_RADIX 2
+#define CL_DBL_MAX 0x1.fffffffffffffp1023
+#define CL_DBL_MIN 0x1.0p-1022
+#define CL_DBL_EPSILON 0x1.0p-52
+
+#define CL_M_E 2.718281828459045090796
+#define CL_M_LOG2E 1.442695040888963387005
+#define CL_M_LOG10E 0.434294481903251816668
+#define CL_M_LN2 0.693147180559945286227
+#define CL_M_LN10 2.302585092994045901094
+#define CL_M_PI 3.141592653589793115998
+#define CL_M_PI_2 1.570796326794896557999
+#define CL_M_PI_4 0.785398163397448278999
+#define CL_M_1_PI 0.318309886183790691216
+#define CL_M_2_PI 0.636619772367581382433
+#define CL_M_2_SQRTPI 1.128379167095512558561
+#define CL_M_SQRT2 1.414213562373095145475
+#define CL_M_SQRT1_2 0.707106781186547572737
+
+#define CL_M_E_F 2.71828174591064f
+#define CL_M_LOG2E_F 1.44269502162933f
+#define CL_M_LOG10E_F 0.43429449200630f
+#define CL_M_LN2_F 0.69314718246460f
+#define CL_M_LN10_F 2.30258512496948f
+#define CL_M_PI_F 3.14159274101257f
+#define CL_M_PI_2_F 1.57079637050629f
+#define CL_M_PI_4_F 0.78539818525314f
+#define CL_M_1_PI_F 0.31830987334251f
+#define CL_M_2_PI_F 0.63661974668503f
+#define CL_M_2_SQRTPI_F 1.12837922573090f
+#define CL_M_SQRT2_F 1.41421353816986f
+#define CL_M_SQRT1_2_F 0.70710676908493f
+
+#if defined( __GNUC__ )
+ #define CL_HUGE_VALF __builtin_huge_valf()
+ #define CL_HUGE_VAL __builtin_huge_val()
+ #define CL_NAN __builtin_nanf( "" )
+#else
+ #define CL_HUGE_VALF ((cl_float) 1e50)
+ #define CL_HUGE_VAL ((cl_double) 1e500)
+ float nanf( const char * );
+ #define CL_NAN nanf( "" )
+#endif
+#define CL_MAXFLOAT CL_FLT_MAX
+#define CL_INFINITY CL_HUGE_VALF
+
+#endif
+
+#include <stddef.h>
+
+/* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */
+typedef unsigned int cl_GLuint;
+typedef int cl_GLint;
+typedef unsigned int cl_GLenum;
+
+/*
+ * Vector types
+ *
+ * Note: OpenCL requires that all types be naturally aligned.
+ * This means that vector types must be naturally aligned.
+ * For example, a vector of four floats must be aligned to
+ * a 16 byte boundary (calculated as 4 * the natural 4-byte
+ * alignment of the float). The alignment qualifiers here
+ * will only function properly if your compiler supports them
+ * and if you don't actively work to defeat them. For example,
+ * in order for a cl_float4 to be 16 byte aligned in a struct,
+ * the start of the struct must itself be 16-byte aligned.
+ *
+ * Maintaining proper alignment is the user's responsibility.
+ */
+
+/* Define basic vector types */
+#if defined( __VEC__ )
+ #include <altivec.h> /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */
+ typedef vector unsigned char __cl_uchar16;
+ typedef vector signed char __cl_char16;
+ typedef vector unsigned short __cl_ushort8;
+ typedef vector signed short __cl_short8;
+ typedef vector unsigned int __cl_uint4;
+ typedef vector signed int __cl_int4;
+ typedef vector float __cl_float4;
+ #define __CL_UCHAR16__ 1
+ #define __CL_CHAR16__ 1
+ #define __CL_USHORT8__ 1
+ #define __CL_SHORT8__ 1
+ #define __CL_UINT4__ 1
+ #define __CL_INT4__ 1
+ #define __CL_FLOAT4__ 1
+#endif
+
+#if defined( __SSE__ )
+ #if defined( __MINGW64__ )
+ #include <intrin.h>
+ #else
+ #include <xmmintrin.h>
+ #endif
+ #if defined( __GNUC__ )
+ typedef float __cl_float4 __attribute__((vector_size(16)));
+ #else
+ typedef __m128 __cl_float4;
+ #endif
+ #define __CL_FLOAT4__ 1
+#endif
+
+#if defined( __SSE2__ )
+ #if defined( __MINGW64__ )
+ #include <intrin.h>
+ #else
+ #include <emmintrin.h>
+ #endif
+ #if defined( __GNUC__ )
+ typedef cl_uchar __cl_uchar16 __attribute__((vector_size(16)));
+ typedef cl_char __cl_char16 __attribute__((vector_size(16)));
+ typedef cl_ushort __cl_ushort8 __attribute__((vector_size(16)));
+ typedef cl_short __cl_short8 __attribute__((vector_size(16)));
+ typedef cl_uint __cl_uint4 __attribute__((vector_size(16)));
+ typedef cl_int __cl_int4 __attribute__((vector_size(16)));
+ typedef cl_ulong __cl_ulong2 __attribute__((vector_size(16)));
+ typedef cl_long __cl_long2 __attribute__((vector_size(16)));
+ typedef cl_double __cl_double2 __attribute__((vector_size(16)));
+ #else
+ typedef __m128i __cl_uchar16;
+ typedef __m128i __cl_char16;
+ typedef __m128i __cl_ushort8;
+ typedef __m128i __cl_short8;
+ typedef __m128i __cl_uint4;
+ typedef __m128i __cl_int4;
+ typedef __m128i __cl_ulong2;
+ typedef __m128i __cl_long2;
+ typedef __m128d __cl_double2;
+ #endif
+ #define __CL_UCHAR16__ 1
+ #define __CL_CHAR16__ 1
+ #define __CL_USHORT8__ 1
+ #define __CL_SHORT8__ 1
+ #define __CL_INT4__ 1
+ #define __CL_UINT4__ 1
+ #define __CL_ULONG2__ 1
+ #define __CL_LONG2__ 1
+ #define __CL_DOUBLE2__ 1
+#endif
+
+#if defined( __MMX__ )
+ #include <mmintrin.h>
+ #if defined( __GNUC__ )
+ typedef cl_uchar __cl_uchar8 __attribute__((vector_size(8)));
+ typedef cl_char __cl_char8 __attribute__((vector_size(8)));
+ typedef cl_ushort __cl_ushort4 __attribute__((vector_size(8)));
+ typedef cl_short __cl_short4 __attribute__((vector_size(8)));
+ typedef cl_uint __cl_uint2 __attribute__((vector_size(8)));
+ typedef cl_int __cl_int2 __attribute__((vector_size(8)));
+ typedef cl_ulong __cl_ulong1 __attribute__((vector_size(8)));
+ typedef cl_long __cl_long1 __attribute__((vector_size(8)));
+ typedef cl_float __cl_float2 __attribute__((vector_size(8)));
+ #else
+ typedef __m64 __cl_uchar8;
+ typedef __m64 __cl_char8;
+ typedef __m64 __cl_ushort4;
+ typedef __m64 __cl_short4;
+ typedef __m64 __cl_uint2;
+ typedef __m64 __cl_int2;
+ typedef __m64 __cl_ulong1;
+ typedef __m64 __cl_long1;
+ typedef __m64 __cl_float2;
+ #endif
+ #define __CL_UCHAR8__ 1
+ #define __CL_CHAR8__ 1
+ #define __CL_USHORT4__ 1
+ #define __CL_SHORT4__ 1
+ #define __CL_INT2__ 1
+ #define __CL_UINT2__ 1
+ #define __CL_ULONG1__ 1
+ #define __CL_LONG1__ 1
+ #define __CL_FLOAT2__ 1
+#endif
+
+#if defined( __AVX__ )
+ #if defined( __MINGW64__ )
+ #include <intrin.h>
+ #else
+ #include <immintrin.h>
+ #endif
+ #if defined( __GNUC__ )
+ typedef cl_float __cl_float8 __attribute__((vector_size(32)));
+ typedef cl_double __cl_double4 __attribute__((vector_size(32)));
+ #else
+ typedef __m256 __cl_float8;
+ typedef __m256d __cl_double4;
+ #endif
+ #define __CL_FLOAT8__ 1
+ #define __CL_DOUBLE4__ 1
+#endif
+
+/* Define alignment keys */
+#if defined( __GNUC__ )
+ #define CL_ALIGNED(_x) __attribute__ ((aligned(_x)))
+#elif defined( _WIN32) && (_MSC_VER)
+ /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements */
+ /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx */
+ /* #include <crtdefs.h> */
+ /* #define CL_ALIGNED(_x) _CRT_ALIGN(_x) */
+ #define CL_ALIGNED(_x)
+#else
+ #warning Need to implement some method to align data here
+ #define CL_ALIGNED(_x)
+#endif
+
+/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ /* .xyzw and .s0123...{f|F} are supported */
+ #define CL_HAS_NAMED_VECTOR_FIELDS 1
+ /* .hi and .lo are supported */
+ #define CL_HAS_HI_LO_VECTOR_FIELDS 1
+#endif
+
+/* Define cl_vector types */
+
+/* ---- cl_charn ---- */
+typedef union
+{
+ cl_char CL_ALIGNED(2) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_char x, y; };
+ __extension__ struct{ cl_char s0, s1; };
+ __extension__ struct{ cl_char lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+ __cl_char2 v2;
+#endif
+}cl_char2;
+
+typedef union
+{
+ cl_char CL_ALIGNED(4) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_char x, y, z, w; };
+ __extension__ struct{ cl_char s0, s1, s2, s3; };
+ __extension__ struct{ cl_char2 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+ __cl_char2 v2[2];
+#endif
+#if defined( __CL_CHAR4__)
+ __cl_char4 v4;
+#endif
+}cl_char4;
+
+/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */
+typedef cl_char4 cl_char3;
+
+typedef union
+{
+ cl_char CL_ALIGNED(8) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_char x, y, z, w; };
+ __extension__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7; };
+ __extension__ struct{ cl_char4 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+ __cl_char2 v2[4];
+#endif
+#if defined( __CL_CHAR4__)
+ __cl_char4 v4[2];
+#endif
+#if defined( __CL_CHAR8__ )
+ __cl_char8 v8;
+#endif
+}cl_char8;
+
+typedef union
+{
+ cl_char CL_ALIGNED(16) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_char x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __extension__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __extension__ struct{ cl_char8 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+ __cl_char2 v2[8];
+#endif
+#if defined( __CL_CHAR4__)
+ __cl_char4 v4[4];
+#endif
+#if defined( __CL_CHAR8__ )
+ __cl_char8 v8[2];
+#endif
+#if defined( __CL_CHAR16__ )
+ __cl_char16 v16;
+#endif
+}cl_char16;
+
+
+/* ---- cl_ucharn ---- */
+typedef union
+{
+ cl_uchar CL_ALIGNED(2) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_uchar x, y; };
+ __extension__ struct{ cl_uchar s0, s1; };
+ __extension__ struct{ cl_uchar lo, hi; };
+#endif
+#if defined( __cl_uchar2__)
+ __cl_uchar2 v2;
+#endif
+}cl_uchar2;
+
+typedef union
+{
+ cl_uchar CL_ALIGNED(4) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_uchar x, y, z, w; };
+ __extension__ struct{ cl_uchar s0, s1, s2, s3; };
+ __extension__ struct{ cl_uchar2 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__)
+ __cl_uchar2 v2[2];
+#endif
+#if defined( __CL_UCHAR4__)
+ __cl_uchar4 v4;
+#endif
+}cl_uchar4;
+
+/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */
+typedef cl_uchar4 cl_uchar3;
+
+typedef union
+{
+ cl_uchar CL_ALIGNED(8) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_uchar x, y, z, w; };
+ __extension__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7; };
+ __extension__ struct{ cl_uchar4 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__)
+ __cl_uchar2 v2[4];
+#endif
+#if defined( __CL_UCHAR4__)
+ __cl_uchar4 v4[2];
+#endif
+#if defined( __CL_UCHAR8__ )
+ __cl_uchar8 v8;
+#endif
+}cl_uchar8;
+
+typedef union
+{
+ cl_uchar CL_ALIGNED(16) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_uchar x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __extension__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __extension__ struct{ cl_uchar8 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__)
+ __cl_uchar2 v2[8];
+#endif
+#if defined( __CL_UCHAR4__)
+ __cl_uchar4 v4[4];
+#endif
+#if defined( __CL_UCHAR8__ )
+ __cl_uchar8 v8[2];
+#endif
+#if defined( __CL_UCHAR16__ )
+ __cl_uchar16 v16;
+#endif
+}cl_uchar16;
+
+
+/* ---- cl_shortn ---- */
+typedef union
+{
+ cl_short CL_ALIGNED(4) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_short x, y; };
+ __extension__ struct{ cl_short s0, s1; };
+ __extension__ struct{ cl_short lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+ __cl_short2 v2;
+#endif
+}cl_short2;
+
+typedef union
+{
+ cl_short CL_ALIGNED(8) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_short x, y, z, w; };
+ __extension__ struct{ cl_short s0, s1, s2, s3; };
+ __extension__ struct{ cl_short2 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+ __cl_short2 v2[2];
+#endif
+#if defined( __CL_SHORT4__)
+ __cl_short4 v4;
+#endif
+}cl_short4;
+
+/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */
+typedef cl_short4 cl_short3;
+
+typedef union
+{
+ cl_short CL_ALIGNED(16) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_short x, y, z, w; };
+ __extension__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7; };
+ __extension__ struct{ cl_short4 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+ __cl_short2 v2[4];
+#endif
+#if defined( __CL_SHORT4__)
+ __cl_short4 v4[2];
+#endif
+#if defined( __CL_SHORT8__ )
+ __cl_short8 v8;
+#endif
+}cl_short8;
+
+typedef union
+{
+ cl_short CL_ALIGNED(32) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_short x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __extension__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __extension__ struct{ cl_short8 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+ __cl_short2 v2[8];
+#endif
+#if defined( __CL_SHORT4__)
+ __cl_short4 v4[4];
+#endif
+#if defined( __CL_SHORT8__ )
+ __cl_short8 v8[2];
+#endif
+#if defined( __CL_SHORT16__ )
+ __cl_short16 v16;
+#endif
+}cl_short16;
+
+
+/* ---- cl_ushortn ---- */
+typedef union
+{
+ cl_ushort CL_ALIGNED(4) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_ushort x, y; };
+ __extension__ struct{ cl_ushort s0, s1; };
+ __extension__ struct{ cl_ushort lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+ __cl_ushort2 v2;
+#endif
+}cl_ushort2;
+
+typedef union
+{
+ cl_ushort CL_ALIGNED(8) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_ushort x, y, z, w; };
+ __extension__ struct{ cl_ushort s0, s1, s2, s3; };
+ __extension__ struct{ cl_ushort2 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+ __cl_ushort2 v2[2];
+#endif
+#if defined( __CL_USHORT4__)
+ __cl_ushort4 v4;
+#endif
+}cl_ushort4;
+
+/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */
+typedef cl_ushort4 cl_ushort3;
+
+typedef union
+{
+ cl_ushort CL_ALIGNED(16) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_ushort x, y, z, w; };
+ __extension__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7; };
+ __extension__ struct{ cl_ushort4 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+ __cl_ushort2 v2[4];
+#endif
+#if defined( __CL_USHORT4__)
+ __cl_ushort4 v4[2];
+#endif
+#if defined( __CL_USHORT8__ )
+ __cl_ushort8 v8;
+#endif
+}cl_ushort8;
+
+typedef union
+{
+ cl_ushort CL_ALIGNED(32) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_ushort x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __extension__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __extension__ struct{ cl_ushort8 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+ __cl_ushort2 v2[8];
+#endif
+#if defined( __CL_USHORT4__)
+ __cl_ushort4 v4[4];
+#endif
+#if defined( __CL_USHORT8__ )
+ __cl_ushort8 v8[2];
+#endif
+#if defined( __CL_USHORT16__ )
+ __cl_ushort16 v16;
+#endif
+}cl_ushort16;
+
+/* ---- cl_intn ---- */
+typedef union
+{
+ cl_int CL_ALIGNED(8) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_int x, y; };
+ __extension__ struct{ cl_int s0, s1; };
+ __extension__ struct{ cl_int lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+ __cl_int2 v2;
+#endif
+}cl_int2;
+
+typedef union
+{
+ cl_int CL_ALIGNED(16) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_int x, y, z, w; };
+ __extension__ struct{ cl_int s0, s1, s2, s3; };
+ __extension__ struct{ cl_int2 lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+ __cl_int2 v2[2];
+#endif
+#if defined( __CL_INT4__)
+ __cl_int4 v4;
+#endif
+}cl_int4;
+
+/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */
+typedef cl_int4 cl_int3;
+
+typedef union
+{
+ cl_int CL_ALIGNED(32) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_int x, y, z, w; };
+ __extension__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7; };
+ __extension__ struct{ cl_int4 lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+ __cl_int2 v2[4];
+#endif
+#if defined( __CL_INT4__)
+ __cl_int4 v4[2];
+#endif
+#if defined( __CL_INT8__ )
+ __cl_int8 v8;
+#endif
+}cl_int8;
+
+typedef union
+{
+ cl_int CL_ALIGNED(64) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_int x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __extension__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __extension__ struct{ cl_int8 lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+ __cl_int2 v2[8];
+#endif
+#if defined( __CL_INT4__)
+ __cl_int4 v4[4];
+#endif
+#if defined( __CL_INT8__ )
+ __cl_int8 v8[2];
+#endif
+#if defined( __CL_INT16__ )
+ __cl_int16 v16;
+#endif
+}cl_int16;
+
+
+/* ---- cl_uintn ---- */
+typedef union
+{
+ cl_uint CL_ALIGNED(8) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_uint x, y; };
+ __extension__ struct{ cl_uint s0, s1; };
+ __extension__ struct{ cl_uint lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+ __cl_uint2 v2;
+#endif
+}cl_uint2;
+
+typedef union
+{
+ cl_uint CL_ALIGNED(16) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_uint x, y, z, w; };
+ __extension__ struct{ cl_uint s0, s1, s2, s3; };
+ __extension__ struct{ cl_uint2 lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+ __cl_uint2 v2[2];
+#endif
+#if defined( __CL_UINT4__)
+ __cl_uint4 v4;
+#endif
+}cl_uint4;
+
+/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */
+typedef cl_uint4 cl_uint3;
+
+typedef union
+{
+ cl_uint CL_ALIGNED(32) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_uint x, y, z, w; };
+ __extension__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7; };
+ __extension__ struct{ cl_uint4 lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+ __cl_uint2 v2[4];
+#endif
+#if defined( __CL_UINT4__)
+ __cl_uint4 v4[2];
+#endif
+#if defined( __CL_UINT8__ )
+ __cl_uint8 v8;
+#endif
+}cl_uint8;
+
+typedef union
+{
+ cl_uint CL_ALIGNED(64) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_uint x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __extension__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __extension__ struct{ cl_uint8 lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+ __cl_uint2 v2[8];
+#endif
+#if defined( __CL_UINT4__)
+ __cl_uint4 v4[4];
+#endif
+#if defined( __CL_UINT8__ )
+ __cl_uint8 v8[2];
+#endif
+#if defined( __CL_UINT16__ )
+ __cl_uint16 v16;
+#endif
+}cl_uint16;
+
+/* ---- cl_longn ---- */
+typedef union
+{
+ cl_long CL_ALIGNED(16) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_long x, y; };
+ __extension__ struct{ cl_long s0, s1; };
+ __extension__ struct{ cl_long lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+ __cl_long2 v2;
+#endif
+}cl_long2;
+
+typedef union
+{
+ cl_long CL_ALIGNED(32) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_long x, y, z, w; };
+ __extension__ struct{ cl_long s0, s1, s2, s3; };
+ __extension__ struct{ cl_long2 lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+ __cl_long2 v2[2];
+#endif
+#if defined( __CL_LONG4__)
+ __cl_long4 v4;
+#endif
+}cl_long4;
+
+/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */
+typedef cl_long4 cl_long3;
+
+typedef union
+{
+ cl_long CL_ALIGNED(64) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_long x, y, z, w; };
+ __extension__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7; };
+ __extension__ struct{ cl_long4 lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+ __cl_long2 v2[4];
+#endif
+#if defined( __CL_LONG4__)
+ __cl_long4 v4[2];
+#endif
+#if defined( __CL_LONG8__ )
+ __cl_long8 v8;
+#endif
+}cl_long8;
+
+typedef union
+{
+ cl_long CL_ALIGNED(128) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_long x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __extension__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __extension__ struct{ cl_long8 lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+ __cl_long2 v2[8];
+#endif
+#if defined( __CL_LONG4__)
+ __cl_long4 v4[4];
+#endif
+#if defined( __CL_LONG8__ )
+ __cl_long8 v8[2];
+#endif
+#if defined( __CL_LONG16__ )
+ __cl_long16 v16;
+#endif
+}cl_long16;
+
+
+/* ---- cl_ulongn ---- */
+typedef union
+{
+ cl_ulong CL_ALIGNED(16) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_ulong x, y; };
+ __extension__ struct{ cl_ulong s0, s1; };
+ __extension__ struct{ cl_ulong lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+ __cl_ulong2 v2;
+#endif
+}cl_ulong2;
+
+typedef union
+{
+ cl_ulong CL_ALIGNED(32) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_ulong x, y, z, w; };
+ __extension__ struct{ cl_ulong s0, s1, s2, s3; };
+ __extension__ struct{ cl_ulong2 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+ __cl_ulong2 v2[2];
+#endif
+#if defined( __CL_ULONG4__)
+ __cl_ulong4 v4;
+#endif
+}cl_ulong4;
+
+/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */
+typedef cl_ulong4 cl_ulong3;
+
+typedef union
+{
+ cl_ulong CL_ALIGNED(64) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_ulong x, y, z, w; };
+ __extension__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7; };
+ __extension__ struct{ cl_ulong4 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+ __cl_ulong2 v2[4];
+#endif
+#if defined( __CL_ULONG4__)
+ __cl_ulong4 v4[2];
+#endif
+#if defined( __CL_ULONG8__ )
+ __cl_ulong8 v8;
+#endif
+}cl_ulong8;
+
+typedef union
+{
+ cl_ulong CL_ALIGNED(128) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_ulong x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __extension__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __extension__ struct{ cl_ulong8 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+ __cl_ulong2 v2[8];
+#endif
+#if defined( __CL_ULONG4__)
+ __cl_ulong4 v4[4];
+#endif
+#if defined( __CL_ULONG8__ )
+ __cl_ulong8 v8[2];
+#endif
+#if defined( __CL_ULONG16__ )
+ __cl_ulong16 v16;
+#endif
+}cl_ulong16;
+
+
+/* --- cl_floatn ---- */
+
+typedef union
+{
+ cl_float CL_ALIGNED(8) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_float x, y; };
+ __extension__ struct{ cl_float s0, s1; };
+ __extension__ struct{ cl_float lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+ __cl_float2 v2;
+#endif
+}cl_float2;
+
+typedef union
+{
+ cl_float CL_ALIGNED(16) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_float x, y, z, w; };
+ __extension__ struct{ cl_float s0, s1, s2, s3; };
+ __extension__ struct{ cl_float2 lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+ __cl_float2 v2[2];
+#endif
+#if defined( __CL_FLOAT4__)
+ __cl_float4 v4;
+#endif
+}cl_float4;
+
+/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */
+typedef cl_float4 cl_float3;
+
+typedef union
+{
+ cl_float CL_ALIGNED(32) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_float x, y, z, w; };
+ __extension__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7; };
+ __extension__ struct{ cl_float4 lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+ __cl_float2 v2[4];
+#endif
+#if defined( __CL_FLOAT4__)
+ __cl_float4 v4[2];
+#endif
+#if defined( __CL_FLOAT8__ )
+ __cl_float8 v8;
+#endif
+}cl_float8;
+
+typedef union
+{
+ cl_float CL_ALIGNED(64) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_float x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __extension__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __extension__ struct{ cl_float8 lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+ __cl_float2 v2[8];
+#endif
+#if defined( __CL_FLOAT4__)
+ __cl_float4 v4[4];
+#endif
+#if defined( __CL_FLOAT8__ )
+ __cl_float8 v8[2];
+#endif
+#if defined( __CL_FLOAT16__ )
+ __cl_float16 v16;
+#endif
+}cl_float16;
+
+/* --- cl_doublen ---- */
+
+typedef union
+{
+ cl_double CL_ALIGNED(16) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_double x, y; };
+ __extension__ struct{ cl_double s0, s1; };
+ __extension__ struct{ cl_double lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+ __cl_double2 v2;
+#endif
+}cl_double2;
+
+typedef union
+{
+ cl_double CL_ALIGNED(32) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_double x, y, z, w; };
+ __extension__ struct{ cl_double s0, s1, s2, s3; };
+ __extension__ struct{ cl_double2 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+ __cl_double2 v2[2];
+#endif
+#if defined( __CL_DOUBLE4__)
+ __cl_double4 v4;
+#endif
+}cl_double4;
+
+/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */
+typedef cl_double4 cl_double3;
+
+typedef union
+{
+ cl_double CL_ALIGNED(64) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_double x, y, z, w; };
+ __extension__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7; };
+ __extension__ struct{ cl_double4 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+ __cl_double2 v2[4];
+#endif
+#if defined( __CL_DOUBLE4__)
+ __cl_double4 v4[2];
+#endif
+#if defined( __CL_DOUBLE8__ )
+ __cl_double8 v8;
+#endif
+}cl_double8;
+
+typedef union
+{
+ cl_double CL_ALIGNED(128) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_double x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __extension__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __extension__ struct{ cl_double8 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+ __cl_double2 v2[8];
+#endif
+#if defined( __CL_DOUBLE4__)
+ __cl_double4 v4[4];
+#endif
+#if defined( __CL_DOUBLE8__ )
+ __cl_double8 v8[2];
+#endif
+#if defined( __CL_DOUBLE16__ )
+ __cl_double16 v16;
+#endif
+}cl_double16;
+
+/* Macro to facilitate debugging
+ * Usage:
+ * Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source.
+ * The first line ends with: CL_PROGRAM_STRING_DEBUG_INFO \"
+ * Each line thereafter of OpenCL C source must end with: \n\
+ * The last line ends in ";
+ *
+ * Example:
+ *
+ * const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\
+ * kernel void foo( int a, float * b ) \n\
+ * { \n\
+ * // my comment \n\
+ * *b[ get_global_id(0)] = a; \n\
+ * } \n\
+ * ";
+ *
+ * This should correctly set up the line, (column) and file information for your source
+ * string so you can do source level debugging.
+ */
+#define __CL_STRINGIFY( _x ) # _x
+#define _CL_STRINGIFY( _x ) __CL_STRINGIFY( _x )
+#define CL_PROGRAM_STRING_DEBUG_INFO "#line " _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __CL_PLATFORM_H */

x264-snapshot-20130723-2245.tar.bz2/extras/windowsPorts Added

x264-snapshot-20130723-2245.tar.bz2/extras/windowsPorts/basicDataTypeConversions.h Added

@@ -0,0 +1,85 @@
+#ifndef __DATA_TYPE_CONVERSIONS_H__
+#define __DATA_TYPE_CONVERSIONS_H__
+
+#include <stdint.h>
+#include <wchar.h>
+
+#ifdef __cplusplus
+namespace avxsynth {
+#endif // __cplusplus
+
+typedef int64_t __int64;
+typedef int32_t __int32;
+#ifdef __cplusplus
+typedef bool	BOOL;
+#else
+typedef uint32_t BOOL;
+#endif // __cplusplus
+typedef void* HMODULE;
+typedef void* LPVOID;
+typedef void* PVOID;
+typedef PVOID HANDLE;
+typedef HANDLE HWND;
+typedef HANDLE HINSTANCE;
+typedef void* HDC;
+typedef void* HBITMAP;
+typedef void* HICON;
+typedef void* HFONT;
+typedef void* HGDIOBJ;
+typedef void* HBRUSH;
+typedef void* HMMIO;
+typedef void* HACMSTREAM;
+typedef void* HACMDRIVER;
+typedef void* HIC;
+typedef void* HACMOBJ;
+typedef HACMSTREAM* LPHACMSTREAM;
+typedef void* HACMDRIVERID;
+typedef void* LPHACMDRIVER;
+typedef unsigned char BYTE;
+typedef BYTE* LPBYTE;
+typedef char TCHAR;
+typedef TCHAR* LPTSTR;
+typedef const TCHAR* LPCTSTR;
+typedef char* LPSTR;
+typedef LPSTR LPOLESTR;
+typedef const char* LPCSTR;
+typedef LPCSTR LPCOLESTR;
+typedef wchar_t WCHAR;
+typedef unsigned short WORD;
+typedef unsigned int UINT;
+typedef UINT MMRESULT;
+typedef uint32_t DWORD;
+typedef DWORD COLORREF;
+typedef DWORD FOURCC;
+typedef DWORD HRESULT;
+typedef DWORD* LPDWORD;
+typedef DWORD* DWORD_PTR;
+typedef int32_t LONG;
+typedef int32_t* LONG_PTR;
+typedef LONG_PTR LRESULT;
+typedef uint32_t ULONG;
+typedef uint32_t* ULONG_PTR;
+//typedef __int64_t intptr_t;
+typedef uint64_t _fsize_t;
+
+
+//
+// Structures
+//
+
+typedef struct _GUID {
+ DWORD Data1;
+ WORD Data2;
+ WORD Data3;
+ BYTE Data4[8];
+} GUID;
+
+typedef GUID REFIID;
+typedef GUID CLSID;
+typedef CLSID* LPCLSID;
+typedef GUID IID;
+
+#ifdef __cplusplus
+}; // namespace avxsynth
+#endif // __cplusplus
+#endif // __DATA_TYPE_CONVERSIONS_H__

x264-snapshot-20130723-2245.tar.bz2/extras/windowsPorts/windows2linux.h Added

@@ -0,0 +1,77 @@
+#ifndef __WINDOWS2LINUX_H__
+#define __WINDOWS2LINUX_H__
+
+/*
+ * LINUX SPECIFIC DEFINITIONS
+*/
+//
+// Data types conversions
+//
+#include <stdlib.h>
+#include <string.h>
+#include "basicDataTypeConversions.h"
+
+#ifdef __cplusplus
+namespace avxsynth {
+#endif // __cplusplus
+//
+// purposefully define the following MSFT definitions 
+// to mean nothing (as they do not mean anything on Linux)
+//
+#define __stdcall
+#define __cdecl
+#define noreturn
+#define __declspec(x)
+#define STDAPI extern "C" HRESULT
+#define STDMETHODIMP HRESULT __stdcall
+#define STDMETHODIMP_(x) x __stdcall
+
+#define STDMETHOD(x) virtual HRESULT x
+#define STDMETHOD_(a, x) virtual a x
+
+#ifndef TRUE
+#define TRUE true
+#endif 
+
+#ifndef FALSE
+#define FALSE false
+#endif
+
+#define S_OK (0x00000000)
+#define S_FALSE (0x00000001)
+#define E_NOINTERFACE (0X80004002)
+#define E_POINTER (0x80004003)
+#define E_FAIL (0x80004005)
+#define E_OUTOFMEMORY (0x8007000E)
+
+#define INVALID_HANDLE_VALUE ((HANDLE)((LONG_PTR)-1))
+#define FAILED(hr) ((hr) & 0x80000000)
+#define SUCCEEDED(hr) (!FAILED(hr))
+
+
+// 
+// Functions
+//
+#define MAKEDWORD(a,b,c,d) ((a << 24) | (b << 16) | (c << 8) | (d))
+#define MAKEWORD(a,b) ((a << 8) | (b))
+
+#define lstrlen strlen
+#define lstrcpy strcpy
+#define lstrcmpi strcasecmp
+#define _stricmp strcasecmp
+#define InterlockedIncrement(x) __sync_fetch_and_add((x), 1)
+#define InterlockedDecrement(x) __sync_fetch_and_sub((x), 1)
+// Windows uses (new, old) ordering but GCC has (old, new)
+#define InterlockedCompareExchange(x,y,z) __sync_val_compare_and_swap(x,z,y)
+
+#define UInt32x32To64(a, b) ( (uint64_t) ( ((uint64_t)((uint32_t)(a))) * ((uint32_t)(b)) ) ) 
+#define Int64ShrlMod32(a, b) ( (uint64_t) ( (uint64_t)(a) >> (b) ) )
+#define Int32x32To64(a, b) ((__int64)(((__int64)((long)(a))) * ((long)(b))))
+
+#define MulDiv(nNumber, nNumerator, nDenominator) (int32_t) (((int64_t) (nNumber) * (int64_t) (nNumerator) + (int64_t) ((nDenominator)/2)) / (int64_t) (nDenominator))
+
+#ifdef __cplusplus
+}; // namespace avxsynth
+#endif // __cplusplus
+
+#endif // __WINDOWS2LINUX_H__

x264-snapshot-20130224-2245.tar.bz2/input/avs.c -> x264-snapshot-20130723-2245.tar.bz2/input/avs.c Changed

@@ -24,12 +24,30 @@
 *****************************************************************************/
 
 #include "input.h"
+#if USE_AVXSYNTH
+#include <dlfcn.h>
+#if SYS_MACOSX
+#define avs_open dlopen( "libavxsynth.dylib", RTLD_NOW )
+#else
+#define avs_open dlopen( "libavxsynth.so", RTLD_NOW )
+#endif
+#define avs_close dlclose
+#define avs_address dlsym
+#else
 #include <windows.h>
+#define avs_open LoadLibrary( "avisynth" )
+#define avs_close FreeLibrary
+#define avs_address GetProcAddress
+#endif
 #define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "avs", __VA_ARGS__ )
 
 #define AVSC_NO_DECLSPEC
 #undef EXTERN_C
+#if USE_AVXSYNTH
+#include "extras/avxsynth_c.h"
+#else
 #include "extras/avisynth_c.h"
+#endif
 #define AVSC_DECLARE_FUNC(name) name##_func name
 
 /* AVS uses a versioned interface to control backwards compatibility */
@@ -40,12 +58,20 @@
 #include <libavutil/pixfmt.h>
 #endif
 
+/* AvxSynth doesn't have yv24, yv16, yv411, or y8, so disable them. */
+#if USE_AVXSYNTH
+#define avs_is_yv24( vi ) 0
+#define avs_is_yv16( vi ) 0
+#define avs_is_yv411( vi ) 0
+#define avs_is_y8( vi ) 0
+#endif
+
 /* maximum size of the sequence of filters to try on non script files */
 #define AVS_MAX_SEQUENCE 5
 
 #define LOAD_AVS_FUNC(name, continue_on_fail)\
 {\
- h->func.name = (void*)GetProcAddress( h->library, #name );\
+ h->func.name = (void*)avs_address( h->library, #name );\
 if( !continue_on_fail && !h->func.name )\
 goto fail;\
 }
@@ -76,7 +102,7 @@
 /* load the library and functions we require from it */
 static int x264_avs_load_library( avs_hnd_t *h )
 {
- h->library = LoadLibrary( "avisynth" );
+ h->library = avs_open;
 if( !h->library )
 return -1;
 LOAD_AVS_FUNC( avs_clip_get_error, 0 );
@@ -93,7 +119,7 @@
 LOAD_AVS_FUNC( avs_take_clip, 0 );
 return 0;
 fail:
- FreeLibrary( h->library );
+ avs_close( h->library );
 return -1;
 }
 
@@ -101,6 +127,9 @@
 static void avs_build_filter_sequence( char *filename_ext, const char *filter[AVS_MAX_SEQUENCE+1] )
 {
 int i = 0;
+#if USE_AVXSYNTH
+ const char *all_purpose[] = { "FFVideoSource", 0 };
+#else
 const char *all_purpose[] = { "FFmpegSource2", "DSS2", "DirectShowSource", 0 };
 if( !strcasecmp( filename_ext, "avi" ) )
 filter[i++] = "AVISource";
@@ -108,6 +137,7 @@
 filter[i++] = "MPEG2Source";
 if( !strcasecmp( filename_ext, "dga" ) )
 filter[i++] = "AVCSource";
+#endif
 for( int j = 0; all_purpose[j] && i < AVS_MAX_SEQUENCE; j++ )
 filter[i++] = all_purpose[j];
 }
@@ -123,6 +153,13 @@
 
 static float get_avs_version( avs_hnd_t *h )
 {
+/* AvxSynth has its version defined starting at 4.0, even though it's based on
+ AviSynth 2.5.8. This is troublesome for get_avs_version and working around
+ the new colorspaces in 2.6. So if AvxSynth is detected, explicitly define
+ the version as 2.58. */
+#if USE_AVXSYNTH
+ return 2.58f;
+#else
 FAIL_IF_ERROR( !h->func.avs_function_exists( h->env, "VersionNumber" ), "VersionNumber does not exist\n" )
 AVS_Value ver = h->func.avs_invoke( h->env, "VersionNumber", avs_new_value_array( NULL, 0 ), NULL );
 FAIL_IF_ERROR( avs_is_error( ver ), "unable to determine avisynth version: %s\n", avs_as_error( ver ) )
@@ -130,6 +167,7 @@
 float ret = avs_as_float( ver );
 h->func.avs_release_value( ver );
 return ret;
+#endif
 }
 
 static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
@@ -219,11 +257,11 @@
 }
 #if !HAVE_SWSCALE
 /* if swscale is not available, convert the CSP if necessary */
+ FAIL_IF_ERROR( avs_version < 2.6f && (opt->output_csp == X264_CSP_I422 || opt->output_csp == X264_CSP_I444),
+ "avisynth >= 2.6 is required for i422/i444 output\n" )
 if( (opt->output_csp == X264_CSP_I420 && !avs_is_yv12( vi )) || (opt->output_csp == X264_CSP_I422 && !avs_is_yv16( vi )) ||
 (opt->output_csp == X264_CSP_I444 && !avs_is_yv24( vi )) || (opt->output_csp == X264_CSP_RGB && !avs_is_rgb( vi )) )
 {
- FAIL_IF_ERROR( avs_version < 2.6f && (opt->output_csp == X264_CSP_I422 || opt->output_csp == X264_CSP_I444),
- "avisynth >= 2.6 is required for i422/i444 output\n" )
 
 const char *csp = opt->output_csp == X264_CSP_I420 ? "YV12" :
 opt->output_csp == X264_CSP_I422 ? "YV16" :
@@ -270,6 +308,7 @@
 opt->input_range = opt->output_range;
 }
 #endif
+
 h->func.avs_release_value( res );
 
 info->width = vi->width;
@@ -357,7 +396,7 @@
 h->func.avs_release_clip( h->clip );
 if( h->func.avs_delete_script_environment )
 h->func.avs_delete_script_environment( h->env );
- FreeLibrary( h->library );
+ avs_close( h->library );
 free( h );
 return 0;
 }

x264-snapshot-20130224-2245.tar.bz2/input/lavf.c -> x264-snapshot-20130723-2245.tar.bz2/input/lavf.c Changed

x264-snapshot-20130224-2245.tar.bz2/input/y4m.c -> x264-snapshot-20130723-2245.tar.bz2/input/y4m.c Changed

x264-snapshot-20130224-2245.tar.bz2/tools/checkasm-a.asm -> x264-snapshot-20130723-2245.tar.bz2/tools/checkasm-a.asm Changed

@@ -4,7 +4,7 @@
 ;* Copyright (C) 2008-2013 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
-;* Henrik Gramner <hengar-6@student.ltu.se>
+;* Henrik Gramner <henrik@gramner.com>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
@@ -88,8 +88,7 @@
 ; intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... )
 ;-----------------------------------------------------------------------------
 INIT_XMM
-cglobal checkasm_call, 2,15,16
- SUB rsp, max_args*8+16
+cglobal checkasm_call, 2,15,16,max_args*8+8
 mov r6, r0
 mov [rsp+max_args*8], r1
 
@@ -158,7 +157,6 @@
 mov dword [r1], 0
 mov rax, r9
 .ok:
- ADD rsp, max_args*8+16
 RET
 
 %else
@@ -207,8 +205,12 @@
 ; int x264_stack_pagealign( int (*func)(), int align )
 ;-----------------------------------------------------------------------------
 cglobal stack_pagealign, 2,2
+ movsxdifnidn r1, r1d
 push rbp
 mov rbp, rsp
+%if WIN64
+ sub rsp, 32 ; shadow space
+%endif
 and rsp, ~0xfff
 sub rsp, r1
 call r0

x264-snapshot-20130224-2245.tar.bz2/tools/checkasm.c -> x264-snapshot-20130723-2245.tar.bz2/tools/checkasm.c Changed

@@ -61,7 +61,7 @@
 {
 void *pointer; // just for detecting duplicates
 uint32_t cpu;
- uint32_t cycles;
+ uint64_t cycles;
 uint32_t den;
 } bench_t;
 
@@ -137,12 +137,12 @@
 
 static void print_bench(void)
 {
- uint16_t nops[10000] = {0};
+ uint16_t nops[10000];
 int nfuncs, nop_time=0;
 
 for( int i = 0; i < 10000; i++ )
 {
- int t = read_time();
+ uint32_t t = read_time();
 nops[i] = read_time() - t;
 }
 qsort( nops, 10000, sizeof(uint16_t), cmp_nop );
@@ -164,6 +164,7 @@
 if( k < j )
 continue;
 printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
+#if HAVE_MMX
 b->cpu&X264_CPU_AVX2 && b->cpu&X264_CPU_FMA3 ? "avx2_fma3" :
 b->cpu&X264_CPU_AVX2 ? "avx2" :
 b->cpu&X264_CPU_FMA3 ? "fma3" :
@@ -176,21 +177,30 @@
 /* print sse2slow only if there's also a sse2fast version of the same func */
 b->cpu&X264_CPU_SSE2_IS_SLOW && j<MAX_CPUS-1 && b[1].cpu&X264_CPU_SSE2_IS_FAST && !(b[1].cpu&X264_CPU_SSE3) ? "sse2slow" :
 b->cpu&X264_CPU_SSE2 ? "sse2" :
+ b->cpu&X264_CPU_SSE ? "sse" :
 b->cpu&X264_CPU_MMX ? "mmx" :
+#elif ARCH_PPC
 b->cpu&X264_CPU_ALTIVEC ? "altivec" :
+#elif ARCH_ARM
 b->cpu&X264_CPU_NEON ? "neon" :
- b->cpu&X264_CPU_ARMV6 ? "armv6" : "c",
+ b->cpu&X264_CPU_ARMV6 ? "armv6" :
+#endif
+ "c",
+#if HAVE_MMX
 b->cpu&X264_CPU_CACHELINE_32 ? "_c32" :
+ b->cpu&X264_CPU_SLOW_ATOM && b->cpu&X264_CPU_CACHELINE_64 ? "_c64_atom" :
 b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
- b->cpu&X264_CPU_SHUFFLE_IS_FAST && !(b->cpu&X264_CPU_SSE4) ? "_fastshuffle" :
+ b->cpu&X264_CPU_SLOW_SHUFFLE ? "_slowshuffle" :
 b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :
 b->cpu&X264_CPU_LZCNT ? "_lzcnt" :
 b->cpu&X264_CPU_BMI2 ? "_bmi2" :
- b->cpu&X264_CPU_TBM ? "_tbm" :
 b->cpu&X264_CPU_BMI1 ? "_bmi1" :
- b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" :
 b->cpu&X264_CPU_SLOW_CTZ ? "_slow_ctz" :
- b->cpu&X264_CPU_SLOW_ATOM ? "_slow_atom" : "",
+ b->cpu&X264_CPU_SLOW_ATOM ? "_atom" :
+#elif ARCH_ARM
+ b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" :
+#endif
+ "",
 ((int64_t)10*b->cycles/b->den - nop_time)/4 );
 }
 }
@@ -231,7 +241,7 @@
 #define call_bench(func,cpu,...)\
 if( do_bench && !strncmp(func_name, bench_pattern, bench_pattern_len) )\
 {\
- uint32_t tsum = 0;\
+ uint64_t tsum = 0;\
 int tcount = 0;\
 call_a1(func, __VA_ARGS__);\
 for( int ti = 0; ti < (cpu?BENCH_RUNS:BENCH_RUNS/4); ti++ )\
@@ -242,7 +252,7 @@
 func(__VA_ARGS__);\
 func(__VA_ARGS__);\
 t = read_time() - t;\
- if( t*tcount <= tsum*4 && ti > 0 )\
+ if( (uint64_t)t*tcount <= tsum*4 && ti > 0 )\
 {\
 tsum += t;\
 tcount++;\
@@ -299,7 +309,7 @@
 
 #define TEST_PIXEL( name, align ) \
 ok = 1, used_asm = 0; \
- for( int i = 0; i < 8; i++ ) \
+ for( int i = 0; i < ARRAY_ELEMS(pixel_c.name); i++ ) \
 { \
 int res_c, res_asm; \
 if( pixel_asm.name[i] != pixel_ref.name[i] ) \
@@ -337,11 +347,49 @@
 TEST_PIXEL( satd, 0 );
 TEST_PIXEL( sa8d, 1 );
 
+ ok = 1, used_asm = 0;
+ if( pixel_asm.sa8d_satd[PIXEL_16x16] != pixel_ref.sa8d_satd[PIXEL_16x16] )
+ {
+ set_func_name( "sa8d_satd_%s", pixel_names[PIXEL_16x16] );
+ used_asm = 1;
+ for( int j = 0; j < 64; j++ )
+ {
+ uint32_t cost8_c = pixel_c.sa8d[PIXEL_16x16]( pbuf1, 16, pbuf2, 64 );
+ uint32_t cost4_c = pixel_c.satd[PIXEL_16x16]( pbuf1, 16, pbuf2, 64 );
+ uint64_t res_a = call_a( pixel_asm.sa8d_satd[PIXEL_16x16], pbuf1, (intptr_t)16, pbuf2, (intptr_t)64 );
+ uint32_t cost8_a = res_a;
+ uint32_t cost4_a = res_a >> 32;
+ if( cost8_a != cost8_c || cost4_a != cost4_c )
+ {
+ ok = 0;
+ fprintf( stderr, "sa8d_satd [%d]: (%d,%d) != (%d,%d) [FAILED]\n", PIXEL_16x16,
+ cost8_c, cost4_c, cost8_a, cost4_a );
+ break;
+ }
+ }
+ for( int j = 0; j < 0x1000 && ok; j += 256 ) \
+ {
+ uint32_t cost8_c = pixel_c.sa8d[PIXEL_16x16]( pbuf3+j, 16, pbuf4+j, 16 );
+ uint32_t cost4_c = pixel_c.satd[PIXEL_16x16]( pbuf3+j, 16, pbuf4+j, 16 );
+ uint64_t res_a = pixel_asm.sa8d_satd[PIXEL_16x16]( pbuf3+j, 16, pbuf4+j, 16 );
+ uint32_t cost8_a = res_a;
+ uint32_t cost4_a = res_a >> 32;
+ if( cost8_a != cost8_c || cost4_a != cost4_c )
+ {
+ ok = 0;
+ fprintf( stderr, "sa8d_satd [%d]: overflow (%d,%d) != (%d,%d) [FAILED]\n", PIXEL_16x16,
+ cost8_c, cost4_c, cost8_a, cost4_a );
+ }
+ }
+ }
+ report( "pixel sa8d_satd :" );
+
 #define TEST_PIXEL_X( N ) \
 ok = 1; used_asm = 0; \
 for( int i = 0; i < 7; i++ ) \
 { \
- int res_c[4]={0}, res_asm[4]={0}; \
+ ALIGNED_16( int res_c[4] ) = {0}; \
+ ALIGNED_16( int res_asm[4] ) = {0}; \
 if( pixel_asm.sad_x##N[i] && pixel_asm.sad_x##N[i] != pixel_ref.sad_x##N[i] ) \
 { \
 set_func_name( "sad_x%d_%s", N, pixel_names[i] ); \
@@ -494,7 +542,8 @@
 #define TEST_INTRA_X3( name, i8x8, ... ) \
 if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
 { \
- int res_c[3], res_asm[3]; \
+ ALIGNED_16( int res_c[3] ); \
+ ALIGNED_16( int res_asm[3] ); \
 set_func_name( #name ); \
 used_asm = 1; \
 call_c( pixel_c.name, pbuf1+48, i8x8 ? edge : pbuf3+48, res_c ); \
@@ -696,8 +745,8 @@
 {
 ALIGNED_16( uint16_t sums[72] );
 ALIGNED_16( int dc[4] );
- ALIGNED_16( int16_t mvs_a[32] );
- ALIGNED_16( int16_t mvs_c[32] );
+ ALIGNED_16( int16_t mvs_a[48] );
+ ALIGNED_16( int16_t mvs_c[48] );
 int mvn_a, mvn_c;
 int thresh = rand() & 0x3fff;
 set_func_name( "esa_ads" );
@@ -732,10 +781,10 @@
 x264_dct_function_t dct_asm;
 x264_quant_function_t qf;
 int ret = 0, ok, used_asm, interlace = 0;
- ALIGNED_16( dctcoef dct1[16][16] );
- ALIGNED_16( dctcoef dct2[16][16] );
- ALIGNED_16( dctcoef dct4[16][16] );
- ALIGNED_16( dctcoef dct8[4][64] );
+ ALIGNED_ARRAY_N( dctcoef, dct1, [16],[16] );
+ ALIGNED_ARRAY_N( dctcoef, dct2, [16],[16] );
+ ALIGNED_ARRAY_N( dctcoef, dct4, [16],[16] );
+ ALIGNED_ARRAY_N( dctcoef, dct8, [4],[64] );
 ALIGNED_16( dctcoef dctdc[2][8] );
 x264_t h_buf;
 x264_t *h = &h_buf;
@@ -1030,7 +1079,7 @@
 call_a( zigzag_asm[interlace].name, t2, dct, buf4 ); \
 if( memcmp( t1, t2, size*sizeof(dctcoef) ) || memcmp( buf3, buf4, 10 ) ) \
 { \
- ok = 0; \
+ ok = 0; printf("%d: %d %d %d %d\n%d %d %d %d\n\n",memcmp( t1, t2, size*sizeof(dctcoef) ),buf3[0], buf3[1], buf3[8], buf3[9], buf4[0], buf4[1], buf4[8], buf4[9]);break;\
 } \
 } \
 }
@@ -1040,13 +1089,13 @@
 x264_zigzag_init( cpu_new, &zigzag_asm[0], &zigzag_asm[1] );
 
 ok = 1; used_asm = 0;
- TEST_INTERLEAVE( interleave_8x8_cavlc, level1, level2, dct1[0], 64 );
+ TEST_INTERLEAVE( interleave_8x8_cavlc, level1, level2, dct8[0], 64 );
 report( "zigzag_interleave :" );
 
 for( interlace = 0; interlace <= 1; interlace++ )
 {
 ok = 1; used_asm = 0;
- TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, dct1[0], 8 );
+ TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, dct8[0], 8 );
 TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 4 );
 TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 );
 TEST_ZIGZAG_SUBAC( sub_4x4ac, level1, level2 );
@@ -1073,9 +1122,9 @@
 
 int ret = 0, ok, used_asm;
 
- x264_mc_init( 0, &mc_c );
- x264_mc_init( cpu_ref, &mc_ref );
- x264_mc_init( cpu_new, &mc_a );
+ x264_mc_init( 0, &mc_c, 0 );
+ x264_mc_init( cpu_ref, &mc_ref, 0 );
+ x264_mc_init( cpu_new, &mc_a, 0 );
 x264_pixel_init( 0, &pixf );
 
 #define MC_TEST_LUMA( w, h ) \
@@ -1227,8 +1276,12 @@
 fprintf( stderr, #name "[%d]: [FAILED] s:%d o:%d d%d\n", i, s, o, d ); \
 break; \
 } \
- call_c2( mc_c.weight[i], buffC, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \
- call_a2( weight.weightfn[i], buffA, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \
+ /* omit unlikely high scales for benchmarking */ \
+ if( (s << (8-d)) < 512 ) \
+ { \
+ call_c2( mc_c.weight[i], buffC, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \
+ call_a2( weight.weightfn[i], buffA, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \
+ } \
 } \
 }
 
@@ -1437,23 +1490,24 @@
 pixel *dsta[4] = { pbuf4, pbuf4+1024, pbuf4+2048, pbuf4+3072 };
 set_func_name( "lowres_init" );
 ok = 1; used_asm = 1;
- for( int w = 40; w <= 48; w += 8 )
+ for( int w = 96; w <= 96+24; w += 8 )
 {
- intptr_t stride = (w+8)&~15;
- call_c( mc_c.frame_init_lowres_core, pbuf1, dstc[0], dstc[1], dstc[2], dstc[3], (intptr_t)w*2, stride, w, 16 );
- call_a( mc_a.frame_init_lowres_core, pbuf1, dsta[0], dsta[1], dsta[2], dsta[3], (intptr_t)w*2, stride, w, 16 );
- for( int i = 0; i < 16; i++ )
+ intptr_t stride = (w*2+31)&~31;
+ intptr_t stride_lowres = (w+31)&~31;
+ call_c( mc_c.frame_init_lowres_core, pbuf1, dstc[0], dstc[1], dstc[2], dstc[3], stride, stride_lowres, w, 8 );
+ call_a( mc_a.frame_init_lowres_core, pbuf1, dsta[0], dsta[1], dsta[2], dsta[3], stride, stride_lowres, w, 8 );
+ for( int i = 0; i < 8; i++ )
 {
 for( int j = 0; j < 4; j++ )
- if( memcmp( dstc[j]+i*stride, dsta[j]+i*stride, w * sizeof(pixel) ) )
+ if( memcmp( dstc[j]+i*stride_lowres, dsta[j]+i*stride_lowres, w * sizeof(pixel) ) )
 {
 ok = 0;
 fprintf( stderr, "frame_init_lowres differs at plane %d line %d\n", j, i );
 for( int k = 0; k < w; k++ )
- printf( "%d ", dstc[j][k+i*stride] );
+ printf( "%d ", dstc[j][k+i*stride_lowres] );
 printf( "\n" );
 for( int k = 0; k < w; k++ )
- printf( "%d ", dsta[j][k+i*stride] );
+ printf( "%d ", dsta[j][k+i*stride_lowres] );
 printf( "\n" );
 break;
 }
@@ -1465,7 +1519,7 @@
 #define INTEGRAL_INIT( name, size, ... )\
 if( mc_a.name != mc_ref.name )\
 {\
- intptr_t stride = 80;\
+ intptr_t stride = 96;\
 set_func_name( #name );\
 used_asm = 1;\
 memcpy( buf3, buf1, size*2*stride );\
@@ -1637,8 +1691,8 @@
 ALIGNED_ARRAY_16( uint8_t, nnz, [X264_SCAN8_SIZE] );
 ALIGNED_4( int8_t ref[2][X264_SCAN8_LUMA_SIZE] );
 ALIGNED_ARRAY_16( int16_t, mv, [2],[X264_SCAN8_LUMA_SIZE][2] );
- ALIGNED_ARRAY_16( uint8_t, bs, [2],[2][8][4] );
- memset( bs, 99, sizeof(bs) );
+ ALIGNED_ARRAY_N( uint8_t, bs, [2],[2][8][4] );
+ memset( bs, 99, sizeof(uint8_t)*2*4*8*2 );
 for( int j = 0; j < X264_SCAN8_SIZE; j++ )
 nnz[j] = ((rand()&7) == 7) * rand() & 0xf;
 for( int j = 0; j < 2; j++ )
@@ -1651,7 +1705,7 @@
 set_func_name( "deblock_strength" );
 call_c( db_c.deblock_strength, nnz, ref, mv, bs[0], 2<<(i&1), ((i>>1)&1) );
 call_a( db_a.deblock_strength, nnz, ref, mv, bs[1], 2<<(i&1), ((i>>1)&1) );
- if( memcmp( bs[0], bs[1], sizeof(bs[0]) ) )
+ if( memcmp( bs[0], bs[1], sizeof(uint8_t)*2*4*8 ) )
 {
 ok = 0;
 fprintf( stderr, "deblock_strength: [FAILED]\n" );
@@ -1681,11 +1735,11 @@
 x264_quant_function_t qf_c;
 x264_quant_function_t qf_ref;
 x264_quant_function_t qf_a;
- ALIGNED_16( dctcoef dct1[64] );
- ALIGNED_16( dctcoef dct2[64] );
- ALIGNED_16( dctcoef dct3[8][16] );
- ALIGNED_16( dctcoef dct4[8][16] );
- ALIGNED_16( uint8_t cqm_buf[64] );
+ ALIGNED_ARRAY_N( dctcoef, dct1,[64] );
+ ALIGNED_ARRAY_N( dctcoef, dct2,[64] );
+ ALIGNED_ARRAY_N( dctcoef, dct3,[8],[16] );
+ ALIGNED_ARRAY_N( dctcoef, dct4,[8],[16] );
+ ALIGNED_ARRAY_N( uint8_t, cqm_buf,[64] );
 int ret = 0, ok, used_asm;
 int oks[3] = {1,1,1}, used_asms[3] = {0,0,0};
 x264_t h_buf;
@@ -1731,23 +1785,23 @@
 x264_quant_init( h, cpu_ref, &qf_ref );
 x264_quant_init( h, cpu_new, &qf_a );
 
-#define INIT_QUANT8(j) \
+#define INIT_QUANT8(j,max) \
 { \
 static const int scale1d[8] = {32,31,24,31,32,31,24,31}; \
- for( int i = 0; i < 64; i++ ) \
+ for( int i = 0; i < max; i++ ) \
 { \
- unsigned int scale = (255*scale1d[i>>3]*scale1d[i&7])/16; \
- dct1[i] = dct2[i] = j ? (rand()%(2*scale+1))-scale : 0; \
+ unsigned int scale = (255*scale1d[(i>>3)&7]*scale1d[i&7])/16; \
+ dct1[i] = dct2[i] = (j>>(i>>6))&1 ? (rand()%(2*scale+1))-scale : 0; \
 } \
 }
 
-#define INIT_QUANT4(j) \
+#define INIT_QUANT4(j,max) \
 { \
 static const int scale1d[4] = {4,6,4,6}; \
- for( int i = 0; i < 16; i++ ) \
+ for( int i = 0; i < max; i++ ) \
 { \
- unsigned int scale = 255*scale1d[i>>2]*scale1d[i&3]; \
- dct1[i] = dct2[i] = j ? (rand()%(2*scale+1))-scale : 0; \
+ unsigned int scale = 255*scale1d[(i>>2)&3]*scale1d[i&3]; \
+ dct1[i] = dct2[i] = (j>>(i>>4))&1 ? (rand()%(2*scale+1))-scale : 0; \
 } \
 }
 
@@ -1777,34 +1831,36 @@
 } \
 }
 
-#define TEST_QUANT( qname, block, w ) \
+#define TEST_QUANT( qname, block, type, w, maxj ) \
 if( qf_a.qname != qf_ref.qname ) \
 { \
 set_func_name( #qname ); \
 used_asms[0] = 1; \
 for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \
 { \
- for( int j = 0; j < 2; j++ ) \
+ for( int j = 0; j < maxj; j++ ) \
 { \
- INIT_QUANT##w(j) \
- int result_c = call_c1( qf_c.qname, dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
- int result_a = call_a1( qf_a.qname, dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
+ INIT_QUANT##type(j, w*w) \
+ int result_c = call_c1( qf_c.qname, (void*)dct1, h->quant##type##_mf[block][qp], h->quant##type##_bias[block][qp] ); \
+ int result_a = call_a1( qf_a.qname, (void*)dct2, h->quant##type##_mf[block][qp], h->quant##type##_bias[block][qp] ); \
 if( memcmp( dct1, dct2, w*w*sizeof(dctcoef) ) || result_c != result_a ) \
 { \
 oks[0] = 0; \
 fprintf( stderr, #qname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
 break; \
 } \
- call_c2( qf_c.qname, dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
- call_a2( qf_a.qname, dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
+ call_c2( qf_c.qname, (void*)dct1, h->quant##type##_mf[block][qp], h->quant##type##_bias[block][qp] ); \
+ call_a2( qf_a.qname, (void*)dct2, h->quant##type##_mf[block][qp], h->quant##type##_bias[block][qp] ); \
 } \
 } \
 }
 
- TEST_QUANT( quant_8x8, CQM_8IY, 8 );
- TEST_QUANT( quant_8x8, CQM_8PY, 8 );
- TEST_QUANT( quant_4x4, CQM_4IY, 4 );
- TEST_QUANT( quant_4x4, CQM_4PY, 4 );
+ TEST_QUANT( quant_8x8, CQM_8IY, 8, 8, 2 );
+ TEST_QUANT( quant_8x8, CQM_8PY, 8, 8, 2 );
+ TEST_QUANT( quant_4x4, CQM_4IY, 4, 4, 2 );
+ TEST_QUANT( quant_4x4, CQM_4PY, 4, 4, 2 );
+ TEST_QUANT( quant_4x4x4, CQM_4IY, 4, 8, 16 );
+ TEST_QUANT( quant_4x4x4, CQM_4PY, 4, 8, 16 );
 TEST_QUANT_DC( quant_4x4_dc, **h->quant4_mf[CQM_4IY] );
 TEST_QUANT_DC( quant_2x2_dc, **h->quant4_mf[CQM_4IC] );
 
@@ -1815,7 +1871,7 @@
 used_asms[1] = 1; \
 for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \
 { \
- INIT_QUANT##w(1) \
+ INIT_QUANT##w(1, w*w) \
 qf_c.qname( dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
 memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \
 call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
@@ -2092,7 +2148,7 @@
 int ret = 0, ok = 1, used_asm = 0;
 ALIGNED_ARRAY_32( pixel, edge,[36] );
 ALIGNED_ARRAY_32( pixel, edge2,[36] );
- ALIGNED_16( pixel fdec[FDEC_STRIDE*20] );
+ ALIGNED_ARRAY_32( pixel, fdec,[FDEC_STRIDE*20] );
 struct
 {
 x264_predict_t predict_16x16[4+3];
@@ -2263,13 +2319,99 @@
 #define run_cabac_terminal_asm run_cabac_terminal_c
 #endif
 
+extern const uint8_t x264_count_cat_m1[14];
+void x264_cabac_block_residual_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l );
+void x264_cabac_block_residual_8x8_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l );
+void x264_cabac_block_residual_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l );
+
 static int check_cabac( int cpu_ref, int cpu_new )
 {
- int ret = 0, ok, used_asm = 1;
+ int ret = 0, ok = 1, used_asm = 0;
 x264_t h;
 h.sps->i_chroma_format_idc = 3;
+
+ x264_bitstream_function_t bs_ref;
+ x264_bitstream_function_t bs_a;
+ x264_bitstream_init( cpu_ref, &bs_ref );
+ x264_bitstream_init( cpu_new, &bs_a );
+ x264_quant_init( &h, cpu_new, &h.quantf );
+ h.quantf.coeff_last[DCT_CHROMA_DC] = h.quantf.coeff_last4;
+
+#define CABAC_RESIDUAL(name, start, end, rd)\
+{\
+ if( bs_a.name##_internal && (bs_a.name##_internal != bs_ref.name##_internal || (cpu_new&X264_CPU_SSE2_IS_SLOW)) )\
+ {\
+ used_asm = 1;\
+ set_func_name( #name );\
+ for( int i = 0; i < 2; i++ )\
+ {\
+ for( intptr_t ctx_block_cat = start; ctx_block_cat <= end; ctx_block_cat++ )\
+ {\
+ for( int j = 0; j < 256; j++ )\
+ {\
+ ALIGNED_ARRAY_N( dctcoef, dct, [2],[64] );\
+ uint8_t bitstream[2][1<<16];\
+ static const uint8_t ctx_ac[14] = {0,1,0,0,1,0,0,1,0,0,0,1,0,0};\
+ int ac = ctx_ac[ctx_block_cat];\
+ int nz = 0;\
+ while( !nz )\
+ {\
+ for( int k = 0; k <= x264_count_cat_m1[ctx_block_cat]; k++ )\
+ {\
+ /* Very rough distribution that covers possible inputs */\
+ int rnd = rand();\
+ int coef = !(rnd&3);\
+ coef += !(rnd& 15) * (rand()&0x0006);\
+ coef += !(rnd& 63) * (rand()&0x0008);\
+ coef += !(rnd& 255) * (rand()&0x00F0);\
+ coef += !(rnd&1023) * (rand()&0x7F00);\
+ nz |= dct[0][ac+k] = dct[1][ac+k] = coef * ((rand()&1) ? 1 : -1);\
+ }\
+ }\
+ h.mb.b_interlaced = i;\
+ x264_cabac_t cb[2];\
+ x264_cabac_context_init( &h, &cb[0], SLICE_TYPE_P, 26, 0 );\
+ x264_cabac_context_init( &h, &cb[1], SLICE_TYPE_P, 26, 0 );\
+ x264_cabac_encode_init( &cb[0], bitstream[0], bitstream[0]+0xfff0 );\
+ x264_cabac_encode_init( &cb[1], bitstream[1], bitstream[1]+0xfff0 );\
+ cb[0].f8_bits_encoded = 0;\
+ cb[1].f8_bits_encoded = 0;\
+ if( !rd ) memcpy( bitstream[1], bitstream[0], 0x400 );\
+ call_c1( x264_##name##_c, &h, &cb[0], ctx_block_cat, dct[0]+ac );\
+ call_a1( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, &cb[1] );\
+ ok = cb[0].f8_bits_encoded == cb[1].f8_bits_encoded && !memcmp(cb[0].state, cb[1].state, 1024);\
+ if( !rd ) ok |= !memcmp( bitstream[1], bitstream[0], 0x400 ) && !memcmp( &cb[1], &cb[0], offsetof(x264_cabac_t, p_start) );\
+ if( !ok )\
+ {\
+ fprintf( stderr, #name " : [FAILED] ctx_block_cat %d", (int)ctx_block_cat );\
+ if( rd && cb[0].f8_bits_encoded != cb[1].f8_bits_encoded )\
+ fprintf( stderr, " (%d != %d)", cb[0].f8_bits_encoded, cb[1].f8_bits_encoded );\
+ fprintf( stderr, "\n");\
+ goto name##fail;\
+ }\
+ if( (j&15) == 0 )\
+ {\
+ call_c2( x264_##name##_c, &h, &cb[0], ctx_block_cat, dct[0]+ac );\
+ call_a2( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, &cb[1] );\
+ }\
+ }\
+ }\
+ }\
+ }\
+}\
+name##fail:
+
+ CABAC_RESIDUAL( cabac_block_residual, 0, DCT_LUMA_8x8, 0 )
+ report( "cabac residual:" );
+
+ ok = 1; used_asm = 0;
+ CABAC_RESIDUAL( cabac_block_residual_rd, 0, DCT_LUMA_8x8-1, 1 )
+ CABAC_RESIDUAL( cabac_block_residual_8x8_rd, DCT_LUMA_8x8, DCT_LUMA_8x8, 1 )
+ report( "cabac residual rd:" );
+
 if( cpu_ref || run_cabac_decision_c == run_cabac_decision_asm )
- return 0;
+ return ret;
+ ok = 1; used_asm = 0;
 x264_cabac_init( &h );
 
 set_func_name( "cabac_encode_decision" );
@@ -2394,18 +2536,18 @@
 ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "MMX SlowCTZ" );
 cpu1 &= ~X264_CPU_SLOW_CTZ;
 }
+ if( x264_cpu_detect() & X264_CPU_SSE )
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE, "SSE" );
 if( x264_cpu_detect() & X264_CPU_SSE2 )
 {
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE | X264_CPU_SSE2 | X264_CPU_SSE2_IS_SLOW, "SSE2Slow" );
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2 | X264_CPU_SSE2_IS_SLOW, "SSE2Slow" );
 ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2_IS_FAST, "SSE2Fast" );
 ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSE2Fast Cache64" );
 cpu1 &= ~X264_CPU_CACHELINE_64;
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSE2 FastShuffle" );
- cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_SHUFFLE, "SSE2 SlowShuffle" );
+ cpu1 &= ~X264_CPU_SLOW_SHUFFLE;
 ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" );
 cpu1 &= ~X264_CPU_SLOW_CTZ;
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSE2 SlowAtom" );
- cpu1 &= ~X264_CPU_SLOW_ATOM;
 }
 if( x264_cpu_detect() & X264_CPU_SSE_MISALIGN )
 {
@@ -2427,15 +2569,17 @@
 ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" );
 ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
 cpu1 &= ~X264_CPU_CACHELINE_64;
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSSE3 FastShuffle" );
- cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_SHUFFLE, "SSSE3 SlowShuffle" );
+ cpu1 &= ~X264_CPU_SLOW_SHUFFLE;
 ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSSE3 SlowCTZ" );
 cpu1 &= ~X264_CPU_SLOW_CTZ;
 ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSSE3 SlowAtom" );
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64 SlowAtom" );
+ cpu1 &= ~X264_CPU_CACHELINE_64;
 cpu1 &= ~X264_CPU_SLOW_ATOM;
 }
 if( x264_cpu_detect() & X264_CPU_SSE4 )
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4 | X264_CPU_SHUFFLE_IS_FAST, "SSE4" );
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" );
 if( x264_cpu_detect() & X264_CPU_AVX )
 ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX, "AVX" );
 if( x264_cpu_detect() & X264_CPU_XOP )
@@ -2448,20 +2592,22 @@
 if( x264_cpu_detect() & X264_CPU_BMI1 )
 {
 ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" );
- if( x264_cpu_detect() & X264_CPU_TBM )
- {
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_TBM, "TBM" );
- cpu1 &= ~X264_CPU_TBM;
- }
- if( x264_cpu_detect() & X264_CPU_BMI2 )
- {
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI2, "BMI2" );
- cpu1 &= ~X264_CPU_BMI2;
- }
 cpu1 &= ~X264_CPU_BMI1;
 }
 if( x264_cpu_detect() & X264_CPU_AVX2 )
+ {
 ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" );
+ if( x264_cpu_detect() & X264_CPU_LZCNT )
+ {
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "AVX2_LZCNT" );
+ cpu1 &= ~X264_CPU_LZCNT;
+ }
+ }
+ if( x264_cpu_detect() & X264_CPU_BMI2 )
+ {
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1|X264_CPU_BMI2, "BMI2" );
+ cpu1 &= ~(X264_CPU_BMI1|X264_CPU_BMI2);
+ }
 if( x264_cpu_detect() & X264_CPU_FMA3 )
 {
 ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" );
@@ -2508,8 +2654,8 @@
 fprintf( stderr, "x264: using random seed %u\n", seed );
 srand( seed );
 
- buf1 = x264_malloc( 0x1e00 + 0x2000*sizeof(pixel) + 16*BENCH_ALIGNS );
- pbuf1 = x264_malloc( 0x1e00*sizeof(pixel) + 16*BENCH_ALIGNS );
+ buf1 = x264_malloc( 0x1e00 + 0x2000*sizeof(pixel) + 32*BENCH_ALIGNS );
+ pbuf1 = x264_malloc( 0x1e00*sizeof(pixel) + 32*BENCH_ALIGNS );
 if( !buf1 || !pbuf1 )
 {
 fprintf( stderr, "malloc failed, unable to initiate tests!\n" );
@@ -2530,19 +2676,19 @@
 }
 memset( buf1+0x1e00, 0, 0x2000*sizeof(pixel) );
 
- /* 16-byte alignment is guaranteed whenever it's useful, but some functions also vary in speed depending on %64 */
+ /* 32-byte alignment is guaranteed whenever it's useful, but some functions also vary in speed depending on %64 */
 if( do_bench )
 for( int i = 0; i < BENCH_ALIGNS && !ret; i++ )
 {
 INIT_POINTER_OFFSETS;
- ret |= x264_stack_pagealign( check_all_flags, i*16 );
- buf1 += 16;
- pbuf1 += 16;
+ ret |= x264_stack_pagealign( check_all_flags, i*32 );
+ buf1 += 32;
+ pbuf1 += 32;
 quiet = 1;
 fprintf( stderr, "%d/%d\r", i+1, BENCH_ALIGNS );
 }
 else
- ret = check_all_flags();
+ ret = x264_stack_pagealign( check_all_flags, 0 );
 
 if( ret )
 {

x264-snapshot-20130723-2245.tar.bz2/tools/cltostr.pl Added

@@ -0,0 +1,65 @@
+# Perl script used for compiling OpenCL src into x264 binary
+#
+# Copyright (C) 2013 x264 project
+# Authors: Steve Borho <sborho@multicorewareinc.com>
+
+use Digest::MD5 qw(md5_hex);
+
+# xxd takes a VAR, which will be the variable name
+# and BYTES, a string of bytes to beencoded.
+sub xxd
+{
+ my %args = @_;
+ my $var = $args{VAR};
+ my $bytes = $args{BYTES};
+ my @hexbytes;
+ my @bytes = split //, $$bytes;
+ foreach $b (@bytes)
+ {
+ push @hexbytes, sprintf("0x%02X", ord($b));
+ }
+
+ # Format 'em nice and pretty-like.
+ print 'static const char ' . $var . '[] = {' . "\n";
+ my $count = 0;
+ foreach my $h (@hexbytes)
+ {
+ print "$h, ";
+ $count++;
+ if ($count == 16)
+ {
+ print "\n";
+ $count = 0;
+ }
+ }
+ print "\n0x00 };\n\n";
+
+ return;
+}
+
+if (@ARGV < 1)
+{
+ printf "%s: VARNAME ", $0 . "\n";
+ exit(-1);
+}
+
+
+my @lines;
+while(<STDIN>)
+{
+ s/^\s+//; # trim leading whitespace
+ if (/^\/\//)
+ {
+ next; # skip the line if it starts with '//'
+ }
+ push @lines, $_;
+}
+
+my $lines = join '', @lines;
+xxd(VAR => @ARGV[0], BYTES => \$lines);
+
+my $hash = md5_hex($lines);
+@hash = ( $hash =~ m/../g );
+
+
+xxd(VAR => @ARGV[0] . "_hash", BYTES => \$hash);

x264-snapshot-20130224-2245.tar.bz2/x264.c -> x264-snapshot-20130723-2245.tar.bz2/x264.c Changed

@@ -225,7 +225,7 @@
 va_end( arg );
 }
 
-static void print_version_info()
+static void print_version_info( void )
 {
 #ifdef X264_POINTVER
 printf( "x264 "X264_POINTVER"\n" );
@@ -596,8 +596,11 @@
 H2( " --slices <integer> Number of slices per frame; forces rectangular\n"
 " slices and is overridden by other slicing options\n" );
 else H1( " --slices <integer> Number of slices per frame\n" );
+ H2( " --slices-max <integer> Absolute maximum slices per frame; overrides\n"
+ " slice-max-size/slice-max-mbs when necessary\n" );
 H2( " --slice-max-size <integer> Limit the size of each slice in bytes\n");
- H2( " --slice-max-mbs <integer> Limit the size of each slice in macroblocks\n");
+ H2( " --slice-max-mbs <integer> Limit the size of each slice in macroblocks (max)\n");
+ H2( " --slice-min-mbs <integer> Limit the size of each slice in macroblocks (min)\n");
 H0( " --tff Enable interlaced mode (top field first)\n" );
 H0( " --bff Enable interlaced mode (bottom field first)\n" );
 H2( " --constrained-intra Enable constrained intra prediction.\n" );
@@ -743,16 +746,18 @@
 H2( " --range <string> Specify color range [\"%s\"]\n"
 " - %s\n", range_names[0], stringify_names( buf, range_names ) );
 H2( " --colorprim <string> Specify color primaries [\"%s\"]\n"
- " - undef, bt709, bt470m, bt470bg\n"
- " smpte170m, smpte240m, film\n",
+ " - undef, bt709, bt470m, bt470bg, smpte170m,\n"
+ " smpte240m, film, bt2020\n",
 strtable_lookup( x264_colorprim_names, defaults->vui.i_colorprim ) );
 H2( " --transfer <string> Specify transfer characteristics [\"%s\"]\n"
- " - undef, bt709, bt470m, bt470bg, linear,\n"
- " log100, log316, smpte170m, smpte240m\n",
+ " - undef, bt709, bt470m, bt470bg, smpte170m,\n"
+ " smpte240m, linear, log100, log316,\n"
+ " iec61966-2-4, bt1361e, iec61966-2-1,\n"
+ " bt2020-10, bt2020-12\n",
 strtable_lookup( x264_transfer_names, defaults->vui.i_transfer ) );
 H2( " --colormatrix <string> Specify color matrix setting [\"%s\"]\n"
- " - undef, bt709, fcc, bt470bg\n"
- " smpte170m, smpte240m, GBR, YCgCo\n",
+ " - undef, bt709, fcc, bt470bg, smpte170m,\n"
+ " smpte240m, GBR, YCgCo, bt2020nc, bt2020c\n",
 strtable_lookup( x264_colmatrix_names, defaults->vui.i_colmatrix ) );
 H2( " --chromaloc <integer> Specify chroma sample location (0 to 5) [%d]\n",
 defaults->vui.i_chroma_loc );
@@ -787,6 +792,8 @@
 H0( " --frames <integer> Maximum number of frames to encode\n" );
 H0( " --level <string> Specify level (as defined by Annex A)\n" );
 H1( " --bluray-compat Enable compatibility hacks for Blu-ray support\n" );
+ H1( " --stitchable Don't optimize headers based on video content\n"
+ " Ensures ability to recombine a segmented encode\n" );
 H1( "\n" );
 H1( " -v, --verbose Print stats for each frame\n" );
 H1( " --no-progress Don't show the progress indicator while encoding\n" );
@@ -806,6 +813,9 @@
 " as opposed to letting them select different algorithms\n" );
 H2( " --asm <integer> Override CPU detection\n" );
 H2( " --no-asm Disable all CPU optimizations\n" );
+ H2( " --opencl Enable use of OpenCL\n" );
+ H2( " --opencl-clbin <string> Specify path of compiled OpenCL kernel cache\n" );
+ H2( " --opencl-device <integer> Specify OpenCL device ordinal\n" );
 H2( " --visualize Show MB types overlayed on the encoded video\n" );
 H2( " --dump-yuv <string> Save reconstructed frames\n" );
 H2( " --sps-id <integer> Set SPS and PPS id numbers [%d]\n", defaults->i_sps_id );
@@ -910,6 +920,9 @@
 { "ref", required_argument, NULL, 'r' },
 { "asm", required_argument, NULL, 0 },
 { "no-asm", no_argument, NULL, 0 },
+ { "opencl", no_argument, NULL, 1 },
+ { "opencl-clbin",required_argument, NULL, 0 },
+ { "opencl-device",required_argument, NULL, 0 },
 { "sar", required_argument, NULL, 0 },
 { "fps", required_argument, NULL, OPT_FPS },
 { "frames", required_argument, NULL, OPT_FRAMES },
@@ -971,7 +984,9 @@
 { "no-sliced-threads", no_argument, NULL, 0 },
 { "slice-max-size", required_argument, NULL, 0 },
 { "slice-max-mbs", required_argument, NULL, 0 },
+ { "slice-min-mbs", required_argument, NULL, 0 },
 { "slices", required_argument, NULL, 0 },
+ { "slices-max", required_argument, NULL, 0 },
 { "thread-input", no_argument, NULL, OPT_THREAD_INPUT },
 { "sync-lookahead", required_argument, NULL, 0 },
 { "non-deterministic", no_argument, NULL, 0 },
@@ -1025,6 +1040,7 @@
 { "dts-compress", no_argument, NULL, OPT_DTS_COMPRESSION },
 { "output-csp", required_argument, NULL, OPT_OUTPUT_CSP },
 { "input-range", required_argument, NULL, OPT_INPUT_RANGE },
+ { "stitchable", no_argument, NULL, 0 },
 {0, 0, 0, 0}
 };

x264-snapshot-20130224-2245.tar.bz2/x264.h -> x264-snapshot-20130723-2245.tar.bz2/x264.h Changed

@@ -28,7 +28,7 @@
 #ifndef X264_X264_H
 #define X264_X264_H
 
-#if !defined(_STDINT_H) && !defined(_STDINT_H_) && \
+#if !defined(_STDINT_H) && !defined(_STDINT_H_) && !defined(_STDINT_H_INCLUDED) &&\
     !defined(_INTTYPES_H) && !defined(_INTTYPES_H_)
 # ifdef _MSC_VER
 #  pragma message("You must include stdint.h or inttypes.h before x264.h")
@@ -41,7 +41,7 @@
 
 #include "x264_config.h"
 
-#define X264_BUILD 129
+#define X264_BUILD 135
 
 /* Application developers planning to link against a shared library version of
  * libx264 from a Microsoft Visual Studio or similar development environment
@@ -109,43 +109,53 @@
 /****************************************************************************
  * Encoder parameters
  ****************************************************************************/
-/* CPU flags
- */
-#define X264_CPU_CACHELINE_32    0x0000001  /* avoid memory loads that span the border between two cachelines */
-#define X264_CPU_CACHELINE_64    0x0000002  /* 32/64 is the size of a cacheline in bytes */
-#define X264_CPU_ALTIVEC         0x0000004
-#define X264_CPU_MMX             0x0000008
-#define X264_CPU_MMX2            0x0000010  /* MMX2 aka MMXEXT aka ISSE */
+/* CPU flags */
+
+/* x86 */
+#define X264_CPU_CMOV            0x0000001
+#define X264_CPU_MMX             0x0000002
+#define X264_CPU_MMX2            0x0000004  /* MMX2 aka MMXEXT aka ISSE */
 #define X264_CPU_MMXEXT          X264_CPU_MMX2
-#define X264_CPU_SSE             0x0000020
-#define X264_CPU_SSE2            0x0000040
-#define X264_CPU_SSE2_IS_SLOW    0x0000080  /* avoid most SSE2 functions on Athlon64 */
-#define X264_CPU_SSE2_IS_FAST    0x0000100  /* a few functions are only faster on Core2 and Phenom */
-#define X264_CPU_SSE3            0x0000200
-#define X264_CPU_SSSE3           0x0000400
-#define X264_CPU_SHUFFLE_IS_FAST 0x0000800  /* Penryn, Nehalem, and Phenom have fast shuffle units */
-#define X264_CPU_STACK_MOD4      0x0001000  /* if stack is only mod4 and not mod16 */
-#define X264_CPU_SSE4            0x0002000  /* SSE4.1 */
-#define X264_CPU_SSE42           0x0004000  /* SSE4.2 */
-#define X264_CPU_SSE_MISALIGN    0x0008000  /* Phenom support for misaligned SSE instruction arguments */
-#define X264_CPU_LZCNT           0x0010000  /* Phenom support for "leading zero count" instruction. */
-#define X264_CPU_ARMV6           0x0020000
-#define X264_CPU_NEON            0x0040000  /* ARM NEON */
-#define X264_CPU_FAST_NEON_MRC   0x0080000  /* Transfer from NEON to ARM register is fast (Cortex-A9) */
-#define X264_CPU_SLOW_CTZ        0x0100000  /* BSR/BSF x86 instructions are really slow on some CPUs */
-#define X264_CPU_SLOW_ATOM       0x0200000  /* The Atom just sucks */
-#define X264_CPU_AVX             0x0400000  /* AVX support: requires OS support even if YMM registers
-                                             * aren't used. */
-#define X264_CPU_XOP             0x0800000  /* AMD XOP */
-#define X264_CPU_FMA4            0x1000000  /* AMD FMA4 */
-#define X264_CPU_AVX2            0x2000000  /* AVX2 */
-#define X264_CPU_FMA3            0x4000000  /* Intel FMA3 */
-#define X264_CPU_BMI1            0x8000000  /* BMI1 */
-#define X264_CPU_BMI2           0x10000000  /* BMI2 */
-#define X264_CPU_TBM            0x20000000  /* AMD TBM */
-
-/* Analyse flags
- */
+#define X264_CPU_SSE             0x0000008
+#define X264_CPU_SSE2            0x0000010
+#define X264_CPU_SSE3            0x0000020
+#define X264_CPU_SSSE3           0x0000040
+#define X264_CPU_SSE4            0x0000080  /* SSE4.1 */
+#define X264_CPU_SSE42           0x0000100  /* SSE4.2 */
+#define X264_CPU_SSE_MISALIGN    0x0000200  /* Phenom support for misaligned SSE instruction arguments */
+#define X264_CPU_LZCNT           0x0000400  /* Phenom support for "leading zero count" instruction. */
+#define X264_CPU_AVX             0x0000800  /* AVX support: requires OS support even if YMM registers aren't used. */
+#define X264_CPU_XOP             0x0001000  /* AMD XOP */
+#define X264_CPU_FMA4            0x0002000  /* AMD FMA4 */
+#define X264_CPU_AVX2            0x0004000  /* AVX2 */
+#define X264_CPU_FMA3            0x0008000  /* Intel FMA3 */
+#define X264_CPU_BMI1            0x0010000  /* BMI1 */
+#define X264_CPU_BMI2            0x0020000  /* BMI2 */
+/* x86 modifiers */
+#define X264_CPU_CACHELINE_32    0x0040000  /* avoid memory loads that span the border between two cachelines */
+#define X264_CPU_CACHELINE_64    0x0080000  /* 32/64 is the size of a cacheline in bytes */
+#define X264_CPU_SSE2_IS_SLOW    0x0100000  /* avoid most SSE2 functions on Athlon64 */
+#define X264_CPU_SSE2_IS_FAST    0x0200000  /* a few functions are only faster on Core2 and Phenom */
+#define X264_CPU_SLOW_SHUFFLE    0x0400000  /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */
+#define X264_CPU_STACK_MOD4      0x0800000  /* if stack is only mod4 and not mod16 */
+#define X264_CPU_SLOW_CTZ        0x1000000  /* BSR/BSF x86 instructions are really slow on some CPUs */
+#define X264_CPU_SLOW_ATOM       0x2000000  /* The Atom is terrible: slow SSE unaligned loads, slow
+                                             * SIMD multiplies, slow SIMD variable shifts, slow pshufb,
+                                             * cacheline split penalties -- gather everything here that
+                                             * isn't shared by other CPUs to avoid making half a dozen
+                                             * new SLOW flags. */
+#define X264_CPU_SLOW_PSHUFB     0x4000000  /* such as on the Intel Atom */
+#define X264_CPU_SLOW_PALIGNR    0x8000000  /* such as on the AMD Bobcat */
+
+/* PowerPC */
+#define X264_CPU_ALTIVEC         0x0000001
+
+/* ARM */
+#define X264_CPU_ARMV6           0x0000001
+#define X264_CPU_NEON            0x0000002  /* ARM NEON */
+#define X264_CPU_FAST_NEON_MRC   0x0000004  /* Transfer from NEON to ARM register is fast (Cortex-A9) */
+
+/* Analyse flags */
 #define X264_ANALYSE_I4x4       0x0001  /* Analyse i4x4 */
 #define X264_ANALYSE_I8x8       0x0002  /* Analyse i8x8 (requires 8x8 transform) */
 #define X264_ANALYSE_PSUB16x16  0x0010  /* Analyse p16x8, p8x16 and p8x8 */
@@ -188,9 +198,10 @@
 static const char * const x264_overscan_names[] = { "undef", "show", "crop", 0 };
 static const char * const x264_vidformat_names[] = { "component", "pal", "ntsc", "secam", "mac", "undef", 0 };
 static const char * const x264_fullrange_names[] = { "off", "on", 0 };
-static const char * const x264_colorprim_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "film", 0 };
-static const char * const x264_transfer_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "linear", "log100", "log316", 0 };
-static const char * const x264_colmatrix_names[] = { "GBR", "bt709", "undef", "", "fcc", "bt470bg", "smpte170m", "smpte240m", "YCgCo", 0 };
+static const char * const x264_colorprim_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "film", "bt2020", 0 };
+static const char * const x264_transfer_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "linear", "log100", "log316",
+                                                    "iec61966-2-4", "bt1361e", "iec61966-2-1", "bt2020-10", "bt2020-12", 0 };
+static const char * const x264_colmatrix_names[] = { "GBR", "bt709", "undef", "", "fcc", "bt470bg", "smpte170m", "smpte240m", "YCgCo", "bt2020nc", "bt2020c", 0 };
 static const char * const x264_nal_hrd_names[] = { "none", "vbr", "cbr", 0 };
 
 /* Colorspace type */
@@ -464,10 +475,23 @@
 
     int b_fake_interlaced;
 
+    /* Don't optimize header parameters based on video content, e.g. ensure that splitting an input video, compressing
+     * each part, and stitching them back together will result in identical SPS/PPS. This is necessary for stitching
+     * with container formats that don't allow multiple SPS/PPS. */
+    int b_stitchable;
+
+    int b_opencl;            /* use OpenCL when available */
+    int i_opencl_device;     /* specify count of GPU devices to skip, for CLI users */
+    void *opencl_device_id;  /* pass explicit cl_device_id as void*, for API users */
+    char *psz_clbin_file;    /* compiled OpenCL kernel cache file */
+
     /* Slicing parameters */
     int i_slice_max_size;    /* Max size per slice in bytes; includes estimated NAL overhead. */
     int i_slice_max_mbs;     /* Max number of MBs per slice; overrides i_slice_count. */
+    int i_slice_min_mbs;     /* Min number of MBs per slice */
     int i_slice_count;       /* Number of slices per frame: forces rectangular slices. */
+    int i_slice_count_max;   /* Absolute cap on slices per frame; stops applying slice-max-size
+                              * and slice-max-mbs if this is reached. */
 
     /* Optional callback for freeing this x264_param_t when it is done being used.
      * Only used when the x264_param_t sits in memory for an indefinite period of time,
@@ -481,7 +505,7 @@
      * is done encoding.
      *
      * This callback MUST do the following in order to work correctly:
-     * 1) Have available an output buffer of at least size nal->i_payload*3/2 + 5 + 16.
+     * 1) Have available an output buffer of at least size nal->i_payload*3/2 + 5 + 64.
      * 2) Call x264_nal_encode( h, dst, nal ), where dst is the output buffer.
      * After these steps, the content of nal is valid and can be used in the same way as if
      * the NAL unit were output by x264_encoder_encode.
@@ -834,7 +858,13 @@
  *      due to delay, this may not be the next frame passed to encoder_encode.
  *      if the change should apply to some particular frame, use x264_picture_t->param instead.
  *      returns 0 on success, negative on parameter validation error.
- *      not all parameters can be changed; see the actual function for a detailed breakdown. */
+ *      not all parameters can be changed; see the actual function for a detailed breakdown.
+ *
+ *      since not all parameters can be changed, moving from preset to preset may not always
+ *      fully copy all relevant parameters, but should still work usably in practice. however,
+ *      more so than for other presets, many of the speed shortcuts used in ultrafast cannot be
+ *      switched out of; using reconfig to switch between ultrafast and other presets is not
+ *      recommended without a more fine-grained breakdown of parameters to take this into account. */
 int     x264_encoder_reconfig( x264_t *, x264_param_t * );
 /* x264_encoder_parameters:
  *      copies the current internal set of parameters to the pointer provided