Packman Build Service PMBS

We truncated the diff of some files because they were too big. If you want to see the full diff for every file, click here.

Changes of Revision 4

libx264.changes Changed

libx264.spec Changed

@@ -1,5 +1,6 @@
 # vim: set ts=4 sw=4 et:
 # Copyright (c) 2012 Pascal Bleser <pascal.bleser@opensuse.org>
+# COpyright (c) 2013 Marguerite Su <marguerite@opensuse.org>
 #
 # All modifications and additions to the file contributed by third parties
 # remain the property of their copyright owners, unless otherwise agreed
@@ -10,20 +11,19 @@
 # license that conforms to the Open Source Definition (Version 1.9)
 # published by the Open Source Initiative.
 
-# Please submit bugfixes or comments via http://bugs.opensuse.org/
+# Please submit bugfixes or comments via http://bugs.links2linux.org/
 
 Name:           libx264
-%define libname %{name}
-%define soname  129
-%define svn     20130224
+%define soname  135
+%define svn     20130723
 Version:        0.%{soname}svn%{svn}
 Release:        1
 License:        GPL-2.0+
 Summary:        A free h264/avc encoder - encoder binary
 Url:            http://developers.videolan.org/x264.html
 Group:          Productivity/Multimedia/Video/Editors and Convertors
-Source0:        ftp://ftp.videolan.org/pub/videolan/x264/snapshots/x264-snapshot-%{svn}-2245.tar.bz2
-Patch0:         x264-use-shared-library.patch
+Source:        ftp://ftp.videolan.org/pub/videolan/x264/snapshots/x264-snapshot-%{svn}-2245.tar.bz2
+Patch:         x264-use-shared-library.patch
 BuildRequires:  nasm
 BuildRequires:  pkg-config
 BuildRequires:  yasm >= 1.2.0
@@ -59,11 +59,11 @@
 moment so please use mencoder or another tool that supports x264 library
 for all other file types.
 
-%package -n %{libname}-%{soname}
+%package %{soname}
 Summary:        A free h264/avc encoder - encoder binary
 Group:          Productivity/Multimedia/Video/Editors and Convertors
 
-%description -n %{libname}-%{soname}
+%description %{soname}
 x264 is a free library for encoding next-generation H264/AVC video
 streams. The code is written from scratch by Laurent Aimar, Loren
 Merritt, Eric Petit (OS X), Min Chen (vfw/asm), Justin Clay (vfw), Mans
@@ -73,15 +73,14 @@
 development with libx264. This library is needed to build
 mplayer/mencoder with H264 encoding support.
 
-%package -n %{libname}-devel
+%package devel
 Summary:        Libraries and include file for the %{name} encoder
 Group:          Development/Libraries/C and C++
-Requires:       %{buildrequires}
-Requires:       %{libname}-%{soname} = %{version}
-Provides:       %{name}-devel = %{version}
-Obsoletes:      %{name}-devel < %{version}
+Requires:       %{name}-%{soname} = %{version}
+Provides:       x264-devel = %{version}
+Obsoletes:      x264-devel < %{version}
 
-%description -n %{libname}-devel
+%description devel
 x264 is a free library for encoding next-generation H264/AVC video
 streams. The code is written from scratch by Laurent Aimar, Loren
 Merritt, Eric Petit (OS X), Min Chen (vfw/asm), Justin Clay (vfw), Mans
@@ -92,8 +91,8 @@
 mplayer/mencoder with H264 encoding support.
 
 %prep
-%setup -q -n "x264-snapshot-%{svn}-2245"
-%patch0 -p0
+%setup -q -n x264-snapshot-%{svn}-2245
+%patch -p1
 FAKE_BUILDDATE=$(LC_ALL=C date -u -r %{_sourcedir}/%{name}.changes '+%%b %%e %%Y')
 sed -i "s/__DATE__/\"$FAKE_BUILDDATE\"/" x264.c
 
@@ -104,29 +103,26 @@
 %install
 %makeinstall
 
-rm -f "%{buildroot}%{_libdir}/%{libname}.so"
-rm -f "%{buildroot}%{_libdir}/%{libname}.a"
-ln -s %{libname}.so.%{soname} "%{buildroot}%{_libdir}/%{libname}.so"
+rm -f %{buildroot}%{_libdir}/%{name}.so
+rm -f %{buildroot}%{_libdir}/%{name}.a
+ln -s %{name}.so.%{soname} %{buildroot}%{_libdir}/%{name}.so
 
-rm "%{buildroot}%{_bindir}"/*
+rm %{buildroot}%{_bindir}/*
 
-echo "%{libname}-%{soname}" > %{_sourcedir}/baselibs.conf
+echo "%{name}-%{soname}" > %{_sourcedir}/baselibs.conf
 
-%clean
-%{?buildroot:%__rm -rf "%{buildroot}"}
+%post -n %{name}-%{soname} -p /sbin/ldconfig
+%postun -n %{name}-%{soname} -p /sbin/ldconfig
 
-%post -n %{libname}-%{soname} -p /sbin/ldconfig
-%postun -n %{libname}-%{soname} -p /sbin/ldconfig
-
-%files -n %{libname}-%{soname}
+%files %{soname}
 %defattr(0644,root,root)
-%{_libdir}/%{libname}.so.%{soname}
+%{_libdir}/%{name}.so.%{soname}
 
-%files -n %{libname}-devel
+%files devel
 %defattr(0644,root,root)
 %{_includedir}/x264.h
 %{_includedir}/x264_config.h
 %{_libdir}/pkgconfig/x264.pc
-%{_libdir}/%{libname}.so
+%{_libdir}/%{name}.so
 
 %changelog

x264-use-shared-library.patch Changed

@@ -1,21 +1,23 @@
---- Makefile.orig	2011-12-26 22:45:03.000000000 +0100
-+++ Makefile	2011-12-27 20:03:46.070404383 +0100
-@@ -152,6 +152,7 @@
+Index: x264-snapshot-20130723-2245/Makefile
+===================================================================
+--- x264-snapshot-20130723-2245.orig/Makefile
++++ x264-snapshot-20130723-2245/Makefile
+@@ -171,6 +171,7 @@ $(LIBX264): $(GENERATED) .depend $(OBJS)
  
- $(SONAME): .depend $(OBJS) $(OBJASM) $(OBJSO)
+ $(SONAME): $(GENERATED) .depend $(OBJS) $(OBJASM) $(OBJSO)
  	$(LD)$@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS)
 +	ln -s $(SONAME) libx264.so
  
  ifneq ($(EXE),)
  .PHONY: x264 checkasm
-@@ -159,8 +160,8 @@
+@@ -178,8 +179,8 @@ x264: x264$(EXE)
  checkasm: checkasm$(EXE)
  endif
  
--x264$(EXE): .depend $(OBJCLI) $(CLI_LIBX264)
+-x264$(EXE): $(GENERATED) .depend $(OBJCLI) $(CLI_LIBX264)
 -	$(LD)$@ $(OBJCLI) $(CLI_LIBX264) $(LDFLAGSCLI) $(LDFLAGS)
-+x264$(EXE): .depend $(OBJCLI) $(SONAME)
++x264$(EXE): $(GENERATED) .depend $(OBJCLI) $(SONAME)
 +	$(LD)$@ $(OBJCLI) -L. -lx264 $(LDFLAGSCLI) $(LDFLAGS)
  
- checkasm$(EXE): .depend $(OBJCHK) $(LIBX264)
+ checkasm$(EXE): $(GENERATED) .depend $(OBJCHK) $(LIBX264)
  	$(LD)$@ $(OBJCHK) $(LIBX264) $(LDFLAGS)

x264-snapshot-20130224-2245.tar.bz2/.gitignore -> x264-snapshot-20130723-2245.tar.bz2/.gitignore Changed

x264-snapshot-20130224-2245.tar.bz2/Makefile -> x264-snapshot-20130723-2245.tar.bz2/Makefile Changed

@@ -8,6 +8,8 @@
 vpath %.asm $(SRCPATH)
 vpath %.rc $(SRCPATH)
 
+GENERATED =
+
 all: default
 default:
 
@@ -145,6 +147,13 @@
 endif
 endif
 
+ifeq ($(HAVE_OPENCL),yes)
+common/oclobj.h: common/opencl/x264-cl.h $(wildcard $(SRCPATH)/common/opencl/*.cl)
+	cat $^ | perl $(SRCPATH)/tools/cltostr.pl x264_opencl_source > $@
+GENERATED += common/oclobj.h
+SRCS += common/opencl.c encoder/slicetype-cl.c
+endif
+
 OBJS   += $(SRCS:%.c=%.o)
 OBJCLI += $(SRCCLI:%.c=%.o)
 OBJSO  += $(SRCSO:%.c=%.o)
@@ -155,12 +164,12 @@
 lib-static: $(LIBX264)
 lib-shared: $(SONAME)
 
-$(LIBX264): .depend $(OBJS) $(OBJASM)
+$(LIBX264): $(GENERATED) .depend $(OBJS) $(OBJASM)
 	rm -f $(LIBX264)
 	$(AR)$@ $(OBJS) $(OBJASM)
 	$(if $(RANLIB), $(RANLIB) $@)
 
-$(SONAME): .depend $(OBJS) $(OBJASM) $(OBJSO)
+$(SONAME): $(GENERATED) .depend $(OBJS) $(OBJASM) $(OBJSO)
 	$(LD)$@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS)
 
 ifneq ($(EXE),)
@@ -169,10 +178,10 @@
 checkasm: checkasm$(EXE)
 endif
 
-x264$(EXE): .depend $(OBJCLI) $(CLI_LIBX264)
+x264$(EXE): $(GENERATED) .depend $(OBJCLI) $(CLI_LIBX264)
 	$(LD)$@ $(OBJCLI) $(CLI_LIBX264) $(LDFLAGSCLI) $(LDFLAGS)
 
-checkasm$(EXE): .depend $(OBJCHK) $(LIBX264)
+checkasm$(EXE): $(GENERATED) .depend $(OBJCHK) $(LIBX264)
 	$(LD)$@ $(OBJCHK) $(LIBX264) $(LDFLAGS)
 
 $(OBJS) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK): .depend
@@ -231,7 +240,7 @@
 
 clean:
 	rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(SONAME) *.a *.lib *.exp *.pdb x264 x264.exe .depend TAGS
-	rm -f checkasm checkasm.exe $(OBJCHK)
+	rm -f checkasm checkasm.exe $(OBJCHK) $(GENERATED) x264_lookahead.clbin
 	rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock
 
 distclean: clean

x264-snapshot-20130224-2245.tar.bz2/common/arm/mc-a.S -> x264-snapshot-20130723-2245.tar.bz2/common/arm/mc-a.S Changed

@@ -5,6 +5,7 @@
  *
  * Authors: David Conrad <lessen42@gmail.com>
  *          Mans Rullgard <mans@mansr.com>
+ *          Stefan Groenroos <stefan.gronroos@gmail.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -813,54 +814,57 @@
 // void x264_mc_chroma_neon( uint8_t *dst, intptr_t i_dst_stride,
 //                           uint8_t *src, intptr_t i_src_stride,
 //                           int dx, int dy, int i_width, int i_height );
+
 function x264_mc_chroma_neon
-    push            {r4-r6, lr}
-    ldrd            r4,  [sp, #16]
-    ldr             r6,  [sp, #24]
+    push            {r4-r8, lr}
+    vpush           {d8-d11}
+    ldrd            r4, [sp, #56]
+    ldrd            r6, [sp, #64]
 
-    asr             lr,  r5,  #3
-    mul             lr,  r3,  lr
-    add             r2,  r2,  r4,  asr #3
-    cmp             r6, #4
-    add             r2,  r2,  lr
+    asr             lr, r6, #3
+    mul             lr, r4, lr
+    add             r3, r3, r5, asr #2
+    cmp             r7, #4
 
-    and             r4, r4, #7
     and             r5, r5, #7
-    pld             [r2]
-    pld             [r2, r3]
+    and             r6, r6, #7
+
+    add             r3, r3, lr
+    bic             r3, r3, #0x1
+
+    pld             [r3]
+    pld             [r3, r4]
 
     bgt             mc_chroma_w8
     beq             mc_chroma_w4
 
-// calculate cA cB cC cD
-.macro CHROMA_MC_START r0 r1
-    muls            lr,  r4,  r5
-    rsb             r6,  lr,  r5,  lsl #3
-    rsb             ip,  lr,  r4,  lsl #3
-    sub             r4,  lr,  r4,  lsl #3
-    sub             r4,  r4,  r5,  lsl #3
-    add             r4,  r4,  #64
+.macro CHROMA_MC_START r00, r01, r10, r11
+    muls            lr, r5, r6
+    rsb             r7, lr, r6, lsl #3
+    rsb             ip, lr, r5, lsl #3
+    sub             r5, lr, r5, lsl #3
+    sub             r5, r5, r6, lsl #3
+    add             r5, r5, #64
 
     beq             2f
+    vld2.8          {\r00-\r01}, [r3], r4
 
-    add             r5,  r2,  r3
+    vdup.8          d0,    r5
+    vdup.8          d1,    ip
 
-    vdup.8          d0,  r4
-    lsl             r3,  r3,  #1
-    vdup.8          d1,  ip
-    vld1.64         {\r0}, [r2], r3
-    vdup.8          d2,  r6
-    vld1.64         {\r1}, [r5], r3
-    vdup.8          d3,  lr
-    ldr             r4,  [sp, #28]
-
-    vext.8          d5,  d4,  d5,  #1
-    vext.8          d7,  d6,  d7,  #1
+    vdup.8          d2,    r7
+    vld2.8          {\r10-\r11}, [r3], r4
+    vdup.8          d3,    lr
+    ldr             r5,    [sp, #72]
 .endm
 
 .macro CHROMA_MC width, align
 mc_chroma_w\width:
-    CHROMA_MC_START d4,  d6
+    CHROMA_MC_START d4, d5,  d8, d9
+    vext.8          d6,  d4,  d6,  #1
+    vext.8          d7,  d5,  d7,  #1
+    vext.8          d10, d8,  d10, #1
+    vext.8          d11, d9,  d11, #1
 // since the element size varies, there's a different index for the 2nd store
 .if \width == 4
     .set st2, 1
@@ -868,187 +872,292 @@
     .set st2, 2
 .endif
 
-    vtrn.32         d4,  d5
-    vtrn.32         d6,  d7
+    vtrn.32         d4, d6
+    vtrn.32         d5, d7
+    vtrn.32         d8, d10
+    vtrn.32         d9, d11
 
-    vtrn.32         d0,  d1
-    vtrn.32         d2,  d3
+    vtrn.32         d0, d1
+    vtrn.32         d2, d3
 
 1:  // height loop, interpolate xy
-    pld             [r5]
+
     vmull.u8        q8,  d4,  d0
-    vmlal.u8        q8,  d6,  d2
-    vld1.64         {d4},     [r2], r3
-    vext.8          d5,  d4,  d5,  #1
-    vtrn.32         d4,  d5
-    vmull.u8        q9,  d6,  d0
-    vmlal.u8        q9,  d4,  d2
-    vld1.64         {d6},     [r5], r3
+    vmlal.u8        q8,  d8,  d2
+    vmull.u8        q9,  d5,  d0
+    vmlal.u8        q9,  d9,  d2
+
+    vld2.8          {d4-d5},  [r3], r4
+
+    vext.8          d6,  d4,  d6,  #1
+    vext.8          d7,  d5,  d7,  #1
+
     vadd.i16        d16, d16, d17
     vadd.i16        d17, d18, d19
+
+    vtrn.32         d4,  d6
+    vtrn.32         d5,  d7
+
+    vmull.u8        q10, d8,  d0
+    vmlal.u8        q10, d4,  d2
+    vmull.u8        q11, d9,  d0
+    vmlal.u8        q11, d5,  d2
+
+    vld2.8          {d8-d9},  [r3], r4
+
     vrshrn.u16      d16, q8,  #6
-    subs            r4,  r4,  #2
-    pld             [r2]
-    vext.8          d7,  d6,  d7,  #1
-    vtrn.32         d6,  d7
-    vst1.\align     {d16[0]},   [r0,:\align], r1
-    vst1.\align     {d16[st2]}, [r0,:\align], r1
+
+    vext.8          d10, d8,  d10,  #1
+    vext.8          d11, d9,  d11,  #1
+
+    vadd.i16        d18, d20, d21
+    vadd.i16        d19, d22, d23
+
+    vtrn.32         d8, d10
+    vtrn.32         d9, d11
+
+    vrshrn.u16      d18, q9,  #6
+
+    subs            r5,  r5,  #2
+
+    pld             [r3]
+    pld             [r3, r4]
+
+    vst1.\align     {d16[0]},   [r0,:\align], r2
+    vst1.\align     {d16[st2]}, [r1,:\align], r2
+    vst1.\align     {d18[0]},   [r0,:\align], r2
+    vst1.\align     {d18[st2]}, [r1,:\align], r2
     bgt             1b
 
-    pop             {r4-r6, pc}
+    vpop            {d8-d11}
+    pop             {r4-r8, pc}
 
 2:  // dx or dy are 0
-    tst             r6,  r6
-    add             ip,  ip,  r6
-    vdup.8          d0,  r4
+    tst             r7,  r7
+    add             ip,  ip,  r7
+    vdup.8          d0,  r5
+    ldr             r5,  [sp, #72]
     vdup.8          d1,  ip
-    vtrn.32         d0,  d1
-    ldr             r4,  [sp, #28]
 
     beq             4f
 
-    vext.32         d1,  d0,  d1,  #1
-    add             r5,  r2,  r3
-    lsl             r3,  r3,  #1
-    vld1.32         {d4[0]},  [r2], r3
-    vld1.32         {d4[1]},  [r5], r3

x264-snapshot-20130224-2245.tar.bz2/common/arm/mc-c.c -> x264-snapshot-20130723-2245.tar.bz2/common/arm/mc-c.c Changed

x264-snapshot-20130224-2245.tar.bz2/common/arm/quant-a.S -> x264-snapshot-20130723-2245.tar.bz2/common/arm/quant-a.S Changed

@@ -35,7 +35,7 @@
 
 .text
 
-.macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 load_mf=no
+.macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 mask load_mf=no
     vadd.u16    q8,  q8,  \bias0
     vadd.u16    q9,  q9,  \bias1
 .ifc \load_mf, yes
@@ -55,7 +55,7 @@
     veor        q9,  q9,  q15
     vsub.s16    q8,  q8,  q14
     vsub.s16    q9,  q9,  q15
-    vorr        \bias0, q8,  q9
+    vorr        \mask, q8,  q9
     vst1.64     {d16-d19}, [r0,:128]!
 .endm
 
@@ -89,7 +89,7 @@
     vabs.s16    q9,  q15
     vdup.16     q0,  r2
     vdup.16     q2,  r1
-    QUANT_TWO   q0,  q0,  d4,  d5,  d4,  d5
+    QUANT_TWO   q0,  q0,  d4,  d5,  d4,  d5,  q0
     vorr        d0,  d0,  d1
     QUANT_END   d0
 .endfunc
@@ -101,11 +101,52 @@
     vabs.s16    q9,  q15
     vld1.64     {d0-d3}, [r2,:128]
     vld1.64     {d4-d7}, [r1,:128]
-    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7
+    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7, q0
     vorr        d0,  d0,  d1
     QUANT_END   d0
 .endfunc
 
+// quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] )
+function x264_quant_4x4x4_neon
+    vpush       {d8-d15}
+    vld1.64     {d28-d31}, [r0,:128]
+    vabs.s16    q8,  q14
+    vabs.s16    q9,  q15
+    vld1.64     {d0-d3},   [r2,:128]
+    vld1.64     {d4-d7},   [r1,:128]
+    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7,  q4
+    vld1.64     {d28-d31}, [r0,:128]
+    vabs.s16    q8,  q14
+    vabs.s16    q9,  q15
+    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7,  q5
+    vld1.64     {d28-d31}, [r0,:128]
+    vabs.s16    q8,  q14
+    vabs.s16    q9,  q15
+    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7,  q6
+    vld1.64     {d28-d31}, [r0,:128]
+    vabs.s16    q8,  q14
+    vabs.s16    q9,  q15
+    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7,  q7
+    vorr        d8,  d8,  d9
+    vorr       d10, d10, d11
+    vorr       d12, d12, d13
+    vorr       d14, d14, d15
+    vmov        r0,  r1,  d8
+    vmov        r2,  r3, d10
+    orrs        r0,  r1
+    movne       r0,  #1
+    orrs        r2,  r3
+    orrne       r0,  #2
+    vmov        r1,  r2, d12
+    vmov        r3,  ip, d14
+    orrs        r1,  r2
+    orrne       r0,  #4
+    orrs        r3,  ip
+    orrne       r0,  #8
+    vpop        {d8-d15}
+    bx          lr
+.endfunc
+
 // quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
 function x264_quant_8x8_neon
     vld1.64     {d28-d31}, [r0,:128]
@@ -113,13 +154,13 @@
     vabs.s16    q9,  q15
     vld1.64     {d0-d3},   [r2,:128]!
     vld1.64     {d4-d7},   [r1,:128]!
-    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7
+    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7,  q0
 .rept 3
     vld1.64     {d28-d31}, [r0,:128]
     vabs.s16    q8,  q14
     vabs.s16    q9,  q15
     vld1.64     {d2-d5},   [r2,:128]!
-    QUANT_TWO   q1,  q2,  d4,  d5,  d6,  d7, yes
+    QUANT_TWO   q1,  q2,  d4,  d5,  d6,  d7,  q1, yes
     vorr        q0,  q0,  q1
 .endr
     vorr        d0,  d0,  d1

x264-snapshot-20130224-2245.tar.bz2/common/arm/quant.h -> x264-snapshot-20130723-2245.tar.bz2/common/arm/quant.h Changed

x264-snapshot-20130224-2245.tar.bz2/common/bitstream.c -> x264-snapshot-20130723-2245.tar.bz2/common/bitstream.c Changed

@@ -39,11 +39,20 @@
     return dst;
 }
 
-#if HAVE_MMX
 uint8_t *x264_nal_escape_mmx2( uint8_t *dst, uint8_t *src, uint8_t *end );
 uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end );
-uint8_t *x264_nal_escape_avx( uint8_t *dst, uint8_t *src, uint8_t *end );
-#endif
+uint8_t *x264_nal_escape_avx2( uint8_t *dst, uint8_t *src, uint8_t *end );
+void x264_cabac_block_residual_rd_internal_sse2       ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_rd_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_rd_internal_ssse3      ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_8x8_rd_internal_sse2       ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_8x8_rd_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_8x8_rd_internal_ssse3      ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_internal_sse2       ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_internal_avx2_bmi2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
 
 /****************************************************************************
  * x264_nal_encode:
@@ -88,13 +97,49 @@
 
 void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
 {
+    memset( pf, 0, sizeof(*pf) );
+
     pf->nal_escape = x264_nal_escape_c;
 #if HAVE_MMX
+#if ARCH_X86_64
+    pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_sse2;
+    pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_sse2;
+    pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_sse2;
+#endif
+
     if( cpu&X264_CPU_MMX2 )
         pf->nal_escape = x264_nal_escape_mmx2;
-    if( (cpu&X264_CPU_SSE2) && (cpu&X264_CPU_SSE2_IS_FAST) )
-        pf->nal_escape = x264_nal_escape_sse2;
-    if( cpu&X264_CPU_AVX )
-        pf->nal_escape = x264_nal_escape_avx;
+    if( cpu&X264_CPU_SSE2 )
+    {
+#if ARCH_X86_64
+        if( cpu&X264_CPU_LZCNT )
+        {
+            pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_sse2_lzcnt;
+            pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_sse2_lzcnt;
+            pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_sse2_lzcnt;
+        }
+#endif
+        if( cpu&X264_CPU_SSE2_IS_FAST )
+            pf->nal_escape = x264_nal_escape_sse2;
+    }
+#if ARCH_X86_64
+    if( cpu&X264_CPU_SSSE3 )
+    {
+        pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_ssse3;
+        pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_ssse3;
+        if( cpu&X264_CPU_LZCNT )
+        {
+            pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_ssse3_lzcnt;
+            pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt;
+        }
+    }
+
+    if( cpu&X264_CPU_AVX2 )
+    {
+        pf->nal_escape = x264_nal_escape_avx2;
+        if( cpu&X264_CPU_BMI2 )
+            pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx2_bmi2;
+    }
+#endif
 #endif
 }

x264-snapshot-20130224-2245.tar.bz2/common/bitstream.h -> x264-snapshot-20130723-2245.tar.bz2/common/bitstream.h Changed

x264-snapshot-20130224-2245.tar.bz2/common/common.c -> x264-snapshot-20130723-2245.tar.bz2/common/common.c Changed

@@ -171,6 +171,10 @@
     param->b_pic_struct = 0;
     param->b_fake_interlaced = 0;
     param->i_frame_packing = -1;
+    param->b_opencl = 0;
+    param->i_opencl_device = 0;
+    param->opencl_device_id = NULL;
+    param->psz_clbin_file = NULL;
 }
 
 static int x264_param_apply_preset( x264_param_t *param, const char *preset )
@@ -563,6 +567,8 @@
 }
 
 #define atobool(str) ( name_was_bool = 1, x264_atobool( str, &b_error ) )
+#undef atoi
+#undef atof
 #define atoi(str) x264_atoi( str, &b_error )
 #define atof(str) x264_atof( str, &b_error )
 
@@ -620,10 +626,8 @@
                     b_error = 1;
             }
             free( buf );
-            if( p->cpu & X264_CPU_SSSE3 )
+            if( (p->cpu&X264_CPU_SSSE3) && !(p->cpu&X264_CPU_SSE2_IS_SLOW) )
                 p->cpu |= X264_CPU_SSE2_IS_FAST;
-            if( p->cpu & X264_CPU_SSE4 )
-                p->cpu |= X264_CPU_SHUFFLE_IS_FAST;
         }
     }
     OPT("threads")
@@ -778,8 +782,12 @@
         p->i_slice_max_size = atoi(value);
     OPT("slice-max-mbs")
         p->i_slice_max_mbs = atoi(value);
+    OPT("slice-min-mbs")
+        p->i_slice_min_mbs = atoi(value);
     OPT("slices")
         p->i_slice_count = atoi(value);
+    OPT("slices-max")
+        p->i_slice_count_max = atoi(value);
     OPT("cabac")
         p->b_cabac = atobool(value);
     OPT("cabac-idc")
@@ -1029,6 +1037,14 @@
         p->b_fake_interlaced = atobool(value);
     OPT("frame-packing")
         p->i_frame_packing = atoi(value);
+    OPT("stitchable")
+        p->b_stitchable = atobool(value);
+    OPT("opencl")
+        p->b_opencl = atobool( value );
+    OPT("opencl-clbin")
+        p->psz_clbin_file = strdup( value );
+    OPT("opencl-device")
+        p->i_opencl_device = atoi( value );
     else
         return X264_PARAM_BAD_NAME;
 #undef OPT
@@ -1166,17 +1182,14 @@
 void *x264_malloc( int i_size )
 {
     uint8_t *align_buf = NULL;
-#if SYS_MACOSX || (SYS_WINDOWS && ARCH_X86_64)
-    /* Mac OS X and Win x64 always returns 16 byte aligned memory */
-    align_buf = malloc( i_size );
-#elif HAVE_MALLOC_H
-    align_buf = memalign( 16, i_size );
+#if HAVE_MALLOC_H
+    align_buf = memalign( NATIVE_ALIGN, i_size );
 #else
-    uint8_t *buf = malloc( i_size + 15 + sizeof(void **) );
+    uint8_t *buf = malloc( i_size + (NATIVE_ALIGN-1) + sizeof(void **) );
     if( buf )
     {
-        align_buf = buf + 15 + sizeof(void **);
-        align_buf -= (intptr_t) align_buf & 15;
+        align_buf = buf + (NATIVE_ALIGN-1) + sizeof(void **);
+        align_buf -= (intptr_t) align_buf & (NATIVE_ALIGN-1);
         *( (void **) ( align_buf - sizeof(void **) ) ) = buf;
     }
 #endif
@@ -1192,7 +1205,7 @@
 {
     if( p )
     {
-#if HAVE_MALLOC_H || SYS_MACOSX || (SYS_WINDOWS && ARCH_X86_64)
+#if HAVE_MALLOC_H
         free( p );
 #else
         free( *( ( ( void **) p ) - 1 ) );
@@ -1281,6 +1294,8 @@
         s += sprintf( s, "bitdepth=%d ", BIT_DEPTH );
     }
 
+    if( p->b_opencl )
+        s += sprintf( s, "opencl=%d ", p->b_opencl );
     s += sprintf( s, "cabac=%d", p->b_cabac );
     s += sprintf( s, " ref=%d", p->i_frame_reference );
     s += sprintf( s, " deblock=%d:%d:%d", p->b_deblocking_filter,
@@ -1305,14 +1320,20 @@
     s += sprintf( s, " sliced_threads=%d", p->b_sliced_threads );
     if( p->i_slice_count )
         s += sprintf( s, " slices=%d", p->i_slice_count );
+    if( p->i_slice_count_max )
+        s += sprintf( s, " slices_max=%d", p->i_slice_count_max );
     if( p->i_slice_max_size )
         s += sprintf( s, " slice_max_size=%d", p->i_slice_max_size );
     if( p->i_slice_max_mbs )
         s += sprintf( s, " slice_max_mbs=%d", p->i_slice_max_mbs );
+    if( p->i_slice_min_mbs )
+        s += sprintf( s, " slice_min_mbs=%d", p->i_slice_min_mbs );
     s += sprintf( s, " nr=%d", p->analyse.i_noise_reduction );
     s += sprintf( s, " decimate=%d", p->analyse.b_dct_decimate );
     s += sprintf( s, " interlaced=%s", p->b_interlaced ? p->b_tff ? "tff" : "bff" : p->b_fake_interlaced ? "fake" : "0" );
     s += sprintf( s, " bluray_compat=%d", p->b_bluray_compat );
+    if( p->b_stitchable )
+        s += sprintf( s, " stitchable=%d", p->b_stitchable );
 
     s += sprintf( s, " constrained_intra=%d", p->b_constrained_intra );

x264-snapshot-20130224-2245.tar.bz2/common/common.h -> x264-snapshot-20130723-2245.tar.bz2/common/common.h Changed

@@ -40,6 +40,7 @@
 #define IS_DISPOSABLE(type) ( type == X264_TYPE_B )
 #define FIX8(f) ((int)(f*(1<<8)+.5))
 #define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
+#define ARRAY_ELEMS(a) ((sizeof(a))/(sizeof(a[0])))
 
 #define CHECKED_MALLOC( var, size )\
 do {\
@@ -53,6 +54,8 @@
     memset( var, 0, size );\
 } while( 0 )
 
+#define ARRAY_SIZE(array)  (sizeof(array)/sizeof(array[0]))
+
 #define X264_BFRAME_MAX 16
 #define X264_REF_MAX 16
 #define X264_THREAD_MAX 128
@@ -202,6 +205,10 @@
 };
 
 #include "x264.h"
+#if HAVE_OPENCL
+#include "opencl.h"
+#endif
+#include "cabac.h"
 #include "bitstream.h"
 #include "set.h"
 #include "predict.h"
@@ -209,7 +216,6 @@
 #include "mc.h"
 #include "frame.h"
 #include "dct.h"
-#include "cabac.h"
 #include "quant.h"
 #include "cpu.h"
 #include "threadpool.h"
@@ -291,17 +297,6 @@
     return amvd0 + (amvd1<<8);
 }
 
-static void ALWAYS_INLINE x264_predictor_roundclip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
-{
-    for( int i = 0; i < i_mvc; i++ )
-    {
-        int mx = (mvc[i][0] + 2) >> 2;
-        int my = (mvc[i][1] + 2) >> 2;
-        dst[i][0] = x264_clip3( mx, mv_x_min, mv_x_max );
-        dst[i][1] = x264_clip3( my, mv_y_min, mv_y_max );
-    }
-}
-
 extern const uint8_t x264_exp2_lut[64];
 extern const float x264_log2_lut[128];
 extern const float x264_log2_lz_lut[32];
@@ -614,11 +609,11 @@
     /* Current MB DCT coeffs */
     struct
     {
-        ALIGNED_16( dctcoef luma16x16_dc[3][16] );
+        ALIGNED_N( dctcoef luma16x16_dc[3][16] );
         ALIGNED_16( dctcoef chroma_dc[2][8] );
         // FIXME share memory?
-        ALIGNED_16( dctcoef luma8x8[12][64] );
-        ALIGNED_16( dctcoef luma4x4[16*3][16] );
+        ALIGNED_N( dctcoef luma8x8[12][64] );
+        ALIGNED_N( dctcoef luma4x4[16*3][16] );
     } dct;
 
     /* MB table and cache for current frame/mb */
@@ -671,8 +666,7 @@
         int     mv_miny_spel_row[3];
         int     mv_maxy_spel_row[3];
         /* Fullpel MV range for motion search */
-        int     mv_min_fpel[2];
-        int     mv_max_fpel[2];
+        ALIGNED_8( int16_t mv_limit_fpel[2][2] ); /* min_x, min_y, max_x, max_y */
         int     mv_miny_fpel_row[3];
         int     mv_maxy_fpel_row[3];
 
@@ -758,7 +752,7 @@
 #define FENC_STRIDE 16
 #define FDEC_STRIDE 32
             ALIGNED_16( pixel fenc_buf[48*FENC_STRIDE] );
-            ALIGNED_16( pixel fdec_buf[52*FDEC_STRIDE] );
+            ALIGNED_N( pixel fdec_buf[52*FDEC_STRIDE] );
 
             /* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
             ALIGNED_16( pixel i4x4_fdec_buf[16*16] );
@@ -775,8 +769,8 @@
             ALIGNED_16( dctcoef fenc_dct4[16][16] );
 
             /* Psy RD SATD/SA8D scores cache */
-            ALIGNED_16( uint64_t fenc_hadamard_cache[9] );
-            ALIGNED_16( uint32_t fenc_satd_cache[32] );
+            ALIGNED_N( uint64_t fenc_hadamard_cache[9] );
+            ALIGNED_N( uint32_t fenc_satd_cache[32] );
 
             /* pointer over mb of the frame to be compressed */
             pixel *p_fenc[3]; /* y,u,v */
@@ -910,8 +904,8 @@
     uint32_t (*nr_residual_sum)[64];
     uint32_t *nr_count;
 
-    ALIGNED_16( udctcoef nr_offset_denoise[4][64] );
-    ALIGNED_16( uint32_t nr_residual_sum_buf[2][4][64] );
+    ALIGNED_N( udctcoef nr_offset_denoise[4][64] );
+    ALIGNED_N( uint32_t nr_residual_sum_buf[2][4][64] );
     uint32_t nr_count_buf[2][4];
 
     uint8_t luma2chroma_pixel[7]; /* Subsampled pixel size */
@@ -947,11 +941,48 @@
     struct visualize_t *visualize;
 #endif
     x264_lookahead_t *lookahead;
+
+#if HAVE_OPENCL
+    x264_opencl_t opencl;
+#endif
 };
 
 // included at the end because it needs x264_t
 #include "macroblock.h"
 
+static int ALWAYS_INLINE x264_predictor_roundclip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
+{
+    int cnt = 0;
+    for( int i = 0; i < i_mvc; i++ )
+    {
+        int mx = (mvc[i][0] + 2) >> 2;
+        int my = (mvc[i][1] + 2) >> 2;
+        uint32_t mv = pack16to32_mask(mx, my);
+        if( !mv || mv == pmv ) continue;
+        dst[cnt][0] = x264_clip3( mx, mv_limit[0][0], mv_limit[1][0] );
+        dst[cnt][1] = x264_clip3( my, mv_limit[0][1], mv_limit[1][1] );
+        cnt++;
+    }
+    return cnt;
+}
+
+static int ALWAYS_INLINE x264_predictor_clip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
+{
+    int cnt = 0;
+    int qpel_limit[4] = {mv_limit[0][0] << 2, mv_limit[0][1] << 2, mv_limit[1][0] << 2, mv_limit[1][1] << 2};
+    for( int i = 0; i < i_mvc; i++ )
+    {
+        uint32_t mv = M32( mvc[i] );
+        int mx = mvc[i][0];
+        int my = mvc[i][1];
+        if( !mv || mv == pmv ) continue;
+        dst[cnt][0] = x264_clip3( mx, qpel_limit[0], qpel_limit[2] );
+        dst[cnt][1] = x264_clip3( my, qpel_limit[1], qpel_limit[3] );
+        cnt++;
+    }
+    return cnt;
+}
+
 #if ARCH_X86 || ARCH_X86_64
 #include "x86/util.h"
 #endif

x264-snapshot-20130224-2245.tar.bz2/common/cpu.c -> x264-snapshot-20130723-2245.tar.bz2/common/cpu.c Changed

@@ -47,18 +47,19 @@
 
 const x264_cpu_name_t x264_cpu_names[] =
 {
-    {"Altivec",     X264_CPU_ALTIVEC},
-//  {"MMX",         X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore
-    {"MMX2",        X264_CPU_MMX|X264_CPU_MMX2},
-    {"MMXEXT",      X264_CPU_MMX|X264_CPU_MMX2},
-//  {"SSE",         X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_SSE}, // there are no sse1 functions in x264
-#define SSE2 X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_SSE|X264_CPU_SSE2
+#if HAVE_MMX
+//  {"MMX",         X264_CPU_MMX},  // we don't support asm on mmx1 cpus anymore
+//  {"CMOV",        X264_CPU_CMOV}, // we require this unconditionally, so don't print it
+#define MMX2 X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_CMOV
+    {"MMX2",        MMX2},
+    {"MMXEXT",      MMX2},
+    {"SSE",         MMX2|X264_CPU_SSE},
+#define SSE2 MMX2|X264_CPU_SSE|X264_CPU_SSE2
     {"SSE2Slow",    SSE2|X264_CPU_SSE2_IS_SLOW},
     {"SSE2",        SSE2},
     {"SSE2Fast",    SSE2|X264_CPU_SSE2_IS_FAST},
     {"SSE3",        SSE2|X264_CPU_SSE3},
     {"SSSE3",       SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
-    {"FastShuffle", SSE2|X264_CPU_SHUFFLE_IS_FAST},
     {"SSE4.1",      SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
     {"SSE4",        SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
     {"SSE4.2",      SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
@@ -70,19 +71,26 @@
     {"FMA3",        AVX|X264_CPU_FMA3},
 #undef AVX
 #undef SSE2
+#undef MMX2
     {"Cache32",         X264_CPU_CACHELINE_32},
     {"Cache64",         X264_CPU_CACHELINE_64},
     {"SSEMisalign",     X264_CPU_SSE_MISALIGN},
     {"LZCNT",           X264_CPU_LZCNT},
     {"BMI1",            X264_CPU_BMI1},
     {"BMI2",            X264_CPU_BMI1|X264_CPU_BMI2},
-    {"TBM",             X264_CPU_TBM},
-    {"Slow_mod4_stack", X264_CPU_STACK_MOD4},
-    {"ARMv6",           X264_CPU_ARMV6},
-    {"NEON",            X264_CPU_NEON},
-    {"Fast_NEON_MRC",   X264_CPU_FAST_NEON_MRC},
     {"SlowCTZ",         X264_CPU_SLOW_CTZ},
     {"SlowAtom",        X264_CPU_SLOW_ATOM},
+    {"SlowPshufb",      X264_CPU_SLOW_PSHUFB},
+    {"SlowPalignr",     X264_CPU_SLOW_PALIGNR},
+    {"SlowShuffle",     X264_CPU_SLOW_SHUFFLE},
+    {"UnalignedStack",  X264_CPU_STACK_MOD4},
+#elif ARCH_PPC
+    {"Altivec",         X264_CPU_ALTIVEC},
+#elif ARCH_ARM
+    {"ARMv6",           X264_CPU_ARMV6},
+    {"NEON",            X264_CPU_NEON},
+    {"FastNeonMRC",     X264_CPU_FAST_NEON_MRC},
+#endif
     {"", 0},
 };
 
@@ -131,9 +139,13 @@
     if( edx&0x00800000 )
         cpu |= X264_CPU_MMX;
     else
-        return 0;
+        return cpu;
     if( edx&0x02000000 )
         cpu |= X264_CPU_MMX2|X264_CPU_SSE;
+    if( edx&0x00008000 )
+        cpu |= X264_CPU_CMOV;
+    else
+        return cpu;
     if( edx&0x04000000 )
         cpu |= X264_CPU_SSE2;
     if( ecx&0x00000001 )
@@ -170,46 +182,56 @@
 
     if( cpu & X264_CPU_SSSE3 )
         cpu |= X264_CPU_SSE2_IS_FAST;
-    if( cpu & X264_CPU_SSE4 )
-        cpu |= X264_CPU_SHUFFLE_IS_FAST;
 
     x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
     max_extended_cap = eax;
 
-    if( !strcmp((char*)vendor, "AuthenticAMD") && max_extended_cap >= 0x80000001 )
+    if( max_extended_cap >= 0x80000001 )
     {
-        cpu |= X264_CPU_SLOW_CTZ;
         x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx );
-        if( edx&0x00400000 )
-            cpu |= X264_CPU_MMX2;
-        if( cpu & X264_CPU_SSE2 )
+
+        if( ecx&0x00000020 )
+            cpu |= X264_CPU_LZCNT;             /* Supported by Intel chips starting with Haswell */
+        if( ecx&0x00000040 ) /* SSE4a, AMD only */
         {
-            if( ecx&0x00000040 ) /* SSE4a */
+            int family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
+            cpu |= X264_CPU_SSE2_IS_FAST;      /* Phenom and later CPUs have fast SSE units */
+            if( family == 0x14 )
             {
-                cpu |= X264_CPU_SSE2_IS_FAST;
-                cpu |= X264_CPU_LZCNT;
-                cpu |= X264_CPU_SHUFFLE_IS_FAST;
-                cpu &= ~X264_CPU_SLOW_CTZ;
+                cpu &= ~X264_CPU_SSE2_IS_FAST; /* SSSE3 doesn't imply fast SSE anymore... */
+                cpu |= X264_CPU_SSE2_IS_SLOW;  /* Bobcat has 64-bit SIMD units */
+                cpu |= X264_CPU_SLOW_PALIGNR;  /* palignr is insanely slow on Bobcat */
             }
-            else
-                cpu |= X264_CPU_SSE2_IS_SLOW;
-
-            if( ecx&0x00000080 ) /* Misalign SSE */
+            if( family == 0x16 )
             {
-                cpu |= X264_CPU_SSE_MISALIGN;
-                x264_cpu_mask_misalign_sse();
+                cpu |= X264_CPU_SLOW_PSHUFB;   /* Jaguar's pshufb isn't that slow, but it's slow enough
+                                                * compared to alternate instruction sequences that this
+                                                * is equal or faster on almost all such functions. */
             }
+        }
 
-            if( cpu & X264_CPU_AVX )
-            {
-                if( ecx&0x00000800 ) /* XOP */
-                    cpu |= X264_CPU_XOP;
-                if( ecx&0x00010000 ) /* FMA4 */
-                    cpu |= X264_CPU_FMA4;
-            }
+        if( ecx&0x00000080 ) /* Misalign SSE */
+        {
+            cpu |= X264_CPU_SSE_MISALIGN;
+            x264_cpu_mask_misalign_sse();
+        }
 
-            if( ecx&0x00200000 )
-                cpu |= X264_CPU_TBM;
+        if( cpu & X264_CPU_AVX )
+        {
+            if( ecx&0x00000800 ) /* XOP */
+                cpu |= X264_CPU_XOP;
+            if( ecx&0x00010000 ) /* FMA4 */
+                cpu |= X264_CPU_FMA4;
+        }
+
+        if( !strcmp((char*)vendor, "AuthenticAMD") )
+        {
+            if( edx&0x00400000 )
+                cpu |= X264_CPU_MMX2;
+            if( !(cpu&X264_CPU_LZCNT) )
+                cpu |= X264_CPU_SLOW_CTZ;
+            if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_FAST) )
+                cpu |= X264_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
         }
     }
 
@@ -233,11 +255,12 @@
             {
                 cpu |= X264_CPU_SLOW_ATOM;
                 cpu |= X264_CPU_SLOW_CTZ;
+                cpu |= X264_CPU_SLOW_PSHUFB;
             }
-            /* Some Penryns and Nehalems are pointlessly crippled (SSE4 disabled), so
-             * detect them here. */
-            else if( model >= 23 )
-                cpu |= X264_CPU_SHUFFLE_IS_FAST;
+            /* Conroe has a slow shuffle unit. Check the model number to make sure not
+             * to include crippled low-end Penryns and Nehalems that don't have SSE4. */
+            else if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE4) && model < 23 )
+                cpu |= X264_CPU_SLOW_SHUFFLE;
         }
     }

x264-snapshot-20130224-2245.tar.bz2/common/cpu.h -> x264-snapshot-20130723-2245.tar.bz2/common/cpu.h Changed

@@ -48,15 +48,17 @@
 void     x264_cpu_mask_misalign_sse( void );
 void     x264_safe_intel_cpu_indicator_init( void );
 
-/* kluge:
+/* kludge:
  * gcc can't give variables any greater alignment than the stack frame has.
- * We need 16 byte alignment for SSE2, so here we make sure that the stack is
- * aligned to 16 bytes.
+ * We need 32 byte alignment for AVX2, so here we make sure that the stack is
+ * aligned to 32 bytes.
  * gcc 4.2 introduced __attribute__((force_align_arg_pointer)) to fix this
  * problem, but I don't want to require such a new version.
- * This applies only to x86_32, since other architectures that need alignment
- * either have ABIs that ensure aligned stack, or don't support it at all. */
-#if ARCH_X86 && HAVE_MMX
+ * aligning to 32 bytes only works if the compiler supports keeping that
+ * alignment between functions (osdep.h handles manual alignment of arrays
+ * if it doesn't).
+ */
+#if (ARCH_X86 || HAVE_32B_STACK_ALIGNMENT) && HAVE_MMX
 int x264_stack_align( void (*func)(), ... );
 #define x264_stack_align(func,...) x264_stack_align((void (*)())func, __VA_ARGS__)
 #else

x264-snapshot-20130224-2245.tar.bz2/common/dct.c -> x264-snapshot-20130723-2245.tar.bz2/common/dct.c Changed

@@ -640,23 +640,32 @@
         dctf->add8x8_idct8  = x264_add8x8_idct8_sse2;
         dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
 
-        dctf->sub8x8_dct    = x264_sub8x8_dct_sse2;
-        dctf->sub16x16_dct  = x264_sub16x16_dct_sse2;
-        dctf->add8x8_idct   = x264_add8x8_idct_sse2;
-        dctf->add16x16_idct = x264_add16x16_idct_sse2;
-        dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
+        if( !(cpu&X264_CPU_SSE2_IS_SLOW) )
+        {
+            dctf->sub8x8_dct    = x264_sub8x8_dct_sse2;
+            dctf->sub16x16_dct  = x264_sub16x16_dct_sse2;
+            dctf->add8x8_idct   = x264_add8x8_idct_sse2;
+            dctf->add16x16_idct = x264_add16x16_idct_sse2;
+            dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
+        }
     }
 
-    if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SLOW_ATOM) )
+    if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
     {
-        dctf->sub4x4_dct    = x264_sub4x4_dct_ssse3;
-        dctf->sub8x8_dct    = x264_sub8x8_dct_ssse3;
-        dctf->sub16x16_dct  = x264_sub16x16_dct_ssse3;
-        dctf->sub8x8_dct8   = x264_sub8x8_dct8_ssse3;
-        dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
         dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_ssse3;
-        dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
-        dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
+        if( !(cpu&X264_CPU_SLOW_ATOM) )
+        {
+            dctf->sub4x4_dct    = x264_sub4x4_dct_ssse3;
+            dctf->sub8x8_dct    = x264_sub8x8_dct_ssse3;
+            dctf->sub16x16_dct  = x264_sub16x16_dct_ssse3;
+            dctf->sub8x8_dct8   = x264_sub8x8_dct8_ssse3;
+            dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
+            if( !(cpu&X264_CPU_SLOW_PSHUFB) )
+            {
+                dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
+                dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
+            }
+        }
     }
 
     if( cpu&X264_CPU_SSE4 )
@@ -681,6 +690,18 @@
         dctf->sub8x8_dct       = x264_sub8x8_dct_xop;
         dctf->sub16x16_dct     = x264_sub16x16_dct_xop;
     }
+
+    if( cpu&X264_CPU_AVX2 )
+    {
+        dctf->add8x8_idct      = x264_add8x8_idct_avx2;
+        dctf->add16x16_idct    = x264_add16x16_idct_avx2;
+        dctf->sub8x8_dct       = x264_sub8x8_dct_avx2;
+        dctf->sub16x16_dct     = x264_sub16x16_dct_avx2;
+        dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx2;
+#if ARCH_X86_64
+        dctf->sub16x16_dct8    = x264_sub16x16_dct8_avx2;
+#endif
+    }
 #endif //HAVE_MMX
 
 #if HAVE_ALTIVEC
@@ -951,7 +972,7 @@
         pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_ssse3;
         pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3;
         pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
-        if( cpu&X264_CPU_SHUFFLE_IS_FAST )
+        if( !(cpu&X264_CPU_SLOW_SHUFFLE) )
             pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
     }
     if( cpu&X264_CPU_AVX )
@@ -962,8 +983,7 @@
         pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_avx;
         pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_avx;
 #endif
-        if( cpu&X264_CPU_SHUFFLE_IS_FAST )
-            pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
+        pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
     }
     if( cpu&X264_CPU_XOP )
     {
@@ -1005,7 +1025,7 @@
         pf_interlaced->interleave_8x8_cavlc =
         pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
     }
-    if( cpu&X264_CPU_SHUFFLE_IS_FAST )
+    if( (cpu&X264_CPU_SSE2) && !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SSE2_IS_SLOW)) )
     {
         pf_interlaced->interleave_8x8_cavlc =
         pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
@@ -1016,6 +1036,12 @@
         pf_interlaced->interleave_8x8_cavlc =
         pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
     }
+
+    if( cpu&X264_CPU_AVX2 )
+    {
+        pf_interlaced->interleave_8x8_cavlc =
+        pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx2;
+    }
 #endif // HIGH_BIT_DEPTH
 #endif
 }

x264-snapshot-20130224-2245.tar.bz2/common/deblock.c -> x264-snapshot-20130723-2245.tar.bz2/common/deblock.c Changed

x264-snapshot-20130224-2245.tar.bz2/common/display-x11.c -> x264-snapshot-20130723-2245.tar.bz2/common/display-x11.c Changed

x264-snapshot-20130224-2245.tar.bz2/common/frame.c -> x264-snapshot-20130723-2245.tar.bz2/common/frame.c Changed

@@ -72,8 +72,18 @@
     int i_mb_count = h->mb.i_mb_count;
     int i_stride, i_width, i_lines, luma_plane_count;
     int i_padv = PADV << PARAM_INTERLACED;
-    int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
-    int disalign = h->param.cpu&X264_CPU_ALTIVEC ? 1<<9 : 1<<10;
+    int align = 16;
+#if ARCH_X86 || ARCH_X86_64
+    if( h->param.cpu&X264_CPU_CACHELINE_64 )
+        align = 64;
+    else if( h->param.cpu&X264_CPU_CACHELINE_32 || h->param.cpu&X264_CPU_AVX2 )
+        align = 32;
+#endif
+#if ARCH_PPC
+    int disalign = 1<<9;
+#else
+    int disalign = 1<<10;
+#endif
 
     CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );
 
@@ -251,6 +261,10 @@
     if( x264_pthread_cond_init( &frame->cv, NULL ) )
         goto fail;
 
+#if HAVE_OPENCL
+    frame->opencl.ocl = h->opencl.ocl;
+#endif
+
     return frame;
 
 fail:
@@ -312,6 +326,9 @@
         }
         x264_pthread_mutex_destroy( &frame->mutex );
         x264_pthread_cond_destroy( &frame->cv );
+#if HAVE_OPENCL
+        x264_opencl_frame_delete( frame );
+#endif
     }
     x264_free( frame );
 }
@@ -655,6 +672,21 @@
     x264_pthread_mutex_unlock( &h->mutex );
 }
 
+int x264_frame_new_slice( x264_t *h, x264_frame_t *frame )
+{
+    if( h->param.i_slice_count_max )
+    {
+        int slice_count;
+        if( h->param.b_sliced_threads )
+            slice_count = x264_pthread_fetch_and_add( &frame->i_slice_count, 1, &frame->mutex );
+        else
+            slice_count = frame->i_slice_count++;
+        if( slice_count >= h->param.i_slice_count_max )
+            return -1;
+    }
+    return 0;
+}
+
 /* list operators */
 
 void x264_frame_push( x264_frame_t **list, x264_frame_t *frame )
@@ -717,6 +749,7 @@
     frame->b_scenecut = 1;
     frame->b_keyframe = 0;
     frame->b_corrupt = 0;
+    frame->i_slice_count = h->param.b_sliced_threads ? h->param.i_threads : 1;
 
     memset( frame->weight, 0, sizeof(frame->weight) );
     memset( frame->f_weighted_cost_delta, 0, sizeof(frame->f_weighted_cost_delta) );

x264-snapshot-20130224-2245.tar.bz2/common/frame.h -> x264-snapshot-20130723-2245.tar.bz2/common/frame.h Changed

x264-snapshot-20130224-2245.tar.bz2/common/macroblock.c -> x264-snapshot-20130723-2245.tar.bz2/common/macroblock.c Changed

x264-snapshot-20130224-2245.tar.bz2/common/mc.c -> x264-snapshot-20130723-2245.tar.bz2/common/mc.c Changed

x264-snapshot-20130224-2245.tar.bz2/common/mc.h -> x264-snapshot-20130723-2245.tar.bz2/common/mc.h Changed

x264-snapshot-20130723-2245.tar.bz2/common/opencl Added

x264-snapshot-20130723-2245.tar.bz2/common/opencl.c Added

@@ -0,0 +1,718 @@
+/*****************************************************************************
+ * opencl.c: OpenCL initialization and kernel compilation
+ *****************************************************************************
+ * Copyright (C) 2012-2013 x264 project
+ *
+ * Authors: Steve Borho <sborho@multicorewareinc.com>
+ *          Anton Mitrofanov <BugMaster@narod.ru>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common.h"
+
+#ifdef _WIN32
+#include <windows.h>
+#define ocl_open LoadLibrary( "OpenCL" )
+#define ocl_close FreeLibrary
+#define ocl_address GetProcAddress
+#else
+#include <dlfcn.h> //dlopen, dlsym, dlclose
+#if SYS_MACOSX
+#define ocl_open dlopen( "/System/Library/Frameworks/OpenCL.framework/OpenCL", RTLD_NOW )
+#else
+#define ocl_open dlopen( "libOpenCL.so", RTLD_NOW )
+#endif
+#define ocl_close dlclose
+#define ocl_address dlsym
+#endif
+
+#define LOAD_OCL_FUNC(name, continue_on_fail)\
+{\
+    ocl->name = (void*)ocl_address( ocl->library, #name );\
+    if( !continue_on_fail && !ocl->name )\
+        goto fail;\
+}
+
+/* load the library and functions we require from it */
+x264_opencl_function_t *x264_opencl_load_library( void )
+{
+    x264_opencl_function_t *ocl;
+#undef fail
+#define fail fail0
+    CHECKED_MALLOCZERO( ocl, sizeof(x264_opencl_function_t) );
+#undef fail
+#define fail fail1
+    ocl->library = ocl_open;
+    if( !ocl->library )
+        goto fail;
+#undef fail
+#define fail fail2
+    LOAD_OCL_FUNC( clBuildProgram, 0 );
+    LOAD_OCL_FUNC( clCreateBuffer, 0 );
+    LOAD_OCL_FUNC( clCreateCommandQueue, 0 );
+    LOAD_OCL_FUNC( clCreateContext, 0 );
+    LOAD_OCL_FUNC( clCreateImage2D, 0 );
+    LOAD_OCL_FUNC( clCreateKernel, 0 );
+    LOAD_OCL_FUNC( clCreateProgramWithBinary, 0 );
+    LOAD_OCL_FUNC( clCreateProgramWithSource, 0 );
+    LOAD_OCL_FUNC( clEnqueueCopyBuffer, 0 );
+    LOAD_OCL_FUNC( clEnqueueMapBuffer, 0 );
+    LOAD_OCL_FUNC( clEnqueueNDRangeKernel, 0 );
+    LOAD_OCL_FUNC( clEnqueueReadBuffer, 0 );
+    LOAD_OCL_FUNC( clEnqueueWriteBuffer, 0 );
+    LOAD_OCL_FUNC( clFinish, 0 );
+    LOAD_OCL_FUNC( clGetCommandQueueInfo, 0 );
+    LOAD_OCL_FUNC( clGetDeviceIDs, 0 );
+    LOAD_OCL_FUNC( clGetDeviceInfo, 0 );
+    LOAD_OCL_FUNC( clGetKernelWorkGroupInfo, 0 );
+    LOAD_OCL_FUNC( clGetPlatformIDs, 0 );
+    LOAD_OCL_FUNC( clGetProgramBuildInfo, 0 );
+    LOAD_OCL_FUNC( clGetProgramInfo, 0 );
+    LOAD_OCL_FUNC( clGetSupportedImageFormats, 0 );
+    LOAD_OCL_FUNC( clReleaseCommandQueue, 0 );
+    LOAD_OCL_FUNC( clReleaseContext, 0 );
+    LOAD_OCL_FUNC( clReleaseKernel, 0 );
+    LOAD_OCL_FUNC( clReleaseMemObject, 0 );
+    LOAD_OCL_FUNC( clReleaseProgram, 0 );
+    LOAD_OCL_FUNC( clSetKernelArg, 0 );
+    return ocl;
+#undef fail
+fail2:
+    ocl_close( ocl->library );
+fail1:
+    x264_free( ocl );
+fail0:
+    return NULL;
+}
+
+void x264_opencl_close_library( x264_opencl_function_t *ocl )
+{
+    if( !ocl )
+        return;
+    ocl_close( ocl->library );
+    x264_free( ocl );
+}
+
+/* define from recent cl_ext.h, copied here in case headers are old */
+#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD        0x4042
+
+/* Requires full include path in case of out-of-tree builds */
+#include "common/oclobj.h"
+
+static int x264_detect_switchable_graphics( void );
+
+/* Try to load the cached compiled program binary, verify the device context is
+ * still valid before reuse */
+static cl_program x264_opencl_cache_load( x264_t *h, char *dev_name, char *dev_vendor, char *driver_version )
+{
+    /* try to load cached program binary */
+    FILE *fp = fopen( h->param.psz_clbin_file, "rb" );
+    if( !fp )
+        return NULL;
+
+    x264_opencl_function_t *ocl = h->opencl.ocl;
+    cl_program program = NULL;
+    uint8_t *binary = NULL;
+
+    fseek( fp, 0, SEEK_END );
+    size_t size = ftell( fp );
+    rewind( fp );
+    CHECKED_MALLOC( binary, size );
+
+    fread( binary, 1, size, fp );
+    const uint8_t *ptr = (const uint8_t*)binary;
+
+#define CHECK_STRING( STR )\
+    do {\
+        size_t len = strlen( STR );\
+        if( size <= len || strncmp( (char*)ptr, STR, len ) )\
+            goto fail;\
+        else {\
+            size -= (len+1); ptr += (len+1);\
+        }\
+    } while( 0 )
+
+    CHECK_STRING( dev_name );
+    CHECK_STRING( dev_vendor );
+    CHECK_STRING( driver_version );
+    CHECK_STRING( x264_opencl_source_hash );
+#undef CHECK_STRING
+
+    cl_int status;
+    program = ocl->clCreateProgramWithBinary( h->opencl.context, 1, &h->opencl.device, &size, &ptr, NULL, &status );
+    if( status != CL_SUCCESS )
+        program = NULL;
+
+fail:
+    fclose( fp );
+    x264_free( binary );
+    return program;
+}
+
+/* Save the compiled program binary to a file for later reuse.  Device context
+ * is also saved in the cache file so we do not reuse stale binaries */
+static void x264_opencl_cache_save( x264_t *h, cl_program program, char *dev_name, char *dev_vendor, char *driver_version )
+{
+    FILE *fp = fopen( h->param.psz_clbin_file, "wb" );
+    if( !fp )
+    {
+        x264_log( h, X264_LOG_INFO, "OpenCL: unable to open clbin file for write\n" );
+        return;
+    }
+
+    x264_opencl_function_t *ocl = h->opencl.ocl;
+    uint8_t *binary = NULL;
+
+    size_t size = 0;
+    cl_int status = ocl->clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL );
+    if( status != CL_SUCCESS || !size )
+    {
+        x264_log( h, X264_LOG_INFO, "OpenCL: Unable to query program binary size, no cache file generated\n" );
+        goto fail;
+    }
+
+    CHECKED_MALLOC( binary, size );
+    status = ocl->clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &binary, NULL );
+    if( status != CL_SUCCESS )
+    {
+        x264_log( h, X264_LOG_INFO, "OpenCL: Unable to query program binary, no cache file generated\n" );
+        goto fail;
+    }
+
+    fputs( dev_name, fp );
+    fputc( '\n', fp );

x264-snapshot-20130723-2245.tar.bz2/common/opencl.h Added

@@ -0,0 +1,804 @@
+/*****************************************************************************
+ * opencl.h: OpenCL structures and defines
+ *****************************************************************************
+ * Copyright (C) 2012-2013 x264 project
+ *
+ * Authors: Steve Borho <sborho@multicorewareinc.com>
+ *          Anton Mitrofanov <BugMaster@narod.ru>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_OPENCL_H
+#define X264_OPENCL_H
+
+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
+#include "extras/cl.h"
+
+#define OCL_API(ret, attr, name) typedef ret (attr *name##_func)
+
+/* Platform API */
+OCL_API(cl_int, CL_API_CALL, clGetPlatformIDs)
+(   cl_uint          /* num_entries */,
+    cl_platform_id * /* platforms */,
+    cl_uint *        /* num_platforms */);
+
+OCL_API(cl_int, CL_API_CALL, clGetPlatformInfo)
+(   cl_platform_id   /* platform */,
+    cl_platform_info /* param_name */,
+    size_t           /* param_value_size */,
+    void *           /* param_value */,
+    size_t *         /* param_value_size_ret */);
+
+/* Device APIs */
+OCL_API(cl_int, CL_API_CALL, clGetDeviceIDs)
+(   cl_platform_id   /* platform */,
+    cl_device_type   /* device_type */,
+    cl_uint          /* num_entries */,
+    cl_device_id *   /* devices */,
+    cl_uint *        /* num_devices */);
+
+OCL_API(cl_int, CL_API_CALL, clGetDeviceInfo)
+(   cl_device_id    /* device */,
+    cl_device_info  /* param_name */,
+    size_t          /* param_value_size */,
+    void *          /* param_value */,
+    size_t *        /* param_value_size_ret */);
+
+OCL_API(cl_int, CL_API_CALL, clCreateSubDevices)
+(   cl_device_id                         /* in_device */,
+    const cl_device_partition_property * /* properties */,
+    cl_uint                              /* num_devices */,
+    cl_device_id *                       /* out_devices */,
+    cl_uint *                            /* num_devices_ret */);
+
+OCL_API(cl_int, CL_API_CALL, clRetainDevice)
+(   cl_device_id /* device */);
+
+OCL_API(cl_int, CL_API_CALL, clReleaseDevice)
+(   cl_device_id /* device */);
+
+/* Context APIs  */
+OCL_API(cl_context, CL_API_CALL, clCreateContext)
+(   const cl_context_properties * /* properties */,
+    cl_uint                 /* num_devices */,
+    const cl_device_id *    /* devices */,
+    void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *),
+    void *                  /* user_data */,
+    cl_int *                /* errcode_ret */);
+
+OCL_API(cl_context, CL_API_CALL, clCreateContextFromType)
+(   const cl_context_properties * /* properties */,
+    cl_device_type          /* device_type */,
+    void (CL_CALLBACK *     /* pfn_notify*/ )(const char *, const void *, size_t, void *),
+    void *                  /* user_data */,
+    cl_int *                /* errcode_ret */);
+
+OCL_API(cl_int, CL_API_CALL, clRetainContext)
+(   cl_context /* context */);
+
+OCL_API(cl_int, CL_API_CALL, clReleaseContext)
+(   cl_context /* context */);
+
+OCL_API(cl_int, CL_API_CALL, clGetContextInfo)
+(   cl_context         /* context */,
+    cl_context_info    /* param_name */,
+    size_t             /* param_value_size */,
+    void *             /* param_value */,
+    size_t *           /* param_value_size_ret */);
+
+/* Command Queue APIs */
+OCL_API(cl_command_queue, CL_API_CALL, clCreateCommandQueue)
+(   cl_context                     /* context */,
+    cl_device_id                   /* device */,
+    cl_command_queue_properties    /* properties */,
+    cl_int *                       /* errcode_ret */);
+
+OCL_API(cl_int, CL_API_CALL, clRetainCommandQueue)
+(   cl_command_queue /* command_queue */);
+
+OCL_API(cl_int, CL_API_CALL, clReleaseCommandQueue)
+(   cl_command_queue /* command_queue */);
+
+OCL_API(cl_int, CL_API_CALL, clGetCommandQueueInfo)
+(   cl_command_queue      /* command_queue */,
+    cl_command_queue_info /* param_name */,
+    size_t                /* param_value_size */,
+    void *                /* param_value */,
+    size_t *              /* param_value_size_ret */);
+
+/* Memory Object APIs */
+OCL_API(cl_mem, CL_API_CALL, clCreateBuffer)
+(   cl_context   /* context */,
+    cl_mem_flags /* flags */,
+    size_t       /* size */,
+    void *       /* host_ptr */,
+    cl_int *     /* errcode_ret */);
+
+OCL_API(cl_mem, CL_API_CALL, clCreateSubBuffer)
+(   cl_mem                   /* buffer */,
+    cl_mem_flags             /* flags */,
+    cl_buffer_create_type    /* buffer_create_type */,
+    const void *             /* buffer_create_info */,
+    cl_int *                 /* errcode_ret */);
+
+OCL_API(cl_mem, CL_API_CALL, clCreateImage)
+(   cl_context              /* context */,
+    cl_mem_flags            /* flags */,
+    const cl_image_format * /* image_format */,
+    const cl_image_desc *   /* image_desc */,
+    void *                  /* host_ptr */,
+    cl_int *                /* errcode_ret */);
+
+OCL_API(cl_int, CL_API_CALL, clRetainMemObject)
+(   cl_mem /* memobj */);
+
+OCL_API(cl_int, CL_API_CALL, clReleaseMemObject)
+(   cl_mem /* memobj */);
+
+OCL_API(cl_int, CL_API_CALL, clGetSupportedImageFormats)
+(   cl_context           /* context */,
+    cl_mem_flags         /* flags */,
+    cl_mem_object_type   /* image_type */,
+    cl_uint              /* num_entries */,
+    cl_image_format *    /* image_formats */,
+    cl_uint *            /* num_image_formats */);
+
+OCL_API(cl_int, CL_API_CALL, clGetMemObjectInfo)
+(   cl_mem           /* memobj */,
+    cl_mem_info      /* param_name */,
+    size_t           /* param_value_size */,
+    void *           /* param_value */,
+    size_t *         /* param_value_size_ret */);
+
+OCL_API(cl_int, CL_API_CALL, clGetImageInfo)
+(   cl_mem           /* image */,
+    cl_image_info    /* param_name */,
+    size_t           /* param_value_size */,
+    void *           /* param_value */,
+    size_t *         /* param_value_size_ret */);
+
+OCL_API(cl_int, CL_API_CALL, clSetMemObjectDestructorCallback)
+(   cl_mem /* memobj */,
+    void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/),
+    void * /*user_data */ );
+
+/* Sampler APIs */
+OCL_API(cl_sampler, CL_API_CALL, clCreateSampler)
+(   cl_context          /* context */,
+    cl_bool             /* normalized_coords */,
+    cl_addressing_mode  /* addressing_mode */,
+    cl_filter_mode      /* filter_mode */,
+    cl_int *            /* errcode_ret */);
+
+OCL_API(cl_int, CL_API_CALL, clRetainSampler)
+(   cl_sampler /* sampler */);
+
+OCL_API(cl_int, CL_API_CALL, clReleaseSampler)
+(   cl_sampler /* sampler */);
+
+OCL_API(cl_int, CL_API_CALL, clGetSamplerInfo)
+(   cl_sampler         /* sampler */,
+    cl_sampler_info    /* param_name */,
+    size_t             /* param_value_size */,
+    void *             /* param_value */,

x264-snapshot-20130723-2245.tar.bz2/common/opencl/bidir.cl Added

@@ -0,0 +1,265 @@
+/* Mode selection routines, select the least SATD cost mode for each lowres
+ * macroblock.  When measuring B slices, this includes measuring the cost of
+ * three bidir modes.  */
+
+/* Four threads cooperatively measure 8x8 BIDIR cost with SATD */
+int bidir_satd_8x8_ii_coop4( read_only image2d_t fenc_lowres,
+                             int2 fencpos,
+                             read_only image2d_t fref0_planes,
+                             int2 qpos0,
+                             read_only image2d_t fref1_planes,
+                             int2 qpos1,
+                             int weight,
+                             local sum2_t *tmpp,
+                             int idx )
+{
+    volatile local sum2_t( *tmp )[4] = (volatile local sum2_t( * )[4])tmpp;
+    sum2_t b0, b1, b2, b3;
+    sum2_t sum = 0;
+
+    // fencpos is full-pel position of original MB
+    // qpos0 is qpel position within reference frame 0
+    // qpos1 is qpel position within reference frame 1
+
+    int2 fref0Apos = (int2)(qpos0.x>>2, qpos0.y>>2);
+    int hpel0A = ((qpos0.x&2)>>1) + (qpos0.y&2);
+
+    int2 qpos0B = (int2)qpos0 + (int2)(((qpos0.x&1)<<1), ((qpos0.y&1)<<1));
+    int2 fref0Bpos = (int2)(qpos0B.x>>2, qpos0B.y>>2);
+    int hpel0B = ((qpos0B.x&2)>>1) + (qpos0B.y&2);
+
+    int2 fref1Apos = (int2)(qpos1.x>>2, qpos1.y>>2);
+    int hpel1A = ((qpos1.x&2)>>1) + (qpos1.y&2);
+
+    int2 qpos1B = (int2)qpos1 + (int2)(((qpos1.x&1)<<1), ((qpos1.y&1)<<1));
+    int2 fref1Bpos = (int2)(qpos1B.x>>2, qpos1B.y>>2);
+    int hpel1B = ((qpos1B.x&2)>>1) + (qpos1B.y&2);
+
+    uint mask_shift0A = 8 * hpel0A, mask_shift0B = 8 * hpel0B;
+    uint mask_shift1A = 8 * hpel1A, mask_shift1B = 8 * hpel1B;
+
+    uint vA, vB;
+    uint enc, ref0, ref1;
+    uint a0, a1;
+    const int weight2 = 64 - weight;
+
+#define READ_BIDIR_DIFF( OUT, X )\
+    enc = read_imageui( fenc_lowres, sampler, fencpos + (int2)(X, idx) ).s0;\
+    vA = (read_imageui( fref0_planes, sampler, fref0Apos + (int2)(X, idx) ).s0 >> mask_shift0A) & 0xFF;\
+    vB = (read_imageui( fref0_planes, sampler, fref0Bpos + (int2)(X, idx) ).s0 >> mask_shift0B) & 0xFF;\
+    ref0 = rhadd( vA, vB );\
+    vA = (read_imageui( fref1_planes, sampler, fref1Apos + (int2)(X, idx) ).s0 >> mask_shift1A) & 0xFF;\
+    vB = (read_imageui( fref1_planes, sampler, fref1Bpos + (int2)(X, idx) ).s0 >> mask_shift1B) & 0xFF;\
+    ref1 = rhadd( vA, vB );\
+    OUT = enc - ((ref0 * weight + ref1 * weight2 + (1 << 5)) >> 6);
+
+#define READ_DIFF_EX( OUT, a, b )\
+    READ_BIDIR_DIFF( a0, a );\
+    READ_BIDIR_DIFF( a1, b );\
+    OUT = a0 + (a1<<BITS_PER_SUM);
+
+#define ROW_8x4_SATD( a, b, c )\
+    fencpos.y += a;\
+    fref0Apos.y += b;\
+    fref0Bpos.y += b;\
+    fref1Apos.y += c;\
+    fref1Bpos.y += c;\
+    READ_DIFF_EX( b0, 0, 4 );\
+    READ_DIFF_EX( b1, 1, 5 );\
+    READ_DIFF_EX( b2, 2, 6 );\
+    READ_DIFF_EX( b3, 3, 7 );\
+    HADAMARD4( tmp[idx][0], tmp[idx][1], tmp[idx][2], tmp[idx][3], b0, b1, b2, b3 );\
+    HADAMARD4( b0, b1, b2, b3, tmp[0][idx], tmp[1][idx], tmp[2][idx], tmp[3][idx] );\
+    sum += abs2( b0 ) + abs2( b1 ) + abs2( b2 ) + abs2( b3 );
+
+    ROW_8x4_SATD( 0, 0, 0 );
+    ROW_8x4_SATD( 4, 4, 4 );
+
+#undef READ_BIDIR_DIFF
+#undef READ_DIFF_EX
+#undef ROW_8x4_SATD
+
+    return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;
+}
+
+/*
+ * mode selection - pick the least cost partition type for each 8x8 macroblock.
+ * Intra, list0 or list1.  When measuring a B slice, also test three bidir
+ * possibilities.
+ *
+ * fenc_lowres_mvs[0|1] and fenc_lowres_mv_costs[0|1] are large buffers that
+ * hold many frames worth of motion vectors.  We must offset into the correct
+ * location for this frame's vectors:
+ *
+ *   CPU equivalent: fenc->lowres_mvs[0][b - p0 - 1]
+ *   GPU equivalent: fenc_lowres_mvs0[(b - p0 - 1) * mb_count]
+ *
+ * global launch dimensions for P slice estimate:  [mb_width, mb_height]
+ * global launch dimensions for B slice estimate:  [mb_width * 4, mb_height]
+ */
+kernel void mode_selection( read_only image2d_t   fenc_lowres,
+                            read_only image2d_t   fref0_planes,
+                            read_only image2d_t   fref1_planes,
+                            const global short2  *fenc_lowres_mvs0,
+                            const global short2  *fenc_lowres_mvs1,
+                            const global short2  *fref1_lowres_mvs0,
+                            const global int16_t *fenc_lowres_mv_costs0,
+                            const global int16_t *fenc_lowres_mv_costs1,
+                            const global uint16_t *fenc_intra_cost,
+                            global uint16_t      *lowres_costs,
+                            global int           *frame_stats,
+                            local int16_t        *cost_local,
+                            local sum2_t         *satd_local,
+                            int                   mb_width,
+                            int                   bipred_weight,
+                            int                   dist_scale_factor,
+                            int                   b,
+                            int                   p0,
+                            int                   p1,
+                            int                   lambda )
+{
+    int mb_x = get_global_id( 0 );
+    int b_bidir = b < p1;
+    if( b_bidir )
+    {
+        /* when mode_selection is run for B frames, it must perform BIDIR SATD
+         * measurements, so it is launched with four times as many threads in
+         * order to spread the work around more of the GPU.  And it can add
+         * padding threads in the X direction. */
+        mb_x >>= 2;
+        if( mb_x >= mb_width )
+            return;
+    }
+    int mb_y = get_global_id( 1 );
+    int mb_height = get_global_size( 1 );
+    int mb_count = mb_width * mb_height;
+    int mb_xy = mb_x + mb_y * mb_width;
+
+    /* Initialize int frame_stats[4] for next kernel (sum_inter_cost) */
+    if( mb_x < 4 && mb_y == 0 )
+        frame_stats[mb_x] = 0;
+
+    int bcost = COST_MAX;
+    int list_used = 0;
+
+    if( !b_bidir )
+    {
+        int icost = fenc_intra_cost[mb_xy];
+        COPY2_IF_LT( bcost, icost, list_used, 0 );
+    }
+    if( b != p0 )
+    {
+        int mv_cost0 = fenc_lowres_mv_costs0[(b - p0 - 1) * mb_count + mb_xy];
+        COPY2_IF_LT( bcost, mv_cost0, list_used, 1 );
+    }
+    if( b != p1 )
+    {
+        int mv_cost1 = fenc_lowres_mv_costs1[(p1 - b - 1) * mb_count + mb_xy];
+        COPY2_IF_LT( bcost, mv_cost1, list_used, 2 );
+    }
+
+    if( b_bidir )
+    {
+        int2 coord = (int2)(mb_x, mb_y) << 3;
+        int mb_i = get_global_id( 0 ) & 3;
+        int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
+        cost_local += mb_in_group * 4;
+        satd_local += mb_in_group * 16;
+
+#define TRY_BIDIR( mv0, mv1, penalty )\
+{\
+    int2 qpos0 = (int2)((coord.x<<2) + mv0.x, (coord.y<<2) + mv0.y);\
+    int2 qpos1 = (int2)((coord.x<<2) + mv1.x, (coord.y<<2) + mv1.y);\
+    cost_local[mb_i] = bidir_satd_8x8_ii_coop4( fenc_lowres, coord, fref0_planes, qpos0, fref1_planes, qpos1, bipred_weight, satd_local, mb_i );\
+    int cost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3];\
+    COPY2_IF_LT( bcost, penalty * lambda + cost, list_used, 3 );\
+}
+
+        /* temporal prediction */
+        short2 dmv0, dmv1;
+        short2 mvr = fref1_lowres_mvs0[mb_xy];
+        dmv0 = (mvr * (short) dist_scale_factor + (short) 128) >> (short) 8;
+        dmv1 = dmv0 - mvr;
+        TRY_BIDIR( dmv0, dmv1, 0 )
+
+        if( as_uint( dmv0 ) || as_uint( dmv1 ) )
+        {
+            /* B-direct prediction */
+            dmv0 = 0; dmv1 = 0;
+            TRY_BIDIR( dmv0, dmv1, 0 );
+        }
+
+        /* L0+L1 prediction */
+        dmv0 = fenc_lowres_mvs0[(b - p0 - 1) * mb_count + mb_xy];
+        dmv1 = fenc_lowres_mvs1[(p1 - b - 1) * mb_count + mb_xy];
+        TRY_BIDIR( dmv0, dmv1, 5 );
+#undef TRY_BIDIR
+    }
+
+    lowres_costs[mb_xy] = min( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT);

x264-snapshot-20130723-2245.tar.bz2/common/opencl/downscale.cl Added

@@ -0,0 +1,135 @@
+/*
+ * downscale lowres luma: full-res buffer to down scale image, and to packed hpel image
+ *
+ * --
+ *
+ * fenc_img is an output image (area of memory referenced through a texture
+ * cache). A read of any pixel location (x,y) returns four pixel values:
+ *
+ * val.s0 = P(x,y)
+ * val.s1 = P(x+1,y)
+ * val.s2 = P(x+2,y)
+ * val.s3 = P(x+3,y)
+ *
+ * This is a 4x replication of the lowres pixels, a trade-off between memory
+ * size and read latency.
+ *
+ * --
+ *
+ * hpel_planes is an output image that contains the four HPEL planes used for
+ * subpel refinement. A read of any pixel location (x,y) returns a UInt32 with
+ * the four planar values C | V | H | F
+ *
+ * launch dimensions:  [lowres-width, lowres-height]
+ */
+kernel void downscale_hpel( const global pixel *fenc,
+                            write_only image2d_t fenc_img,
+                            write_only image2d_t hpel_planes,
+                            int stride )
+{
+    int x = get_global_id( 0 );
+    int y = get_global_id( 1 );
+    uint4 values;
+
+    fenc += y * stride * 2;
+    const global pixel *src1 = fenc + stride;
+    const global pixel *src2 = (y == get_global_size( 1 )-1) ? src1 : src1 + stride;
+    int2 pos = (int2)(x, y);
+    pixel right, left;
+
+    right = rhadd( fenc[x*2], src1[x*2] );
+    left  = rhadd( fenc[x*2+1], src1[x*2+1] );
+    values.s0 = rhadd( right, left );           // F
+
+    right = rhadd( fenc[2*x+1], src1[2*x+1] );
+    left  = rhadd( fenc[2*x+2], src1[2*x+2] );
+    values.s1 = rhadd( right, left );           // H
+
+    right = rhadd( src1[2*x], src2[2*x] );
+    left  = rhadd( src1[2*x+1], src2[2*x+1] );
+    values.s2 = rhadd( right, left );           // V
+
+    right = rhadd( src1[2*x+1], src2[2*x+1] );
+    left  = rhadd( src1[2*x+2], src2[2*x+2] );
+    values.s3 = rhadd( right, left );           // C
+
+    uint4 val = (uint4) ((values.s3 & 0xff) << 24) | ((values.s2 & 0xff) << 16) | ((values.s1 & 0xff) << 8) | (values.s0 & 0xff);
+    write_imageui( hpel_planes, pos, val );
+
+    x = select( x, x+1, x+1 < get_global_size( 0 ) );
+    right = rhadd( fenc[x*2], src1[x*2] );
+    left  = rhadd( fenc[x*2+1], src1[x*2+1] );
+    values.s1 = rhadd( right, left );
+
+    x = select( x, x+1, x+1 < get_global_size( 0 ) );
+    right = rhadd( fenc[x*2], src1[x*2] );
+    left  = rhadd( fenc[x*2+1], src1[x*2+1] );
+    values.s2 = rhadd( right, left );
+
+    x = select( x, x+1, x+1 < get_global_size( 0 ) );
+    right = rhadd( fenc[x*2], src1[x*2] );
+    left  = rhadd( fenc[x*2+1], src1[x*2+1] );
+    values.s3 = rhadd( right, left );
+
+    write_imageui( fenc_img, pos, values );
+}
+
+/*
+ * downscale lowres hierarchical motion search image, copy from one image to
+ * another decimated image.  This kernel is called iteratively to generate all
+ * of the downscales.
+ *
+ * launch dimensions:  [lower_res width, lower_res height]
+ */
+kernel void downscale1( read_only image2d_t higher_res, write_only image2d_t lower_res )
+{
+    int x = get_global_id( 0 );
+    int y = get_global_id( 1 );
+    int2 pos = (int2)(x, y);
+    int gs = get_global_size( 0 );
+    uint4 top, bot, values;
+    top = read_imageui( higher_res, sampler, (int2)(x*2, 2*y) );
+    bot = read_imageui( higher_res, sampler, (int2)(x*2, 2*y+1) );
+    values.s0 = rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) );
+
+    /* these select statements appear redundant, and they should be, but tests break when
+     * they are not here.  I believe this was caused by a driver bug
+     */
+    values.s1 = select( values.s0, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 1 < gs) );
+    top = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y) );
+    bot = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y+1) );
+    values.s2 = select( values.s1, rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ), ( x + 2 < gs ) );
+    values.s3 = select( values.s2, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 3 < gs ) );
+    write_imageui( lower_res, pos, (uint4)(values) );
+}
+
+/*
+ * Second copy of downscale kernel, no differences. This is a (no perf loss)
+ * workaround for a scheduling bug in current Tahiti drivers.  This bug has
+ * theoretically been fixed in the July 2012 driver release from AMD.
+ */
+kernel void downscale2( read_only image2d_t higher_res, write_only image2d_t lower_res )
+{
+    int x = get_global_id( 0 );
+    int y = get_global_id( 1 );
+    int2 pos = (int2)(x, y);
+    int gs = get_global_size( 0 );
+    uint4 top, bot, values;
+    top = read_imageui( higher_res, sampler, (int2)(x*2, 2*y) );
+    bot = read_imageui( higher_res, sampler, (int2)(x*2, 2*y+1) );
+    values.s0 = rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) );
+
+    // see comment in above function copy
+    values.s1 = select( values.s0, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 1 < gs) );
+    top = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y) );
+    bot = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y+1) );
+    values.s2 = select( values.s1, rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ), ( x + 2 < gs ) );
+    values.s3 = select( values.s2, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 3 < gs ) );
+    write_imageui( lower_res, pos, (uint4)(values) );
+}
+
+/* OpenCL 1.2 finally added a memset command, but we're not targeting 1.2 */
+kernel void memset_int16( global int16_t *buf, int16_t value )
+{
+    buf[get_global_id( 0 )] = value;
+}

x264-snapshot-20130723-2245.tar.bz2/common/opencl/intra.cl Added

@@ -0,0 +1,1072 @@
+/* Lookahead lowres intra analysis
+ *
+ * Each intra analysis function has been implemented twice, once for scalar GPUs
+ * (NV) and once for vectorized GPUs (AMD pre-Southern Islands).  x264 detects
+ * the GPU type and sets the -DVECTORIZE compile flag accordingly.
+ *
+ * All the intra analysis functions were based on their C versions in pixel.c
+ * and produce the exact same results.
+ */
+
+/* force all clamp arguments and return value to int, prevent ambiguous types */
+#define clamp_int( X, MIN, MAX ) (int) clamp( (int)(X), (int)(MIN), (int)(MAX) )
+
+#if VECTORIZE
+int satd_8x4_intra_lr( const local pixel *data, int data_stride, int8 pr0, int8 pr1, int8 pr2, int8 pr3 )
+{
+    int8 a_v, d_v;
+    int2 tmp00, tmp01, tmp02, tmp03, tmp10, tmp11, tmp12, tmp13;
+    int2 tmp20, tmp21, tmp22, tmp23, tmp30, tmp31, tmp32, tmp33;
+
+    d_v = convert_int8( vload8( 0, data ) );
+    a_v.s01234567 = (d_v - pr0).s04152637;
+    HADAMARD4V( tmp00, tmp01, tmp02, tmp03, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
+
+    data += data_stride;
+    d_v = convert_int8( vload8( 0, data ) );
+    a_v.s01234567 = (d_v - pr1).s04152637;
+    HADAMARD4V( tmp10, tmp11, tmp12, tmp13, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
+
+    data += data_stride;
+    d_v = convert_int8( vload8( 0, data ) );
+    a_v.s01234567 = (d_v - pr2).s04152637;
+    HADAMARD4V( tmp20, tmp21, tmp22, tmp23, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
+
+    data += data_stride;
+    d_v = convert_int8( vload8( 0, data ) );
+    a_v.s01234567 = (d_v - pr3).s04152637;
+    HADAMARD4V( tmp30, tmp31, tmp32, tmp33, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
+
+    uint8 sum_v;
+
+    HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp00, tmp10, tmp20, tmp30 );
+    sum_v = abs( a_v );
+
+    HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp01, tmp11, tmp21, tmp31 );
+    sum_v += abs( a_v );
+
+    HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp02, tmp12, tmp22, tmp32 );
+    sum_v += abs( a_v );
+
+    HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp03, tmp13, tmp23, tmp33 );
+    sum_v += abs( a_v );
+
+    uint4 sum2 = sum_v.hi + sum_v.lo;
+    uint2 sum3 = sum2.hi + sum2.lo;
+    return ( sum3.hi + sum3.lo ) >> 1;
+}
+#else
+SATD_C_8x4_Q( satd_8x4_lp, const local, private )
+#endif
+
+/****************************************************************************
+ * 8x8 prediction for intra luma block
+ ****************************************************************************/
+
+#define F1            rhadd
+#define F2( a, b, c ) ( a+2*b+c+2 )>>2
+
+#if VECTORIZE
+int x264_predict_8x8_ddl( const local pixel *src, int src_stride, const local pixel *top )
+{
+    int8 pr0, pr1, pr2, pr3;
+
+    // Upper half of pred[]
+    pr0.s0 = ( 2 + top[0] + 2*top[1] + top[2] ) >> 2;
+    pr0.s1 = ( 2 + top[1] + 2*top[2] + top[3] ) >> 2;
+    pr0.s2 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
+    pr0.s3 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
+    pr0.s4 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
+    pr0.s5 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
+    pr0.s6 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
+    pr0.s7 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+
+    pr1.s0 = ( 2 + top[1] + 2*top[2] + top[3] ) >> 2;
+    pr1.s1 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
+    pr1.s2 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
+    pr1.s3 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
+    pr1.s4 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
+    pr1.s5 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
+    pr1.s6 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+    pr1.s7 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
+
+    pr2.s0 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
+    pr2.s1 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
+    pr2.s2 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
+    pr2.s3 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
+    pr2.s4 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
+    pr2.s5 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+    pr2.s6 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
+    pr2.s7 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
+
+    pr3.s0 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
+    pr3.s1 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
+    pr3.s2 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
+    pr3.s3 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
+    pr3.s4 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+    pr3.s5 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
+    pr3.s6 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
+    pr3.s7 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
+    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
+
+    // Lower half of pred[]
+    pr0.s0 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
+    pr0.s1 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
+    pr0.s2 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
+    pr0.s3 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+    pr0.s4 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
+    pr0.s5 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
+    pr0.s6 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
+    pr0.s7 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
+
+    pr1.s0 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
+    pr1.s1 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
+    pr1.s2 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+    pr1.s3 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
+    pr1.s4 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
+    pr1.s5 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
+    pr1.s6 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
+    pr1.s7 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
+
+    pr2.s0 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
+    pr2.s1 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+    pr2.s2 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
+    pr2.s3 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
+    pr2.s4 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
+    pr2.s5 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
+    pr2.s6 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
+    pr2.s7 = ( 2 + top[13] + 2*top[14] + top[15] ) >> 2;
+
+    pr3.s0 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+    pr3.s1 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
+    pr3.s2 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
+    pr3.s3 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
+    pr3.s4 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
+    pr3.s5 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
+    pr3.s6 = ( 2 + top[13] + 2*top[14] + top[15] ) >> 2;
+    pr3.s7 = ( 2 + top[14] + 3*top[15] ) >> 2;
+
+    return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
+}
+
+int x264_predict_8x8_ddr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
+{
+    int8 pr0, pr1, pr2, pr3;
+
+    // Upper half of pred[]
+    pr3.s0 = F2( left[1], left[2], left[3] );
+    pr2.s0 = pr3.s1 = F2( left[0], left[1], left[2] );
+    pr1.s0 = pr2.s1 = pr3.s2 = F2( left[1], left[0], left_top );
+    pr0.s0 = pr1.s1 = pr2.s2 = pr3.s3 = F2( left[0], left_top, top[0] );
+    pr0.s1 = pr1.s2 = pr2.s3 = pr3.s4 = F2( left_top, top[0], top[1] );
+    pr0.s2 = pr1.s3 = pr2.s4 = pr3.s5 = F2( top[0], top[1], top[2] );
+    pr0.s3 = pr1.s4 = pr2.s5 = pr3.s6 = F2( top[1], top[2], top[3] );
+    pr0.s4 = pr1.s5 = pr2.s6 = pr3.s7 = F2( top[2], top[3], top[4] );
+    pr0.s5 = pr1.s6 = pr2.s7 = F2( top[3], top[4], top[5] );
+    pr0.s6 = pr1.s7 = F2( top[4], top[5], top[6] );
+    pr0.s7 = F2( top[5], top[6], top[7] );
+    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
+
+    // Lower half of pred[]
+    pr3.s0 = F2( left[5], left[6], left[7] );
+    pr2.s0 = pr3.s1 = F2( left[4], left[5], left[6] );
+    pr1.s0 = pr2.s1 = pr3.s2 = F2( left[3], left[4], left[5] );
+    pr0.s0 = pr1.s1 = pr2.s2 = pr3.s3 = F2( left[2], left[3], left[4] );
+    pr0.s1 = pr1.s2 = pr2.s3 = pr3.s4 = F2( left[1], left[2], left[3] );
+    pr0.s2 = pr1.s3 = pr2.s4 = pr3.s5 = F2( left[0], left[1], left[2] );
+    pr0.s3 = pr1.s4 = pr2.s5 = pr3.s6 = F2( left[1], left[0], left_top );
+    pr0.s4 = pr1.s5 = pr2.s6 = pr3.s7 = F2( left[0], left_top, top[0] );
+    pr0.s5 = pr1.s6 = pr2.s7 = F2( left_top, top[0], top[1] );
+    pr0.s6 = pr1.s7 = F2( top[0], top[1], top[2] );
+    pr0.s7 = F2( top[1], top[2], top[3] );
+    return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
+}
+
+int x264_predict_8x8_vr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
+{
+    int8 pr0, pr1, pr2, pr3;
+
+    // Upper half of pred[]
+    pr2.s0 = F2( left[1], left[0], left_top );
+    pr3.s0 = F2( left[2], left[1], left[0] );
+    pr1.s0 = pr3.s1 = F2( left[0], left_top, top[0] );
+    pr0.s0 = pr2.s1 = F1( left_top, top[0] );
+    pr1.s1 = pr3.s2 = F2( left_top, top[0], top[1] );
+    pr0.s1 = pr2.s2 = F1( top[0], top[1] );
+    pr1.s2 = pr3.s3 = F2( top[0], top[1], top[2] );
+    pr0.s2 = pr2.s3 = F1( top[1], top[2] );
+    pr1.s3 = pr3.s4 = F2( top[1], top[2], top[3] );
+    pr0.s3 = pr2.s4 = F1( top[2], top[3] );

x264-snapshot-20130723-2245.tar.bz2/common/opencl/motionsearch.cl Added

@@ -0,0 +1,249 @@
+/* Hierarchical (iterative) OpenCL lowres motion search */
+
+inline int find_downscale_mb_xy( int x, int y, int mb_width, int mb_height )
+{
+    /* edge macroblocks might not have a direct descendant, use nearest */
+    x = select( x >> 1, (x - (mb_width&1)) >> 1, x == mb_width-1 );
+    y = select( y >> 1, (y - (mb_height&1)) >> 1, y == mb_height-1 );
+    return (mb_width>>1) * y + x;
+}
+
+/* Four threads calculate an 8x8 SAD.  Each does two rows */
+int sad_8x8_ii_coop4( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos, int idx, local int16_t *costs )
+{
+    frefpos.y += idx << 1;
+    fencpos.y += idx << 1;
+    int cost = 0;
+    if( frefpos.x < 0 )
+    {
+        /* slow path when MV goes past left edge.  The GPU clamps reads from
+         * (-1, 0) to (0,0), so you get pixels [0, 1, 2, 3] when what you really
+         * want are [0, 0, 1, 2]
+         */
+        for( int y = 0; y < 2; y++ )
+        {
+            for( int x = 0; x < 8; x++ )
+            {
+                pixel enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0;
+                pixel ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0;
+                cost += abs_diff( enc, ref );
+            }
+        }
+    }
+    else
+    {
+        uint4 enc, ref, costs = 0;
+        enc = read_imageui( fenc, sampler, fencpos );
+        ref = read_imageui( fref, sampler, frefpos );
+        costs += abs_diff( enc, ref );
+        enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 0) );
+        ref = read_imageui( fref, sampler, frefpos + (int2)(4, 0) );
+        costs += abs_diff( enc, ref );
+        enc = read_imageui( fenc, sampler, fencpos + (int2)(0, 1) );
+        ref = read_imageui( fref, sampler, frefpos + (int2)(0, 1) );
+        costs += abs_diff( enc, ref );
+        enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 1) );
+        ref = read_imageui( fref, sampler, frefpos + (int2)(4, 1) );
+        costs += abs_diff( enc, ref );
+        cost = costs.s0 + costs.s1 + costs.s2 + costs.s3;
+    }
+    costs[idx] = cost;
+    return costs[0] + costs[1] + costs[2] + costs[3];
+}
+
+/* One thread performs 8x8 SAD */
+int sad_8x8_ii( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos )
+{
+    if( frefpos.x < 0 )
+    {
+        /* slow path when MV goes past left edge */
+        int cost = 0;
+        for( int y = 0; y < 8; y++ )
+        {
+            for( int x = 0; x < 8; x++ )
+            {
+                uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0;
+                uint ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0;
+                cost += abs_diff( enc, ref );
+            }
+        }
+        return cost;
+    }
+    else
+    {
+        uint4 enc, ref, cost = 0;
+        for( int y = 0; y < 8; y++ )
+        {
+            for( int x = 0; x < 8; x += 4 )
+            {
+                enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) );
+                ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) );
+                cost += abs_diff( enc, ref );
+            }
+        }
+        return cost.s0 + cost.s1 + cost.s2 + cost.s3;
+    }
+}
+/*
+ * hierarchical motion estimation
+ *
+ * Each kernel launch is a single iteration
+ *
+ * MB per work group is determined by lclx / 4 * lcly
+ *
+ * global launch dimensions:  [mb_width * 4, mb_height]
+ */
+kernel void hierarchical_motion( read_only image2d_t  fenc,
+                                 read_only image2d_t  fref,
+                                 const global short2 *in_mvs,
+                                 global short2       *out_mvs,
+                                 global int16_t      *out_mv_costs,
+                                 global short2       *mvp_buffer,
+                                 local int16_t       *cost_local,
+                                 local short2        *mvc_local,
+                                 int                  mb_width,
+                                 int                  lambda,
+                                 int                  me_range,
+                                 int                  scale,
+                                 int                  b_shift_index,
+                                 int                  b_first_iteration,
+                                 int                  b_reverse_references )
+{
+    int mb_x = get_global_id( 0 ) >> 2;
+    if( mb_x >= mb_width )
+        return;
+    int mb_height = get_global_size( 1 );
+    int mb_i = get_global_id( 0 ) & 3;
+    int mb_y = get_global_id( 1 );
+    int mb_xy = mb_y * mb_width + mb_x;
+    const int mb_size = 8;
+    int2 coord = (int2)(mb_x, mb_y) * mb_size;
+
+    const int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
+    cost_local += 4 * mb_in_group;
+
+    int i_mvc = 0;
+    mvc_local += 4 * mb_in_group;
+    mvc_local[mb_i] = 0;
+    int2 mvp =0;
+
+    if( !b_first_iteration )
+    {
+#define MVC( DX, DY )\
+    {\
+        int px = mb_x + DX;\
+        int py = mb_y + DY;\
+        mvc_local[i_mvc] = b_shift_index ? in_mvs[find_downscale_mb_xy( px, py, mb_width, mb_height )] : \
+                                           in_mvs[mb_width * py + px];\
+        mvc_local[i_mvc] >>= (short) scale;\
+        i_mvc++;\
+    }
+        /* Find MVP from median of MVCs */
+        if( b_reverse_references )
+        {
+            /* odd iterations: derive MVP from down and right */
+            if( mb_x < mb_width - 1 )
+                MVC( 1, 0 );
+            if( mb_y < mb_height - 1 )
+            {
+                MVC( 0, 1 );
+                if( mb_x > b_shift_index )
+                    MVC( -1, 1 );
+                if( mb_x < mb_width - 1 )
+                    MVC( 1, 1 );
+            }
+        }
+        else
+        {
+            /* even iterations: derive MVP from up and left */
+            if( mb_x > 0 )
+                MVC( -1, 0 );
+            if( mb_y > 0 )
+            {
+                MVC( 0, -1 );
+                if( mb_x < mb_width - 1 )
+                    MVC( 1, -1 );
+                if( mb_x > b_shift_index )
+                    MVC( -1, -1 );
+            }
+        }
+#undef MVC
+        mvp = (i_mvc <= 1) ? convert_int2_sat(mvc_local[0]) : x264_median_mv( mvc_local[0], mvc_local[1], mvc_local[2] );
+    }
+    /* current mvp matches the previous mvp and we have not changed scale.  We know
+     * we're going to arrive at the same MV again, so just copy the previous
+     * result to our output. */
+    if( !b_shift_index && mvp.x == mvp_buffer[mb_xy].x && mvp.y == mvp_buffer[mb_xy].y )
+    {
+        out_mvs[mb_xy] = in_mvs[mb_xy];
+        return;
+    }
+    mvp_buffer[mb_xy] = convert_short2_sat(mvp);
+    int2 mv_min = -mb_size * (int2)(mb_x, mb_y) - 4;
+    int2 mv_max = mb_size * ((int2)(mb_width, mb_height) - (int2)(mb_x, mb_y) - 1) + 4;
+
+    int2 bestmv = clamp(mvp, mv_min, mv_max);
+    int2 refcrd = coord + bestmv;
+
+    /* measure cost at bestmv */
+    int bcost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) +
+                lambda * mv_cost( abs_diff( bestmv, mvp ) << (2 + scale) );
+
+    do
+    {
+        /* measure costs at offsets from bestmv */
+        refcrd = coord + bestmv + dia_offs[mb_i];
+        int2 trymv = bestmv + dia_offs[mb_i];
+        int cost = sad_8x8_ii( fenc, coord, fref, refcrd ) +
+                   lambda * mv_cost( abs_diff( trymv, mvp ) << (2 + scale) );
+

x264-snapshot-20130723-2245.tar.bz2/common/opencl/subpel.cl Added

@@ -0,0 +1,242 @@
+/* OpenCL lowres subpel Refine */
+
+/* Each thread performs 8x8 SAD.  4 threads per MB, so the 4 DIA HPEL offsets are
+ * calculated simultaneously */
+int sad_8x8_ii_hpel( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref_planes, int2 qpos )
+{
+    int2 frefpos = qpos >> 2;
+    int hpel_idx = ((qpos.x & 2) >> 1) + (qpos.y & 2);
+    uint mask_shift = 8 * hpel_idx;
+
+    uint4 cost4 = 0;
+
+    for( int y = 0; y < 8; y++ )
+    {
+        uint4 enc, val4;
+        enc = read_imageui( fenc, sampler, fencpos + (int2)(0, y));
+        val4.s0 = (read_imageui( fref_planes, sampler, frefpos + (int2)(0, y)).s0 >> mask_shift) & 0xFF;
+        val4.s1 = (read_imageui( fref_planes, sampler, frefpos + (int2)(1, y)).s0 >> mask_shift) & 0xFF;
+        val4.s2 = (read_imageui( fref_planes, sampler, frefpos + (int2)(2, y)).s0 >> mask_shift) & 0xFF;
+        val4.s3 = (read_imageui( fref_planes, sampler, frefpos + (int2)(3, y)).s0 >> mask_shift) & 0xFF;
+        cost4 += abs_diff( enc, val4 );
+
+        enc = read_imageui( fenc, sampler, fencpos + (int2)(4, y));
+        val4.s0 = (read_imageui( fref_planes, sampler, frefpos + (int2)(4, y)).s0 >> mask_shift) & 0xFF;
+        val4.s1 = (read_imageui( fref_planes, sampler, frefpos + (int2)(5, y)).s0 >> mask_shift) & 0xFF;
+        val4.s2 = (read_imageui( fref_planes, sampler, frefpos + (int2)(6, y)).s0 >> mask_shift) & 0xFF;
+        val4.s3 = (read_imageui( fref_planes, sampler, frefpos + (int2)(7, y)).s0 >> mask_shift) & 0xFF;
+        cost4 += abs_diff( enc, val4 );
+    }
+
+    return cost4.s0 + cost4.s1 + cost4.s2 + cost4.s3;
+}
+
+/* One thread measures 8x8 SAD cost at a QPEL offset into an HPEL plane */
+int sad_8x8_ii_qpel( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref_planes, int2 qpos )
+{
+    int2 frefApos = qpos >> 2;
+    int hpelA = ((qpos.x & 2) >> 1) + (qpos.y & 2);
+
+    int2 qposB = qpos + ((qpos & 1) << 1);
+    int2 frefBpos = qposB >> 2;
+    int hpelB = ((qposB.x & 2) >> 1) + (qposB.y & 2);
+
+    uint mask_shift0 = 8 * hpelA, mask_shift1 = 8 * hpelB;
+
+    int cost = 0;
+
+    for( int y = 0; y < 8; y++ )
+    {
+        for( int x = 0; x < 8; x++ )
+        {
+            uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y)).s0;
+            uint vA = (read_imageui( fref_planes, sampler, frefApos + (int2)(x, y)).s0 >> mask_shift0) & 0xFF;
+            uint vB = (read_imageui( fref_planes, sampler, frefBpos + (int2)(x, y)).s0 >> mask_shift1) & 0xFF;
+            cost += abs_diff( enc, rhadd( vA, vB ) );
+        }
+    }
+
+    return cost;
+}
+
+/* Four threads measure 8x8 SATD cost at a QPEL offset into an HPEL plane
+ *
+ * Each thread collects 1/4 of the rows of diffs and processes one quarter of
+ * the transforms
+ */
+int satd_8x8_ii_qpel_coop4( read_only image2d_t fenc,
+                            int2 fencpos,
+                            read_only image2d_t fref_planes,
+                            int2 qpos,
+                            local sum2_t *tmpp,
+                            int idx )
+{
+    volatile local sum2_t( *tmp )[4] = (volatile local sum2_t( * )[4])tmpp;
+    sum2_t b0, b1, b2, b3;
+
+    // fencpos is full-pel position of original MB
+    // qpos is qpel position within reference frame
+    int2 frefApos = qpos >> 2;
+    int hpelA = ((qpos.x&2)>>1) + (qpos.y&2);
+
+    int2 qposB = qpos + (int2)(((qpos.x&1)<<1), ((qpos.y&1)<<1));
+    int2 frefBpos = qposB >> 2;
+    int hpelB = ((qposB.x&2)>>1) + (qposB.y&2);
+
+    uint mask_shift0 = 8 * hpelA, mask_shift1 = 8 * hpelB;
+
+    uint vA, vB;
+    uint a0, a1;
+    uint enc;
+    sum2_t sum = 0;
+
+#define READ_DIFF( OUT, X )\
+    enc = read_imageui( fenc, sampler, fencpos + (int2)(X, idx) ).s0;\
+    vA = (read_imageui( fref_planes, sampler, frefApos + (int2)(X, idx) ).s0 >> mask_shift0) & 0xFF;\
+    vB = (read_imageui( fref_planes, sampler, frefBpos + (int2)(X, idx) ).s0 >> mask_shift1) & 0xFF;\
+    OUT = enc - rhadd( vA, vB );
+
+#define READ_DIFF_EX( OUT, a, b )\
+    {\
+        READ_DIFF( a0, a );\
+        READ_DIFF( a1, b );\
+        OUT = a0 + (a1<<BITS_PER_SUM);\
+    }
+#define ROW_8x4_SATD( a, b )\
+    {\
+        fencpos.y += a;\
+        frefApos.y += b;\
+        frefBpos.y += b;\
+        READ_DIFF_EX( b0, 0, 4 );\
+        READ_DIFF_EX( b1, 1, 5 );\
+        READ_DIFF_EX( b2, 2, 6 );\
+        READ_DIFF_EX( b3, 3, 7 );\
+        HADAMARD4( tmp[idx][0], tmp[idx][1], tmp[idx][2], tmp[idx][3], b0, b1, b2, b3 );\
+        HADAMARD4( b0, b1, b2, b3, tmp[0][idx], tmp[1][idx], tmp[2][idx], tmp[3][idx] );\
+        sum += abs2( b0 ) + abs2( b1 ) + abs2( b2 ) + abs2( b3 );\
+    }
+    ROW_8x4_SATD( 0, 0 );
+    ROW_8x4_SATD( 4, 4 );
+
+#undef READ_DIFF
+#undef READ_DIFF_EX
+#undef ROW_8x4_SATD
+    return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;
+}
+
+constant int2 hpoffs[4] =
+{
+    {0, -2}, {-2, 0}, {2, 0}, {0, 2}
+};
+
+/* sub pixel refinement of motion vectors, output MVs and costs are moved from
+ * temporary buffers into final per-frame buffer
+ *
+ * global launch dimensions:  [mb_width * 4, mb_height]
+ *
+ * With X being the source 16x16 pixels, F is the lowres pixel used by the
+ * motion search.  We will now utilize the H V and C pixels (stored in separate
+ * planes) to search at half-pel increments.
+ *
+ * X X X X X X
+ *  F H F H F
+ * X X X X X X
+ *  V C V C V
+ * X X X X X X
+ *  F H F H F
+ * X X X X X X
+ *
+ * The YX HPEL bits of the motion vector selects the plane we search in.  The
+ * four planes are packed in the fref_planes 2D image buffer.  Each sample
+ * returns:  s0 = F, s1 = H, s2 = V, s3 = C */
+kernel void subpel_refine( read_only image2d_t   fenc,
+                           read_only image2d_t   fref_planes,
+                           const global short2  *in_mvs,
+                           const global int16_t *in_sad_mv_costs,
+                           local int16_t        *cost_local,
+                           local sum2_t         *satd_local,
+                           local short2         *mvc_local,
+                           global short2        *fenc_lowres_mv,
+                           global int16_t       *fenc_lowres_mv_costs,
+                           int                   mb_width,
+                           int                   lambda,
+                           int                   b,
+                           int                   ref,
+                           int                   b_islist1 )
+{
+    int mb_x = get_global_id( 0 ) >> 2;
+    if( mb_x >= mb_width )
+        return;
+    int mb_height = get_global_size( 1 );
+
+    int mb_i = get_global_id( 0 ) & 3;
+    int mb_y = get_global_id( 1 );
+    int mb_xy = mb_y * mb_width + mb_x;
+
+    /* fenc_lowres_mv and fenc_lowres_mv_costs are large buffers that
+     * hold many frames worth of motion vectors.  We must offset into the correct
+     * location for this frame's vectors.  The kernel will be passed the correct
+     * directional buffer for the direction of the search: list1 or list0
+     *
+     *   CPU equivalent: fenc->lowres_mvs[0][b - p0 - 1]
+     *   GPU equivalent: fenc_lowres_mvs[(b - p0 - 1) * mb_count] */
+    fenc_lowres_mv +=       (b_islist1 ? (ref-b-1) : (b-ref-1)) * mb_width * mb_height;
+    fenc_lowres_mv_costs += (b_islist1 ? (ref-b-1) : (b-ref-1)) * mb_width * mb_height;
+
+    /* Adjust pointers into local memory buffers for this thread's data */
+    int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
+    cost_local += mb_in_group * 4;
+    satd_local += mb_in_group * 16;
+    mvc_local += mb_in_group * 4;
+
+    int i_mvc = 0;
+
+    mvc_local[0] = mvc_local[1] = mvc_local[2] = mvc_local[3] = 0;
+
+#define MVC( DX, DY ) mvc_local[i_mvc++] = in_mvs[mb_width * (mb_y + DY) + (mb_x + DX)];
+    if( mb_x > 0 )
+        MVC( -1, 0 );
+    if( mb_y > 0 )

x264-snapshot-20130723-2245.tar.bz2/common/opencl/weightp.cl Added

@@ -0,0 +1,48 @@
+/* Weightp filter a downscaled image into a temporary output buffer.
+ * This kernel is launched once for each scale.
+ *
+ * Launch dimensions: width x height (in pixels)
+ */
+kernel void weightp_scaled_images( read_only image2d_t in_plane,
+                                   write_only image2d_t out_plane,
+                                   uint offset,
+                                   uint scale,
+                                   uint denom )
+{
+    int gx = get_global_id( 0 );
+    int gy = get_global_id( 1 );
+    uint4 input_val;
+    uint4 output_val;
+
+    input_val = read_imageui( in_plane, sampler, (int2)(gx, gy));
+    output_val = (uint4)(offset) + ( ( ((uint4)(scale)) * input_val ) >> ((uint4)(denom)) );
+    write_imageui( out_plane, (int2)(gx, gy), output_val );
+}
+
+/* Weightp filter for the half-pel interpolated image
+ *
+ * Launch dimensions: width x height (in pixels)
+ */
+kernel void weightp_hpel( read_only image2d_t in_plane,
+                          write_only image2d_t out_plane,
+                          uint offset,
+                          uint scale,
+                          uint denom )
+{
+    int gx = get_global_id( 0 );
+    int gy = get_global_id( 1 );
+    uint input_val;
+    uint output_val;
+
+    input_val = read_imageui( in_plane, sampler, (int2)(gx, gy)).s0;
+    //Unpack
+    uint4 temp;
+    temp.s0 = input_val & 0x00ff; temp.s1 = (input_val >> 8) & 0x00ff;
+    temp.s2 = (input_val >> 16) & 0x00ff; temp.s3 = (input_val >> 24) & 0x00ff;
+
+    temp = (uint4)(offset) + ( ( ((uint4)(scale)) * temp ) >> ((uint4)(denom)) );
+
+    //Pack
+    output_val = temp.s0 | (temp.s1 << 8) | (temp.s2 << 16) | (temp.s3 << 24);
+    write_imageui( out_plane, (int2)(gx, gy), output_val );
+}

x264-snapshot-20130723-2245.tar.bz2/common/opencl/x264-cl.h Added

@@ -0,0 +1,132 @@
+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
+
+constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
+
+/* 7.18.1.1  Exact-width integer types */
+typedef signed char int8_t;
+typedef unsigned char   uint8_t;
+typedef short  int16_t;
+typedef unsigned short  uint16_t;
+typedef int  int32_t;
+typedef unsigned   uint32_t;
+
+typedef uint8_t  pixel;
+typedef uint16_t sum_t;
+typedef uint32_t sum2_t;
+
+#define LOWRES_COST_MASK ((1<<14)-1)
+#define LOWRES_COST_SHIFT 14
+#define COST_MAX (1<<28)
+
+#define PIXEL_MAX 255
+#define BITS_PER_SUM (8 * sizeof(sum_t))
+
+/* Constants for offsets into frame statistics buffer */
+#define COST_EST    0
+#define COST_EST_AQ 1
+#define INTRA_MBS   2
+
+#define COPY2_IF_LT( x, y, a, b )\
+    if((y)<(x))\
+    {\
+        (x) = (y);\
+        (a) = (b);\
+    }
+
+constant int2 dia_offs[4] =
+{
+    {0, -1}, {-1, 0}, {1, 0}, {0, 1},
+};
+
+inline pixel x264_clip_pixel( int x )
+{
+    return (pixel) clamp( x, (int) 0, (int) PIXEL_MAX );
+}
+
+inline int2 x264_median_mv( short2 a, short2 b, short2 c )
+{
+    short2 t1 = min(a, b);
+    short2 t2 = min(max(a, b), c);
+    return convert_int2(max(t1, t2));
+}
+
+inline sum2_t abs2( sum2_t a )
+{
+    sum2_t s = ((a >> (BITS_PER_SUM - 1)) & (((sum2_t)1 << BITS_PER_SUM) + 1)) * ((sum_t)-1);
+    return (a + s) ^ s;
+}
+
+#define HADAMARD4( d0, d1, d2, d3, s0, s1, s2, s3 ) {\
+    sum2_t t0 = s0 + s1;\
+    sum2_t t1 = s0 - s1;\
+    sum2_t t2 = s2 + s3;\
+    sum2_t t3 = s2 - s3;\
+    d0 = t0 + t2;\
+    d2 = t0 - t2;\
+    d1 = t1 + t3;\
+    d3 = t1 - t3;\
+}
+
+#define HADAMARD4V( d0, d1, d2, d3, s0, s1, s2, s3 ) {\
+    int2 t0 = s0 + s1;\
+    int2 t1 = s0 - s1;\
+    int2 t2 = s2 + s3;\
+    int2 t3 = s2 - s3;\
+    d0 = t0 + t2;\
+    d2 = t0 - t2;\
+    d1 = t1 + t3;\
+    d3 = t1 - t3;\
+}
+
+#define SATD_C_8x4_Q( name, q1, q2 )\
+    int name( q1 pixel *pix1, int i_pix1, q2 pixel *pix2, int i_pix2 )\
+    {\
+        sum2_t tmp[4][4];\
+        sum2_t a0, a1, a2, a3;\
+        sum2_t sum = 0;\
+        for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )\
+        {\
+            a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM);\
+            a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM);\
+            a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM);\
+            a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM);\
+            HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3 );\
+        }\
+        for( int i = 0; i < 4; i++ )\
+        {\
+            HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] );\
+            sum += abs2( a0 ) + abs2( a1 ) + abs2( a2 ) + abs2( a3 );\
+        }\
+        return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;\
+    }
+
+/*
+ * Utility function to perform a parallel sum reduction of an array of integers
+ */
+int parallel_sum( int value, int x, volatile local int *array )
+{
+    array[x] = value;
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    int dim = get_local_size( 0 );
+
+    while( dim > 1 )
+    {
+        dim >>= 1;
+
+        if( x < dim )
+            array[x] += array[x + dim];
+
+        if( dim > 32 )
+            barrier( CLK_LOCAL_MEM_FENCE );
+    }
+
+    return array[0];
+}
+
+int mv_cost( uint2 mvd )
+{
+    float2 mvdf = (float2)(mvd.x, mvd.y) + 1.0f;
+    float2 cost = round( log2(mvdf) * 2.0f + 0.718f + (float2)(!!mvd.x, !!mvd.y) );
+    return (int) (cost.x + cost.y);
+}

x264-snapshot-20130224-2245.tar.bz2/common/osdep.h -> x264-snapshot-20130723-2245.tar.bz2/common/osdep.h Changed

@@ -79,6 +79,7 @@
 #else
 #define DECLARE_ALIGNED( var, n ) var __attribute__((aligned(n)))
 #endif
+#define ALIGNED_32( var ) DECLARE_ALIGNED( var, 32 )
 #define ALIGNED_16( var ) DECLARE_ALIGNED( var, 16 )
 #define ALIGNED_8( var )  DECLARE_ALIGNED( var, 8 )
 #define ALIGNED_4( var )  DECLARE_ALIGNED( var, 4 )
@@ -110,9 +111,26 @@
 
 #define EXPAND(x) x
 
+#if HAVE_32B_STACK_ALIGNMENT
+#define ALIGNED_ARRAY_32( type, name, sub1, ... )\
+    ALIGNED_32( type name sub1 __VA_ARGS__ )
+#else
 #define ALIGNED_ARRAY_32( ... ) EXPAND( ALIGNED_ARRAY_EMU( 31, __VA_ARGS__ ) )
+#endif
+
 #define ALIGNED_ARRAY_64( ... ) EXPAND( ALIGNED_ARRAY_EMU( 63, __VA_ARGS__ ) )
 
+/* For AVX2 */
+#if ARCH_X86 || ARCH_X86_64
+#define NATIVE_ALIGN 32
+#define ALIGNED_N ALIGNED_32
+#define ALIGNED_ARRAY_N ALIGNED_ARRAY_32
+#else
+#define NATIVE_ALIGN 16
+#define ALIGNED_N ALIGNED_16
+#define ALIGNED_ARRAY_N ALIGNED_ARRAY_16
+#endif
+
 #define UNINIT(x) x=x
 
 #if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
@@ -204,6 +222,25 @@
 #define x264_threading_init() 0
 #endif
 
+static ALWAYS_INLINE int x264_pthread_fetch_and_add( int *val, int add, x264_pthread_mutex_t *mutex )
+{
+#if HAVE_THREAD
+#if defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ > 0) && ARCH_X86
+    return __sync_fetch_and_add( val, add );
+#else
+    x264_pthread_mutex_lock( mutex );
+    int res = *val;
+    *val += add;
+    x264_pthread_mutex_unlock( mutex );
+    return res;
+#endif
+#else
+    int res = *val;
+    *val += add;
+    return res;
+#endif
+}
+
 #define WORD_SIZE sizeof(void*)
 
 #define asm __asm__
@@ -254,6 +291,13 @@
 }
 #endif
 
+/* For values with 4 bits or less. */
+static int ALWAYS_INLINE x264_ctz_4bit( uint32_t x )
+{
+    static uint8_t lut[16] = {4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0};
+    return lut[x];
+}
+
 #if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 3)
 #define x264_clz(x) __builtin_clz(x)
 #define x264_ctz(x) __builtin_ctz(x)

x264-snapshot-20130224-2245.tar.bz2/common/pixel.c -> x264-snapshot-20130723-2245.tar.bz2/common/pixel.c Changed

@@ -370,7 +370,6 @@
     return (sum+2)>>2;
 }
 
-
 static NOINLINE uint64_t pixel_hadamard_ac( pixel *pix, intptr_t stride )
 {
     sum2_t tmp[32];
@@ -501,6 +500,7 @@
 #if !HIGH_BIT_DEPTH
 SATD_X_DECL6( _sse2 )
 SATD_X_DECL7( _ssse3 )
+SATD_X_DECL6( _ssse3_atom )
 SATD_X_DECL7( _sse4 )
 SATD_X_DECL7( _avx )
 SATD_X_DECL7( _xop )
@@ -528,6 +528,7 @@
 INTRA_MBCMP_8x8( sad,, _c )
 INTRA_MBCMP_8x8(sa8d,, _c )
 #if HIGH_BIT_DEPTH && HAVE_MMX
+#define x264_predict_8x8_v_sse2 x264_predict_8x8_v_sse
 INTRA_MBCMP_8x8( sad, _mmx2,  _c )
 INTRA_MBCMP_8x8(sa8d, _sse2,  _sse2 )
 #endif
@@ -554,6 +555,9 @@
 
 #if HAVE_MMX
 #if HIGH_BIT_DEPTH
+#define x264_predict_8x8c_v_sse2 x264_predict_8x8c_v_sse
+#define x264_predict_8x16c_v_sse2 x264_predict_8x16c_v_sse
+#define x264_predict_16x16_v_sse2 x264_predict_16x16_v_sse
 INTRA_MBCMP( sad,  4x4,   v, h, dc,  , _mmx2, _c )
 INTRA_MBCMP( sad,  8x8,  dc, h,  v, c, _mmx2, _c )
 INTRA_MBCMP( sad, 16x16,  v, h, dc,  , _mmx2, _mmx2 )
@@ -841,6 +845,7 @@
     if( cpu&X264_CPU_MMX2 )
     {
         INIT7( sad, _mmx2 );
+        INIT7_NAME( sad_aligned, sad, _mmx2 );
         INIT7( sad_x3, _mmx2 );
         INIT7( sad_x4, _mmx2 );
         INIT8( satd, _mmx2 );
@@ -870,11 +875,14 @@
     {
         INIT4_NAME( sad_aligned, sad, _sse2_aligned );
         INIT5( ssd, _sse2 );
+        INIT6( satd, _sse2 );
+        pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_sse2;
 
         pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
         pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_sse2;
 #if ARCH_X86_64
         pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
+        pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse2;
 #endif
         pixf->intra_sad_x3_4x4  = x264_intra_sad_x3_4x4_sse2;
         pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_sse2;
@@ -916,10 +924,14 @@
     if( cpu&X264_CPU_SSSE3 )
     {
         INIT4_NAME( sad_aligned, sad, _ssse3_aligned );
+        pixf->sad_aligned[PIXEL_4x4] = x264_pixel_sad_4x4_ssse3;
+        pixf->sad_aligned[PIXEL_4x8] = x264_pixel_sad_4x8_ssse3;
         INIT7( sad, _ssse3 );
         INIT7( sad_x3, _ssse3 );
         INIT7( sad_x4, _ssse3 );
         INIT_ADS( _ssse3 );
+        INIT6( satd, _ssse3 );
+        pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_ssse3;
 
         if( !(cpu&X264_CPU_STACK_MOD4) )
         {
@@ -930,6 +942,9 @@
         pixf->intra_sad_x3_4x4  = x264_intra_sad_x3_4x4_ssse3;
         pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
         pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_ssse3;
+#if ARCH_X86_64
+        pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3;
+#endif
         pixf->intra_sad_x3_4x4    = x264_intra_sad_x3_4x4_ssse3;
         pixf->intra_sad_x3_8x8    = x264_intra_sad_x3_8x8_ssse3;
         pixf->intra_sad_x3_8x8c   = x264_intra_sad_x3_8x8c_ssse3;
@@ -937,16 +952,24 @@
     }
     if( cpu&X264_CPU_SSE4 )
     {
+        INIT6( satd, _sse4 );
+        pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_sse4;
         if( !(cpu&X264_CPU_STACK_MOD4) )
         {
             INIT4( hadamard_ac, _sse4 );
         }
         pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4;
         pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_sse4;
+#if ARCH_X86_64
+        pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse4;
+#endif
     }
     if( cpu&X264_CPU_AVX )
     {
+        INIT5_NAME( sad_aligned, sad, _ssse3 ); /* AVX-capable CPUs doesn't benefit from an aligned version */
         INIT_ADS( _avx );
+        INIT6( satd, _avx );
+        pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_avx;
         if( !(cpu&X264_CPU_STACK_MOD4) )
         {
             INIT4( hadamard_ac, _avx );
@@ -959,12 +982,26 @@
         pixf->ssd_nv12_core    = x264_pixel_ssd_nv12_core_avx;
         pixf->ssim_4x4x2_core  = x264_pixel_ssim_4x4x2_core_avx;
         pixf->ssim_end4        = x264_pixel_ssim_end4_avx;
+#if ARCH_X86_64
+        pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_avx;
+#endif
     }
     if( cpu&X264_CPU_XOP )
     {
         pixf->vsad = x264_pixel_vsad_xop;
         pixf->asd8 = x264_pixel_asd8_xop;
     }
+    if( cpu&X264_CPU_AVX2 )
+    {
+        INIT2( ssd, _avx2 );
+        INIT2( sad, _avx2 );
+        INIT2_NAME( sad_aligned, sad, _avx2 );
+        INIT2( sad_x3, _avx2 );
+        INIT2( sad_x4, _avx2 );
+        pixf->vsad = x264_pixel_vsad_avx2;
+        pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx2;
+        pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_avx2;
+    }
 #endif // HAVE_MMX
 #else // !HIGH_BIT_DEPTH
 #if HAVE_MMX
@@ -1003,14 +1040,14 @@
             INIT4( sad_x3, _cache32_mmx2 );
             INIT4( sad_x4, _cache32_mmx2 );
         }
-        else if( cpu&X264_CPU_CACHELINE_64 )
+        else if( cpu&X264_CPU_CACHELINE_64 && !(cpu&X264_CPU_SLOW_ATOM) )
         {
             INIT5( sad, _cache64_mmx2 );
             INIT4( sad_x3, _cache64_mmx2 );
             INIT4( sad_x4, _cache64_mmx2 );
         }
 #else
-        if( cpu&X264_CPU_CACHELINE_64 )
+        if( cpu&X264_CPU_CACHELINE_64 && !(cpu&X264_CPU_SLOW_ATOM) )
         {
             pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_cache64_mmx2;
             pixf->sad[PIXEL_8x8]  = x264_pixel_sad_8x8_cache64_mmx2;
@@ -1044,6 +1081,7 @@
         pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_sse2;
 #if ARCH_X86_64
         pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
+        pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse2;
 #endif
         pixf->var2[PIXEL_8x8]   = x264_pixel_var2_8x8_sse2;
         pixf->var2[PIXEL_8x16]  = x264_pixel_var2_8x16_sse2;
@@ -1060,10 +1098,7 @@
         pixf->satd[PIXEL_4x16]   = x264_pixel_satd_4x16_sse2;
         INIT6( satd_x3, _sse2 );
         INIT6( satd_x4, _sse2 );
-        if( !(cpu&X264_CPU_STACK_MOD4) )
-        {
-            INIT4( hadamard_ac, _sse2 );
-        }
+        INIT4( hadamard_ac, _sse2 );
         INIT_ADS( _sse2 );
         pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
         pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_sse2;
@@ -1113,9 +1148,9 @@
 
     if( cpu&X264_CPU_SSSE3 )
     {
+        INIT4( hadamard_ac, _ssse3 );
         if( !(cpu&X264_CPU_STACK_MOD4) )
         {
-            INIT4( hadamard_ac, _ssse3 );
             pixf->intra_sad_x9_4x4  = x264_intra_sad_x9_4x4_ssse3;
             pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_ssse3;
             pixf->intra_sad_x9_8x8  = x264_intra_sad_x9_8x8_ssse3;
@@ -1124,7 +1159,20 @@
 #endif
         }
         INIT_ADS( _ssse3 );
-        if( !(cpu&X264_CPU_SLOW_ATOM) )
+        if( cpu&X264_CPU_SLOW_ATOM )
+        {
+            pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3_atom;
+            pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_ssse3_atom;
+            INIT6( satd, _ssse3_atom );
+            pixf->satd[PIXEL_4x16]  = x264_pixel_satd_4x16_ssse3_atom;
+            INIT6( satd_x3, _ssse3_atom );
+            INIT6( satd_x4, _ssse3_atom );
+            INIT4( hadamard_ac, _ssse3_atom );
+#if ARCH_X86_64
+            pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3_atom;
+#endif
+        }

x264-snapshot-20130224-2245.tar.bz2/common/pixel.h -> x264-snapshot-20130723-2245.tar.bz2/common/pixel.h Changed

x264-snapshot-20130224-2245.tar.bz2/common/quant.c -> x264-snapshot-20130723-2245.tar.bz2/common/quant.c Changed

@@ -63,6 +63,19 @@
     return !!nz;
 }
 
+static int quant_4x4x4( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] )
+{
+    int nza = 0;
+    for( int j = 0; j < 4; j++ )
+    {
+        int nz = 0;
+        for( int i = 0; i < 16; i++ )
+            QUANT_ONE( dct[j][i], mf[i], bias[i] );
+        nza |= (!!nz)<<j;
+    }
+    return nza;
+}
+
 static int quant_4x4_dc( dctcoef dct[16], int mf, int bias )
 {
     int nz = 0;
@@ -405,6 +418,7 @@
 {
     pf->quant_8x8 = quant_8x8;
     pf->quant_4x4 = quant_4x4;
+    pf->quant_4x4x4 = quant_4x4x4;
     pf->quant_4x4_dc = quant_4x4_dc;
     pf->quant_2x2_dc = quant_2x2_dc;
 
@@ -442,11 +456,6 @@
         pf->denoise_dct = x264_denoise_dct_mmx;
         pf->decimate_score15 = x264_decimate_score15_mmx2;
         pf->decimate_score16 = x264_decimate_score16_mmx2;
-        if( cpu&X264_CPU_SLOW_CTZ )
-        {
-            pf->decimate_score15 = x264_decimate_score15_mmx2_slowctz;
-            pf->decimate_score16 = x264_decimate_score16_mmx2_slowctz;
-        }
         pf->decimate_score64 = x264_decimate_score64_mmx2;
         pf->coeff_last8 = x264_coeff_last8_mmx2;
         pf->coeff_last[  DCT_LUMA_AC] = x264_coeff_last15_mmx2;
@@ -464,6 +473,7 @@
     if( cpu&X264_CPU_SSE2 )
     {
         pf->quant_4x4 = x264_quant_4x4_sse2;
+        pf->quant_4x4x4 = x264_quant_4x4x4_sse2;
         pf->quant_8x8 = x264_quant_8x8_sse2;
         pf->quant_2x2_dc = x264_quant_2x2_dc_sse2;
         pf->quant_4x4_dc = x264_quant_4x4_dc_sse2;
@@ -474,11 +484,6 @@
         pf->decimate_score15 = x264_decimate_score15_sse2;
         pf->decimate_score16 = x264_decimate_score16_sse2;
         pf->decimate_score64 = x264_decimate_score64_sse2;
-        if( cpu&X264_CPU_SLOW_CTZ )
-        {
-            pf->decimate_score15 = x264_decimate_score15_sse2_slowctz;
-            pf->decimate_score16 = x264_decimate_score16_sse2_slowctz;
-        }
         pf->coeff_last8 = x264_coeff_last8_sse2;
         pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
         pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
@@ -501,17 +506,13 @@
     if( cpu&X264_CPU_SSSE3 )
     {
         pf->quant_4x4 = x264_quant_4x4_ssse3;
+        pf->quant_4x4x4 = x264_quant_4x4x4_ssse3;
         pf->quant_8x8 = x264_quant_8x8_ssse3;
         pf->quant_2x2_dc = x264_quant_2x2_dc_ssse3;
         pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
         pf->denoise_dct = x264_denoise_dct_ssse3;
         pf->decimate_score15 = x264_decimate_score15_ssse3;
         pf->decimate_score16 = x264_decimate_score16_ssse3;
-        if( cpu&X264_CPU_SLOW_CTZ )
-        {
-            pf->decimate_score15 = x264_decimate_score15_ssse3_slowctz;
-            pf->decimate_score16 = x264_decimate_score16_ssse3_slowctz;
-        }
         pf->decimate_score64 = x264_decimate_score64_ssse3;
         INIT_TRELLIS( ssse3 );
     }
@@ -520,6 +521,7 @@
         pf->quant_2x2_dc = x264_quant_2x2_dc_sse4;
         pf->quant_4x4_dc = x264_quant_4x4_dc_sse4;
         pf->quant_4x4 = x264_quant_4x4_sse4;
+        pf->quant_4x4x4 = x264_quant_4x4x4_sse4;
         pf->quant_8x8 = x264_quant_8x8_sse4;
     }
     if( cpu&X264_CPU_AVX )
@@ -535,6 +537,17 @@
             pf->dequant_8x8 = x264_dequant_8x8_xop;
         }
     }
+    if( cpu&X264_CPU_AVX2 )
+    {
+        pf->quant_4x4 = x264_quant_4x4_avx2;
+        pf->quant_4x4_dc = x264_quant_4x4_dc_avx2;
+        pf->quant_8x8 = x264_quant_8x8_avx2;
+        pf->quant_4x4x4 = x264_quant_4x4x4_avx2;
+        pf->dequant_4x4 = x264_dequant_4x4_avx2;
+        pf->dequant_8x8 = x264_dequant_8x8_avx2;
+        pf->dequant_4x4_dc = x264_dequant_4x4dc_avx2;
+        pf->denoise_dct = x264_denoise_dct_avx2;
+    }
 #endif // HAVE_MMX
 #else // !HIGH_BIT_DEPTH
 #if HAVE_MMX
@@ -543,6 +556,7 @@
     {
 #if ARCH_X86
         pf->quant_4x4 = x264_quant_4x4_mmx;
+        pf->quant_4x4x4 = x264_quant_4x4x4_mmx;
         pf->quant_8x8 = x264_quant_8x8_mmx;
         pf->dequant_4x4 = x264_dequant_4x4_mmx;
         pf->dequant_4x4_dc = x264_dequant_4x4dc_mmx2;
@@ -563,11 +577,6 @@
         pf->quant_4x4_dc = x264_quant_4x4_dc_mmx2;
         pf->decimate_score15 = x264_decimate_score15_mmx2;
         pf->decimate_score16 = x264_decimate_score16_mmx2;
-        if( cpu&X264_CPU_SLOW_CTZ )
-        {
-            pf->decimate_score15 = x264_decimate_score15_mmx2_slowctz;
-            pf->decimate_score16 = x264_decimate_score16_mmx2_slowctz;
-        }
         pf->decimate_score64 = x264_decimate_score64_mmx2;
         pf->coeff_last[  DCT_LUMA_AC] = x264_coeff_last15_mmx2;
         pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2;
@@ -592,6 +601,7 @@
     {
         pf->quant_4x4_dc = x264_quant_4x4_dc_sse2;
         pf->quant_4x4 = x264_quant_4x4_sse2;
+        pf->quant_4x4x4 = x264_quant_4x4x4_sse2;
         pf->quant_8x8 = x264_quant_8x8_sse2;
         pf->dequant_4x4 = x264_dequant_4x4_sse2;
         pf->dequant_4x4_dc = x264_dequant_4x4dc_sse2;
@@ -606,11 +616,6 @@
         pf->decimate_score15 = x264_decimate_score15_sse2;
         pf->decimate_score16 = x264_decimate_score16_sse2;
         pf->decimate_score64 = x264_decimate_score64_sse2;
-        if( cpu&X264_CPU_SLOW_CTZ )
-        {
-            pf->decimate_score15 = x264_decimate_score15_sse2_slowctz;
-            pf->decimate_score16 = x264_decimate_score16_sse2_slowctz;
-        }
         pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
         pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
         pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
@@ -631,18 +636,25 @@
         pf->quant_2x2_dc = x264_quant_2x2_dc_ssse3;
         pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
         pf->quant_4x4 = x264_quant_4x4_ssse3;
+        pf->quant_4x4x4 = x264_quant_4x4x4_ssse3;
         pf->quant_8x8 = x264_quant_8x8_ssse3;
         pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_ssse3;
         pf->denoise_dct = x264_denoise_dct_ssse3;
         pf->decimate_score15 = x264_decimate_score15_ssse3;
         pf->decimate_score16 = x264_decimate_score16_ssse3;
-        if( cpu&X264_CPU_SLOW_CTZ )
-        {
-            pf->decimate_score15 = x264_decimate_score15_ssse3_slowctz;
-            pf->decimate_score16 = x264_decimate_score16_ssse3_slowctz;
-        }
         pf->decimate_score64 = x264_decimate_score64_ssse3;
         INIT_TRELLIS( ssse3 );
+        pf->coeff_level_run4 = x264_coeff_level_run4_ssse3;
+        pf->coeff_level_run8 = x264_coeff_level_run8_ssse3;
+        pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3;
+        pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3;
+        if( cpu&X264_CPU_LZCNT )
+        {
+            pf->coeff_level_run4 = x264_coeff_level_run4_ssse3;
+            pf->coeff_level_run8 = x264_coeff_level_run8_ssse3;
+            pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3_lzcnt;
+            pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3_lzcnt;
+        }
     }
 
     if( cpu&X264_CPU_SSE4 )
@@ -673,6 +685,30 @@
             pf->dequant_8x8 = x264_dequant_8x8_xop;
         }
     }
+
+    if( cpu&X264_CPU_AVX2 )
+    {
+        pf->quant_4x4 = x264_quant_4x4_avx2;
+        pf->quant_4x4_dc = x264_quant_4x4_dc_avx2;
+        pf->quant_8x8 = x264_quant_8x8_avx2;
+        pf->quant_4x4x4 = x264_quant_4x4x4_avx2;
+        pf->dequant_4x4 = x264_dequant_4x4_avx2;
+        pf->dequant_8x8 = x264_dequant_8x8_avx2;
+        pf->dequant_4x4_dc = x264_dequant_4x4dc_avx2;
+        if( h->param.i_cqm_preset == X264_CQM_FLAT )
+        {
+            pf->dequant_4x4 = x264_dequant_4x4_flat16_avx2;
+            pf->dequant_8x8 = x264_dequant_8x8_flat16_avx2;
+        }
+        pf->decimate_score64 = x264_decimate_score64_avx2;
+        pf->denoise_dct = x264_denoise_dct_avx2;
+        if( cpu&X264_CPU_LZCNT )
+        {
+            pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2_lzcnt;

x264-snapshot-20130224-2245.tar.bz2/common/quant.h -> x264-snapshot-20130723-2245.tar.bz2/common/quant.h Changed

x264-snapshot-20130224-2245.tar.bz2/common/set.c -> x264-snapshot-20130723-2245.tar.bz2/common/set.c Changed

@@ -85,44 +85,49 @@
     int max_qp_err = -1;
     int max_chroma_qp_err = -1;
     int min_qp_err = QP_MAX+1;
-    int num_8x8_lists = h->sps->i_chroma_format_idc == CHROMA_444 ? 4 : 2; /* Checkasm may segfault if optimized out by --chroma-format */
+    int num_8x8_lists = h->sps->i_chroma_format_idc == CHROMA_444 ? 4
+                      : h->param.analyse.b_transform_8x8 ? 2 : 0; /* Checkasm may segfault if optimized out by --chroma-format */
 
-    for( int i = 0; i < 4 + num_8x8_lists; i++ )
-    {
-        int size = i<4 ? 16 : 64;
-        int j;
-        for( j = (i<4 ? 0 : 4); j < i; j++ )
-            if( !memcmp( h->pps->scaling_list[i], h->pps->scaling_list[j], size*sizeof(uint8_t) ) )
-                break;
-        if( j < i )
-        {
-            h->  quant4_mf[i] = h->  quant4_mf[j];
-            h->dequant4_mf[i] = h->dequant4_mf[j];
-            h->unquant4_mf[i] = h->unquant4_mf[j];
-        }
-        else
-        {
-            CHECKED_MALLOC( h->  quant4_mf[i], (QP_MAX+1)*size*sizeof(udctcoef) );
-            CHECKED_MALLOC( h->dequant4_mf[i],  6*size*sizeof(int) );
-            CHECKED_MALLOC( h->unquant4_mf[i], (QP_MAX+1)*size*sizeof(int) );
-        }
-
-        for( j = (i<4 ? 0 : 4); j < i; j++ )
-            if( deadzone[j&3] == deadzone[i&3] &&
-                !memcmp( h->pps->scaling_list[i], h->pps->scaling_list[j], size*sizeof(uint8_t) ) )
-                break;
-        if( j < i )
-        {
-            h->quant4_bias[i] = h->quant4_bias[j];
-            h->quant4_bias0[i] = h->quant4_bias0[j];
-        }
-        else
-        {
-            CHECKED_MALLOC( h->quant4_bias[i], (QP_MAX+1)*size*sizeof(udctcoef) );
-            CHECKED_MALLOC( h->quant4_bias0[i], (QP_MAX+1)*size*sizeof(udctcoef) );
-        }
+#define CQM_ALLOC( w, count )\
+    for( int i = 0; i < count; i++ )\
+    {\
+        int size = w*w;\
+        int start = w == 8 ? 4 : 0;\
+        int j;\
+        for( j = 0; j < i; j++ )\
+            if( !memcmp( h->pps->scaling_list[i+start], h->pps->scaling_list[j+start], size*sizeof(uint8_t) ) )\
+                break;\
+        if( j < i )\
+        {\
+            h->  quant##w##_mf[i] = h->  quant##w##_mf[j];\
+            h->dequant##w##_mf[i] = h->dequant##w##_mf[j];\
+            h->unquant##w##_mf[i] = h->unquant##w##_mf[j];\
+        }\
+        else\
+        {\
+            CHECKED_MALLOC( h->  quant##w##_mf[i], (QP_MAX+1)*size*sizeof(udctcoef) );\
+            CHECKED_MALLOC( h->dequant##w##_mf[i],  6*size*sizeof(int) );\
+            CHECKED_MALLOC( h->unquant##w##_mf[i], (QP_MAX+1)*size*sizeof(int) );\
+        }\
+        for( j = 0; j < i; j++ )\
+            if( deadzone[j] == deadzone[i] &&\
+                !memcmp( h->pps->scaling_list[i+start], h->pps->scaling_list[j+start], size*sizeof(uint8_t) ) )\
+                break;\
+        if( j < i )\
+        {\
+            h->quant##w##_bias[i] = h->quant##w##_bias[j];\
+            h->quant##w##_bias0[i] = h->quant##w##_bias0[j];\
+        }\
+        else\
+        {\
+            CHECKED_MALLOC( h->quant##w##_bias[i], (QP_MAX+1)*size*sizeof(udctcoef) );\
+            CHECKED_MALLOC( h->quant##w##_bias0[i], (QP_MAX+1)*size*sizeof(udctcoef) );\
+        }\
     }
 
+    CQM_ALLOC( 4, 4 )
+    CQM_ALLOC( 8, num_8x8_lists )
+
     for( int q = 0; q < 6; q++ )
     {
         for( int i = 0; i < 16; i++ )
@@ -204,6 +209,9 @@
         for( int cat = 0; cat < 3 + CHROMA444; cat++ )
         {
             int dct8x8 = cat&1;
+            if( !h->param.analyse.b_transform_8x8 && dct8x8 )
+                continue;
+
             int size = dct8x8 ? 64 : 16;
             udctcoef *nr_offset = h->nr_offset_emergency[q][cat];
             /* Denoise chroma first (due to h264's chroma QP offset), then luma, then DC. */

x264-snapshot-20130224-2245.tar.bz2/common/win32thread.c -> x264-snapshot-20130723-2245.tar.bz2/common/win32thread.c Changed

x264-snapshot-20130224-2245.tar.bz2/common/x86/bitstream-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/bitstream-a.asm Changed

@@ -4,7 +4,7 @@
 ;* Copyright (C) 2010-2013 x264 project
 ;*
 ;* Authors: Jason Garrett-Glaser <darkshikari@gmail.com>
-;*          Henrik Gramner <hengar-6@student.ltu.se>
+;*          Henrik Gramner <henrik@gramner.com>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
@@ -32,100 +32,105 @@
 ;-----------------------------------------------------------------------------
 ; uint8_t *x264_nal_escape( uint8_t *dst, uint8_t *src, uint8_t *end )
 ;-----------------------------------------------------------------------------
-
 %macro NAL_LOOP 2
-%1_escape:
+%%escape:
     ; Detect false positive to avoid unneccessary escape loop
     xor      r3d, r3d
     cmp byte [r0+r1-1], 0
     setnz    r3b
-    xor      r3d, r4d
+    xor       k3, k4
     jnz .escape
-    jmp %1_continue
+    jmp %%continue
 ALIGN 16
 %1:
-    pcmpeqb   m3, m1, m4
-    pcmpeqb   m2, m0, m4
-    pmovmskb r3d, m3
-    %2   [r0+r1], m0
+    mova [r0+r1+mmsize], m1
+    pcmpeqb   m1, m0
+    mova [r0+r1], m2
+    pcmpeqb   m2, m0
+    pmovmskb r3d, m1
+    %2        m1, [r1+r2+3*mmsize]
     pmovmskb r4d, m2
-    shl      r3d, mmsize
-    mova      m0, [r1+r2+2*mmsize]
-    or       r4d, r3d
-    %2 [r0+r1+mmsize], m1
-    lea      r3d, [r4+r4+1]
-    mova      m1, [r1+r2+3*mmsize]
-    and      r4d, r3d
-    jnz %1_escape
-%1_continue:
+    %2        m2, [r1+r2+2*mmsize]
+    shl       k3, mmsize
+    or        k3, k4
+    lea       k4, [2*r3+1]
+    and       k4, k3
+    jnz %%escape
+%%continue:
     add       r1, 2*mmsize
     jl %1
 %endmacro
 
 %macro NAL_ESCAPE 0
+%if mmsize == 32
+    %xdefine k3 r3
+    %xdefine k4 r4
+%else
+    %xdefine k3 r3d
+    %xdefine k4 r4d
+%endif
 
 cglobal nal_escape, 3,5
-    mov      r3w, [r1]
+    movzx    r3d, byte [r1]
     sub       r1, r2 ; r1 = offset of current src pointer from end of src
-    pxor      m4, m4
+    pxor      m0, m0
+    mov     [r0], r3b
     sub       r0, r1 ; r0 = projected end of dst, assuming no more escapes
-    mov  [r0+r1], r3w
-    add       r1, 2
-    jge .ret
+    or       r3d, 0xffffff00 ; ignore data before src
 
-    ; Start off by jumping into the escape loop in
-    ; case there's an escape at the start.
-    ; And do a few more in scalar until src is aligned again.
-    jmp .first_escape
+    ; Start off by jumping into the escape loop in case there's an escape at the start.
+    ; And do a few more in scalar until dst is aligned.
+    jmp .escape_loop
 
+%if mmsize == 16
     NAL_LOOP .loop_aligned, mova
-%if mmsize==16
     jmp .ret
-    NAL_LOOP .loop_unaligned, movu
 %endif
+    NAL_LOOP .loop_unaligned, movu
 .ret:
     movifnidn rax, r0
     RET
 
-ALIGN 16
 .escape:
     ; Skip bytes that are known to be valid
-    and      r4d, r3d
-    tzcnt    r3d, r4d
-    add       r1, r3
+    and       k4, k3
+    tzcnt     k4, k4
+    xor      r3d, r3d ; the last two bytes are known to be zero
+    add       r1, r4
 .escape_loop:
     inc       r1
     jge .ret
-.first_escape:
-    movzx    r3d, byte [r1+r2]
-    lea       r4, [r1+r2]
-    cmp      r3d, 3
-    jna .escape_check
-.no_escape:
+    movzx    r4d, byte [r1+r2]
+    shl      r3d, 8
+    or       r3d, r4d
+    test     r3d, 0xfffffc ; if the last two bytes are 0 and the current byte is <=3
+    jz .add_escape_byte
+.escaped:
+    lea      r4d, [r0+r1]
     mov  [r0+r1], r3b
-    test     r4d, mmsize-1 ; Do SIMD when src is aligned
+    test     r4d, mmsize-1 ; Do SIMD when dst is aligned
     jnz .escape_loop
-    mova      m0, [r4]
-    mova      m1, [r4+mmsize]
-%if mmsize==16
-    lea      r4d, [r0+r1]
+    movu      m1, [r1+r2+mmsize]
+    movu      m2, [r1+r2]
+%if mmsize == 16
+    lea      r4d, [r1+r2]
     test     r4d, mmsize-1
-    jnz .loop_unaligned
+    jz .loop_aligned
 %endif
-    jmp .loop_aligned
+    jmp .loop_unaligned
 
-ALIGN 16
-.escape_check:
-    cmp word [r0+r1-2], 0
-    jnz .no_escape
+.add_escape_byte:
     mov byte [r0+r1], 3
-    inc      r0
-    jmp .no_escape
+    inc       r0
+    or       r3d, 0x0300
+    jmp .escaped
 %endmacro
 
 INIT_MMX mmx2
 NAL_ESCAPE
 INIT_XMM sse2
 NAL_ESCAPE
-INIT_XMM avx
+%if ARCH_X86_64
+INIT_YMM avx2
 NAL_ESCAPE
+%endif

x264-snapshot-20130224-2245.tar.bz2/common/x86/cabac-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/cabac-a.asm Changed

@@ -26,22 +26,69 @@
 ;*****************************************************************************
 
 %include "x86inc.asm"
+%include "x86util.asm"
+
+SECTION_RODATA
+
+coeff_abs_level1_ctx:       db 1, 2, 3, 4, 0, 0, 0, 0
+coeff_abs_levelgt1_ctx:     db 5, 5, 5, 5, 6, 7, 8, 9
+coeff_abs_level_transition: db 1, 2, 3, 3, 4, 5, 6, 7
+                            db 4, 4, 4, 4, 5, 6, 7, 7
+
+%if ARCH_X86_64
+%macro COEFF_LAST_TABLE 17
+    %define funccpu1 %1
+    %define funccpu2 %2
+    %define funccpu3 %3
+    %rep 14
+        %ifidn %4, 4
+            dq mangle(x264_coeff_last%4_ %+ funccpu1)
+        %elifidn %4, 64
+            dq mangle(x264_coeff_last%4_ %+ funccpu2)
+        %else
+            dq mangle(x264_coeff_last%4_ %+ funccpu3)
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+cextern coeff_last4_mmx2
+cextern coeff_last4_mmx2_lzcnt
+cextern coeff_last15_sse2
+cextern coeff_last15_sse2_lzcnt
+cextern coeff_last16_sse2
+cextern coeff_last16_sse2_lzcnt
+cextern coeff_last64_sse2
+cextern coeff_last64_sse2_lzcnt
+cextern coeff_last64_avx2_lzcnt
+
+%ifdef PIC
+SECTION .data
+%endif
+coeff_last_sse2:       COEFF_LAST_TABLE       mmx2,       sse2,       sse2, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
+coeff_last_sse2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, sse2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
+coeff_last_avx2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, avx2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
+%endif
 
 SECTION .text
 
 cextern cabac_range_lps
 cextern cabac_transition
 cextern cabac_renorm_shift
+cextern cabac_entropy
+cextern cabac_size_unary
+cextern cabac_transition_unary
+cextern significant_coeff_flag_offset
+cextern significant_coeff_flag_offset_8x8
+cextern last_coeff_flag_offset
+cextern last_coeff_flag_offset_8x8
+cextern coeff_abs_level_m1_offset
+cextern count_cat_m1
+cextern cabac_encode_ue_bypass
 
-; t3 must be ecx, since it's used for shift.
-%if WIN64
-    DECLARE_REG_TMP 3,1,2,0,6,5,4,2
-    %define pointer resq
-%elif ARCH_X86_64
-    DECLARE_REG_TMP 0,1,2,3,4,5,6,6
+%if ARCH_X86_64
     %define pointer resq
 %else
-    DECLARE_REG_TMP 0,4,2,1,3,5,6,2
     %define pointer resd
 %endif
 
@@ -58,24 +105,34 @@
     .state: resb 1024
 endstruc
 
-%macro LOAD_GLOBAL 4
+%macro LOAD_GLOBAL 3-5 0 ; dst, base, off1, off2, tmp
 %ifdef PIC
-    ; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea
-    lea   r7, [%2]
-    %ifnidn %3, 0
-    add   r7, %3
+    %ifidn %4, 0
+        movzx %1, byte [%2+%3+r7-$$]
+    %else
+        lea   %5, [r7+%4]
+        movzx %1, byte [%2+%3+%5-$$]
     %endif
-    movzx %1, byte [r7+%4]
 %else
     movzx %1, byte [%2+%3+%4]
 %endif
 %endmacro
 
-cglobal cabac_encode_decision_asm, 0,7
-    movifnidn t0,  r0mp
+%macro CABAC 1
+; t3 must be ecx, since it's used for shift.
+%if WIN64
+    DECLARE_REG_TMP 3,1,2,0,5,6,4,4
+%elif ARCH_X86_64
+    DECLARE_REG_TMP 0,1,2,3,4,5,6,6
+%else
+    DECLARE_REG_TMP 0,4,2,1,3,5,6,2
+%endif
+
+cglobal cabac_encode_decision_%1, 1,7
     movifnidn t1d, r1m
-    mov   t5d, [t0+cb.range]
-    movzx t6d, byte [t0+cb.state+t1]
+    mov   t5d, [r0+cb.range]
+    movzx t6d, byte [r0+cb.state+t1]
+    movifnidn t0,  r0 ; WIN64
     mov   t4d, ~1
     mov   t3d, t5d
     and   t4d, t6d
@@ -84,8 +141,11 @@
 %if WIN64
     PUSH r7
 %endif
-    LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*2
-    LOAD_GLOBAL t4d, cabac_transition, t2, t6*2
+%ifdef PIC
+    lea    r7, [$$]
+%endif
+    LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*2, t4
+    LOAD_GLOBAL t4d, cabac_transition, t2, t6*2, t4
     and   t6d, 1
     sub   t3d, t5d
     cmp   t6d, t2d
@@ -96,66 +156,82 @@
     mov   [t0+cb.state+t1], t4b
 ;cabac_encode_renorm
     mov   t4d, t3d
+%ifidn %1, bmi2
+    lzcnt t3d, t3d
+    sub   t3d, 23
+    shlx  t4d, t4d, t3d
+    shlx  t6d, t6d, t3d
+%else
     shr   t3d, 3
-    LOAD_GLOBAL t3d, cabac_renorm_shift, 0, t3
+    LOAD_GLOBAL t3d, cabac_renorm_shift, t3
+    shl   t4d, t3b
+    shl   t6d, t3b
+%endif
 %if WIN64
     POP r7
 %endif
-    shl   t4d, t3b
-    shl   t6d, t3b
     mov   [t0+cb.range], t4d
     add   t3d, [t0+cb.queue]
-    jge cabac_putbyte
+    jge cabac_putbyte_%1
 .update_queue_low:
     mov   [t0+cb.low], t6d
     mov   [t0+cb.queue], t3d
     RET
 
-cglobal cabac_encode_bypass_asm, 0,3
-    movifnidn  t0, r0mp
-    movifnidn t3d, r1m
-    mov       t7d, [t0+cb.low]
-    and       t3d, [t0+cb.range]
-    lea       t7d, [t7*2+t3]
-    mov       t3d, [t0+cb.queue]
+cglobal cabac_encode_bypass_%1, 2,3
+    mov       t7d, [r0+cb.low]
+    and       r1d, [r0+cb.range]
+    lea       t7d, [t7*2+r1]
+    movifnidn  t0, r0 ; WIN64
+    mov       t3d, [r0+cb.queue]
     inc       t3d
-%if UNIX64 ; .putbyte compiles to nothing but a jmp
-    jge cabac_putbyte
+%if ARCH_X86_64 ; .putbyte compiles to nothing but a jmp
+    jge cabac_putbyte_%1
 %else
     jge .putbyte
 %endif
     mov   [t0+cb.low], t7d
     mov   [t0+cb.queue], t3d
     RET
+%if ARCH_X86_64 == 0
 .putbyte:
     PROLOGUE 0,7
     movifnidn t6d, t7d
-    jmp cabac_putbyte
+    jmp cabac_putbyte_%1
+%endif
 
-cglobal cabac_encode_terminal_asm, 0,3
-    movifnidn  t0, r0mp

x264-snapshot-20130224-2245.tar.bz2/common/x86/const-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/const-a.asm Changed

@@ -26,39 +26,53 @@
 
 %include "x86inc.asm"
 
-SECTION_RODATA
+SECTION_RODATA 32
+
+const pb_1,        times 32 db 1
+const hsub_mul,    times 16 db 1, -1
+const pw_1,        times 16 dw 1
+const pw_16,       times 16 dw 16
+const pw_32,       times 16 dw 32
+const pw_512,      times 16 dw 512
+const pw_00ff,     times 16 dw 0x00ff
+const pw_pixel_max,times 16 dw ((1 << BIT_DEPTH)-1)
+const pd_1,        times 8 dd 1
+const deinterleave_shufd, dd 0,4,1,5,2,6,3,7
+const pb_unpackbd1, times 2 db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
+const pb_unpackbd2, times 2 db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
 
 const pb_01,       times  8 db 0,1
 const pb_0,        times 16 db 0
 const pb_a1,       times 16 db 0xa1
-const pb_1,        times 16 db 1
 const pb_3,        times 16 db 3
-const hsub_mul,    times  8 db 1, -1
 const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6
 
-const pw_1,        times 8 dw 1
 const pw_2,        times 8 dw 2
 const pw_m2,       times 8 dw -2
 const pw_4,        times 8 dw 4
 const pw_8,        times 8 dw 8
-const pw_16,       times 8 dw 16
-const pw_32,       times 8 dw 32
 const pw_64,       times 8 dw 64
+const pw_256,      times 8 dw 256
 const pw_32_0,     times 4 dw 32,
                    times 4 dw 0
 const pw_8000,     times 8 dw 0x8000
 const pw_3fff,     times 8 dw 0x3fff
-const pw_pixel_max,times 8 dw ((1 << BIT_DEPTH)-1)
 const pw_ppppmmmm, dw 1,1,1,1,-1,-1,-1,-1
 const pw_ppmmppmm, dw 1,1,-1,-1,1,1,-1,-1
 const pw_pmpmpmpm, dw 1,-1,1,-1,1,-1,1,-1
 const pw_pmmpzzzz, dw 1,-1,-1,1,0,0,0,0
 
-const pd_1,        times 4 dd 1
 const pd_32,       times 4 dd 32
 const pd_1024,     times 4 dd 1024
 const pd_ffff,     times 4 dd 0xffff
-const pw_00ff,     times 8 dw 0x00ff
 const pw_ff00,     times 8 dw 0xff00
 
+const popcnt_table
+%assign x 0
+%rep 256
+; population count
+db ((x>>0)&1)+((x>>1)&1)+((x>>2)&1)+((x>>3)&1)+((x>>4)&1)+((x>>5)&1)+((x>>6)&1)+((x>>7)&1)
+%assign x x+1
+%endrep
+
 const sw_64,       dd 64

x264-snapshot-20130224-2245.tar.bz2/common/x86/cpu-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/cpu-a.asm Changed

@@ -66,7 +66,27 @@
     mov [r4], edx
     RET
 
-%if ARCH_X86_64 == 0
+%if ARCH_X86_64
+
+;-----------------------------------------------------------------------------
+; void stack_align( void (*func)(void*), void *arg );
+;-----------------------------------------------------------------------------
+cglobal stack_align
+    push rbp
+    mov  rbp, rsp
+%if WIN64
+    sub  rsp, 32 ; shadow space
+%endif
+    and  rsp, ~31
+    mov  rax, r0
+    mov   r0, r1
+    mov   r1, r2
+    mov   r2, r3
+    call rax
+    leave
+    ret
+
+%else
 
 ;-----------------------------------------------------------------------------
 ; int cpu_cpuid_test( void )
@@ -94,14 +114,11 @@
     popfd
     ret
 
-;-----------------------------------------------------------------------------
-; void stack_align( void (*func)(void*), void *arg );
-;-----------------------------------------------------------------------------
 cglobal stack_align
     push ebp
     mov  ebp, esp
     sub  esp, 12
-    and  esp, ~15
+    and  esp, ~31
     mov  ecx, [ebp+8]
     mov  edx, [ebp+12]
     mov  [esp], edx
@@ -165,7 +182,10 @@
 %endif
     push rbp
     mov  rbp, rsp
-    and  rsp, ~15
+%if WIN64
+    sub  rsp, 32 ; shadow space
+%endif
+    and  rsp, ~31
     call intel_cpu_indicator_init
     leave
 %if ARCH_X86_64

x264-snapshot-20130224-2245.tar.bz2/common/x86/dct-64.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/dct-64.asm Changed

@@ -311,6 +311,42 @@
 INIT_XMM xop
 DCT_SUB8
 
+INIT_YMM avx2
+cglobal sub16x16_dct8, 3,3,10
+    add  r0, 128
+    add  r2, 4*FDEC_STRIDE
+    call .sub16x8_dct8
+    add  r0, 256
+    add  r1, FENC_STRIDE*8
+    add  r2, FDEC_STRIDE*8
+    call .sub16x8_dct8
+    RET
+.sub16x8_dct8:
+    LOAD_DIFF16x2_AVX2 0, 1, 2, 3, 0, 1
+    LOAD_DIFF16x2_AVX2 2, 3, 4, 5, 2, 3
+    LOAD_DIFF16x2_AVX2 4, 5, 6, 7, 4, 5
+    LOAD_DIFF16x2_AVX2 6, 7, 8, 9, 6, 7
+    DCT8_1D    w, 0,1,2,3,4,5,6,7,8,9
+    TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
+    DCT8_1D    w, 0,1,2,3,4,5,6,7,8,9
+    mova    [r0-0x80+0x00], xm0
+    vextracti128 [r0+0x00], m0, 1
+    mova    [r0-0x80+0x10], xm1
+    vextracti128 [r0+0x10], m1, 1
+    mova    [r0-0x80+0x20], xm2
+    vextracti128 [r0+0x20], m2, 1
+    mova    [r0-0x80+0x30], xm3
+    vextracti128 [r0+0x30], m3, 1
+    mova    [r0-0x80+0x40], xm4
+    vextracti128 [r0+0x40], m4, 1
+    mova    [r0-0x80+0x50], xm5
+    vextracti128 [r0+0x50], m5, 1
+    mova    [r0-0x80+0x60], xm6
+    vextracti128 [r0+0x60], m6, 1
+    mova    [r0-0x80+0x70], xm7
+    vextracti128 [r0+0x70], m7, 1
+    ret
+
 ;-----------------------------------------------------------------------------
 ; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
 ;-----------------------------------------------------------------------------
@@ -390,4 +426,5 @@
 ADD8x8
 INIT_XMM avx
 ADD8x8
+
 %endif ; !HIGH_BIT_DEPTH

x264-snapshot-20130224-2245.tar.bz2/common/x86/dct-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/dct-a.asm Changed

@@ -30,7 +30,7 @@
 %include "x86inc.asm"
 %include "x86util.asm"
 
-SECTION_RODATA
+SECTION_RODATA 32
 pw_ppmmmmpp:    dw 1,1,-1,-1,-1,-1,1,1
 pb_sub4frame:   db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
 pb_sub4field:   db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
@@ -39,8 +39,6 @@
 pb_scan4frameb: SHUFFLE_MASK_W 0,4,1,2,5,6,3,7
 pb_scan4frame2a: SHUFFLE_MASK_W 0,4,1,2,5,8,12,9
 pb_scan4frame2b: SHUFFLE_MASK_W 6,3,7,10,13,14,11,15
-pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
-pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
 
 pb_scan8framet1: SHUFFLE_MASK_W 0,  1,  6,  7,  8,  9, 13, 14
 pb_scan8framet2: SHUFFLE_MASK_W 2 , 3,  4,  7,  9, 15, 10, 14
@@ -74,6 +72,7 @@
 
 cextern pw_32_0
 cextern pw_32
+cextern pw_512
 cextern pw_8000
 cextern pw_pixel_max
 cextern hsub_mul
@@ -83,6 +82,9 @@
 cextern pd_32
 cextern pw_ppppmmmm
 cextern pw_pmpmpmpm
+cextern deinterleave_shufd
+cextern pb_unpackbd1
+cextern pb_unpackbd2
 
 %macro WALSH4_1D 6
     SUMSUB_BADC %1, %5, %4, %3, %2, %6
@@ -377,6 +379,135 @@
 ADD4x4
 INIT_XMM avx
 ADD4x4
+
+%macro STOREx2_AVX2 9
+    movq      xm%3, [r0+%5*FDEC_STRIDE]
+    vinserti128 m%3, m%3, [r0+%6*FDEC_STRIDE], 1
+    movq      xm%4, [r0+%7*FDEC_STRIDE]
+    vinserti128 m%4, m%4, [r0+%8*FDEC_STRIDE], 1
+    punpcklbw  m%3, m%9
+    punpcklbw  m%4, m%9
+    psraw      m%1, 6
+    psraw      m%2, 6
+    paddsw     m%1, m%3
+    paddsw     m%2, m%4
+    packuswb   m%1, m%2
+    vextracti128 xm%2, m%1, 1
+    movq   [r0+%5*FDEC_STRIDE], xm%1
+    movq   [r0+%6*FDEC_STRIDE], xm%2
+    movhps [r0+%7*FDEC_STRIDE], xm%1
+    movhps [r0+%8*FDEC_STRIDE], xm%2
+%endmacro
+
+INIT_YMM avx2
+cglobal add8x8_idct, 2,3,8
+    add    r0, 4*FDEC_STRIDE
+    pxor   m7, m7
+    TAIL_CALL .skip_prologue, 0
+global current_function %+ .skip_prologue
+.skip_prologue:
+    ; TRANSPOSE4x4Q
+    mova       xm0, [r1+ 0]
+    mova       xm1, [r1+32]
+    mova       xm2, [r1+16]
+    mova       xm3, [r1+48]
+    vinserti128 m0, m0, [r1+ 64], 1
+    vinserti128 m1, m1, [r1+ 96], 1
+    vinserti128 m2, m2, [r1+ 80], 1
+    vinserti128 m3, m3, [r1+112], 1
+    SBUTTERFLY qdq, 0, 1, 4
+    SBUTTERFLY qdq, 2, 3, 4
+    IDCT4_1D w,0,1,2,3,4,5
+    TRANSPOSE2x4x4W 0,1,2,3,4
+    paddw m0, [pw_32]
+    IDCT4_1D w,0,1,2,3,4,5
+    STOREx2_AVX2 0, 1, 4, 5, -4, 0, -3, 1, 7
+    STOREx2_AVX2 2, 3, 4, 5, -2, 2, -1, 3, 7
+    ret
+
+; 2xdst, 2xtmp, 4xsrcrow, 1xzero
+%macro LOAD_DIFF8x2_AVX2 9
+    movq    xm%1, [r1+%5*FENC_STRIDE]
+    movq    xm%2, [r1+%6*FENC_STRIDE]
+    vinserti128 m%1, m%1, [r1+%7*FENC_STRIDE], 1
+    vinserti128 m%2, m%2, [r1+%8*FENC_STRIDE], 1
+    punpcklbw m%1, m%9
+    punpcklbw m%2, m%9
+    movq    xm%3, [r2+(%5-4)*FDEC_STRIDE]
+    movq    xm%4, [r2+(%6-4)*FDEC_STRIDE]
+    vinserti128 m%3, m%3, [r2+(%7-4)*FDEC_STRIDE], 1
+    vinserti128 m%4, m%4, [r2+(%8-4)*FDEC_STRIDE], 1
+    punpcklbw m%3, m%9
+    punpcklbw m%4, m%9
+    psubw    m%1, m%3
+    psubw    m%2, m%4
+%endmacro
+
+; 4x src, 1x tmp
+%macro STORE8_DCT_AVX2 5
+    SBUTTERFLY qdq, %1, %2, %5
+    SBUTTERFLY qdq, %3, %4, %5
+    mova [r0+  0], xm%1
+    mova [r0+ 16], xm%3
+    mova [r0+ 32], xm%2
+    mova [r0+ 48], xm%4
+    vextracti128 [r0+ 64], m%1, 1
+    vextracti128 [r0+ 80], m%3, 1
+    vextracti128 [r0+ 96], m%2, 1
+    vextracti128 [r0+112], m%4, 1
+%endmacro
+
+%macro STORE16_DCT_AVX2 5
+    SBUTTERFLY qdq, %1, %2, %5
+    SBUTTERFLY qdq, %3, %4, %5
+    mova [r0+ 0-128], xm%1
+    mova [r0+16-128], xm%3
+    mova [r0+32-128], xm%2
+    mova [r0+48-128], xm%4
+    vextracti128 [r0+ 0], m%1, 1
+    vextracti128 [r0+16], m%3, 1
+    vextracti128 [r0+32], m%2, 1
+    vextracti128 [r0+48], m%4, 1
+%endmacro
+
+INIT_YMM avx2
+cglobal sub8x8_dct, 3,3,7
+    pxor m6, m6
+    add r2, 4*FDEC_STRIDE
+    LOAD_DIFF8x2_AVX2 0, 1, 4, 5, 0, 1, 4, 5, 6
+    LOAD_DIFF8x2_AVX2 2, 3, 4, 5, 2, 3, 6, 7, 6
+    DCT4_1D 0, 1, 2, 3, 4
+    TRANSPOSE2x4x4W 0, 1, 2, 3, 4
+    DCT4_1D 0, 1, 2, 3, 4
+    STORE8_DCT_AVX2 0, 1, 2, 3, 4
+    RET
+
+INIT_YMM avx2
+cglobal sub16x16_dct, 3,3,6
+    add r0, 128
+    add r2, 4*FDEC_STRIDE
+    call .sub16x4_dct
+    add r0, 64
+    add r1, 4*FENC_STRIDE
+    add r2, 4*FDEC_STRIDE
+    call .sub16x4_dct
+    add r0, 256-64
+    add r1, 4*FENC_STRIDE
+    add r2, 4*FDEC_STRIDE
+    call .sub16x4_dct
+    add r0, 64
+    add r1, 4*FENC_STRIDE
+    add r2, 4*FDEC_STRIDE
+    call .sub16x4_dct
+    RET
+.sub16x4_dct:
+    LOAD_DIFF16x2_AVX2 0, 1, 4, 5, 0, 1
+    LOAD_DIFF16x2_AVX2 2, 3, 4, 5, 2, 3
+    DCT4_1D 0, 1, 2, 3, 4
+    TRANSPOSE2x4x4W 0, 1, 2, 3, 4
+    DCT4_1D 0, 1, 2, 3, 4
+    STORE16_DCT_AVX2 0, 1, 2, 3, 4
+    ret
 %endif ; HIGH_BIT_DEPTH
 
 INIT_MMX
@@ -422,7 +553,7 @@
 cglobal %1, 2,2,11
     pxor m7, m7
 %endif
-%if mmsize==16 && %3!=256
+%if mmsize>=16 && %3!=256
     add  r0, 4*FDEC_STRIDE
 %endif
 .skip_prologue:
@@ -497,6 +628,9 @@
 SUB_NxN_DCT  sub16x16_dct8_sse2,  sub8x8_dct8_sse2,  128, 8, 0, 0, 11
 SUB_NxN_DCT  sub16x16_dct8_ssse3, sub8x8_dct8_ssse3, 128, 8, 0, 0, 11
 SUB_NxN_DCT  sub16x16_dct8_avx,   sub8x8_dct8_avx,   128, 8, 0, 0, 11
+
+INIT_YMM
+ADD_NxN_IDCT add16x16_idct_avx2, add8x8_idct_avx2, 128, 8, 0, 0
 %endif ; HIGH_BIT_DEPTH
 
 %if HIGH_BIT_DEPTH
@@ -607,10 +741,9 @@
     movh     m0, [r1]
     pxor     m1, m1
     add      r0, FDEC_STRIDE*4
-    paddw    m0, [pw_32]
-    psraw    m0, 6
+    pmulhrsw m0, [pw_512]
     psubw    m1, m0
-    mova     m5, [pb_idctdc_unpack]

x264-snapshot-20130224-2245.tar.bz2/common/x86/dct.h -> x264-snapshot-20130723-2245.tar.bz2/common/x86/dct.h Changed

@@ -40,6 +40,8 @@
 void x264_sub16x16_dct_avx  ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
 void x264_sub8x8_dct_xop    ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
 void x264_sub16x16_dct_xop  ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_avx2   ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct_avx2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
 void x264_sub8x8_dct_dc_mmx2( int16_t dct    [ 4], uint8_t *pix1, uint8_t *pix2 );
 void x264_sub8x8_dct_dc_sse2( dctcoef dct    [ 4], pixel   *pix1, pixel   *pix2 );
 void x264_sub8x16_dct_dc_sse2 ( dctcoef dct  [ 4], pixel   *pix1, pixel   *pix2 );
@@ -56,14 +58,17 @@
 void x264_add16x16_idct_dc_mmx2 ( uint8_t *p_dst, int16_t dct    [16] );
 void x264_add8x8_idct_sse2      ( pixel   *p_dst, dctcoef dct[ 4][16] );
 void x264_add8x8_idct_avx       ( pixel   *p_dst, dctcoef dct[ 4][16] );
+void x264_add8x8_idct_avx2      ( pixel   *p_dst, dctcoef dct[ 4][16] );
 void x264_add16x16_idct_sse2    ( pixel   *p_dst, dctcoef dct[16][16] );
 void x264_add16x16_idct_avx     ( pixel   *p_dst, dctcoef dct[16][16] );
+void x264_add16x16_idct_avx2    ( pixel   *p_dst, dctcoef dct[16][16] );
 void x264_add8x8_idct_dc_sse2   ( pixel   *p_dst, dctcoef dct    [ 4] );
 void x264_add16x16_idct_dc_sse2 ( pixel   *p_dst, dctcoef dct    [16] );
 void x264_add8x8_idct_dc_ssse3  ( uint8_t *p_dst, int16_t dct    [ 4] );
 void x264_add16x16_idct_dc_ssse3( uint8_t *p_dst, int16_t dct    [16] );
 void x264_add8x8_idct_dc_avx    ( pixel   *p_dst, dctcoef dct    [ 4] );
 void x264_add16x16_idct_dc_avx  ( pixel   *p_dst, dctcoef dct    [16] );
+void x264_add16x16_idct_dc_avx2 ( uint8_t *p_dst, int16_t dct    [16] );
 
 void x264_dct4x4dc_mmx       ( int16_t d[16] );
 void x264_dct4x4dc_sse2      ( int32_t d[16] );
@@ -82,6 +87,7 @@
 void x264_sub16x16_dct8_sse4 ( int32_t dct[4][64], uint16_t *pix1, uint16_t *pix2 );
 void x264_sub8x8_dct8_avx    ( dctcoef dct   [64], pixel *pix1, pixel *pix2 );
 void x264_sub16x16_dct8_avx  ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 );
+void x264_sub16x16_dct8_avx2 ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 );
 
 
 void x264_add8x8_idct8_mmx   ( uint8_t *dst, int16_t dct   [64] );
@@ -118,5 +124,6 @@
 void x264_zigzag_interleave_8x8_cavlc_mmx ( int16_t *dst, int16_t *src, uint8_t *nnz );
 void x264_zigzag_interleave_8x8_cavlc_sse2( dctcoef *dst, dctcoef *src, uint8_t *nnz );
 void x264_zigzag_interleave_8x8_cavlc_avx ( dctcoef *dst, dctcoef *src, uint8_t *nnz );
+void x264_zigzag_interleave_8x8_cavlc_avx2( int16_t *dst, int16_t *src, uint8_t *nnz );
 
 #endif

x264-snapshot-20130224-2245.tar.bz2/common/x86/deblock-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/deblock-a.asm Changed

@@ -28,8 +28,10 @@
 %include "x86inc.asm"
 %include "x86util.asm"
 
-SECTION_RODATA
+SECTION_RODATA 32
 
+load_bytes_shuf: times 2 db 3,4,5,6,11,12,13,14,4,5,6,7,12,13,14,15
+insert_top_shuf: dd 0,1,4,5,7,2,3,6
 transpose_shuf: db 0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15
 
 SECTION .text
@@ -42,6 +44,7 @@
 cextern pw_4
 cextern pw_00ff
 cextern pw_pixel_max
+cextern pb_unpackbd1
 
 %if HIGH_BIT_DEPTH
 ; out: %4 = |%1-%2|-%3
@@ -162,14 +165,12 @@
 ;-----------------------------------------------------------------------------
 ; void deblock_v_luma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
-cglobal deblock_v_luma, 5,5,8
-    %assign pad 5*mmsize+12-(stack_offset&15)
+cglobal deblock_v_luma, 5,5,8,0-5*mmsize
     %define tcm [rsp]
     %define ms1 [rsp+mmsize]
     %define ms2 [rsp+mmsize*2]
     %define am  [rsp+mmsize*3]
     %define bm  [rsp+mmsize*4]
-    SUB        rsp, pad
     add         r1, r1
     LOAD_AB     m4, m5, r2d, r3d
     mov         r3, 32/mmsize
@@ -213,11 +214,9 @@
     add         r4, mmsize/8
     dec         r3
     jg .loop
-    ADD         rsp, pad
     RET
 
-cglobal deblock_h_luma, 5,6,8
-    %assign pad 7*mmsize+12-(stack_offset&15)
+cglobal deblock_h_luma, 5,6,8,0-7*mmsize
     %define tcm [rsp]
     %define ms1 [rsp+mmsize]
     %define ms2 [rsp+mmsize*2]
@@ -225,7 +224,6 @@
     %define p2m [rsp+mmsize*4]
     %define am  [rsp+mmsize*5]
     %define bm  [rsp+mmsize*6]
-    SUB        rsp, pad
     add         r1, r1
     LOAD_AB     m4, m5, r2d, r3d
     mov         r3, r1
@@ -302,7 +300,6 @@
     lea         r2, [r2+r1*(mmsize/2)]
     dec         r5
     jg .loop
-    ADD        rsp, pad
     RET
 %endmacro
 
@@ -485,7 +482,6 @@
 %endmacro
 
 %macro LUMA_INTRA_INIT 1
-    %xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15)
     %define t0 m4
     %define t1 m5
     %define t2 m6
@@ -495,7 +491,6 @@
     CAT_XDEFINE t, i, [rsp+mmsize*(i-4)]
     %assign i i+1
 %endrep
-    SUB    rsp, pad
     add     r1, r1
 %endmacro
 
@@ -724,7 +719,7 @@
 ;-----------------------------------------------------------------------------
 ; void deblock_v_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
-cglobal deblock_v_luma_intra, 4,7,8
+cglobal deblock_v_luma_intra, 4,7,8,0-3*mmsize
     LUMA_INTRA_INIT 3
     lea     r4, [r1*4]
     lea     r5, [r1*3]
@@ -744,13 +739,12 @@
     add     r4, mmsize
     dec     r6
     jg .loop
-    ADD    rsp, pad
     RET
 
 ;-----------------------------------------------------------------------------
 ; void deblock_h_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
-cglobal deblock_h_luma_intra, 4,7,8
+cglobal deblock_h_luma_intra, 4,7,8,0-8*mmsize
     LUMA_INTRA_INIT 8
 %if mmsize == 8
     lea     r4, [r1*3]
@@ -785,7 +779,6 @@
     dec     r6
 %endif
     jg .loop
-    ADD    rsp, pad
     RET
 %endmacro
 
@@ -871,6 +864,19 @@
     movh       %8, m4
 %endmacro
 
+; in: 8 rows of 4 bytes in %9..%10
+; out: 8 rows of 4 bytes in %1..%8
+%macro STORE_8x4B 10
+    movd   %1, %9
+    pextrd %2, %9, 1
+    pextrd %3, %9, 2
+    pextrd %4, %9, 3
+    movd   %5, %10
+    pextrd %6, %10, 1
+    pextrd %7, %10, 2
+    pextrd %8, %10, 3
+%endmacro
+
 %macro TRANSPOSE4x8B_LOAD 8
     TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8
 %endmacro
@@ -925,6 +931,45 @@
 ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
 %macro TRANSPOSE6x8_MEM 9
     RESET_MM_PERMUTATION
+%if cpuflag(avx)
+    ; input:
+    ; _ABCDEF_
+    ; _GHIJKL_
+    ; _MNOPQR_
+    ; _STUVWX_
+    ; _YZabcd_
+    ; _efghij_
+    ; _klmnop_
+    ; _qrstuv_
+
+    movh      m0, %1
+    movh      m2, %2
+    movh      m1, %3
+    movh      m3, %4
+    punpcklbw m0, m2       ; __ AG BH CI DJ EK FL __
+    punpcklbw m1, m3       ; __ MS NT OU PV QW RX __
+    movh      m2, %5
+    movh      m3, %6
+    punpcklbw m2, m3       ; __ Ye Zf ag bh ci dj __
+    movh      m3, %7
+    movh      m4, %8
+    punpcklbw m3, m4       ; __ kq lr ms nt ou pv __
+
+    SBUTTERFLY wd, 0, 1, 4 ; __ __ AG MS BH NT CI OU
+                           ; DJ PV EK QW FL RX __ __
+    SBUTTERFLY wd, 2, 3, 4 ; __ __ Ye kq Zf lr ag ms
+                           ; bh nt ci ou dj pv __ __
+    SBUTTERFLY dq, 0, 2, 4 ; __ __ __ __ AG MS Ye kq
+                           ; BH NT Zf lr CI FL OU RX
+    SBUTTERFLY dq, 1, 3, 4 ; DJ PV bh nt EK QW Zf lr
+                           ; FL RX dj pv __ __ __ __
+    movhps [%9+0x00], m0
+    movh   [%9+0x10], m2
+    movhps [%9+0x20], m2
+    movh   [%9+0x30], m1
+    movhps [%9+0x40], m1
+    movh   [%9+0x50], m3
+%else
     movq  m0, %1
     movq  m1, %2
     movq  m2, %3
@@ -951,13 +996,41 @@
     movq  [%9+0x30], m1
     movq  [%9+0x40], m5
     movq  [%9+0x50], m3
+%endif
     RESET_MM_PERMUTATION
 %endmacro
 
+
 ; in: 8 rows of 8 in %1..%8
 ; out: 8 rows of 8 in %9..%16
 %macro TRANSPOSE8x8_MEM 16
     RESET_MM_PERMUTATION
+%if cpuflag(avx)
+    movh      m0, %1
+    movh      m4, %2
+    movh      m1, %3
+    movh      m5, %4
+    movh      m2, %5
+    movh      m3, %7
+    punpcklbw m0, m4

x264-snapshot-20130224-2245.tar.bz2/common/x86/mc-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/mc-a.asm Changed

@@ -34,7 +34,7 @@
 
 SECTION_RODATA 32
 
-ch_shuf: db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
+ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
 ch_shuf_adj: times 8 db 0
              times 8 db 2
              times 8 db 4
@@ -49,10 +49,12 @@
 cextern pw_8
 cextern pw_32
 cextern pw_64
+cextern pw_512
 cextern pw_00ff
 cextern pw_pixel_max
 cextern sw_64
 cextern pd_32
+cextern deinterleave_shufd
 
 ;=============================================================================
 ; implicit weighted biprediction
@@ -141,8 +143,7 @@
     movh      m1, %2
     punpcklbw m0, m1
     pmaddubsw m0, m3
-    paddw     m0, m4
-    psraw     m0, 6
+    pmulhrsw  m0, m4
 %endmacro
 
 %macro BIWEIGHT_START_SSSE3 0
@@ -151,9 +152,13 @@
     sub    t7d, t6d
     shl    t7d, 8
     add    t6d, t7d
-    movd    m3, t6d
-    mova    m4, [pw_32]
+    mova    m4, [pw_512]
+    movd   xm3, t6d
+%if cpuflag(avx2)
+    vpbroadcastw m3, xm3
+%else
     SPLATW  m3, m3   ; weight_dst,src
+%endif
 %endmacro
 
 %if HIGH_BIT_DEPTH
@@ -244,6 +249,25 @@
 INIT_XMM ssse3
 AVG_WEIGHT 8,  7
 AVG_WEIGHT 16, 7
+
+INIT_YMM avx2
+cglobal pixel_avg_weight_w16
+    BIWEIGHT_START
+    AVG_START 5
+.height_loop:
+    movu     xm0, [t2]
+    movu     xm1, [t4]
+    vinserti128 m0, m0, [t2+t3], 1
+    vinserti128 m1, m1, [t4+t5], 1
+    SBUTTERFLY bw, 0, 1, 2
+    pmaddubsw m0, m3
+    pmaddubsw m1, m3
+    pmulhrsw  m0, m4
+    pmulhrsw  m1, m4
+    packuswb  m0, m1
+    mova    [t0], xm0
+    vextracti128 [t0+t1], m0, 1
+    AVG_END
 %endif ;HIGH_BIT_DEPTH
 
 ;=============================================================================
@@ -274,7 +298,7 @@
 %endmacro
 
 ; src, dst, width
-%macro WEIGHT_TWO_ROW 3
+%macro WEIGHT_TWO_ROW 4
     %assign x 0
 %rep (%3+mmsize/2-1)/(mmsize/2)
 %if %3-x/2 <= 4 && mmsize == 16
@@ -298,16 +322,21 @@
 %else ; !HIGH_BIT_DEPTH
 
 %macro WEIGHT_START 1
+%if cpuflag(avx2)
+    vbroadcasti128 m3, [r4]
+    vbroadcasti128 m4, [r4+16]
+%else
     mova     m3, [r4]
     mova     m4, [r4+16]
 %if notcpuflag(ssse3)
     movd     m5, [r4+32]
 %endif
+%endif
     pxor     m2, m2
 %endmacro
 
-; src1, src2, dst1, dst2
-%macro WEIGHT_ROWx2 4
+; src1, src2, dst1, dst2, fast
+%macro WEIGHT_ROWx2 5
     movh      m0, [%1         ]
     movh      m1, [%1+mmsize/2]
     movh      m6, [%2         ]
@@ -317,10 +346,12 @@
     punpcklbw m6, m2
     punpcklbw m7, m2
 %if cpuflag(ssse3)
+%if %5==0
     psllw     m0, 7
     psllw     m1, 7
     psllw     m6, 7
     psllw     m7, 7
+%endif
     pmulhrsw  m0, m3
     pmulhrsw  m1, m3
     pmulhrsw  m6, m3
@@ -349,15 +380,54 @@
     mova    [%4], m6
 %endmacro
 
-; src1, src2, dst1, dst2, width
-%macro WEIGHT_COL 5
+; src1, src2, dst1, dst2, width, fast
+%macro WEIGHT_COL 6
+%if cpuflag(avx2)
+%if %5==16
+    movu     xm0, [%1]
+    vinserti128 m0, m0, [%2], 1
+    punpckhbw m1, m0, m2
+    punpcklbw m0, m0, m2
+%if %6==0
+    psllw     m0, 7
+    psllw     m1, 7
+%endif
+    pmulhrsw  m0, m3
+    pmulhrsw  m1, m3
+    paddw     m0, m4
+    paddw     m1, m4
+    packuswb  m0, m1
+    mova    [%3], xm0
+    vextracti128 [%4], m0, 1
+%else
+    movq     xm0, [%1]
+    vinserti128 m0, m0, [%2], 1
+    punpcklbw m0, m2
+%if %6==0
+    psllw     m0, 7
+%endif
+    pmulhrsw  m0, m3
+    paddw     m0, m4
+    packuswb  m0, m0
+    vextracti128 xm1, m0, 1
+%if %5 == 8
+    movq    [%3], xm0
+    movq    [%4], xm1
+%else
+    movd    [%3], xm0
+    movd    [%4], xm1
+%endif
+%endif
+%else
     movh      m0, [%1]
     movh      m1, [%2]
     punpcklbw m0, m2
     punpcklbw m1, m2
 %if cpuflag(ssse3)
+%if %6==0
     psllw     m0, 7
     psllw     m1, 7
+%endif
     pmulhrsw  m0, m3
     pmulhrsw  m1, m3
     paddw     m0, m4
@@ -380,18 +450,22 @@
     movd    [%3], m0    ; width 2 can write garbage for the last 2 bytes
     movd    [%4], m1
 %endif
+%endif
 %endmacro
-
 ; src, dst, width
-%macro WEIGHT_TWO_ROW 3
+%macro WEIGHT_TWO_ROW 4
 %assign x 0
 %rep %3
 %if (%3-x) >= mmsize
-    WEIGHT_ROWx2 %1+x, %1+r3+x, %2+x, %2+r1+x
+    WEIGHT_ROWx2 %1+x, %1+r3+x, %2+x, %2+r1+x, %4
     %assign x (x+mmsize)
 %else
-    WEIGHT_COL %1+x, %1+r3+x, %2+x, %2+r1+x, %3-x
-    %exitrep
+    %assign w %3-x
+%if w == 20
+    %assign w 16
+%endif

x264-snapshot-20130224-2245.tar.bz2/common/x86/mc-a2.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/mc-a2.asm Changed

@@ -30,13 +30,14 @@
 %include "x86inc.asm"
 %include "x86util.asm"
 
-SECTION_RODATA
+SECTION_RODATA 32
+
+filt_mul20: times 32 db 20
+filt_mul15: times 16 db 1, -5
+filt_mul51: times 16 db -5, 1
+hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
+deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
 
-filt_mul20: times 16 db 20
-filt_mul15: times 8 db 1, -5
-filt_mul51: times 8 db -5, 1
-hpel_shuf: db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
-deinterleave_shuf: db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
 %if HIGH_BIT_DEPTH
 deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14
 deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15
@@ -44,6 +45,7 @@
 deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
 deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
 %endif
+pw_1024: times 16 dw 1024
 
 pd_16: times 4 dd 16
 pd_0f: times 4 dd 0xffff
@@ -64,6 +66,7 @@
 cextern pw_1
 cextern pw_16
 cextern pw_32
+cextern pw_512
 cextern pw_00ff
 cextern pw_3fff
 cextern pw_pixel_max
@@ -127,19 +130,24 @@
     paddw  %4, %6
 %endmacro
 
-%macro FILT_PACK 4-6 b
-    paddw      %1, %4
-    paddw      %2, %4
-%if %0 == 6
-    psubusw    %1, %6
-    psubusw    %2, %6
-    psrlw      %1, %3
-    psrlw      %2, %3
+%macro FILT_PACK 3-5
+%if cpuflag(ssse3)
+    pmulhrsw %1, %3
+    pmulhrsw %2, %3
+%else
+    paddw    %1, %3
+    paddw    %2, %3
+%if %0 == 5
+    psubusw  %1, %5
+    psubusw  %2, %5
+    psrlw    %1, %4
+    psrlw    %2, %4
 %else
-    psraw      %1, %3
-    psraw      %2, %3
+    psraw    %1, %4
+    psraw    %2, %4
 %endif
-%ifnidn w, %5
+%endif
+%if HIGH_BIT_DEPTH == 0
     packuswb %1, %2
 %endif
 %endmacro
@@ -203,7 +211,7 @@
     mova      [r2+r4+mmsize], m4
     paddw      m1, s30
     paddw      m4, s30
-    FILT_PACK  m1, m4, 5, m6, w, s10
+    FILT_PACK  m1, m4, m6, 5, s10
     CLIPW      m1, m0, m7
     CLIPW      m4, m0, m7
     mova      [r0+r4], m1
@@ -295,7 +303,7 @@
     FILT_H2    m1, m2, m3, m4, m5, m6
     mova       m7, [pw_1]
     pxor       m2, m2
-    FILT_PACK  m1, m4, 1, m7, w
+    FILT_PACK  m1, m4, m7, 1
     CLIPW      m1, m2, m0
     CLIPW      m4, m2, m0
     mova      [r0+r2], m1
@@ -349,17 +357,25 @@
     paddw  m4, m5
     paddw  m1, m3
     paddw  m4, m6
+    mova   m7, [pw_1024]
 %else
     LOAD_ADD_2 m1, m4, [r1     ], [r5+r3*2], m6, m7            ; a0 / a1
     LOAD_ADD_2 m2, m5, [r1+r3  ], [r5+r3  ], m6, m7            ; b0 / b1
     LOAD_ADD   m3,     [r1+r3*2], [r5     ], m7                ; c0
     LOAD_ADD   m6,     [r1+r3*2+mmsize/2], [r5+mmsize/2], m7   ; c1
     FILT_V2 m1, m2, m3, m4, m5, m6
+    mova   m7, [pw_16]
 %endif
-    mova      m7, [pw_16]
+%if mmsize==32
+    mova         [r2+r4*2], xm1
+    mova         [r2+r4*2+mmsize/2], xm4
+    vextracti128 [r2+r4*2+mmsize], m1, 1
+    vextracti128 [r2+r4*2+mmsize*3/2], m4, 1
+%else
     mova      [r2+r4*2], m1
     mova      [r2+r4*2+mmsize], m4
-    FILT_PACK m1, m4, 5, m7
+%endif
+    FILT_PACK m1, m4, m7, 5
     movnta    [r0+r4], m1
     add r1, mmsize
     add r5, mmsize
@@ -371,8 +387,8 @@
 ;-----------------------------------------------------------------------------
 ; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
 ;-----------------------------------------------------------------------------
-INIT_MMX
-cglobal hpel_filter_c_mmx2, 3,3
+INIT_MMX mmx2
+cglobal hpel_filter_c, 3,3
     add r0, r2
     lea r1, [r1+r2*2]
     neg r2
@@ -392,7 +408,7 @@
     paddw  m5, [src+12] ; b1
     paddw  m6, [src+10] ; c1
     FILT_H2 m1, m2, m3, m4, m5, m6
-    FILT_PACK m1, m4, 6, m7
+    FILT_PACK m1, m4, m7, 6
     movntq [r0+r2], m1
     add r2, 8
     jl .loop
@@ -401,7 +417,8 @@
 ;-----------------------------------------------------------------------------
 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
 ;-----------------------------------------------------------------------------
-cglobal hpel_filter_h_mmx2, 3,3
+INIT_MMX mmx2
+cglobal hpel_filter_h, 3,3
     add r0, r2
     add r1, r2
     neg r2
@@ -436,14 +453,12 @@
     paddw      m6, m7 ; a1
     movq       m7, [pw_1]
     FILT_H2 m1, m2, m3, m4, m5, m6
-    FILT_PACK m1, m4, 1, m7
+    FILT_PACK m1, m4, m7, 1
     movntq     [r0+r2], m1
     add r2, 8
     jl .loop
     RET
 
-INIT_XMM
-
 %macro HPEL_C 0
 ;-----------------------------------------------------------------------------
 ; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
@@ -454,29 +469,33 @@
     neg r2
     %define src r1+r2*2
 %ifnidn cpuname, sse2
+%if cpuflag(ssse3)
+    mova    m7, [pw_512]
+%else
     mova    m7, [pw_32]
-    %define tpw_32 m7
+%endif
+    %define pw_rnd m7
 %elif ARCH_X86_64
     mova    m8, [pw_32]
-    %define tpw_32 m8
+    %define pw_rnd m8
 %else
-    %define tpw_32 [pw_32]
+    %define pw_rnd [pw_32]
 %endif
 ; This doesn't seem to be faster (with AVX) on Sandy Bridge or Bulldozer...
-%if cpuflag(misalign)
+%if cpuflag(misalign) || mmsize==32
 .loop:
     movu    m4, [src-4]
     movu    m5, [src-2]
-    mova    m6, [src]
-    movu    m3, [src+12]
-    movu    m2, [src+14]
-    mova    m1, [src+16]
+    mova    m6, [src+0]
+    movu    m3, [src-4+mmsize]
+    movu    m2, [src-2+mmsize]
+    mova    m1, [src+0+mmsize]
     paddw   m4, [src+6]
     paddw   m5, [src+4]

x264-snapshot-20130224-2245.tar.bz2/common/x86/mc-c.c -> x264-snapshot-20130723-2245.tar.bz2/common/x86/mc-c.c Changed

@@ -35,7 +35,8 @@
 #define DECL_SUF( func, args )\
     void func##_mmx2 args;\
     void func##_sse2 args;\
-    void func##_ssse3 args;
+    void func##_ssse3 args;\
+    void func##_avx2 args;
 
 DECL_SUF( x264_pixel_avg_16x16, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
 DECL_SUF( x264_pixel_avg_16x8,  ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
@@ -72,15 +73,20 @@
 MC_WEIGHT( 12, ssse3 )
 MC_WEIGHT( 16, ssse3 )
 MC_WEIGHT( 20, ssse3 )
+MC_WEIGHT( 8, avx2 )
+MC_WEIGHT( 16, avx2 )
+MC_WEIGHT( 20, avx2 )
 #undef MC_OFFSET
 #undef MC_WEIGHT
 
-void x264_mc_copy_w4_mmx  ( pixel *, intptr_t, pixel *, intptr_t, int );
-void x264_mc_copy_w8_mmx  ( pixel *, intptr_t, pixel *, intptr_t, int );
-void x264_mc_copy_w8_sse2 ( pixel *, intptr_t, pixel *, intptr_t, int );
-void x264_mc_copy_w16_mmx ( pixel *, intptr_t, pixel *, intptr_t, int );
-void x264_mc_copy_w16_sse2( pixel *, intptr_t, pixel *, intptr_t, int );
-void x264_mc_copy_w16_aligned_sse2( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_mc_copy_w4_mmx ( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_mc_copy_w8_mmx ( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_mc_copy_w8_sse ( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_mc_copy_w16_mmx( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_mc_copy_w16_sse( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_mc_copy_w16_aligned_sse( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_mc_copy_w16_avx( uint16_t *, intptr_t, uint16_t *, intptr_t, int );
+void x264_mc_copy_w16_aligned_avx( uint16_t *, intptr_t, uint16_t *, intptr_t, int );
 void x264_prefetch_fenc_420_mmx2( pixel *, intptr_t, pixel *, intptr_t, int );
 void x264_prefetch_fenc_422_mmx2( pixel *, intptr_t, pixel *, intptr_t, int );
 void x264_prefetch_ref_mmx2( pixel *, intptr_t, int );
@@ -121,18 +127,23 @@
 void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, intptr_t i_src, int height );
 void x264_load_deinterleave_chroma_fdec_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
 void x264_load_deinterleave_chroma_fdec_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
-void *x264_memcpy_aligned_mmx ( void *dst, const void *src, size_t n );
-void *x264_memcpy_aligned_sse2( void *dst, const void *src, size_t n );
-void x264_memzero_aligned_mmx ( void *dst, size_t n );
-void x264_memzero_aligned_sse2( void *dst, size_t n );
+void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n );
+void *x264_memcpy_aligned_sse( void *dst, const void *src, size_t n );
+void x264_memzero_aligned_mmx( void *dst, size_t n );
+void x264_memzero_aligned_sse( void *dst, size_t n );
+void x264_memzero_aligned_avx( void *dst, size_t n );
 void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );
+void x264_integral_init4h_avx2( uint16_t *sum, uint8_t *pix, intptr_t stride );
 void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );
 void x264_integral_init8h_avx ( uint16_t *sum, uint8_t *pix, intptr_t stride );
+void x264_integral_init8h_avx2( uint16_t *sum, uint8_t *pix, intptr_t stride );
 void x264_integral_init4v_mmx  ( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
 void x264_integral_init4v_sse2 ( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
 void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
+void x264_integral_init4v_avx2( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
 void x264_integral_init8v_mmx ( uint16_t *sum8, intptr_t stride );
 void x264_integral_init8v_sse2( uint16_t *sum8, intptr_t stride );
+void x264_integral_init8v_avx2( uint16_t *sum8, intptr_t stride );
 void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
                                       uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
 void x264_mbtree_propagate_cost_avx ( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
@@ -151,7 +162,7 @@
 MC_CHROMA(ssse3)
 MC_CHROMA(ssse3_cache64)
 MC_CHROMA(avx)
-MC_CHROMA(avx_cache64)
+MC_CHROMA(avx2)
 
 #define LOWRES(cpu)\
 void x264_frame_init_lowres_core_##cpu( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,\
@@ -162,6 +173,7 @@
 LOWRES(ssse3)
 LOWRES(avx)
 LOWRES(xop)
+LOWRES(avx2)
 
 #define PIXEL_AVG_W(width,cpu)\
 void x264_pixel_avg2_w##width##_##cpu( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t );
@@ -176,6 +188,7 @@
 PIXEL_AVG_WALL(sse2)
 PIXEL_AVG_WALL(sse2_misalign)
 PIXEL_AVG_WALL(cache64_ssse3)
+PIXEL_AVG_WALL(avx2)
 
 #define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\
 static void (* const x264_pixel_avg_wtab_##instr[6])( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t ) =\
@@ -194,6 +207,8 @@
 #define x264_pixel_avg2_w20_mmx2       x264_pixel_avg2_w18_mmx2
 #define x264_pixel_avg2_w12_sse2         x264_pixel_avg2_w10_sse2
 #define x264_pixel_avg2_w20_sse2         x264_pixel_avg2_w18_sse2
+#define x264_pixel_avg2_w12_avx2         x264_pixel_avg2_w16_avx2
+#define x264_pixel_avg2_w20_avx2         x264_pixel_avg2_w18_avx2
 #else
 /* w16 sse2 is faster than w12 mmx as long as the cacheline issue is resolved */
 #define x264_pixel_avg2_w12_cache64_ssse3 x264_pixel_avg2_w16_cache64_ssse3
@@ -205,6 +220,7 @@
 PIXEL_AVG_WTAB(mmx2, mmx2, mmx2, mmx2, mmx2, mmx2)
 #if HIGH_BIT_DEPTH
 PIXEL_AVG_WTAB(sse2, mmx2, sse2, sse2, sse2, sse2)
+PIXEL_AVG_WTAB(avx2, mmx2, sse2, avx2, avx2, avx2)
 #else // !HIGH_BIT_DEPTH
 #if ARCH_X86
 PIXEL_AVG_WTAB(cache32_mmx2, mmx2, cache32_mmx2, cache32_mmx2, cache32_mmx2, cache32_mmx2)
@@ -214,6 +230,8 @@
 PIXEL_AVG_WTAB(sse2_misalign, mmx2, mmx2, sse2, sse2, sse2_misalign)
 PIXEL_AVG_WTAB(cache64_sse2, mmx2, cache64_mmx2, cache64_sse2, cache64_sse2, cache64_sse2)
 PIXEL_AVG_WTAB(cache64_ssse3, mmx2, cache64_mmx2, cache64_ssse3, cache64_ssse3, cache64_sse2)
+PIXEL_AVG_WTAB(cache64_ssse3_atom, mmx2, mmx2, cache64_ssse3, cache64_ssse3, sse2)
+PIXEL_AVG_WTAB(avx2, mmx2, mmx2, sse2, sse2, avx2)
 #endif // HIGH_BIT_DEPTH
 
 #define MC_COPY_WTAB(instr, name1, name2, name3)\
@@ -228,9 +246,10 @@
 
 MC_COPY_WTAB(mmx,mmx,mmx,mmx)
 #if HIGH_BIT_DEPTH
-MC_COPY_WTAB(sse2,mmx,sse2,sse2)
+MC_COPY_WTAB(sse,mmx,sse,sse)
+MC_COPY_WTAB(avx,mmx,sse,avx)
 #else
-MC_COPY_WTAB(sse2,mmx,mmx,sse2)
+MC_COPY_WTAB(sse,mmx,mmx,sse)
 #endif
 
 #define MC_WEIGHT_WTAB(function, instr, name1, name2, w12version)\
@@ -282,6 +301,7 @@
 MC_WEIGHT_WTAB(offsetadd,sse2,mmx2,mmx2,16)
 MC_WEIGHT_WTAB(offsetsub,sse2,mmx2,mmx2,16)
 MC_WEIGHT_WTAB(weight,ssse3,ssse3,ssse3,16)
+MC_WEIGHT_WTAB(weight,avx2,ssse3,avx2,16)
 
 static void x264_weight_cache_mmx2( x264_t *h, x264_weight_t *w )
 {
@@ -357,14 +377,17 @@
 }
 
 MC_LUMA(mmx2,mmx2,mmx)
-MC_LUMA(sse2,sse2,sse2)
-#if !HIGH_BIT_DEPTH
+MC_LUMA(sse2,sse2,sse)
+#if HIGH_BIT_DEPTH
+MC_LUMA(avx2,avx2,avx)
+#else
 #if ARCH_X86
 MC_LUMA(cache32_mmx2,cache32_mmx2,mmx)
 MC_LUMA(cache64_mmx2,cache64_mmx2,mmx)
 #endif
-MC_LUMA(cache64_sse2,cache64_sse2,sse2)
-MC_LUMA(cache64_ssse3,cache64_ssse3,sse2)
+MC_LUMA(cache64_sse2,cache64_sse2,sse)
+MC_LUMA(cache64_ssse3,cache64_ssse3,sse)
+MC_LUMA(cache64_ssse3_atom,cache64_ssse3_atom,sse)
 #endif // !HIGH_BIT_DEPTH
 
 #define GET_REF(name)\
@@ -400,6 +423,7 @@
 
 GET_REF(mmx2)
 GET_REF(sse2)
+GET_REF(avx2)
 #if !HIGH_BIT_DEPTH
 #if ARCH_X86
 GET_REF(cache32_mmx2)
@@ -408,6 +432,7 @@
 GET_REF(sse2_misalign)
 GET_REF(cache64_sse2)
 GET_REF(cache64_ssse3)
+GET_REF(cache64_ssse3_atom)
 #endif // !HIGH_BIT_DEPTH
 
 #define HPEL(align, cpu, cpuv, cpuc, cpuh)\
@@ -425,8 +450,8 @@
     width += realign;\
     while( height-- )\
     {\
-        x264_hpel_filter_v_##cpuv( dstv, src, buf+8, stride, width );\
-        x264_hpel_filter_c_##cpuc( dstc, buf+8, width );\
+        x264_hpel_filter_v_##cpuv( dstv, src, buf+16, stride, width );\
+        x264_hpel_filter_c_##cpuc( dstc, buf+16, width );\
         x264_hpel_filter_h_##cpuh( dsth, src, width );\
         dsth += stride;\
         dstv += stride;\
@@ -445,10 +470,12 @@
 void x264_hpel_filter_sse2 ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );
 void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );
 void x264_hpel_filter_avx  ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );
+void x264_hpel_filter_avx2 ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );
 #else
 HPEL(16, sse2, sse2, sse2, sse2)
 HPEL(16, ssse3, ssse3, ssse3, ssse3)
 HPEL(16, avx, avx, avx, avx)
+HPEL(32, avx2, avx2, avx2, avx2)
 #endif
 HPEL(16, sse2_misalign, sse2, sse2_misalign, sse2)
 #endif // HIGH_BIT_DEPTH
@@ -545,6 +572,12 @@

x264-snapshot-20130224-2245.tar.bz2/common/x86/pixel-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/pixel-a.asm Changed

@@ -32,8 +32,17 @@
 %include "x86util.asm"
 
 SECTION_RODATA 32
+hmul_16p:  times 16 db 1
+           times 8 db 1, -1
+hmul_8p:   times 8 db 1
+           times 4 db 1, -1
+           times 8 db 1
+           times 4 db 1, -1
 mask_ff:   times 16 db 0xff
            times 16 db 0
+mask_ac4:  times 2 dw 0, -1, -1, -1, 0, -1, -1, -1
+mask_ac4b: times 2 dw 0, -1, 0, -1, -1, -1, -1, -1
+mask_ac8:  times 2 dw 0, -1, -1, -1, -1, -1, -1, -1
 %if BIT_DEPTH == 10
 ssim_c1:   times 4 dd 6697.7856    ; .01*.01*1023*1023*64
 ssim_c2:   times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63
@@ -46,12 +55,7 @@
 ssim_c1:   times 4 dd 416          ; .01*.01*255*255*64
 ssim_c2:   times 4 dd 235963       ; .03*.03*255*255*64*63
 %endif
-mask_ac4:  dw 0, -1, -1, -1, 0, -1, -1, -1
-mask_ac4b: dw 0, -1, 0, -1, -1, -1, -1, -1
-mask_ac8:  dw 0, -1, -1, -1, -1, -1, -1, -1
 hmul_4p:   times 2 db 1, 1, 1, 1, 1, -1, 1, -1
-hmul_8p:   times 8 db 1
-           times 4 db 1, -1
 mask_10:   times 4 dw 0, -1
 mask_1100: times 2 dd 0, -1
 pb_pppm:   times 4 db 1,1,1,-1
@@ -85,6 +89,7 @@
 intrax9b_v2:    db  2, 3,-1,-1,-1,-1,-1,-1, 6, 7,-1,-1,-1,-1,-1,-1
 intrax9b_lut:   db 0x60,0x64,0x80,0x00,0x04,0x20,0x40,0x24,0x44,0,0,0,0,0,0,0
 
+ALIGN 32
 intra8x9_h1:   db  7, 7, 7, 7, 7, 7, 7, 7, 5, 5, 5, 5, 5, 5, 5, 5
 intra8x9_h2:   db  6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4
 intra8x9_h3:   db  3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1
@@ -120,9 +125,29 @@
 transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15
 
 sw_f0:     dq 0xfff0, 0
-sq_0f:     dq 0xffffffff, 0
 pd_f0:     times 4 dd 0xffff0000
 
+pw_76543210: dw 0, 1, 2, 3, 4, 5, 6, 7
+
+ads_mvs_shuffle:
+%macro ADS_MVS_SHUFFLE 8
+    %assign y x
+    %rep 8
+        %rep 7
+            %rotate (~y)&1
+            %assign y y>>((~y)&1)
+        %endrep
+        db %1*2, %1*2+1
+        %rotate 1
+        %assign y y>>1
+    %endrep
+%endmacro
+%assign x 0
+%rep 256
+    ADS_MVS_SHUFFLE 0, 1, 2, 3, 4, 5, 6, 7
+%assign x x+1
+%endrep
+
 SECTION .text
 
 cextern pb_0
@@ -136,7 +161,9 @@
 cextern pw_ppmmppmm
 cextern pw_pmpmpmpm
 cextern pw_pmmpzzzz
+cextern pd_1
 cextern hsub_mul
+cextern popcnt_table
 
 ;=============================================================================
 ; SSD
@@ -144,69 +171,67 @@
 
 %if HIGH_BIT_DEPTH
 ;-----------------------------------------------------------------------------
-; int pixel_ssd_MxN( uint16_t *, intptr_t, uint16_t *, intptr_t )
+; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t )
 ;-----------------------------------------------------------------------------
 %macro SSD_ONE 2
-cglobal pixel_ssd_%1x%2, 4,5,6
-    mov     r4, %1*%2/mmsize
+cglobal pixel_ssd_%1x%2, 4,7,6
+    FIX_STRIDES r1, r3
+%if mmsize == %1*2
+    %define offset0_1 r1
+    %define offset0_2 r1*2
+    %define offset0_3 r5
+    %define offset1_1 r3
+    %define offset1_2 r3*2
+    %define offset1_3 r6
+    lea     r5, [3*r1]
+    lea     r6, [3*r3]
+%elif mmsize == %1
+    %define offset0_1 mmsize
+    %define offset0_2 r1
+    %define offset0_3 r1+mmsize
+    %define offset1_1 mmsize
+    %define offset1_2 r3
+    %define offset1_3 r3+mmsize
+%elif mmsize == %1/2
+    %define offset0_1 mmsize
+    %define offset0_2 mmsize*2
+    %define offset0_3 mmsize*3
+    %define offset1_1 mmsize
+    %define offset1_2 mmsize*2
+    %define offset1_3 mmsize*3
+%endif
+    %assign %%n %2/(2*mmsize/%1)
+%if %%n > 1
+    mov    r4d, %%n
+%endif
     pxor    m0, m0
 .loop
     mova    m1, [r0]
-%if %1 <= mmsize/2
-    mova    m3, [r0+r1*2]
-    %define offset r3*2
-    %define num_rows 2
-%else
-    mova    m3, [r0+mmsize]
-    %define offset mmsize
-    %define num_rows 1
-%endif
-    lea     r0, [r0+r1*2*num_rows]
+    mova    m2, [r0+offset0_1]
+    mova    m3, [r0+offset0_2]
+    mova    m4, [r0+offset0_3]
     psubw   m1, [r2]
-    psubw   m3, [r2+offset]
-    lea     r2, [r2+r3*2*num_rows]
+    psubw   m2, [r2+offset1_1]
+    psubw   m3, [r2+offset1_2]
+    psubw   m4, [r2+offset1_3]
+%if %%n > 1
+    lea     r0, [r0+r1*(%2/%%n)]
+    lea     r2, [r2+r3*(%2/%%n)]
+%endif
     pmaddwd m1, m1
+    pmaddwd m2, m2
     pmaddwd m3, m3
+    pmaddwd m4, m4
+    paddd   m1, m2
+    paddd   m3, m4
     paddd   m0, m1
     paddd   m0, m3
-    dec     r4
+%if %%n > 1
+    dec    r4d
     jg .loop
+%endif
     HADDD   m0, m5
-    movd   eax, m0
-    RET
-%endmacro
-
-%macro SSD_16_MMX 2
-cglobal pixel_ssd_%1x%2, 4,5
-    mov     r4, %1*%2/mmsize/2
-    pxor    m0, m0
-.loop
-    mova    m1, [r0]
-    mova    m2, [r2]
-    mova    m3, [r0+mmsize]
-    mova    m4, [r2+mmsize]
-    mova    m5, [r0+mmsize*2]
-    mova    m6, [r2+mmsize*2]
-    mova    m7, [r0+mmsize*3]
-    psubw   m1, m2
-    psubw   m3, m4
-    mova    m2, [r2+mmsize*3]
-    psubw   m5, m6
-    pmaddwd m1, m1
-    psubw   m7, m2
-    pmaddwd m3, m3
-    pmaddwd m5, m5
-    lea     r0, [r0+r1*2]
-    lea     r2, [r2+r3*2]
-    pmaddwd m7, m7
-    paddd   m1, m3
-    paddd   m5, m7
-    paddd   m0, m1
-    paddd   m0, m5
-    dec     r4
-    jg .loop
-    HADDD   m0, m7
-    movd   eax, m0
+    movd   eax, xm0
     RET
 %endmacro
 
@@ -217,14 +242,17 @@

x264-snapshot-20130224-2245.tar.bz2/common/x86/pixel.h -> x264-snapshot-20130723-2245.tar.bz2/common/x86/pixel.h Changed

@@ -52,10 +52,12 @@
 DECL_X1( sad, sse2_aligned )
 DECL_X1( sad, ssse3 )
 DECL_X1( sad, ssse3_aligned )
+DECL_X1( sad, avx2 )
 DECL_X4( sad, mmx2 )
 DECL_X4( sad, sse2 )
 DECL_X4( sad, sse3 )
 DECL_X4( sad, ssse3 )
+DECL_X4( sad, avx2 )
 DECL_X1( ssd, mmx )
 DECL_X1( ssd, mmx2 )
 DECL_X1( ssd, sse2slow )
@@ -63,18 +65,23 @@
 DECL_X1( ssd, ssse3 )
 DECL_X1( ssd, avx )
 DECL_X1( ssd, xop )
+DECL_X1( ssd, avx2 )
 DECL_X1( satd, mmx2 )
 DECL_X1( satd, sse2 )
 DECL_X1( satd, ssse3 )
+DECL_X1( satd, ssse3_atom )
 DECL_X1( satd, sse4 )
 DECL_X1( satd, avx )
 DECL_X1( satd, xop )
+DECL_X1( satd, avx2 )
 DECL_X1( sa8d, mmx2 )
 DECL_X1( sa8d, sse2 )
 DECL_X1( sa8d, ssse3 )
+DECL_X1( sa8d, ssse3_atom )
 DECL_X1( sa8d, sse4 )
 DECL_X1( sa8d, avx )
 DECL_X1( sa8d, xop )
+DECL_X1( sa8d, avx2 )
 DECL_X1( sad, cache32_mmx2 );
 DECL_X1( sad, cache64_mmx2 );
 DECL_X1( sad, cache64_sse2 );
@@ -88,12 +95,15 @@
 DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, intptr_t i_stride ))
 DECL_PIXELS( uint64_t, var, avx,  ( pixel *pix, intptr_t i_stride ))
 DECL_PIXELS( uint64_t, var, xop,  ( pixel *pix, intptr_t i_stride ))
+DECL_PIXELS( uint64_t, var, avx2, ( pixel *pix, intptr_t i_stride ))
 DECL_PIXELS( uint64_t, hadamard_ac, mmx2,  ( pixel *pix, intptr_t i_stride ))
 DECL_PIXELS( uint64_t, hadamard_ac, sse2,  ( pixel *pix, intptr_t i_stride ))
 DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( pixel *pix, intptr_t i_stride ))
+DECL_PIXELS( uint64_t, hadamard_ac, ssse3_atom, ( pixel *pix, intptr_t i_stride ))
 DECL_PIXELS( uint64_t, hadamard_ac, sse4,  ( pixel *pix, intptr_t i_stride ))
 DECL_PIXELS( uint64_t, hadamard_ac, avx,   ( pixel *pix, intptr_t i_stride ))
 DECL_PIXELS( uint64_t, hadamard_ac, xop,   ( pixel *pix, intptr_t i_stride ))
+DECL_PIXELS( uint64_t, hadamard_ac, avx2,  ( pixel *pix, intptr_t i_stride ))
 
 
 void x264_intra_satd_x3_4x4_mmx2   ( pixel   *, pixel   *, int * );
@@ -106,16 +116,19 @@
 void x264_intra_sad_x3_8x8c_mmx2   ( pixel   *, pixel   *, int * );
 void x264_intra_sad_x3_8x8c_sse2   ( pixel   *, pixel   *, int * );
 void x264_intra_sad_x3_8x8c_ssse3  ( pixel   *, pixel   *, int * );
+void x264_intra_sad_x3_8x8c_avx2   ( pixel   *, pixel   *, int * );
 void x264_intra_satd_x3_16x16_mmx2 ( pixel   *, pixel   *, int * );
 void x264_intra_satd_x3_16x16_ssse3( uint8_t *, uint8_t *, int * );
 void x264_intra_sad_x3_16x16_mmx2  ( pixel   *, pixel   *, int * );
 void x264_intra_sad_x3_16x16_sse2  ( pixel   *, pixel   *, int * );
 void x264_intra_sad_x3_16x16_ssse3 ( pixel   *, pixel   *, int * );
+void x264_intra_sad_x3_16x16_avx2  ( pixel   *, pixel   *, int * );
 void x264_intra_sa8d_x3_8x8_mmx2   ( uint8_t *, uint8_t *, int * );
 void x264_intra_sa8d_x3_8x8_sse2   ( pixel   *, pixel   *, int * );
 void x264_intra_sad_x3_8x8_mmx2    ( pixel   *, pixel   *, int * );
 void x264_intra_sad_x3_8x8_sse2    ( pixel   *, pixel   *, int * );
 void x264_intra_sad_x3_8x8_ssse3   ( pixel   *, pixel   *, int * );
+void x264_intra_sad_x3_8x8_avx2    ( uint16_t*, uint16_t*, int * );
 int x264_intra_satd_x9_4x4_ssse3( uint8_t *, uint8_t *, uint16_t * );
 int x264_intra_satd_x9_4x4_sse4 ( uint8_t *, uint8_t *, uint16_t * );
 int x264_intra_satd_x9_4x4_avx  ( uint8_t *, uint8_t *, uint16_t * );
@@ -129,6 +142,7 @@
 int x264_intra_sad_x9_8x8_ssse3 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
 int x264_intra_sad_x9_8x8_sse4  ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
 int x264_intra_sad_x9_8x8_avx   ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
+int x264_intra_sad_x9_8x8_avx2  ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
 
 void x264_pixel_ssd_nv12_core_mmx2( pixel *pixuv1, intptr_t stride1,
                                     pixel *pixuv2, intptr_t stride2, int width,
@@ -139,6 +153,9 @@
 void x264_pixel_ssd_nv12_core_avx ( pixel *pixuv1, intptr_t stride1,
                                     pixel *pixuv2, intptr_t stride2, int width,
                                     int height, uint64_t *ssd_u, uint64_t *ssd_v );
+void x264_pixel_ssd_nv12_core_avx2( pixel *pixuv1, intptr_t stride1,
+                                    pixel *pixuv2, intptr_t stride2, int width,
+                                    int height, uint64_t *ssd_u, uint64_t *ssd_v );
 void x264_pixel_ssim_4x4x2_core_mmx2( const uint8_t *pix1, intptr_t stride1,
                                       const uint8_t *pix2, intptr_t stride2, int sums[2][4] );
 void x264_pixel_ssim_4x4x2_core_sse2( const pixel *pix1, intptr_t stride1,
@@ -151,17 +168,28 @@
 int  x264_pixel_var2_8x8_sse2  ( pixel *,   intptr_t, pixel *,   intptr_t, int * );
 int  x264_pixel_var2_8x8_ssse3 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
 int  x264_pixel_var2_8x8_xop   ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
+int  x264_pixel_var2_8x8_avx2  ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
 int  x264_pixel_var2_8x16_mmx2 ( pixel *,   intptr_t, pixel *,   intptr_t, int * );
 int  x264_pixel_var2_8x16_sse2 ( pixel *,   intptr_t, pixel *,   intptr_t, int * );
 int  x264_pixel_var2_8x16_ssse3( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
 int  x264_pixel_var2_8x16_xop  ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
+int  x264_pixel_var2_8x16_avx2 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
 int  x264_pixel_vsad_mmx2 ( pixel *src, intptr_t stride, int height );
 int  x264_pixel_vsad_sse2 ( pixel *src, intptr_t stride, int height );
 int  x264_pixel_vsad_ssse3( pixel *src, intptr_t stride, int height );
 int  x264_pixel_vsad_xop  ( pixel *src, intptr_t stride, int height );
+int  x264_pixel_vsad_avx2 ( uint16_t *src, intptr_t stride, int height );
 int x264_pixel_asd8_sse2 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
 int x264_pixel_asd8_ssse3( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
 int x264_pixel_asd8_xop  ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
+uint64_t x264_pixel_sa8d_satd_16x16_sse2      ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
+uint64_t x264_pixel_sa8d_satd_16x16_ssse3     ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
+uint64_t x264_pixel_sa8d_satd_16x16_ssse3_atom( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
+uint64_t x264_pixel_sa8d_satd_16x16_sse4      ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
+uint64_t x264_pixel_sa8d_satd_16x16_avx       ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
+uint64_t x264_pixel_sa8d_satd_16x16_xop       ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
+uint64_t x264_pixel_sa8d_satd_16x16_avx2      ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
+
 
 #define DECL_ADS( size, suffix ) \
 int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\
@@ -178,6 +206,9 @@
 DECL_ADS( 4, avx )
 DECL_ADS( 2, avx )
 DECL_ADS( 1, avx )
+DECL_ADS( 4, avx2 )
+DECL_ADS( 2, avx2 )
+DECL_ADS( 1, avx2 )
 
 #undef DECL_PIXELS
 #undef DECL_X1

x264-snapshot-20130224-2245.tar.bz2/common/x86/predict-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/predict-a.asm Changed

@@ -6,6 +6,7 @@
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Holger Lubitz <holger@lubitz.org>
 ;*          Jason Garrett-Glaser <darkshikari@gmail.com>
+;*          Henrik Gramner <henrik@gramner.com>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
@@ -28,13 +29,12 @@
 %include "x86inc.asm"
 %include "x86util.asm"
 
-SECTION_RODATA
+SECTION_RODATA 32
 
-pw_76543210:
-pw_3210:     dw 0, 1, 2, 3, 4, 5, 6, 7
-pw_43210123: dw -3, -2, -1, 0, 1, 2, 3, 4
-pw_m3:       times 8 dw -3
-pw_m7:       times 8 dw -7
+pw_0to15:    dw 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+pw_43210123: times 2 dw -3, -2, -1, 0, 1, 2, 3, 4
+pw_m3:       times 16 dw -3
+pw_m7:       times 16 dw -7
 pb_00s_ff:   times 8 db 0
 pb_0s_ff:    times 7 db 0
              db 0xff
@@ -57,109 +57,106 @@
 cextern pw_00ff
 cextern pw_pixel_max
 
-%macro STORE8x8 2-4
-    add r0, 4*FDEC_STRIDEB
-    mova        [r0 + -4*FDEC_STRIDEB], %1
-    mova        [r0 + -3*FDEC_STRIDEB], %1
-    mova        [r0 + -2*FDEC_STRIDEB], %1
-    mova        [r0 + -1*FDEC_STRIDEB], %1
-    mova        [r0 +  0*FDEC_STRIDEB], %2
-    mova        [r0 +  1*FDEC_STRIDEB], %2
-    mova        [r0 +  2*FDEC_STRIDEB], %2
-    mova        [r0 +  3*FDEC_STRIDEB], %2
+%macro STORE8 1
+    mova [r0+0*FDEC_STRIDEB], %1
+    mova [r0+1*FDEC_STRIDEB], %1
+    add  r0, 4*FDEC_STRIDEB
+    mova [r0-2*FDEC_STRIDEB], %1
+    mova [r0-1*FDEC_STRIDEB], %1
+    mova [r0+0*FDEC_STRIDEB], %1
+    mova [r0+1*FDEC_STRIDEB], %1
+    mova [r0+2*FDEC_STRIDEB], %1
+    mova [r0+3*FDEC_STRIDEB], %1
 %endmacro
 
-%macro STORE8x16 4
-    add r0, 4*FDEC_STRIDEB
-    mova        [r0 + -4*FDEC_STRIDEB], %1
-    mova        [r0 + -3*FDEC_STRIDEB], %1
-    mova        [r0 + -2*FDEC_STRIDEB], %1
-    mova        [r0 + -1*FDEC_STRIDEB], %1
-    add r0, 4*FDEC_STRIDEB
-    mova        [r0 + -4*FDEC_STRIDEB], %2
-    mova        [r0 + -3*FDEC_STRIDEB], %2
-    mova        [r0 + -2*FDEC_STRIDEB], %2
-    mova        [r0 + -1*FDEC_STRIDEB], %2
-    add r0, 4*FDEC_STRIDEB
-    mova        [r0 + -4*FDEC_STRIDEB], %3
-    mova        [r0 + -3*FDEC_STRIDEB], %3
-    mova        [r0 + -2*FDEC_STRIDEB], %3
-    mova        [r0 + -1*FDEC_STRIDEB], %3
-    mova        [r0 +  0*FDEC_STRIDEB], %4
-    mova        [r0 +  1*FDEC_STRIDEB], %4
-    mova        [r0 +  2*FDEC_STRIDEB], %4
-    mova        [r0 +  3*FDEC_STRIDEB], %4
+%macro STORE16 1-4
+%if %0 > 1
+    mov  r1d, 2*%0
+.loop:
+    mova [r0+0*FDEC_STRIDEB+0*mmsize], %1
+    mova [r0+0*FDEC_STRIDEB+1*mmsize], %2
+    mova [r0+1*FDEC_STRIDEB+0*mmsize], %1
+    mova [r0+1*FDEC_STRIDEB+1*mmsize], %2
+%ifidn %0, 4
+    mova [r0+0*FDEC_STRIDEB+2*mmsize], %3
+    mova [r0+0*FDEC_STRIDEB+3*mmsize], %4
+    mova [r0+1*FDEC_STRIDEB+2*mmsize], %3
+    mova [r0+1*FDEC_STRIDEB+3*mmsize], %4
+    add  r0, 2*FDEC_STRIDEB
+%else ; %0 == 2
+    add  r0, 4*FDEC_STRIDEB
+    mova [r0-2*FDEC_STRIDEB+0*mmsize], %1
+    mova [r0-2*FDEC_STRIDEB+1*mmsize], %2
+    mova [r0-1*FDEC_STRIDEB+0*mmsize], %1
+    mova [r0-1*FDEC_STRIDEB+1*mmsize], %2
+%endif
+    dec  r1d
+    jg .loop
+%else ; %0 == 1
+    STORE8 %1
+%if HIGH_BIT_DEPTH ; Different code paths to reduce code size
+    add  r0, 6*FDEC_STRIDEB
+    mova [r0-2*FDEC_STRIDEB], %1
+    mova [r0-1*FDEC_STRIDEB], %1
+    mova [r0+0*FDEC_STRIDEB], %1
+    mova [r0+1*FDEC_STRIDEB], %1
+    add  r0, 4*FDEC_STRIDEB
+    mova [r0-2*FDEC_STRIDEB], %1
+    mova [r0-1*FDEC_STRIDEB], %1
+    mova [r0+0*FDEC_STRIDEB], %1
+    mova [r0+1*FDEC_STRIDEB], %1
+%else
+    add  r0, 8*FDEC_STRIDE
+    mova [r0-4*FDEC_STRIDE], %1
+    mova [r0-3*FDEC_STRIDE], %1
+    mova [r0-2*FDEC_STRIDE], %1
+    mova [r0-1*FDEC_STRIDE], %1
+    mova [r0+0*FDEC_STRIDE], %1
+    mova [r0+1*FDEC_STRIDE], %1
+    mova [r0+2*FDEC_STRIDE], %1
+    mova [r0+3*FDEC_STRIDE], %1
+%endif ; HIGH_BIT_DEPTH
+%endif
 %endmacro
 
-%macro STORE16x16 2-4
-%ifidn %0, 4
-    mov         r1d, 8
-.loop:
-    mova        [r0 + 0*FDEC_STRIDEB + 0], %1
-    mova        [r0 + 1*FDEC_STRIDEB + 0], %1
-    mova        [r0 + 0*FDEC_STRIDEB + 8], %2
-    mova        [r0 + 1*FDEC_STRIDEB + 8], %2
-    mova        [r0 + 0*FDEC_STRIDEB +16], %3
-    mova        [r0 + 1*FDEC_STRIDEB +16], %3
-    mova        [r0 + 0*FDEC_STRIDEB +24], %4
-    mova        [r0 + 1*FDEC_STRIDEB +24], %4
-    add         r0, 2*FDEC_STRIDEB
-    dec         r1d
-    jg          .loop
+%macro PRED_H_LOAD 2 ; reg, offset
+%if cpuflag(avx2)
+    vpbroadcastpix %1, [r0+(%2)*FDEC_STRIDEB-SIZEOF_PIXEL]
+%elif HIGH_BIT_DEPTH
+    movd           %1, [r0+(%2)*FDEC_STRIDEB-4]
+    SPLATW         %1, %1, 1
 %else
-    mov         r1d, 4
-.loop:
-    mova        [r0 + 0*FDEC_STRIDE], %1
-    mova        [r0 + 1*FDEC_STRIDE], %1
-    mova        [r0 + 2*FDEC_STRIDE], %1
-    mova        [r0 + 3*FDEC_STRIDE], %1
-    mova        [r0 + 0*FDEC_STRIDE + 8], %2
-    mova        [r0 + 1*FDEC_STRIDE + 8], %2
-    mova        [r0 + 2*FDEC_STRIDE + 8], %2
-    mova        [r0 + 3*FDEC_STRIDE + 8], %2
-    add         r0, 4*FDEC_STRIDE
-    dec         r1d
-    jg          .loop
+    SPLATB_LOAD    %1, r0+(%2)*FDEC_STRIDE-1, m2
 %endif
 %endmacro
 
-%macro STORE16x16_SSE2 1-2
-%ifidn %0,2
-    mov r1d, 4
-.loop
-    mova      [r0+0*FDEC_STRIDEB+ 0], %1
-    mova      [r0+0*FDEC_STRIDEB+16], %2
-    mova      [r0+1*FDEC_STRIDEB+ 0], %1
-    mova      [r0+1*FDEC_STRIDEB+16], %2
-    mova      [r0+2*FDEC_STRIDEB+ 0], %1
-    mova      [r0+2*FDEC_STRIDEB+16], %2
-    mova      [r0+3*FDEC_STRIDEB+ 0], %1
-    mova      [r0+3*FDEC_STRIDEB+16], %2
-    add       r0, 4*FDEC_STRIDEB
-    dec       r1d
-    jg        .loop
+%macro PRED_H_STORE 3 ; reg, offset, width
+%assign %%w %3*SIZEOF_PIXEL
+%if %%w == 8
+    movq [r0+(%2)*FDEC_STRIDEB], %1
 %else
-    add r0, 4*FDEC_STRIDEB
-    mova      [r0 + -4*FDEC_STRIDEB], %1
-    mova      [r0 + -3*FDEC_STRIDEB], %1
-    mova      [r0 + -2*FDEC_STRIDEB], %1
-    mova      [r0 + -1*FDEC_STRIDEB], %1
-    mova      [r0 +  0*FDEC_STRIDEB], %1
-    mova      [r0 +  1*FDEC_STRIDEB], %1
-    mova      [r0 +  2*FDEC_STRIDEB], %1
-    mova      [r0 +  3*FDEC_STRIDEB], %1
-    add r0, 8*FDEC_STRIDEB
-    mova      [r0 + -4*FDEC_STRIDEB], %1
-    mova      [r0 + -3*FDEC_STRIDEB], %1
-    mova      [r0 + -2*FDEC_STRIDEB], %1
-    mova      [r0 + -1*FDEC_STRIDEB], %1
-    mova      [r0 +  0*FDEC_STRIDEB], %1
-    mova      [r0 +  1*FDEC_STRIDEB], %1
-    mova      [r0 +  2*FDEC_STRIDEB], %1
-    mova      [r0 +  3*FDEC_STRIDEB], %1

x264-snapshot-20130224-2245.tar.bz2/common/x86/predict-c.c -> x264-snapshot-20130723-2245.tar.bz2/common/x86/predict-c.c Changed

@@ -43,6 +43,7 @@
 
 PREDICT_16x16_DC( mmx2 )
 PREDICT_16x16_DC( sse2 )
+PREDICT_16x16_DC( avx2 )
 
 #define PREDICT_16x16_DC_LEFT(name)\
 static void x264_predict_16x16_dc_left_##name( pixel *src )\
@@ -58,10 +59,11 @@
 
 PREDICT_16x16_DC_LEFT( mmx2 )
 PREDICT_16x16_DC_LEFT( sse2 )
+PREDICT_16x16_DC_LEFT( avx2 )
 
 #define PREDICT_P_SUM(j,i)\
     H += i * ( src[j+i - FDEC_STRIDE ]  - src[j-i - FDEC_STRIDE ] );\
-    V += i * ( src[(j+i)*FDEC_STRIDE -1] - src[(j-i)*FDEC_STRIDE -1] );\
+    V += i * ( src[(j+i)*FDEC_STRIDE -1] - src[(j-i)*FDEC_STRIDE -1] );
 
 ALIGNED_16( static const int16_t pw_12345678[8] ) = {1,2,3,4,5,6,7,8};
 ALIGNED_16( static const int16_t pw_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
@@ -70,178 +72,181 @@
 ALIGNED_8( static const int8_t pb_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
 ALIGNED_8( static const int8_t pb_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4};
 
-#if !HIGH_BIT_DEPTH
-#define PREDICT_16x16_P(name)\
-static void x264_predict_16x16_p_##name( pixel *src )\
-{\
-    int a, b, c;\
+#define PREDICT_16x16_P_CORE\
     int H = 0;\
     int V = 0;\
-    int i00;\
-    PREDICT_P_SUM(7,1) \
-    PREDICT_P_SUM(7,2) \
-    PREDICT_P_SUM(7,3) \
-    PREDICT_P_SUM(7,4) \
-    PREDICT_P_SUM(7,5) \
-    PREDICT_P_SUM(7,6) \
-    PREDICT_P_SUM(7,7) \
-    PREDICT_P_SUM(7,8) \
-    a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] );\
-    b = ( 5 * H + 32 ) >> 6;\
-    c = ( 5 * V + 32 ) >> 6;\
-    i00 = a - b * 7 - c * 7 + 16;\
-    x264_predict_16x16_p_core_##name( src, i00, b, c );\
-}
-#ifndef ARCH_X86_64
-PREDICT_16x16_P( mmx2 )
-#endif
-PREDICT_16x16_P( sse2   )
-PREDICT_16x16_P( avx    )
-#endif //!HIGH_BIT_DEPTH
+    PREDICT_P_SUM(7,1)\
+    PREDICT_P_SUM(7,2)\
+    PREDICT_P_SUM(7,3)\
+    PREDICT_P_SUM(7,4)\
+    PREDICT_P_SUM(7,5)\
+    PREDICT_P_SUM(7,6)\
+    PREDICT_P_SUM(7,7)\
+    PREDICT_P_SUM(7,8)
 
-#define PREDICT_8x16C_P_CORE \
-    int H = 0, V = 0;\
-    for( int i = 0; i < 4; i++ )\
-        H += ( i + 1 ) * ( src[4 + i - FDEC_STRIDE] - src[2 - i - FDEC_STRIDE] );\
-    for( int i = 0; i < 8; i++ )\
-        V += ( i + 1 ) * ( src[-1 + (i+8)*FDEC_STRIDE] - src[-1 + (6-i)*FDEC_STRIDE] );\
-    int a = 16 * ( src[-1 + 15*FDEC_STRIDE] + src[7 - FDEC_STRIDE] );\
-    int b = ( 17 * H + 16 ) >> 5;\
-    int c = ( 5 * V + 32 ) >> 6;
-
-#if HIGH_BIT_DEPTH
-#define PREDICT_8x16_P(name)\
-static void x264_predict_8x16c_p_##name( uint16_t *src )\
-{\
-    PREDICT_8x16C_P_CORE \
-    x264_predict_8x16c_p_core_##name( src, a, b, c );\
-}
+#define PREDICT_16x16_P_END(name)\
+    int a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] );\
+    int b = ( 5 * H + 32 ) >> 6;\
+    int c = ( 5 * V + 32 ) >> 6;\
+    int i00 = a - b * 7 - c * 7 + 16;\
+    /* b*15 + c*15 can overflow: it's easier to just branch away in this rare case
+     * than to try to consider it in the asm. */\
+    if( BIT_DEPTH > 8 && (i00 > 0x7fff || abs(b) > 1092 || abs(c) > 1092) )\
+        x264_predict_16x16_p_c( src );\
+    else\
+        x264_predict_16x16_p_core_##name( src, i00, b, c );
 
-PREDICT_8x16_P(sse2)
-PREDICT_8x16_P(avx)
-#else
-#define PREDICT_8x16_P(name)\
-static void x264_predict_8x16c_p_##name( uint8_t *src )\
+#define PREDICT_16x16_P(name, name2)\
+static void x264_predict_16x16_p_##name( pixel *src )\
 {\
-    PREDICT_8x16C_P_CORE \
-    int i00 = a -3*b -7*c + 16;\
-    x264_predict_8x16c_p_core_##name( src, i00, b, c );\
+    PREDICT_16x16_P_CORE\
+    PREDICT_16x16_P_END(name2)\
 }
-#ifndef ARCH_X86_64
-PREDICT_8x16_P(mmx2)
-#endif
-PREDICT_8x16_P(sse2)
-PREDICT_8x16_P(avx)
-#endif
 
 #if HAVE_X86_INLINE_ASM
 #if HIGH_BIT_DEPTH
-static void x264_predict_16x16_p_sse2( uint16_t *src )
-#else
-static void x264_predict_16x16_p_ssse3( uint8_t *src )
-#endif
-{
-    int a, b, c, i00;
-    int H, V;
-#if HIGH_BIT_DEPTH
-    asm (
-        "movdqu           %1, %%xmm1 \n"
-        "movdqa           %2, %%xmm0 \n"
-        "pmaddwd          %3, %%xmm0 \n"
-        "pmaddwd          %4, %%xmm1 \n"
-        "paddd        %%xmm1, %%xmm0 \n"
-        "movhlps      %%xmm0, %%xmm1 \n"
-        "paddd        %%xmm1, %%xmm0 \n"
-        "pshuflw $14, %%xmm0, %%xmm1 \n"
-        "paddd        %%xmm1, %%xmm0 \n"
-        "movd         %%xmm0, %0     \n"
-        :"=r"(H)
-        :"m"(src[-FDEC_STRIDE-1]), "m"(src[-FDEC_STRIDE+8]),
-         "m"(*pw_12345678), "m"(*pw_m87654321)
+#define PREDICT_16x16_P_ASM\
+    asm (\
+        "movdqu           %1, %%xmm1 \n"\
+        "movdqa           %2, %%xmm0 \n"\
+        "pmaddwd          %3, %%xmm0 \n"\
+        "pmaddwd          %4, %%xmm1 \n"\
+        "paddd        %%xmm1, %%xmm0 \n"\
+        "movhlps      %%xmm0, %%xmm1 \n"\
+        "paddd        %%xmm1, %%xmm0 \n"\
+        "pshuflw $14, %%xmm0, %%xmm1 \n"\
+        "paddd        %%xmm1, %%xmm0 \n"\
+        "movd         %%xmm0, %0     \n"\
+        :"=r"(H)\
+        :"m"(src[-FDEC_STRIDE-1]), "m"(src[-FDEC_STRIDE+8]),\
+         "m"(*pw_12345678), "m"(*pw_m87654321)\
     );
-#else
-    asm (
-        "movq           %1, %%mm1 \n"
-        "movq           %2, %%mm0 \n"
-        "palignr $7,    %3, %%mm1 \n"
-        "pmaddubsw      %4, %%mm0 \n"
-        "pmaddubsw      %5, %%mm1 \n"
-        "paddw       %%mm1, %%mm0 \n"
-        "pshufw $14, %%mm0, %%mm1 \n"
-        "paddw       %%mm1, %%mm0 \n"
-        "pshufw  $1, %%mm0, %%mm1 \n"
-        "paddw       %%mm1, %%mm0 \n"
-        "movd        %%mm0, %0    \n"
-        "movswl        %w0, %0    \n"
-        :"=r"(H)
-        :"m"(src[-FDEC_STRIDE]), "m"(src[-FDEC_STRIDE+8]),
-         "m"(src[-FDEC_STRIDE-8]), "m"(*pb_12345678), "m"(*pb_m87654321)
+#else // !HIGH_BIT_DEPTH
+#define PREDICT_16x16_P_ASM\
+    asm (\
+        "movq           %1, %%mm1 \n"\
+        "movq           %2, %%mm0 \n"\
+        "palignr $7,    %3, %%mm1 \n"\
+        "pmaddubsw      %4, %%mm0 \n"\
+        "pmaddubsw      %5, %%mm1 \n"\
+        "paddw       %%mm1, %%mm0 \n"\
+        "pshufw $14, %%mm0, %%mm1 \n"\
+        "paddw       %%mm1, %%mm0 \n"\
+        "pshufw  $1, %%mm0, %%mm1 \n"\
+        "paddw       %%mm1, %%mm0 \n"\
+        "movd        %%mm0, %0    \n"\
+        "movswl        %w0, %0    \n"\
+        :"=r"(H)\
+        :"m"(src[-FDEC_STRIDE]), "m"(src[-FDEC_STRIDE+8]),\
+         "m"(src[-FDEC_STRIDE-8]), "m"(*pb_12345678), "m"(*pb_m87654321)\
     );
-#endif
-    V = 8 * ( src[15*FDEC_STRIDE-1] - src[-1*FDEC_STRIDE-1] )
-      + 7 * ( src[14*FDEC_STRIDE-1] - src[ 0*FDEC_STRIDE-1] )
-      + 6 * ( src[13*FDEC_STRIDE-1] - src[ 1*FDEC_STRIDE-1] )
-      + 5 * ( src[12*FDEC_STRIDE-1] - src[ 2*FDEC_STRIDE-1] )
-      + 4 * ( src[11*FDEC_STRIDE-1] - src[ 3*FDEC_STRIDE-1] )
-      + 3 * ( src[10*FDEC_STRIDE-1] - src[ 4*FDEC_STRIDE-1] )
-      + 2 * ( src[ 9*FDEC_STRIDE-1] - src[ 5*FDEC_STRIDE-1] )
+#endif // HIGH_BIT_DEPTH
+
+#define PREDICT_16x16_P_CORE_INLINE\

x264-snapshot-20130224-2245.tar.bz2/common/x86/predict.h -> x264-snapshot-20130723-2245.tar.bz2/common/x86/predict.h Changed

@@ -34,48 +34,57 @@
 void x264_predict_8x8_init_mmx   ( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter );
 
 void x264_predict_16x16_v_mmx2( pixel *src );
-void x264_predict_16x16_v_sse2( pixel *src );
+void x264_predict_16x16_v_sse ( pixel *src );
+void x264_predict_16x16_v_avx ( uint16_t *src );
 void x264_predict_16x16_h_mmx2( pixel *src );
 void x264_predict_16x16_h_sse2( uint16_t *src );
 void x264_predict_16x16_h_ssse3( uint8_t *src );
+void x264_predict_16x16_h_avx2( uint16_t *src );
 void x264_predict_16x16_dc_mmx2( pixel *src );
 void x264_predict_16x16_dc_sse2( pixel *src );
 void x264_predict_16x16_dc_core_mmx2( pixel *src, int i_dc_left );
 void x264_predict_16x16_dc_core_sse2( pixel *src, int i_dc_left );
+void x264_predict_16x16_dc_core_avx2( pixel *src, int i_dc_left );
 void x264_predict_16x16_dc_left_core_mmx2( pixel *src, int i_dc_left );
 void x264_predict_16x16_dc_left_core_sse2( pixel *src, int i_dc_left );
+void x264_predict_16x16_dc_left_core_avx2( pixel *src, int i_dc_left );
 void x264_predict_16x16_dc_top_mmx2( pixel *src );
 void x264_predict_16x16_dc_top_sse2( pixel *src );
-void x264_predict_16x16_dc_top_ssse3( uint16_t *src );
+void x264_predict_16x16_dc_top_avx2( pixel *src );
 void x264_predict_16x16_p_core_mmx2( uint8_t *src, int i00, int b, int c );
 void x264_predict_16x16_p_core_sse2( pixel *src, int i00, int b, int c );
 void x264_predict_16x16_p_core_avx( pixel *src, int i00, int b, int c );
+void x264_predict_16x16_p_core_avx2( pixel *src, int i00, int b, int c );
 void x264_predict_8x16c_dc_mmx2( pixel *src );
 void x264_predict_8x16c_dc_sse2( uint16_t *src );
 void x264_predict_8x16c_dc_top_mmx2( uint8_t *src );
 void x264_predict_8x16c_dc_top_sse2( uint16_t *src );
 void x264_predict_8x16c_v_mmx( uint8_t *src );
-void x264_predict_8x16c_v_sse2( uint16_t *src );
+void x264_predict_8x16c_v_sse( uint16_t *src );
 void x264_predict_8x16c_h_mmx2( pixel *src );
-void x264_predict_8x16c_h_sse2( pixel *src );
+void x264_predict_8x16c_h_sse2( uint16_t *src );
 void x264_predict_8x16c_h_ssse3( uint8_t *src );
+void x264_predict_8x16c_h_avx2( uint16_t *src );
 void x264_predict_8x16c_p_core_mmx2( uint8_t *src, int i00, int b, int c );
 void x264_predict_8x16c_p_core_sse2( pixel *src, int i00, int b, int c );
-void x264_predict_8x16c_p_core_avx( pixel *src, int i00, int b, int c );
+void x264_predict_8x16c_p_core_avx ( pixel *src, int i00, int b, int c );
+void x264_predict_8x16c_p_core_avx2( pixel *src, int i00, int b, int c );
 void x264_predict_8x8c_p_core_mmx2( uint8_t *src, int i00, int b, int c );
 void x264_predict_8x8c_p_core_sse2( pixel *src, int i00, int b, int c );
-void x264_predict_8x8c_p_core_avx( pixel *src, int i00, int b, int c );
+void x264_predict_8x8c_p_core_avx ( pixel *src, int i00, int b, int c );
+void x264_predict_8x8c_p_core_avx2( pixel *src, int i00, int b, int c );
 void x264_predict_8x8c_dc_mmx2( pixel *src );
 void x264_predict_8x8c_dc_sse2( uint16_t *src );
 void x264_predict_8x8c_dc_top_mmx2( uint8_t *src );
 void x264_predict_8x8c_dc_top_sse2( uint16_t *src );
 void x264_predict_8x8c_v_mmx( pixel *src );
-void x264_predict_8x8c_v_sse2( uint16_t *src );
+void x264_predict_8x8c_v_sse( uint16_t *src );
 void x264_predict_8x8c_h_mmx2( pixel *src );
-void x264_predict_8x8c_h_sse2( pixel *src );
+void x264_predict_8x8c_h_sse2( uint16_t *src );
 void x264_predict_8x8c_h_ssse3( uint8_t *src );
+void x264_predict_8x8c_h_avx2( uint16_t *src );
 void x264_predict_8x8_v_mmx2( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_v_sse2( uint16_t *src, uint16_t edge[36] );
+void x264_predict_8x8_v_sse ( uint16_t *src, uint16_t edge[36] );
 void x264_predict_8x8_h_mmx2( uint8_t *src, uint8_t edge[36] );
 void x264_predict_8x8_h_sse2( uint16_t *src, uint16_t edge[36] );
 void x264_predict_8x8_hd_mmx2( uint8_t *src, uint8_t edge[36] );
@@ -114,6 +123,7 @@
 void x264_predict_8x8_filter_sse2( uint16_t *src, uint16_t edge[36], int i_neighbor, int i_filters );
 void x264_predict_8x8_filter_ssse3( pixel *src, pixel edge[36], int i_neighbor, int i_filters );
 void x264_predict_8x8_filter_avx( uint16_t *src, uint16_t edge[36], int i_neighbor, int i_filters );
+void x264_predict_4x4_h_avx2( uint16_t *src );
 void x264_predict_4x4_ddl_mmx2( pixel *src );
 void x264_predict_4x4_ddl_sse2( uint16_t *src );
 void x264_predict_4x4_ddl_avx( uint16_t *src );

x264-snapshot-20130224-2245.tar.bz2/common/x86/quant-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/quant-a.asm Changed

@@ -7,7 +7,7 @@
 ;*          Jason Garrett-Glaser <darkshikari@gmail.com>
 ;*          Christian Heine <sennindemokrit@gmx.net>
 ;*          Oskar Arvidsson <oskar@irock.se>
-;*          Henrik Gramner <hengar-6@student.ltu.se>
+;*          Henrik Gramner <henrik@gramner.com>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
@@ -30,7 +30,7 @@
 %include "x86inc.asm"
 %include "x86util.asm"
 
-SECTION_RODATA
+SECTION_RODATA 32
 
 %macro DQM4 3
     dw %1, %2, %1, %2, %2, %3, %2, %3
@@ -39,8 +39,7 @@
     dw %1, %4, %5, %4, %1, %4, %5, %4
     dw %4, %2, %6, %2, %4, %2, %6, %2
     dw %5, %6, %3, %6, %5, %6, %3, %6
-    ; last line not used, just padding for power-of-2 stride
-    times 8 dw 0
+    dw %4, %2, %6, %2, %4, %2, %6, %2
 %endmacro
 
 dequant4_scale:
@@ -75,27 +74,55 @@
 chroma_dc_dct_mask:     dw 1, 1,-1,-1, 1, 1,-1,-1
 chroma_dc_dmf_mask:     dw 1, 1,-1,-1, 1,-1,-1, 1
 
+%if HIGH_BIT_DEPTH==0
+dct_coef_shuffle:
+%macro DCT_COEF_SHUFFLE 8
+    %assign y x
+    %rep 8
+        %rep 7
+            %rotate (~(y>>7))&1
+            %assign y y<<((~(y>>7))&1)
+        %endrep
+        db %1*2
+        %rotate 1
+        %assign y y<<1
+    %endrep
+%endmacro
+%assign x 0
+%rep 256
+    DCT_COEF_SHUFFLE 7, 6, 5, 4, 3, 2, 1, 0
+%assign x x+1
+%endrep
+%endif
+
 SECTION .text
 
 cextern pb_1
 cextern pw_1
+cextern pw_2
+cextern pw_256
 cextern pd_1
 cextern pb_01
 cextern pd_1024
-
-%macro QUANT_DC_START 0
-    movd       m6, r1m     ; mf
-    movd       m7, r2m     ; bias
-%if HIGH_BIT_DEPTH
-    SPLATD     m6, m6
-    SPLATD     m7, m7
+cextern deinterleave_shufd
+cextern popcnt_table
+
+%macro QUANT_DC_START 2
+    movd      xm%1, r1m     ; mf
+    movd      xm%2, r2m     ; bias
+%if cpuflag(avx2)
+    vpbroadcastdct m%1, xm%1
+    vpbroadcastdct m%2, xm%2
+%elif HIGH_BIT_DEPTH
+    SPLATD     m%1, m%1
+    SPLATD     m%2, m%2
 %elif cpuflag(sse4) ; ssse3, but not faster on conroe
     mova       m5, [pb_01]
-    pshufb     m6, m5
-    pshufb     m7, m5
+    pshufb     m%1, m5
+    pshufb     m%2, m5
 %else
-    SPLATW     m6, m6
-    SPLATW     m7, m7
+    SPLATW     m%1, m%1
+    SPLATW     m%2, m%2
 %endif
 %endmacro
 
@@ -175,7 +202,7 @@
 %endif ; cpuflag
 %endmacro
 
-%macro QUANT_ONE_AC_MMX 4
+%macro QUANT_ONE_AC_MMX 5
     mova        m0, [%1]
     mova        m2, [%2]
     ABSD        m1, m0
@@ -191,10 +218,10 @@
     psrad       m1, 16
     PSIGND      m1, m0
     mova      [%1], m1
-    ACCUM      por, 5, 1, %4
+    ACCUM      por, %5, 1, %4
 %endmacro
 
-%macro QUANT_TWO_AC 4
+%macro QUANT_TWO_AC 5
 %if cpuflag(sse4)
     mova        m0, [%1       ]
     mova        m1, [%1+mmsize]
@@ -210,11 +237,11 @@
     PSIGND      m3, m1
     mova [%1       ], m2
     mova [%1+mmsize], m3
-    ACCUM      por, 5, 2, %4
-    por         m5, m3
+    ACCUM      por, %5, 2, %4
+    por        m%5, m3
 %else ; !sse4
-    QUANT_ONE_AC_MMX %1, %2, %3, %4
-    QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, %4+mmsize
+    QUANT_ONE_AC_MMX %1, %2, %3, %4, %5
+    QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, 1, %5
 %endif ; cpuflag
 %endmacro
 
@@ -223,7 +250,7 @@
 ;-----------------------------------------------------------------------------
 %macro QUANT_DC 2
 cglobal quant_%1x%2_dc, 3,3,8
-    QUANT_DC_START
+    QUANT_DC_START 6,7
 %if %1*%2 <= mmsize/4
     QUANT_ONE_DC r0, m6, m7, 0
 %else
@@ -244,35 +271,87 @@
 cglobal quant_%1x%2, 3,3,8
 %assign x 0
 %rep %1*%2/(mmsize/2)
-    QUANT_TWO_AC r0+x, r1+x, r2+x, x
+    QUANT_TWO_AC r0+x, r1+x, r2+x, x, 5
 %assign x x+mmsize*2
 %endrep
     QUANT_END
     RET
 %endmacro
 
+%macro QUANT_4x4 2
+    QUANT_TWO_AC r0+%1+mmsize*0, r1+mmsize*0, r2+mmsize*0, 0, %2
+    QUANT_TWO_AC r0+%1+mmsize*2, r1+mmsize*2, r2+mmsize*2, 1, %2
+%endmacro
+
+%macro QUANT_4x4x4 0
+cglobal quant_4x4x4, 3,3,8
+    QUANT_4x4  0, 5
+    QUANT_4x4 64, 6
+    add       r0, 128
+    packssdw  m5, m6
+    QUANT_4x4  0, 6
+    QUANT_4x4 64, 7
+    packssdw  m6, m7
+    packssdw  m5, m6
+    packssdw  m5, m5  ; AA BB CC DD
+    packsswb  m5, m5  ; A B C D
+    pxor      m4, m4
+    pcmpeqb   m5, m4
+    pmovmskb eax, m5
+    not      eax
+    and      eax, 0xf
+    RET
+%endmacro
+
 INIT_XMM sse2
 QUANT_DC 2, 2
 QUANT_DC 4, 4
 QUANT_AC 4, 4
 QUANT_AC 8, 8
+QUANT_4x4x4
 
 INIT_XMM ssse3
 QUANT_DC 2, 2
 QUANT_DC 4, 4
 QUANT_AC 4, 4
 QUANT_AC 8, 8
+QUANT_4x4x4
 
 INIT_XMM sse4
 QUANT_DC 2, 2
 QUANT_DC 4, 4
 QUANT_AC 4, 4
 QUANT_AC 8, 8
+QUANT_4x4x4
+

x264-snapshot-20130224-2245.tar.bz2/common/x86/quant.h -> x264-snapshot-20130723-2245.tar.bz2/common/x86/quant.h Changed

@@ -31,19 +31,27 @@
 int x264_quant_2x2_dc_mmx2( dctcoef dct[4], int mf, int bias );
 int x264_quant_4x4_dc_mmx2( dctcoef dct[16], int mf, int bias );
 int x264_quant_4x4_mmx( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
+int x264_quant_4x4x4_mmx( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
 int x264_quant_8x8_mmx( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
 int x264_quant_2x2_dc_sse2( dctcoef dct[16], int mf, int bias );
 int x264_quant_4x4_dc_sse2( dctcoef dct[16], int mf, int bias );
 int x264_quant_4x4_sse2( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
+int x264_quant_4x4x4_sse2( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
 int x264_quant_8x8_sse2( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
 int x264_quant_2x2_dc_ssse3( dctcoef dct[4], int mf, int bias );
 int x264_quant_4x4_dc_ssse3( dctcoef dct[16], int mf, int bias );
 int x264_quant_4x4_ssse3( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
+int x264_quant_4x4x4_ssse3( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
 int x264_quant_8x8_ssse3( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
 int x264_quant_2x2_dc_sse4( dctcoef dct[16], int mf, int bias );
 int x264_quant_4x4_dc_sse4( dctcoef dct[16], int mf, int bias );
 int x264_quant_4x4_sse4( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
+int x264_quant_4x4x4_sse4( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
 int x264_quant_8x8_sse4( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
+int x264_quant_4x4_avx2( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
+int x264_quant_4x4_dc_avx2( dctcoef dct[16], int mf, int bias );
+int x264_quant_8x8_avx2( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
+int x264_quant_4x4x4_avx2( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
 void x264_dequant_4x4_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
 void x264_dequant_4x4dc_mmx2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
 void x264_dequant_8x8_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
@@ -56,10 +64,15 @@
 void x264_dequant_4x4_xop( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
 void x264_dequant_4x4dc_xop( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
 void x264_dequant_8x8_xop( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
+void x264_dequant_4x4_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_4x4dc_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_8x8_avx2( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
 void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
 void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
 void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
 void x264_dequant_8x8_flat16_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
+void x264_dequant_4x4_flat16_avx2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_8x8_flat16_avx2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
 int x264_optimize_chroma_2x2_dc_sse2( dctcoef dct[4], int dequant_mf );
 int x264_optimize_chroma_2x2_dc_ssse3( dctcoef dct[4], int dequant_mf );
 int x264_optimize_chroma_2x2_dc_sse4( dctcoef dct[4], int dequant_mf );
@@ -68,21 +81,17 @@
 void x264_denoise_dct_sse2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
 void x264_denoise_dct_ssse3( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
 void x264_denoise_dct_avx  ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
+void x264_denoise_dct_avx2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
 int x264_decimate_score15_mmx2( dctcoef *dct );
 int x264_decimate_score15_sse2( dctcoef *dct );
 int x264_decimate_score15_ssse3( dctcoef *dct );
 int x264_decimate_score16_mmx2( dctcoef *dct );
 int x264_decimate_score16_sse2( dctcoef *dct );
 int x264_decimate_score16_ssse3( dctcoef *dct );
-int x264_decimate_score15_mmx2_slowctz( dctcoef *dct );
-int x264_decimate_score15_sse2_slowctz( dctcoef *dct );
-int x264_decimate_score15_ssse3_slowctz( dctcoef *dct );
-int x264_decimate_score16_mmx2_slowctz( dctcoef *dct );
-int x264_decimate_score16_sse2_slowctz( dctcoef *dct );
-int x264_decimate_score16_ssse3_slowctz( dctcoef *dct );
 int x264_decimate_score64_mmx2( dctcoef *dct );
 int x264_decimate_score64_sse2( dctcoef *dct );
 int x264_decimate_score64_ssse3( dctcoef *dct );
+int x264_decimate_score64_avx2( int16_t *dct );
 int x264_coeff_last4_mmx2( dctcoef *dct );
 int x264_coeff_last8_mmx2( dctcoef *dct );
 int x264_coeff_last15_mmx2( dctcoef *dct );
@@ -98,18 +107,29 @@
 int x264_coeff_last15_sse2_lzcnt( dctcoef *dct );
 int x264_coeff_last16_sse2_lzcnt( dctcoef *dct );
 int x264_coeff_last64_sse2_lzcnt( dctcoef *dct );
+int x264_coeff_last64_avx2_lzcnt( dctcoef *dct );
 int x264_coeff_level_run16_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run16_sse2( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run16_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run16_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run16_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run16_avx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run15_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run15_sse2( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run15_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run15_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run15_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run15_avx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run4_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run4_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run4_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run4_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run8_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run8_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run8_sse2( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run8_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run8_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run8_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_trellis_cabac_4x4_sse2 ( TRELLIS_PARAMS, int b_ac );
 int x264_trellis_cabac_4x4_ssse3( TRELLIS_PARAMS, int b_ac );
 int x264_trellis_cabac_8x8_sse2 ( TRELLIS_PARAMS, int b_interlaced );

x264-snapshot-20130224-2245.tar.bz2/common/x86/sad-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/sad-a.asm Changed

@@ -29,6 +29,12 @@
 %include "x86inc.asm"
 %include "x86util.asm"
 
+SECTION_RODATA 32
+
+pb_shuf8x8c2: times 2 db 0,0,0,0,8,8,8,8,-1,-1,-1,-1,-1,-1,-1,-1
+deinterleave_sadx4: dd 0,4,2,6
+hpred_shuf: db 0,0,2,2,8,8,10,10,1,1,3,3,9,9,11,11
+
 SECTION .text
 
 cextern pb_3
@@ -556,6 +562,65 @@
 INIT_MMX ssse3
 INTRA_SAD_8x8C
 
+INIT_YMM avx2
+cglobal intra_sad_x3_8x8c, 3,3,7
+    vpbroadcastq m2, [r1 - FDEC_STRIDE]         ; V pred
+    add          r1, FDEC_STRIDE*4-1
+    pxor        xm5, xm5
+    punpckldq   xm3, xm2, xm5                   ; V0 _ V1 _
+    movd        xm0, [r1 + FDEC_STRIDE*-1 - 3]
+    movd        xm1, [r1 + FDEC_STRIDE* 3 - 3]
+    pinsrb      xm0, [r1 + FDEC_STRIDE*-4], 0
+    pinsrb      xm1, [r1 + FDEC_STRIDE* 0], 0
+    pinsrb      xm0, [r1 + FDEC_STRIDE*-3], 1
+    pinsrb      xm1, [r1 + FDEC_STRIDE* 1], 1
+    pinsrb      xm0, [r1 + FDEC_STRIDE*-2], 2
+    pinsrb      xm1, [r1 + FDEC_STRIDE* 2], 2
+    punpcklqdq  xm0, xm1                        ; H0 _ H1 _
+    vinserti128  m3, m3, xm0, 1                 ; V0 V1 H0 H1
+    pshufb      xm0, [hpred_shuf]               ; H00224466 H11335577
+    psadbw       m3, m5                         ; s0 s1 s2 s3
+    vpermq       m4, m3, q3312                  ; s2 s1 s3 s3
+    vpermq       m3, m3, q1310                  ; s0 s1 s3 s1
+    paddw        m3, m4
+    psrlw        m3, 2
+    pavgw        m3, m5                         ; s0+s2 s1 s3 s1+s3
+    pshufb       m3, [pb_shuf8x8c2]             ; DC0 _ DC1 _
+    vpblendd     m3, m3, m2, 11001100b          ; DC0 V DC1 V
+    vinserti128  m1, m3, xm3, 1                 ; DC0 V DC0 V
+    vperm2i128   m6, m3, m3, q0101              ; DC1 V DC1 V
+    vpermq       m0, m0, q3120                  ; H00224466 _ H11335577 _
+    movddup      m2, [r0+FENC_STRIDE*0]
+    movddup      m4, [r0+FENC_STRIDE*2]
+    pshuflw      m3, m0, q0000
+    psadbw       m3, m2
+    psadbw       m2, m1
+    pshuflw      m5, m0, q1111
+    psadbw       m5, m4
+    psadbw       m4, m1
+    paddw        m2, m4
+    paddw        m3, m5
+    movddup      m4, [r0+FENC_STRIDE*4]
+    pshuflw      m5, m0, q2222
+    psadbw       m5, m4
+    psadbw       m4, m6
+    paddw        m2, m4
+    paddw        m3, m5
+    movddup      m4, [r0+FENC_STRIDE*6]
+    pshuflw      m5, m0, q3333
+    psadbw       m5, m4
+    psadbw       m4, m6
+    paddw        m2, m4
+    paddw        m3, m5
+    vextracti128 xm0, m2, 1
+    vextracti128 xm1, m3, 1
+    paddw       xm2, xm0 ; DC V
+    paddw       xm3, xm1 ; H
+    pextrd   [r2+8], xm2, 2 ; V
+    movd     [r2+4], xm3    ; H
+    movd     [r2+0], xm2    ; DC
+    RET
+
 
 ;-----------------------------------------------------------------------------
 ; void intra_sad_x3_16x16( uint8_t *fenc, uint8_t *fdec, int res[3] );
@@ -648,7 +713,50 @@
 INIT_XMM ssse3
 INTRA_SAD16
 
-
+INIT_YMM avx2
+cglobal intra_sad_x3_16x16, 3,5,6
+    pxor   xm0, xm0
+    psadbw xm0, [r1-FDEC_STRIDE]
+    movhlps xm1, xm0
+    paddw  xm0, xm1
+    movd   r3d, xm0
+%assign x 0
+%rep 16
+    movzx  r4d, byte [r1-1+FDEC_STRIDE*(x&3)]
+%if (x&3)==3 && x!=15
+    add     r1, FDEC_STRIDE*4
+%endif
+    add    r3d, r4d
+%assign x x+1
+%endrep
+    sub     r1, FDEC_STRIDE*12
+    add    r3d, 16
+    shr    r3d, 5
+    movd   xm5, r3d
+    vpbroadcastb xm5, xm5
+    vinserti128 m5, m5, [r1-FDEC_STRIDE], 1 ; m5 contains DC and V prediction
+
+    pxor    m4, m4  ; DC / V accumulator
+    pxor   xm3, xm3 ; H accumulator
+    mov    r3d, 15*FENC_STRIDE
+.vloop:
+    vpbroadcastb  xm2, [r1+r3*2-1]
+    vbroadcasti128 m0, [r0+r3]
+    psadbw  m1, m0, m5
+    psadbw xm0, xm2
+    paddw   m4, m1
+    paddw  xm3, xm0
+    add    r3d, -FENC_STRIDE
+    jge .vloop
+    punpckhqdq m5, m4, m4
+    movhlps xm2, xm3
+    paddw   m4, m5      ; DC / V
+    paddw  xm3, xm2     ; H
+    vextracti128 xm2, m4, 1
+    movd  [r2+0], xm2
+    movd  [r2+4], xm3
+    movd  [r2+8], xm4
+    RET
 
 ;=============================================================================
 ; SAD x3/x4 MMX
@@ -944,17 +1052,27 @@
 %endif
 %endmacro
 
-%macro SAD_X3_2x16P_SSE2 1
-%if %1
+%macro SAD_X3_4x16P_SSE2 2
+%if %1==0
+%if UNIX64
+    mov  r6, r5
+%endif
+    lea  r5, [r4*3]
     SAD_X3_START_1x16P_SSE2
 %else
-    SAD_X3_1x16P_SSE2 0, 0
+    SAD_X3_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0
+%endif
+    SAD_X3_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r4*1
+    SAD_X3_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2
+    SAD_X3_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), r5
+%if %1 != %2-1
+%if (%1&1) != 0
+    add  r0, 8*FENC_STRIDE
+%endif
+    lea  r1, [r1+4*r4]
+    lea  r2, [r2+4*r4]
+    lea  r3, [r3+4*r4]
 %endif
-    SAD_X3_1x16P_SSE2 FENC_STRIDE, r4
-    add  r0, 2*FENC_STRIDE
-    lea  r1, [r1+2*r4]
-    lea  r2, [r2+2*r4]
-    lea  r3, [r3+2*r4]
 %endmacro
 
 %macro SAD_X3_START_2x8P_SSE2 0
@@ -971,15 +1089,15 @@
     psadbw  xmm2, xmm7
 %endmacro
 
-%macro SAD_X3_2x8P_SSE2 0
-    movq    xmm7, [r0]
-    movq    xmm3, [r1]
-    movq    xmm4, [r2]
-    movq    xmm5, [r3]
-    movhps  xmm7, [r0+FENC_STRIDE]
-    movhps  xmm3, [r1+r4]
-    movhps  xmm4, [r2+r4]
-    movhps  xmm5, [r3+r4]
+%macro SAD_X3_2x8P_SSE2 4
+    movq    xmm7, [r0+%1]
+    movq    xmm3, [r1+%2]
+    movq    xmm4, [r2+%2]
+    movq    xmm5, [r3+%2]
+    movhps  xmm7, [r0+%3]
+    movhps  xmm3, [r1+%4]
+    movhps  xmm4, [r2+%4]
+    movhps  xmm5, [r3+%4]
     psadbw  xmm3, xmm7
     psadbw  xmm4, xmm7
     psadbw  xmm5, xmm7
@@ -1005,18 +1123,18 @@
     psadbw  xmm3, xmm7
 %endmacro
 
-%macro SAD_X4_2x8P_SSE2 0
-    movq    xmm7, [r0]
-    movq    xmm4, [r1]
-    movq    xmm5, [r2]

x264-snapshot-20130224-2245.tar.bz2/common/x86/sad16-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/sad16-a.asm Changed

@@ -4,6 +4,7 @@
 ;* Copyright (C) 2010-2013 x264 project
 ;*
 ;* Authors: Oskar Arvidsson <oskar@irock.se>
+;*          Henrik Gramner <henrik@gramner.com>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
@@ -90,11 +91,18 @@
 ; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t )
 ;-----------------------------------------------------------------------------
 %macro SAD_MMX 3
-cglobal pixel_sad_%1x%2, 4,4
+cglobal pixel_sad_%1x%2, 4,5-(%2&4/4)
     pxor    m0, m0
-%rep %2/%3
+%if %2 == 4
     SAD_INC_%3x%1P_MMX
-%endrep
+    SAD_INC_%3x%1P_MMX
+%else
+    mov    r4d, %2/%3
+.loop:
+    SAD_INC_%3x%1P_MMX
+    dec    r4d
+    jg .loop
+%endif
 %if %1*%2 == 256
     HADDUW  m0, m1
 %else
@@ -120,7 +128,8 @@
 ; SAD XMM
 ;=============================================================================
 
-%macro SAD_INC_2x16P_XMM 0
+%macro SAD_INC_2ROW 1
+%if 2*%1 > mmsize
     movu    m1, [r2+ 0]
     movu    m2, [r2+16]
     movu    m3, [r2+2*r3+ 0]
@@ -137,9 +146,7 @@
     paddw   m3, m4
     paddw   m0, m1
     paddw   m0, m3
-%endmacro
-
-%macro SAD_INC_2x8P_XMM 0
+%else
     movu    m1, [r2]
     movu    m2, [r2+2*r3]
     psubw   m1, [r0]
@@ -149,44 +156,55 @@
     lea     r2, [r2+4*r3]
     paddw   m0, m1
     paddw   m0, m2
+%endif
 %endmacro
 
 ;-----------------------------------------------------------------------------
 ; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t )
 ;-----------------------------------------------------------------------------
-%macro SAD_XMM 2
-cglobal pixel_sad_%1x%2, 4,4,8
+%macro SAD 2
+cglobal pixel_sad_%1x%2, 4,5-(%2&4/4),8*(%1/mmsize)
     pxor    m0, m0
-%rep %2/2
-    SAD_INC_2x%1P_XMM
-%endrep
+%if %2 == 4
+    SAD_INC_2ROW %1
+    SAD_INC_2ROW %1
+%else
+    mov    r4d, %2/2
+.loop:
+    SAD_INC_2ROW %1
+    dec    r4d
+    jg .loop
+%endif
     HADDW   m0, m1
-    movd   eax, m0
+    movd   eax, xm0
     RET
 %endmacro
 
 INIT_XMM sse2
-SAD_XMM 16, 16
-SAD_XMM 16,  8
-SAD_XMM  8, 16
-SAD_XMM  8,  8
-SAD_XMM  8,  4
+SAD 16, 16
+SAD 16,  8
+SAD  8, 16
+SAD  8,  8
+SAD  8,  4
 INIT_XMM sse2, aligned
-SAD_XMM 16, 16
-SAD_XMM 16,  8
-SAD_XMM  8, 16
-SAD_XMM  8,  8
+SAD 16, 16
+SAD 16,  8
+SAD  8, 16
+SAD  8,  8
 INIT_XMM ssse3
-SAD_XMM 16, 16
-SAD_XMM 16,  8
-SAD_XMM  8, 16
-SAD_XMM  8,  8
-SAD_XMM  8,  4
+SAD 16, 16
+SAD 16,  8
+SAD  8, 16
+SAD  8,  8
+SAD  8,  4
 INIT_XMM ssse3, aligned
-SAD_XMM 16, 16
-SAD_XMM 16,  8
-SAD_XMM  8, 16
-SAD_XMM  8,  8
+SAD 16, 16
+SAD 16,  8
+SAD  8, 16
+SAD  8,  8
+INIT_YMM avx2
+SAD 16, 16
+SAD 16,  8
 
 ;=============================================================================
 ; SAD x3/x4
@@ -237,14 +255,14 @@
     HADDW    m2, m5
 %endif
 %if UNIX64
-    movd [r5+0], m0
-    movd [r5+4], m1
-    movd [r5+8], m2
+    movd [r5+0], xm0
+    movd [r5+4], xm1
+    movd [r5+8], xm2
 %else
     mov      r0, r5mp
-    movd [r0+0], m0
-    movd [r0+4], m1
-    movd [r0+8], m2
+    movd [r0+0], xm0
+    movd [r0+4], xm1
+    movd [r0+8], xm2
 %endif
     RET
 %endmacro
@@ -333,10 +351,10 @@
     HADDW     m3, m7
 %endif
     mov       r0, r6mp
-    movd [r0+ 0], m0
-    movd [r0+ 4], m1
-    movd [r0+ 8], m2
-    movd [r0+12], m3
+    movd [r0+ 0], xm0
+    movd [r0+ 4], xm1
+    movd [r0+ 8], xm2
+    movd [r0+12], xm3
     RET
 %endmacro
 
@@ -400,8 +418,39 @@
 INIT_XMM xop
 PIXEL_VSAD
 
+INIT_YMM avx2
+cglobal pixel_vsad, 3,3
+    mova      m0, [r0]
+    mova      m1, [r0+2*r1]
+    lea       r0, [r0+4*r1]
+    psubw     m0, m1
+    pabsw     m0, m0
+    sub      r2d, 2
+    je .end
+.loop:
+    mova      m2, [r0]
+    mova      m3, [r0+2*r1]
+    lea       r0, [r0+4*r1]
+    psubw     m1, m2
+    psubw     m2, m3
+    pabsw     m1, m1
+    pabsw     m2, m2
+    paddw     m0, m1
+    paddw     m0, m2
+    mova      m1, m3
+    sub      r2d, 2
+    jg .loop
+.end:
+%if BIT_DEPTH == 9
+    HADDW     m0, m1
+%else
+    HADDUW    m0, m1
+%endif
+    movd     eax, xm0

x264-snapshot-20130224-2245.tar.bz2/common/x86/trellis-64.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/trellis-64.asm Changed

@@ -96,6 +96,15 @@
 %endif
 %endmacro
 
+%macro LOAD_DUP 2 ; dst, src
+%if cpuflag(ssse3)
+    movddup    %1, %2
+%else
+    movd       %1, %2
+    punpcklqdq %1, %1
+%endif
+%endmacro
+
 ;-----------------------------------------------------------------------------
 ; int trellis_cabac_4x4_psy(
 ;     const int *unquant_mf, const uint8_t *zigzag, int lambda2,
@@ -186,12 +195,11 @@
     mov dword levelgt1_ctxm, 9
 %endif
 %if psy
-    movd    m6, psy_trellism
+    LOAD_DUP m6, psy_trellism
     %define psy_trellis m6
 %elif dc
-    movd       m6, [unquant_mfq]
+    LOAD_DUP   m6, [unquant_mfq]
     paddd      m6, m6
-    punpcklqdq m6, m6
     %define unquant_mf m6
 %endif
 %ifdef PIC
@@ -333,13 +341,12 @@
     movd    m0, abs_leveld
     mov     r6, orig_coefsm
 %if HIGH_BIT_DEPTH
-    movd    m1, [r6 + zigzagiq*SIZEOF_DCTCOEF]
+    LOAD_DUP m1, [r6 + zigzagiq*SIZEOF_DCTCOEF]
 %else
-    movd    m1, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
-    psrad   m1, 16
+    LOAD_DUP m1, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
+    psrad    m1, 16     ; sign_coef
 %endif
     punpcklqdq m0, m0 ; quant_coef
-    punpcklqdq m1, m1 ; sign_coef
 %if cpuflag(ssse3)
     pabsd   m0, m0
     pabsd   m2, m1 ; abs_coef
@@ -403,11 +410,10 @@
 %else
 %ifdef PIC
     mov    r10, unquant_mfm
-    movd    m3, [r10 + zigzagiq*4]
+    LOAD_DUP m3, [r10 + zigzagiq*4]
 %else
-    movd    m3, [unquant_mfq + zigzagiq*4]
+    LOAD_DUP m3, [unquant_mfq + zigzagiq*4]
 %endif
-    punpcklqdq m3, m3
     pmuludq m0, m3
 %endif
     paddd   m0, [pq_128]
@@ -420,8 +426,7 @@
 %if dc
     psllq   m0, 8
 %else
-    movd    m5, [dct_weight2_tab + zigzagiq*4 GLOBAL]
-    punpcklqdq m5, m5
+    LOAD_DUP m5, [dct_weight2_tab + zigzagiq*4 GLOBAL]
     pmuludq m0, m5
 %endif
 
@@ -434,12 +439,11 @@
     ; ssd1[k] -= psy_weight * psy_value;
     mov     r6, fenc_dctm
 %if HIGH_BIT_DEPTH
-    movd    m3, [r6 + zigzagiq*SIZEOF_DCTCOEF]
+    LOAD_DUP m3, [r6 + zigzagiq*SIZEOF_DCTCOEF]
 %else
-    movd    m3, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
+    LOAD_DUP m3, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
     psrad   m3, 16 ; orig_coef
 %endif
-    punpcklqdq m3, m3
 %if cpuflag(ssse3)
     psignd  m4, m1 ; SIGN(unquant_abs_level, sign_coef)
 %else
@@ -453,9 +457,8 @@
     ABSD    m3, m4
     SWAP     4, 3
 %endif
-    movd    m1, [dct_weight1_tab + zigzagiq*4 GLOBAL]
+    LOAD_DUP m1, [dct_weight1_tab + zigzagiq*4 GLOBAL]
     pmuludq m1, psy_trellis
-    punpcklqdq m1, m1
     pmuludq m4, m1
     psubq   m0, m4
 %if %1

x264-snapshot-20130224-2245.tar.bz2/common/x86/util.h -> x264-snapshot-20130723-2245.tar.bz2/common/x86/util.h Changed

@@ -121,42 +121,132 @@
     return amvd;
 }
 
+#define x264_predictor_clip x264_predictor_clip_mmx2
+static int ALWAYS_INLINE x264_predictor_clip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
+{
+    static const uint32_t pd_32 = 0x20;
+    intptr_t tmp = (intptr_t)mv_limit, mvc_max = i_mvc, i = 0;
+
+    asm(
+        "movq       (%2), %%mm5 \n"
+        "movd         %6, %%mm3 \n"
+        "psllw        $2, %%mm5 \n" // Convert to subpel
+        "pshufw $0xEE, %%mm5, %%mm6 \n"
+        "dec         %k3        \n"
+        "jz 2f                  \n" // if( i_mvc == 1 ) {do the last iteration}
+        "punpckldq %%mm3, %%mm3 \n"
+        "punpckldq %%mm5, %%mm5 \n"
+        "movd         %7, %%mm4 \n"
+        "lea   (%0,%3,4), %3    \n"
+        "1:                     \n"
+        "movq       (%0), %%mm0 \n"
+        "add          $8, %0    \n"
+        "movq      %%mm3, %%mm1 \n"
+        "pxor      %%mm2, %%mm2 \n"
+        "pcmpeqd   %%mm0, %%mm1 \n" // mv == pmv
+        "pcmpeqd   %%mm0, %%mm2 \n" // mv == 0
+        "por       %%mm1, %%mm2 \n" // (mv == pmv || mv == 0) * -1
+        "pmovmskb  %%mm2, %k2   \n" // (mv == pmv || mv == 0) * 0xf
+        "pmaxsw    %%mm5, %%mm0 \n"
+        "pminsw    %%mm6, %%mm0 \n"
+        "pand      %%mm4, %%mm2 \n" // (mv0 == pmv || mv0 == 0) * 32
+        "psrlq     %%mm2, %%mm0 \n" // drop mv0 if it's skipped
+        "movq      %%mm0, (%5,%4,4) \n"
+        "and         $24, %k2   \n"
+        "add          $2, %4    \n"
+        "add          $8, %k2   \n"
+        "shr          $4, %k2   \n" // (4-val)>>1
+        "sub          %2, %4    \n" // +1 for each valid motion vector
+        "cmp          %3, %0    \n"
+        "jl 1b                  \n"
+        "jg 3f                  \n" // if( i == i_mvc - 1 ) {do the last iteration}
+
+        /* Do the last iteration */
+        "2:                     \n"
+        "movd       (%0), %%mm0 \n"
+        "pxor      %%mm2, %%mm2 \n"
+        "pcmpeqd   %%mm0, %%mm3 \n"
+        "pcmpeqd   %%mm0, %%mm2 \n"
+        "por       %%mm3, %%mm2 \n"
+        "pmovmskb  %%mm2, %k2   \n"
+        "pmaxsw    %%mm5, %%mm0 \n"
+        "pminsw    %%mm6, %%mm0 \n"
+        "movd      %%mm0, (%5,%4,4) \n"
+        "inc          %4        \n"
+        "and          $1, %k2   \n"
+        "sub          %2, %4    \n" // output += !(mv == pmv || mv == 0)
+        "3:                     \n"
+        :"+r"(mvc), "=m"(M64( dst )), "+r"(tmp), "+r"(mvc_max), "+r"(i)
+        :"r"(dst), "g"(pmv), "m"(pd_32), "m"(M64( mvc ))
+    );
+    return i;
+}
+
+/* Same as the above, except we do (mv + 2) >> 2 on the input. */
 #define x264_predictor_roundclip x264_predictor_roundclip_mmx2
-static void ALWAYS_INLINE x264_predictor_roundclip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
+static int ALWAYS_INLINE x264_predictor_roundclip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
 {
-    uint32_t mv_min = pack16to32_mask( mv_x_min, mv_y_min );
-    uint32_t mv_max = pack16to32_mask( mv_x_max, mv_y_max );
     static const uint64_t pw_2 = 0x0002000200020002ULL;
-    intptr_t i = i_mvc;
+    static const uint32_t pd_32 = 0x20;
+    intptr_t tmp = (intptr_t)mv_limit, mvc_max = i_mvc, i = 0;
+
     asm(
-        "movd    %2, %%mm5       \n"
-        "movd    %3, %%mm6       \n"
-        "movq    %4, %%mm7       \n"
-        "punpckldq %%mm5, %%mm5  \n"
-        "punpckldq %%mm6, %%mm6  \n"
-        "test $1, %0             \n"
-        "jz 1f                   \n"
-        "movd -4(%6,%0,4), %%mm0 \n"
-        "paddw %%mm7, %%mm0      \n"
-        "psraw $2, %%mm0         \n"
-        "pmaxsw %%mm5, %%mm0     \n"
-        "pminsw %%mm6, %%mm0     \n"
-        "movd %%mm0, -4(%5,%0,4) \n"
-        "dec %0                  \n"
-        "jz 2f                   \n"
-        "1:                      \n"
-        "movq -8(%6,%0,4), %%mm0 \n"
-        "paddw %%mm7, %%mm0      \n"
-        "psraw $2, %%mm0         \n"
-        "pmaxsw %%mm5, %%mm0     \n"
-        "pminsw %%mm6, %%mm0     \n"
-        "movq %%mm0, -8(%5,%0,4) \n"
-        "sub $2, %0              \n"
-        "jnz 1b                  \n"
-        "2:                      \n"
-        :"+r"(i), "=m"(M64( dst ))
-        :"g"(mv_min), "g"(mv_max), "m"(pw_2), "r"(dst), "r"(mvc), "m"(M64( mvc ))
+        "movq       (%2), %%mm5 \n"
+        "movq         %6, %%mm7 \n"
+        "movd         %7, %%mm3 \n"
+        "pshufw $0xEE, %%mm5, %%mm6 \n"
+        "dec         %k3        \n"
+        "jz 2f                  \n"
+        "punpckldq %%mm3, %%mm3 \n"
+        "punpckldq %%mm5, %%mm5 \n"
+        "movd         %8, %%mm4 \n"
+        "lea   (%0,%3,4), %3    \n"
+        "1:                     \n"
+        "movq       (%0), %%mm0 \n"
+        "add          $8, %0    \n"
+        "paddw     %%mm7, %%mm0 \n"
+        "psraw        $2, %%mm0 \n"
+        "movq      %%mm3, %%mm1 \n"
+        "pxor      %%mm2, %%mm2 \n"
+        "pcmpeqd   %%mm0, %%mm1 \n"
+        "pcmpeqd   %%mm0, %%mm2 \n"
+        "por       %%mm1, %%mm2 \n"
+        "pmovmskb  %%mm2, %k2   \n"
+        "pmaxsw    %%mm5, %%mm0 \n"
+        "pminsw    %%mm6, %%mm0 \n"
+        "pand      %%mm4, %%mm2 \n"
+        "psrlq     %%mm2, %%mm0 \n"
+        "movq      %%mm0, (%5,%4,4) \n"
+        "and         $24, %k2   \n"
+        "add          $2, %4    \n"
+        "add          $8, %k2   \n"
+        "shr          $4, %k2   \n"
+        "sub          %2, %4    \n"
+        "cmp          %3, %0    \n"
+        "jl 1b                  \n"
+        "jg 3f                  \n"
+
+        /* Do the last iteration */
+        "2:                     \n"
+        "movd       (%0), %%mm0 \n"
+        "paddw     %%mm7, %%mm0 \n"
+        "psraw        $2, %%mm0 \n"
+        "pxor      %%mm2, %%mm2 \n"
+        "pcmpeqd   %%mm0, %%mm3 \n"
+        "pcmpeqd   %%mm0, %%mm2 \n"
+        "por       %%mm3, %%mm2 \n"
+        "pmovmskb  %%mm2, %k2   \n"
+        "pmaxsw    %%mm5, %%mm0 \n"
+        "pminsw    %%mm6, %%mm0 \n"
+        "movd      %%mm0, (%5,%4,4) \n"
+        "inc          %4        \n"
+        "and          $1, %k2   \n"
+        "sub          %2, %4    \n"
+        "3:                     \n"
+        :"+r"(mvc), "=m"(M64( dst )), "+r"(tmp), "+r"(mvc_max), "+r"(i)
+        :"r"(dst), "m"(pw_2), "g"(pmv), "m"(pd_32), "m"(M64( mvc ))
     );
+    return i;
 }
 
 #endif

x264-snapshot-20130224-2245.tar.bz2/common/x86/x86inc.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/x86inc.asm Changed

@@ -6,7 +6,7 @@
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Anton Mitrofanov <BugMaster@narod.ru>
 ;*          Jason Garrett-Glaser <darkshikari@gmail.com>
-;*          Henrik Gramner <hengar-6@student.ltu.se>
+;*          Henrik Gramner <henrik@gramner.com>
 ;*
 ;* Permission to use, copy, modify, and/or distribute this software for any
 ;* purpose with or without fee is hereby granted, provided that the above
@@ -34,8 +34,12 @@
 ; as this feature might be useful for others as well.  Send patches or ideas
 ; to x264-devel@videolan.org .
 
-%ifndef program_name
-    %define program_name x264
+%ifndef private_prefix
+    %define private_prefix x264
+%endif
+
+%ifndef public_prefix
+    %define public_prefix private_prefix
 %endif
 
 %define WIN64  0
@@ -56,29 +60,12 @@
     %define mangle(x) x
 %endif
 
-; Name of the .rodata section.
-; Kludge: Something on OS X fails to align .rodata even given an align attribute,
-; so use a different read-only section.
 %macro SECTION_RODATA 0-1 16
-    %ifidn __OUTPUT_FORMAT__,macho64
-        SECTION .text align=%1
-    %elifidn __OUTPUT_FORMAT__,macho
-        SECTION .text align=%1
-        fakegot:
-    %elifidn __OUTPUT_FORMAT__,aout
-        section .text
-    %else
-        SECTION .rodata align=%1
-    %endif
+    SECTION .rodata align=%1
 %endmacro
 
-; aout does not support align=
 %macro SECTION_TEXT 0-1 16
-    %ifidn __OUTPUT_FORMAT__,aout
-        SECTION .text
-    %else
-        SECTION .text align=%1
-    %endif
+    SECTION .text align=%1
 %endmacro
 
 %if WIN64
@@ -323,14 +310,18 @@
             %if stack_size < 0
                 %assign stack_size -stack_size
             %endif
-            %if mmsize != 8
-                %assign xmm_regs_used %2
+            %assign stack_size_padded stack_size
+            %if WIN64
+                %assign stack_size_padded stack_size_padded + 32 ; reserve 32 bytes for shadow space
+                %if mmsize != 8
+                    %assign xmm_regs_used %2
+                    %if xmm_regs_used > 8
+                        %assign stack_size_padded stack_size_padded + (xmm_regs_used-8)*16
+                    %endif
+                %endif
             %endif
             %if mmsize <= 16 && HAVE_ALIGNED_STACK
-                %assign stack_size_padded stack_size + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1))
-                %if xmm_regs_used > 6
-                    %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
-                %endif
+                %assign stack_size_padded stack_size_padded + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1))
                 SUB rsp, stack_size_padded
             %else
                 %assign %%reg_num (regs_used - 1)
@@ -340,14 +331,6 @@
                 ; stack in a single instruction (i.e. mov rsp, rstk or mov
                 ; rsp, [rsp+stack_size_padded])
                 mov  rstk, rsp
-                %assign stack_size_padded stack_size
-                %if xmm_regs_used > 6
-                    %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
-                    %if mmsize == 32 && xmm_regs_used & 1
-                        ; re-align to 32 bytes
-                        %assign stack_size_padded (stack_size_padded + 16)
-                    %endif
-                %endif
                 %if %1 < 0 ; need to store rsp on stack
                     sub  rsp, gprsize+stack_size_padded
                     and  rsp, ~(%%stack_alignment-1)
@@ -359,9 +342,7 @@
                     %xdefine rstkm rstk
                 %endif
             %endif
-            %if xmm_regs_used > 6
-                WIN64_PUSH_XMM
-            %endif
+            WIN64_PUSH_XMM
         %endif
     %endif
 %endmacro
@@ -422,40 +403,55 @@
 %endmacro
 
 %macro WIN64_PUSH_XMM 0
-    %assign %%i xmm_regs_used
-    %rep (xmm_regs_used-6)
-        %assign %%i %%i-1
-        movdqa [rsp + (%%i-6)*16 + stack_size + (~stack_offset&8)], xmm %+ %%i
-    %endrep
+    ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
+    %if xmm_regs_used > 6
+        movaps [rstk + stack_offset +  8], xmm6
+    %endif
+    %if xmm_regs_used > 7
+        movaps [rstk + stack_offset + 24], xmm7
+    %endif
+    %if xmm_regs_used > 8
+        %assign %%i 8
+        %rep xmm_regs_used-8
+            movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
+            %assign %%i %%i+1
+        %endrep
+    %endif
 %endmacro
 
 %macro WIN64_SPILL_XMM 1
     %assign xmm_regs_used %1
     ASSERT xmm_regs_used <= 16
-    %if xmm_regs_used > 6
-        SUB rsp, (xmm_regs_used-6)*16+16
-        WIN64_PUSH_XMM
+    %if xmm_regs_used > 8
+        %assign stack_size_padded (xmm_regs_used-8)*16 + (~stack_offset&8) + 32
+        SUB rsp, stack_size_padded
     %endif
+    WIN64_PUSH_XMM
 %endmacro
 
 %macro WIN64_RESTORE_XMM_INTERNAL 1
-    %if xmm_regs_used > 6
+    %assign %%pad_size 0
+    %if xmm_regs_used > 8
         %assign %%i xmm_regs_used
-        %rep (xmm_regs_used-6)
+        %rep xmm_regs_used-8
             %assign %%i %%i-1
-            movdqa xmm %+ %%i, [%1 + (%%i-6)*16+stack_size+(~stack_offset&8)]
+            movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32]
         %endrep
-        %if stack_size_padded == 0
-            add %1, (xmm_regs_used-6)*16+16
-        %endif
     %endif
     %if stack_size_padded > 0
         %if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0)
             mov rsp, rstkm
         %else
             add %1, stack_size_padded
+            %assign %%pad_size stack_size_padded
         %endif
     %endif
+    %if xmm_regs_used > 7
+        movaps xmm7, [%1 + stack_offset - %%pad_size + 24]
+    %endif
+    %if xmm_regs_used > 6
+        movaps xmm6, [%1 + stack_offset - %%pad_size +  8]
+    %endif
 %endmacro
 
 %macro WIN64_RESTORE_XMM 1
@@ -643,38 +639,48 @@
 ; Applies any symbol mangling needed for C linkage, and sets up a define such that
 ; subsequent uses of the function name automatically refer to the mangled version.
 ; Appends cpuflags to the function name if cpuflags has been specified.
+; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX
+; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2).
 %macro cglobal 1-2+ "" ; name, [PROLOGUE args]
-    ; the "" is a workaround for nasm, which fails if SUFFIX is empty
-    ; and we call cglobal_internal with just %1 %+ SUFFIX (without %2)
-    cglobal_internal %1 %+ SUFFIX, %2
+    cglobal_internal 1, %1 %+ SUFFIX, %2
 %endmacro
-%macro cglobal_internal 1-2+
-    %ifndef cglobaled_%1
-        %xdefine %1 mangle(program_name %+ _ %+ %1)
-        %xdefine %1.skip_prologue %1 %+ .skip_prologue
-        CAT_XDEFINE cglobaled_, %1, 1
+%macro cvisible 1-2+ "" ; name, [PROLOGUE args]
+    cglobal_internal 0, %1 %+ SUFFIX, %2
+%endmacro
+%macro cglobal_internal 2-3+
+    %if %1
+        %xdefine %%FUNCTION_PREFIX private_prefix

x264-snapshot-20130224-2245.tar.bz2/common/x86/x86util.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/x86util.asm Changed

@@ -30,10 +30,14 @@
 %assign SIZEOF_PIXEL 1
 %assign SIZEOF_DCTCOEF 2
 %define pixel byte
+%define vpbroadcastdct vpbroadcastw
+%define vpbroadcastpix vpbroadcastb
 %if HIGH_BIT_DEPTH
     %assign SIZEOF_PIXEL 2
     %assign SIZEOF_DCTCOEF 4
     %define pixel word
+    %define vpbroadcastdct vpbroadcastd
+    %define vpbroadcastpix vpbroadcastw
 %endif
 
 %assign FENC_STRIDEB SIZEOF_PIXEL*FENC_STRIDE
@@ -52,7 +56,10 @@
 
 
 %macro SBUTTERFLY 4
-%if avx_enabled && mmsize == 16
+%ifidn %1, dqqq
+    vperm2i128  m%4, m%2, m%3, q0301 ; punpckh
+    vinserti128 m%2, m%2, xm%3, 1    ; punpckl
+%elif avx_enabled && mmsize >= 16
     punpckh%1 m%4, m%2, m%3
     punpckl%1 m%2, m%3
 %else
@@ -214,15 +221,20 @@
 %endif
 %endmacro
 
-%macro ABSD 2
+%macro ABSD 2-3
 %if cpuflag(ssse3)
     pabsd   %1, %2
 %else
-    pxor    %1, %1
-    pcmpgtd %1, %2
-    pxor    %2, %1
-    psubd   %2, %1
-    SWAP    %1, %2
+    %define %%s %2
+%if %0 == 3
+    mova    %3, %2
+    %define %%s %3
+%endif
+    pxor     %1, %1
+    pcmpgtd  %1, %%s
+    pxor    %%s, %1
+    psubd   %%s, %1
+    SWAP     %1, %%s
 %endif
 %endmacro
 
@@ -255,9 +267,13 @@
 %endmacro
 
 %imacro SPLATW 2-3 0
-    PSHUFLW    %1, %2, (%3)*q1111
+%if cpuflag(avx2) && %3 == 0
+    vpbroadcastw %1, %2
+%else
+    PSHUFLW      %1, %2, (%3)*q1111
 %if mmsize == 16
-    punpcklqdq %1, %1
+    punpcklqdq   %1, %1
+%endif
 %endif
 %endmacro
 
@@ -275,16 +291,24 @@
 %endmacro
 
 %macro HADDD 2 ; sum junk
-%if mmsize == 16
+%if sizeof%1 == 32
+%define %2 xmm%2
+    vextracti128 %2, %1, 1
+%define %1 xmm%1
+    paddd   %1, %2
+%endif
+%if mmsize >= 16
     movhlps %2, %1
     paddd   %1, %2
 %endif
     PSHUFLW %2, %1, q0032
     paddd   %1, %2
+%undef %1
+%undef %2
 %endmacro
 
 %macro HADDW 2 ; reg, tmp
-%if cpuflag(xop) && mmsize == 16
+%if cpuflag(xop) && sizeof%1 == 16
     vphaddwq  %1, %1
     movhlps   %2, %1
     paddd     %1, %2
@@ -294,22 +318,41 @@
 %endif
 %endmacro
 
-%macro HADDUW 2
-%if cpuflag(xop) && mmsize == 16
-    vphadduwq %1, %1
-    movhlps   %2, %1
-    paddd     %1, %2
+%macro HADDUWD 2
+%if cpuflag(xop) && sizeof%1 == 16
+    vphadduwd %1, %1
 %else
     psrld %2, %1, 16
     pslld %1, 16
     psrld %1, 16
     paddd %1, %2
-    HADDD %1, %2
+%endif
+%endmacro
+
+%macro HADDUW 2
+%if cpuflag(xop) && sizeof%1 == 16
+    vphadduwq %1, %1
+    movhlps   %2, %1
+    paddd     %1, %2
+%else
+    HADDUWD   %1, %2
+    HADDD     %1, %2
 %endif
 %endmacro
 
 %macro PALIGNR 4-5 ; [dst,] src1, src2, imm, tmp
-%if cpuflag(ssse3)
+; AVX2 version uses a precalculated extra input that
+; can be re-used across calls
+%if sizeof%1==32
+                                 ; %3 = abcdefgh ijklmnop (lower address)
+                                 ; %2 = ABCDEFGH IJKLMNOP (higher address)
+;   vperm2i128 %5, %2, %3, q0003 ; %5 = ijklmnop ABCDEFGH
+%if %4 < 16
+    palignr    %1, %5, %3, %4    ; %1 = bcdefghi jklmnopA
+%else
+    palignr    %1, %2, %5, %4-16 ; %1 = pABCDEFG HIJKLMNO
+%endif
+%elif cpuflag(ssse3)
     %if %0==5
         palignr %1, %2, %3, %4
     %else
@@ -475,7 +518,7 @@
 %endif
 %elifidn %1, q
     shufps m%5, m%3, m%4, q3131
-    shufps m%3, m%4, q2020
+    shufps m%3, m%3, m%4, q2020
     SWAP    %4, %5
 %endif
 %endmacro
@@ -498,22 +541,24 @@
 ; %5(%6): tmpregs
 %if %1!=0 ; have to reorder stuff for horizontal op
     %ifidn %2, sumsub
-         %define ORDER ord
-         ; sumsub needs order because a-b != b-a unless a=b
+        %define ORDER ord
+        ; sumsub needs order because a-b != b-a unless a=b
     %else
-         %define ORDER unord
-         ; if we just max, order doesn't matter (allows pblendw+or in sse4)
+        %define ORDER unord
+        ; if we just max, order doesn't matter (allows pblendw+or in sse4)
     %endif
     %if %1==1
-         TRANS d, ORDER, %3, %4, %5, %6
+        TRANS d, ORDER, %3, %4, %5, %6
     %elif %1==2
-         %if mmsize==8
-             SBUTTERFLY dq, %3, %4, %5
-         %else
-             TRANS q, ORDER, %3, %4, %5, %6
-         %endif
+        %if mmsize==8
+            SBUTTERFLY dq, %3, %4, %5
+        %else
+            TRANS q, ORDER, %3, %4, %5, %6
+        %endif
     %elif %1==4
-         SBUTTERFLY qdq, %3, %4, %5
+        SBUTTERFLY qdq, %3, %4, %5
+    %elif %1==8
+        SBUTTERFLY dqqq, %3, %4, %5
     %endif
 %endif
 %ifidn %2, sumsub
@@ -675,11 +720,18 @@
 %endmacro
 
 
-%macro LOAD_DIFF 5
+%macro LOAD_DIFF 5-6 1
 %if HIGH_BIT_DEPTH
+%if %6 ; %5 aligned?
     mova       %1, %4

x264-snapshot-20130224-2245.tar.bz2/configure -> x264-snapshot-20130723-2245.tar.bz2/configure Changed

@@ -25,6 +25,7 @@
   --system-libx264         use system libx264 instead of internal
   --enable-shared          build shared library
   --enable-static          build static library
+  --disable-opencl         disable OpenCL features
   --disable-gpl            disable GPL-only features
   --disable-thread         disable multithreaded encoding
   --enable-win32thread     use win32threads (windows only)
@@ -46,7 +47,7 @@
   --sysroot=SYSROOT        root of cross-build tree
 
 External library support:
-  --disable-avs            disable avisynth support (windows only)
+  --disable-avs            disable avisynth support
   --disable-swscale        disable swscale support
   --disable-lavf           disable libavformat support
   --disable-ffms           disable ffmpegsource support
@@ -80,6 +81,9 @@
         [[ "$arg" = -falign-loops* ]] && arg=
         [ "$arg" = -fno-tree-vectorize ] && arg=
         [ "$arg" = -Wshadow ] && arg=
+        [[ "$arg" = -mpreferred-stack-boundary* ]] && arg=
+        [[ "$arg" = -l* ]] && arg=
+        [[ "$arg" = -L* ]] && arg=
         if [ $compiler = ICL ]; then
             [ "$arg" = -Wall ] && arg=-W0
             [ "$arg" = -g ] && arg=-Z7
@@ -133,7 +137,7 @@
     [ -n "$1" ] && echo "#include <$1>" > conftest.c
     echo "int main () { $3 return 0; }" >> conftest.c
     if [ $compiler = ICL ]; then
-        cc_cmd="$CC conftest.c $CFLAGS $2 -link $(icl_ldflags $2 $LDFLAGSCLI $LDFLAGS)"
+        cc_cmd="$CC conftest.c $(intel_cflags $CFLAGS $2) -link $(icl_ldflags $2 $LDFLAGSCLI $LDFLAGS)"
     else
         cc_cmd="$CC conftest.c $CFLAGS $2 $LDFLAGSCLI $LDFLAGS -o conftest"
     fi
@@ -273,6 +277,7 @@
 bit_depth="8"
 chroma_format="all"
 compiler="GNU"
+opencl="yes"
 
 CFLAGS="$CFLAGS -Wall -I. -I\$(SRCPATH)"
 LDFLAGS="$LDFLAGS"
@@ -285,7 +290,7 @@
 EXE=""
 
 # list of all preprocessor HAVE values we can define
-CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F VISUALIZE SWSCALE LAVF FFMS GPAC GF_MALLOC AVS GPL VECTOREXT INTERLACED CPU_COUNT"
+CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F VISUALIZE SWSCALE LAVF FFMS GPAC GF_MALLOC AVS GPL VECTOREXT INTERLACED CPU_COUNT OPENCL"
 
 # parse options
 
@@ -381,6 +386,9 @@
         --host=*)
             host="$optarg"
             ;;
+        --disable-opencl)
+            opencl="no"
+            ;;
         --cross-prefix=*)
             cross_prefix="$optarg"
             ;;
@@ -521,6 +529,13 @@
         fi
         HAVE_GETOPT_LONG=0
         ;;
+    *qnx*)
+        SYS="QNX"
+        define HAVE_MALLOC_H
+        libm="-lm"
+        HAVE_GETOPT_LONG=0
+        CFLAGS="$CFLAGS -I\$(SRCPATH)/extras"
+        ;;
     *)
         die "Unknown system $host, edit the configure"
         ;;
@@ -564,6 +579,7 @@
         elif [ "$SYS" = WINDOWS -o "$SYS" = CYGWIN ]; then
             ASFLAGS="$ASFLAGS -f win32 -DPREFIX"
             LDFLAGS="$LDFLAGS -Wl,--large-address-aware"
+            [ $compiler = GNU ] && LDFLAGS="$LDFLAGS -Wl,--nxcompat -Wl,--dynamicbase"
             [ $compiler = GNU ] && RCFLAGS="--target=pe-i386 $RCFLAGS"
         else
             ASFLAGS="$ASFLAGS -f elf"
@@ -583,6 +599,7 @@
             ASFLAGS="$ASFLAGS -f win32 -m amd64"
             # only the GNU toolchain is inconsistent in prefixing function names with _
             [ $compiler = GNU ] && cc_check "" "-S" && grep -q "_main:" conftest && ASFLAGS="$ASFLAGS -DPREFIX"
+            [ $compiler = GNU ] && LDFLAGS="$LDFLAGS -Wl,--nxcompat -Wl,--dynamicbase"
             [ $compiler = GNU ] && RCFLAGS="--target=pe-x86-64 $RCFLAGS"
         else
             ASFLAGS="$ASFLAGS -f elf -m amd64"
@@ -703,6 +720,10 @@
         exit 1
     fi
     define HAVE_MMX
+    if cc_check '' -mpreferred-stack-boundary=5 ; then
+        CFLAGS="$CFLAGS -mpreferred-stack-boundary=5"
+        define HAVE_32B_STACK_ALIGNMENT
+    fi
 fi
 
 if [ $asm = auto -a $ARCH = ARM ] ; then
@@ -770,6 +791,9 @@
                 thread="win32"
             fi
             ;;
+        QNX)
+            cc_check pthread.h -lc && thread="posix" && libpthread="-lc"
+            ;;
         *)
             cc_check pthread.h -lpthread && thread="posix" && libpthread="-lpthread"
             ;;
@@ -917,8 +941,16 @@
     avs="no"
     # cygwin can use avisynth if it can use LoadLibrary
     if [ $SYS = WINDOWS ] || ([ $SYS = CYGWIN ] && cc_check windows.h "" "LoadLibrary(0);") ; then
-        avs="yes"
+        avs="avisynth"
+        define HAVE_AVS
+        define USE_AVXSYNTH 0
+    elif [ "$SYS" = "LINUX" -o "$SYS" = "MACOSX" ] ; then
+    # AvxSynth currently only supports Linux and OSX
+        avs="avxsynth"
         define HAVE_AVS
+        define USE_AVXSYNTH 1
+        AVS_LIBS="-ldl"
+        LDFLAGSCLI="$AVS_LIBS $LDFLAGSCLI"
     fi
 fi
 
@@ -978,6 +1010,7 @@
 if [ "$bit_depth" -gt "8" ]; then
     define HIGH_BIT_DEPTH
     ASFLAGS="$ASFLAGS -DHIGH_BIT_DEPTH=1"
+    opencl="no"
 else
     ASFLAGS="$ASFLAGS -DHIGH_BIT_DEPTH=0"
 fi
@@ -992,6 +1025,30 @@
 
 [ $interlaced = yes ] && define HAVE_INTERLACED && x264_interlaced=1 || x264_interlaced=0
 
+libdl=""
+if [ "$opencl" = "yes" ]; then
+    opencl="no"
+    log_check "for perl"
+    output=$(perl -v)
+    if [ "$output" = "" ]; then
+        log_fail
+        echo 'OpenCL support requires perl to compile.'
+        echo 'use --disable-opencl to compile without OpenCL.'
+        exit 1
+    fi
+    log_ok
+    # cygwin can use opencl if it can use LoadLibrary
+    if [ $SYS = WINDOWS ] || ([ $SYS = CYGWIN ] && cc_check windows.h "" "LoadLibrary(0);") ; then
+        opencl="yes"
+        define HAVE_OPENCL
+    elif [ "$SYS" = "LINUX" -o "$SYS" = "MACOSX" ] ; then
+        opencl="yes"
+        define HAVE_OPENCL
+        libdl="-ldl"
+    fi
+    LDFLAGS="$LDFLAGS $libdl"
+fi
+
 #define undefined vars as 0
 for var in $CONFIG_HAVE; do
     grep -q "HAVE_$var 1" config.h || define HAVE_$var 0
@@ -1083,6 +1140,7 @@
 PROF_GEN_LD=$PROF_GEN_LD
 PROF_USE_CC=$PROF_USE_CC
 PROF_USE_LD=$PROF_USE_LD
+HAVE_OPENCL=$opencl
 EOF
 
 if [ $compiler = ICL ]; then
@@ -1162,7 +1220,7 @@
 Description: H.264 (MPEG4 AVC) encoder library
 Version: $(grep POINTVER < x264_config.h | sed -e 's/.* "//; s/".*//')
 Libs: -L$libdir -lx264
-Libs.private: $libpthread $libm
+Libs.private: $libpthread $libm $libdl
 Cflags: -I$includedir
 EOF
 
@@ -1186,6 +1244,7 @@
 gpac:          $gpac
 gpl:           $gpl
 thread:        $thread
+opencl:        $opencl
 filters:       $filters
 debug:         $debug
 gprof:         $gprof

x264-snapshot-20130224-2245.tar.bz2/doc/regression_test.txt -> x264-snapshot-20130723-2245.tar.bz2/doc/regression_test.txt Changed

x264-snapshot-20130224-2245.tar.bz2/encoder/analyse.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/analyse.c Changed

@@ -467,8 +467,8 @@
             if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
                 h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
         }
-        h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
-        h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
+        h->mb.mv_limit_fpel[0][0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
+        h->mb.mv_limit_fpel[1][0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
         if( h->mb.i_mb_x == 0 && !(h->mb.i_mb_y & PARAM_INTERLACED) )
         {
             int mb_y = h->mb.i_mb_y >> SLICE_MBAFF;
@@ -516,8 +516,8 @@
                 h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
                 h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
                 h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
-                h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
-                h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
+                h->mb.mv_limit_fpel[0][1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
+                h->mb.mv_limit_fpel[1][1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
             }
         }
         if( PARAM_INTERLACED )
@@ -527,8 +527,8 @@
             h->mb.mv_max[1] = h->mb.mv_maxy_row[i];
             h->mb.mv_min_spel[1] = h->mb.mv_miny_spel_row[i];
             h->mb.mv_max_spel[1] = h->mb.mv_maxy_spel_row[i];
-            h->mb.mv_min_fpel[1] = h->mb.mv_miny_fpel_row[i];
-            h->mb.mv_max_fpel[1] = h->mb.mv_maxy_fpel_row[i];
+            h->mb.mv_limit_fpel[0][1] = h->mb.mv_miny_fpel_row[i];
+            h->mb.mv_limit_fpel[1][1] = h->mb.mv_maxy_fpel_row[i];
         }
 #undef CLIP_FMV
 
@@ -888,7 +888,7 @@
             {
                 if( !h->mb.b_lossless && predict_mode[5] >= 0 )
                 {
-                    int satd[9];
+                    ALIGNED_ARRAY_16( int32_t, satd,[9] );
                     h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
                     int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
                     satd[i_pred_mode] -= 3 * lambda;
@@ -1006,7 +1006,7 @@
             {
                 if( !h->mb.b_lossless && predict_mode[5] >= 0 )
                 {
-                    int satd[9];
+                    ALIGNED_ARRAY_16( int32_t, satd,[9] );
                     h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
                     int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
                     satd[i_pred_mode] -= 3 * lambda;
@@ -1706,7 +1706,7 @@
 static ALWAYS_INLINE int x264_mb_analyse_inter_p4x4_chroma_internal( x264_t *h, x264_mb_analysis_t *a,
                                                                      pixel **p_fref, int i8x8, int size, int chroma )
 {
-    ALIGNED_ARRAY_16( pixel, pix1,[16*16] );
+    ALIGNED_ARRAY_N( pixel, pix1,[16*16] );
     pixel *pix2 = pix1+8;
     int i_stride = h->mb.pic.i_stride[1];
     int chroma_h_shift = chroma <= CHROMA_422;
@@ -1890,8 +1890,8 @@
 
 static ALWAYS_INLINE int x264_analyse_bi_chroma( x264_t *h, x264_mb_analysis_t *a, int idx, int i_pixel )
 {
-    ALIGNED_ARRAY_16( pixel, pix, [4],[16*16] );
-    ALIGNED_ARRAY_16( pixel,  bi, [2],[16*16] );
+    ALIGNED_ARRAY_N( pixel, pix, [4],[16*16] );
+    ALIGNED_ARRAY_N( pixel,  bi, [2],[16*16] );
     int i_chroma_cost = 0;
     int chromapix = h->luma2chroma_pixel[i_pixel];
 
@@ -1984,8 +1984,8 @@
 
 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
 {
-    ALIGNED_ARRAY_16( pixel, pix0,[16*16] );
-    ALIGNED_ARRAY_16( pixel, pix1,[16*16] );
+    ALIGNED_ARRAY_N( pixel, pix0,[16*16] );
+    ALIGNED_ARRAY_N( pixel, pix1,[16*16] );
     pixel *src0, *src1;
     intptr_t stride0 = 16, stride1 = 16;
     int i_ref, i_mvc;
@@ -2454,7 +2454,7 @@
 
 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
 {
-    ALIGNED_ARRAY_16( pixel, pix,[2],[16*8] );
+    ALIGNED_ARRAY_N( pixel, pix,[2],[16*8] );
     ALIGNED_4( int16_t mvc[3][2] );
 
     h->mb.i_partition = D_16x8;
@@ -2836,12 +2836,28 @@
 
         int plane_count = CHROMA444 && h->mb.b_chroma_me ? 3 : 1;
         int i_cost8 = 0, i_cost4 = 0;
-        for( int p = 0; p < plane_count; p++ )
+        /* Not all platforms have a merged SATD function */
+        if( h->pixf.sa8d_satd[PIXEL_16x16] )
         {
-            i_cost8 += h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
-                                                  h->mb.pic.p_fdec[p], FDEC_STRIDE );
-            i_cost4 += h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
-                                                  h->mb.pic.p_fdec[p], FDEC_STRIDE );
+            uint64_t cost = 0;
+            for( int p = 0; p < plane_count; p++ )
+            {
+                cost += h->pixf.sa8d_satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
+                                                        h->mb.pic.p_fdec[p], FDEC_STRIDE );
+
+            }
+            i_cost8 = (uint32_t)cost;
+            i_cost4 = (uint32_t)(cost >> 32);
+        }
+        else
+        {
+            for( int p = 0; p < plane_count; p++ )
+            {
+                i_cost8 += h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
+                                                      h->mb.pic.p_fdec[p], FDEC_STRIDE );
+                i_cost4 += h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
+                                                      h->mb.pic.p_fdec[p], FDEC_STRIDE );
+            }
         }
 
         h->mb.b_transform_8x8 = i_cost8 < i_cost4;
@@ -3002,8 +3018,8 @@
     h->mb.i_qp = x264_ratecontrol_mb_qp( h );
     /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
      * to lower the bit cost of the qp_delta.  Don't do this if QPRD is enabled. */
-    if( h->param.rc.i_aq_mode && h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
-        h->mb.i_qp = h->mb.i_last_qp;
+    if( h->param.rc.i_aq_mode && h->param.analyse.i_subpel_refine < 10 )
+        h->mb.i_qp = abs(h->mb.i_qp - h->mb.i_last_qp) == 1 ? h->mb.i_last_qp : h->mb.i_qp;
 
     if( h->param.analyse.b_mb_info )
         h->fdec->effective_qp[h->mb.i_mb_xy] = h->mb.i_qp; /* Store the real analysis QP. */

x264-snapshot-20130224-2245.tar.bz2/encoder/analyse.h -> x264-snapshot-20130723-2245.tar.bz2/encoder/analyse.h Changed

x264-snapshot-20130224-2245.tar.bz2/encoder/cabac.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/cabac.c Changed

@@ -152,8 +152,10 @@
     int i_dqp = h->mb.i_qp - h->mb.i_last_qp;
     int ctx;
 
-    /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely flat background area */
-    if( h->mb.i_type == I_16x16 && !h->mb.cbp[h->mb.i_mb_xy] )
+    /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely
+     * flat background area. Don't do this if it would raise the quantizer, since that could
+     * cause unexpected deblocking artifacts. */
+    if( h->mb.i_type == I_16x16 && !h->mb.cbp[h->mb.i_mb_xy] && h->mb.i_qp > h->mb.i_last_qp )
     {
 #if !RDO_SKIP_BS
         h->mb.i_qp = h->mb.i_last_qp;
@@ -161,9 +163,7 @@
         i_dqp = 0;
     }
 
-    /* Since, per the above, empty-CBP I16x16 blocks never have delta quants,
-     * we don't have to check for them. */
-    ctx = h->mb.i_last_dqp && h->mb.cbp[h->mb.i_mb_prev_xy];
+    ctx = h->mb.i_last_dqp && (h->mb.type[h->mb.i_mb_prev_xy] == I_16x16 || (h->mb.cbp[h->mb.i_mb_prev_xy]&0x3f));
 
     if( i_dqp != 0 )
     {
@@ -644,26 +644,17 @@
     }
 }
 
-static const uint16_t significant_coeff_flag_offset[2][14] =
-{
-    { 105+0, 105+15, 105+29, 105+44, 105+47, 402, 484+0, 484+15, 484+29, 660, 528+0, 528+15, 528+29, 718 },
-    { 277+0, 277+15, 277+29, 277+44, 277+47, 436, 776+0, 776+15, 776+29, 675, 820+0, 820+15, 820+29, 733 }
-};
-static const uint16_t last_coeff_flag_offset[2][14] =
-{
-    { 166+0, 166+15, 166+29, 166+44, 166+47, 417, 572+0, 572+15, 572+29, 690, 616+0, 616+15, 616+29, 748 },
-    { 338+0, 338+15, 338+29, 338+44, 338+47, 451, 864+0, 864+15, 864+29, 699, 908+0, 908+15, 908+29, 757 }
-};
-static const uint16_t coeff_abs_level_m1_offset[14] =
-{
-    227+0, 227+10, 227+20, 227+30, 227+39, 426, 952+0, 952+10, 952+20, 708, 982+0, 982+10, 982+20, 766
-};
-#if RDO_SKIP_BS
-extern const uint8_t x264_significant_coeff_flag_offset_8x8[2][63];
+#if !RDO_SKIP_BS
+extern const uint8_t x264_significant_coeff_flag_offset_8x8[2][64];
 extern const uint8_t x264_last_coeff_flag_offset_8x8[63];
 extern const uint8_t x264_coeff_flag_offset_chroma_422_dc[7];
+extern const uint16_t x264_significant_coeff_flag_offset[2][16];
+extern const uint16_t x264_last_coeff_flag_offset[2][16];
+extern const uint16_t x264_coeff_abs_level_m1_offset[16];
+extern const uint8_t x264_count_cat_m1[14];
 #else
-const uint8_t x264_significant_coeff_flag_offset_8x8[2][63] =
+/* Padded to [64] for easier addressing */
+const uint8_t x264_significant_coeff_flag_offset_8x8[2][64] =
 {{
     0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
     4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
@@ -683,6 +674,21 @@
     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
 };
 const uint8_t x264_coeff_flag_offset_chroma_422_dc[7] = { 0, 0, 1, 1, 2, 2, 2 }; /* MIN( i/2, 2 ) */
+const uint16_t x264_significant_coeff_flag_offset[2][16] =
+{
+    { 105+0, 105+15, 105+29, 105+44, 105+47, 402, 484+0, 484+15, 484+29, 660, 528+0, 528+15, 528+29, 718, 0, 0 },
+    { 277+0, 277+15, 277+29, 277+44, 277+47, 436, 776+0, 776+15, 776+29, 675, 820+0, 820+15, 820+29, 733, 0, 0 }
+};
+const uint16_t x264_last_coeff_flag_offset[2][16] =
+{
+    { 166+0, 166+15, 166+29, 166+44, 166+47, 417, 572+0, 572+15, 572+29, 690, 616+0, 616+15, 616+29, 748, 0, 0 },
+    { 338+0, 338+15, 338+29, 338+44, 338+47, 451, 864+0, 864+15, 864+29, 699, 908+0, 908+15, 908+29, 757, 0, 0 }
+};
+const uint16_t x264_coeff_abs_level_m1_offset[16] =
+{
+    227+0, 227+10, 227+20, 227+30, 227+39, 426, 952+0, 952+10, 952+20, 708, 982+0, 982+10, 982+20, 766
+};
+const uint8_t x264_count_cat_m1[14] = {15, 14, 15, 3, 14, 63, 15, 14, 15, 63, 15, 14, 15, 63};
 #endif
 
 // node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
@@ -694,20 +700,20 @@
 /* 4:2:2 chroma dc uses a slightly different state machine for some reason, also note that
  * 4:2:0 chroma dc doesn't use the last state so it has identical output with both arrays. */
 static const uint8_t coeff_abs_levelgt1_ctx_chroma_dc[8] = { 5, 5, 5, 5, 6, 7, 8, 8 };
+
 static const uint8_t coeff_abs_level_transition[2][8] = {
 /* update node ctx after coding a level=1 */
     { 1, 2, 3, 3, 4, 5, 6, 7 },
 /* update node ctx after coding a level>1 */
     { 4, 4, 4, 4, 5, 6, 7, 7 }
 };
-static const uint8_t count_cat_m1[14] = {15, 14, 15, 3, 14, 63, 15, 14, 15, 63, 15, 14, 15, 63};
 
 #if !RDO_SKIP_BS
 static ALWAYS_INLINE void x264_cabac_block_residual_internal( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l, int chroma422dc )
 {
-    int ctx_sig = significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
-    int ctx_last = last_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
-    int ctx_level = coeff_abs_level_m1_offset[ctx_block_cat];
+    int ctx_sig = x264_significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
+    int ctx_last = x264_last_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
+    int ctx_level = x264_coeff_abs_level_m1_offset[ctx_block_cat];
     int coeff_idx = -1, node_ctx = 0;
     int last = h->quantf.coeff_last[ctx_block_cat]( l );
     const uint8_t *levelgt1_ctx = chroma422dc ? coeff_abs_levelgt1_ctx_chroma_dc : coeff_abs_levelgt1_ctx;
@@ -747,7 +753,7 @@
     }
     else
     {
-        int count_m1 = count_cat_m1[ctx_block_cat];
+        int count_m1 = x264_count_cat_m1[ctx_block_cat];
         if( count_m1 == 63 )
         {
             const uint8_t *sig_offset = x264_significant_coeff_flag_offset_8x8[MB_INTERLACED];
@@ -787,10 +793,20 @@
         x264_cabac_encode_bypass( cb, coeff_sign );
     } while( --coeff_idx >= 0 );
 }
-static void x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
+
+void x264_cabac_block_residual_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
 {
     x264_cabac_block_residual_internal( h, cb, ctx_block_cat, l, 0 );
 }
+
+static void ALWAYS_INLINE x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
+{
+#if ARCH_X86_64 && HAVE_MMX
+    h->bsf.cabac_block_residual_internal( l, MB_INTERLACED, ctx_block_cat, cb );
+#else
+    x264_cabac_block_residual_c( h, cb, ctx_block_cat, l );
+#endif
+}
 static void x264_cabac_block_residual_422_dc( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
 {
     /* Template a version specifically for chroma 4:2:2 DC in order to avoid
@@ -806,16 +822,16 @@
 static void ALWAYS_INLINE x264_cabac_block_residual_internal( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l, int b_8x8, int chroma422dc )
 {
     const uint8_t *sig_offset = x264_significant_coeff_flag_offset_8x8[MB_INTERLACED];
-    int ctx_sig = significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
-    int ctx_last = last_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
-    int ctx_level = coeff_abs_level_m1_offset[ctx_block_cat];
+    int ctx_sig = x264_significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
+    int ctx_last = x264_last_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
+    int ctx_level = x264_coeff_abs_level_m1_offset[ctx_block_cat];
     int last = h->quantf.coeff_last[ctx_block_cat]( l );
     int coeff_abs = abs(l[last]);
     int ctx = coeff_abs_level1_ctx[0] + ctx_level;
     int node_ctx;
     const uint8_t *levelgt1_ctx = chroma422dc ? coeff_abs_levelgt1_ctx_chroma_dc : coeff_abs_levelgt1_ctx;
 
-    if( last != (b_8x8 ? 63 : chroma422dc ? 7 : count_cat_m1[ctx_block_cat]) )
+    if( last != (b_8x8 ? 63 : chroma422dc ? 7 : x264_count_cat_m1[ctx_block_cat]) )
     {
         x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[last] :
                                     chroma422dc ? x264_coeff_flag_offset_chroma_422_dc[last] : last), 1 );
@@ -888,17 +904,35 @@
     }
 }
 
-static void x264_cabac_block_residual_8x8( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
+void x264_cabac_block_residual_8x8_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
 {
     x264_cabac_block_residual_internal( h, cb, ctx_block_cat, l, 1, 0 );
 }
-static void x264_cabac_block_residual_422_dc( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
+void x264_cabac_block_residual_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
 {
-    x264_cabac_block_residual_internal( h, cb, DCT_CHROMA_DC, l, 0, 1 );
+    x264_cabac_block_residual_internal( h, cb, ctx_block_cat, l, 0, 0 );
 }
-static void x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
+
+static ALWAYS_INLINE void x264_cabac_block_residual_8x8( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
 {
-    x264_cabac_block_residual_internal( h, cb, ctx_block_cat, l, 0, 0 );
+#if ARCH_X86_64 && HAVE_MMX
+    h->bsf.cabac_block_residual_8x8_rd_internal( l, MB_INTERLACED, ctx_block_cat, cb );
+#else
+    x264_cabac_block_residual_8x8_rd_c( h, cb, ctx_block_cat, l );
+#endif
+}
+static ALWAYS_INLINE void x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
+{
+#if ARCH_X86_64 && HAVE_MMX
+    h->bsf.cabac_block_residual_rd_internal( l, MB_INTERLACED, ctx_block_cat, cb );
+#else
+    x264_cabac_block_residual_rd_c( h, cb, ctx_block_cat, l );
+#endif
+}
+
+static void x264_cabac_block_residual_422_dc( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
+{
+    x264_cabac_block_residual_internal( h, cb, DCT_CHROMA_DC, l, 0, 1 );
 }
 #endif
 
@@ -1051,25 +1085,23 @@

x264-snapshot-20130224-2245.tar.bz2/encoder/cavlc.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/cavlc.c Changed

@@ -128,13 +128,13 @@
     unsigned int i_sign;
 
     /* level and run and total */
-    /* set these to 2 to allow branchless i_trailing calculation */
-    runlevel.level[1] = 2;
-    runlevel.level[2] = 2;
     i_total = h->quantf.coeff_level_run[ctx_block_cat]( l, &runlevel );
     x264_prefetch( &x264_run_before[runlevel.mask] );
     i_total_zero = runlevel.last + 1 - i_total;
 
+    /* branchless i_trailing calculation */
+    runlevel.level[i_total+0] = 2;
+    runlevel.level[i_total+1] = 2;
     i_trailing = ((((runlevel.level[0]+1) | (1-runlevel.level[0])) >> 31) & 1) // abs(runlevel.level[0])>1
                | ((((runlevel.level[1]+1) | (1-runlevel.level[1])) >> 31) & 2)
                | ((((runlevel.level[2]+1) | (1-runlevel.level[2])) >> 31) & 4);
@@ -213,11 +213,14 @@
     bs_t *s = &h->out.bs;
     int i_dqp = h->mb.i_qp - h->mb.i_last_qp;
 
-    /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely flat background area */
+    /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely
+     * flat background area. Don't do this if it would raise the quantizer, since that could
+     * cause unexpected deblocking artifacts. */
     if( h->mb.i_type == I_16x16 && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma)
         && !h->mb.cache.non_zero_count[x264_scan8[LUMA_DC]]
         && !h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]]
-        && !h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] )
+        && !h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]]
+        && h->mb.i_qp > h->mb.i_last_qp )
     {
 #if !RDO_SKIP_BS
         h->mb.i_qp = h->mb.i_last_qp;
@@ -268,20 +271,33 @@
     }
 }
 
-static inline void x264_cavlc_macroblock_luma_residual( x264_t *h, int i8start, int i8end )
+static ALWAYS_INLINE void x264_cavlc_macroblock_luma_residual( x264_t *h, int plane_count )
 {
     if( h->mb.b_transform_8x8 )
     {
         /* shuffle 8x8 dct coeffs into 4x4 lists */
-        for( int i8 = i8start; i8 <= i8end; i8++ )
-            if( h->mb.cache.non_zero_count[x264_scan8[i8*4]] )
-                h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4], h->dct.luma8x8[i8], &h->mb.cache.non_zero_count[x264_scan8[i8*4]] );
+        for( int p = 0; p < plane_count; p++ )
+            for( int i8 = 0; i8 < 4; i8++ )
+                if( h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4]] )
+                    h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[p*16+i8*4], h->dct.luma8x8[p*4+i8],
+                                                     &h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4]] );
     }
 
-    for( int i8 = i8start; i8 <= i8end; i8++ )
-        if( h->mb.i_cbp_luma & (1 << (i8&3)) )
+    for( int p = 0; p < plane_count; p++ )
+        FOREACH_BIT( i8, 0, h->mb.i_cbp_luma )
             for( int i4 = 0; i4 < 4; i4++ )
-                x264_cavlc_block_residual( h, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4] );
+                x264_cavlc_block_residual( h, DCT_LUMA_4x4, i4+i8*4+p*16, h->dct.luma4x4[i4+i8*4+p*16] );
+}
+
+static ALWAYS_INLINE void x264_cavlc_partition_luma_residual( x264_t *h, int i8, int p )
+{
+    if( h->mb.b_transform_8x8 && h->mb.cache.non_zero_count[x264_scan8[i8*4]] )
+        h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4+p*16], h->dct.luma8x8[i8+p*4],
+                                         &h->mb.cache.non_zero_count[x264_scan8[i8*4+p*16]] );
+
+    if( h->mb.i_cbp_luma & (1 << i8) )
+        for( int i4 = 0; i4 < 4; i4++ )
+            x264_cavlc_block_residual( h, DCT_LUMA_4x4, i4+i8*4+p*16, h->dct.luma4x4[i4+i8*4+p*16] );
 }
 
 static void x264_cavlc_mb_header_i( x264_t *h, int i_mb_type, int i_mb_i_offset, int chroma )
@@ -552,7 +568,7 @@
     else if( h->mb.i_cbp_luma | h->mb.i_cbp_chroma )
     {
         x264_cavlc_qp_delta( h );
-        x264_cavlc_macroblock_luma_residual( h, 0, plane_count*4-1 );
+        x264_cavlc_macroblock_luma_residual( h, plane_count );
     }
     if( h->mb.i_cbp_chroma )
     {
@@ -612,7 +628,7 @@
     for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- )
     {
         for( int p = 0; p < plane_count; p++ )
-            x264_cavlc_macroblock_luma_residual( h, p*4+i8, p*4+i8 );
+            x264_cavlc_partition_luma_residual( h, i8, p );
         if( h->mb.i_cbp_chroma )
         {
             if( CHROMA_FORMAT == CHROMA_422 )
@@ -665,7 +681,7 @@
     h->out.bs.i_bits_encoded = x264_cavlc_intra4x4_pred_size( h, 4*i8, i_mode );
     bs_write_ue( &h->out.bs, cbp_to_golomb[!CHROMA444][1][(h->mb.i_cbp_chroma << 4)|h->mb.i_cbp_luma] );
     for( int p = 0; p < plane_count; p++ )
-        x264_cavlc_macroblock_luma_residual( h, p*4+i8, p*4+i8 );
+        x264_cavlc_partition_luma_residual( h, i8, p );
     return h->out.bs.i_bits_encoded;
 }

x264-snapshot-20130224-2245.tar.bz2/encoder/encoder.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/encoder.c Changed

@@ -353,34 +353,49 @@
 
 /* If we are within a reasonable distance of the end of the memory allocated for the bitstream, */
 /* reallocate, adding an arbitrary amount of space. */
-static int x264_bitstream_check_buffer( x264_t *h )
+static int x264_bitstream_check_buffer_internal( x264_t *h, int size, int b_cabac, int i_nal )
 {
-    uint8_t *bs_bak = h->out.p_bitstream;
-    int max_row_size = (2500 << SLICE_MBAFF) * h->mb.i_mb_width;
-    if( (h->param.b_cabac && (h->cabac.p_end - h->cabac.p < max_row_size)) ||
-        (h->out.bs.p_end - h->out.bs.p < max_row_size) )
+    if( (b_cabac && (h->cabac.p_end - h->cabac.p < size)) ||
+        (h->out.bs.p_end - h->out.bs.p < size) )
     {
-        h->out.i_bitstream += max_row_size;
-        CHECKED_MALLOC( h->out.p_bitstream, h->out.i_bitstream );
-        h->mc.memcpy_aligned( h->out.p_bitstream, bs_bak, (h->out.i_bitstream - max_row_size) & ~15 );
-        intptr_t delta = h->out.p_bitstream - bs_bak;
+        int buf_size = h->out.i_bitstream + size;
+        uint8_t *buf = x264_malloc( buf_size );
+        if( !buf )
+            return -1;
+        int aligned_size = h->out.i_bitstream & ~15;
+        h->mc.memcpy_aligned( buf, h->out.p_bitstream, aligned_size );
+        memcpy( buf + aligned_size, h->out.p_bitstream + aligned_size, h->out.i_bitstream - aligned_size );
+
+        intptr_t delta = buf - h->out.p_bitstream;
 
         h->out.bs.p_start += delta;
         h->out.bs.p += delta;
-        h->out.bs.p_end = h->out.p_bitstream + h->out.i_bitstream;
+        h->out.bs.p_end = buf + buf_size;
 
         h->cabac.p_start += delta;
         h->cabac.p += delta;
-        h->cabac.p_end = h->out.p_bitstream + h->out.i_bitstream;
+        h->cabac.p_end = buf + buf_size;
 
-        for( int i = 0; i <= h->out.i_nal; i++ )
+        for( int i = 0; i <= i_nal; i++ )
             h->out.nal[i].p_payload += delta;
-        x264_free( bs_bak );
+
+        x264_free( h->out.p_bitstream );
+        h->out.p_bitstream = buf;
+        h->out.i_bitstream = buf_size;
     }
     return 0;
-fail:
-    x264_free( bs_bak );
-    return -1;
+}
+
+static int x264_bitstream_check_buffer( x264_t *h )
+{
+    int max_row_size = (2500 << SLICE_MBAFF) * h->mb.i_mb_width;
+    return x264_bitstream_check_buffer_internal( h, max_row_size, h->param.b_cabac, h->out.i_nal );
+}
+
+static int x264_bitstream_check_buffer_filler( x264_t *h, int filler )
+{
+    filler += 32; // add padding for safety
+    return x264_bitstream_check_buffer_internal( h, filler, 0, -1 );
 }
 
 #if HAVE_THREAD
@@ -417,17 +432,33 @@
 static int x264_validate_parameters( x264_t *h, int b_open )
 {
 #if HAVE_MMX
-#ifdef __SSE__
-    if( b_open && !(x264_cpu_detect() & X264_CPU_SSE) )
+    if( b_open )
     {
-        x264_log( h, X264_LOG_ERROR, "your cpu does not support SSE1, but x264 was compiled with asm support\n");
+        int cpuflags = x264_cpu_detect();
+        int fail = 0;
+#ifdef __SSE__
+        if( !(cpuflags & X264_CPU_SSE) )
+        {
+            x264_log( h, X264_LOG_ERROR, "your cpu does not support SSE1, but x264 was compiled with asm\n");
+            fail = 1;
+        }
 #else
-    if( b_open && !(x264_cpu_detect() & X264_CPU_MMX2) )
-    {
-        x264_log( h, X264_LOG_ERROR, "your cpu does not support MMXEXT, but x264 was compiled with asm support\n");
+        if( !(cpuflags & X264_CPU_MMX2) )
+        {
+            x264_log( h, X264_LOG_ERROR, "your cpu does not support MMXEXT, but x264 was compiled with asm\n");
+            fail = 1;
+        }
 #endif
-        x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm support (configure --disable-asm)\n");
-        return -1;
+        if( !fail && !(cpuflags & X264_CPU_CMOV) )
+        {
+            x264_log( h, X264_LOG_ERROR, "your cpu does not support CMOV, but x264 was compiled with asm\n");
+            fail = 1;
+        }
+        if( fail )
+        {
+            x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm (configure --disable-asm)\n");
+            return -1;
+        }
     }
 #endif
 
@@ -503,8 +534,6 @@
 
     if( h->param.i_threads == X264_THREADS_AUTO )
         h->param.i_threads = x264_cpu_num_processors() * (h->param.b_sliced_threads?2:3)/2;
-    if( h->param.i_lookahead_threads == X264_THREADS_AUTO )
-        h->param.i_lookahead_threads = h->param.i_threads / (h->param.b_sliced_threads?1:6);
     int max_sliced_threads = X264_MAX( 1, (h->param.i_height+15)/16 / 4 );
     if( h->param.i_threads > 1 )
     {
@@ -518,7 +547,6 @@
             h->param.i_threads = X264_MIN( h->param.i_threads, max_sliced_threads );
     }
     h->param.i_threads = x264_clip3( h->param.i_threads, 1, X264_THREAD_MAX );
-    h->param.i_lookahead_threads = x264_clip3( h->param.i_lookahead_threads, 1, X264_MIN( max_sliced_threads, X264_LOOKAHEAD_THREAD_MAX ) );
     if( h->param.i_threads == 1 )
     {
         h->param.b_sliced_threads = 0;
@@ -528,6 +556,28 @@
     if( h->i_thread_frames > 1 )
         h->param.nalu_process = NULL;
 
+    if( h->param.b_opencl )
+    {
+#if !HAVE_OPENCL
+        x264_log( h, X264_LOG_WARNING, "OpenCL: not compiled with OpenCL support, disabling\n" );
+        h->param.b_opencl = 0;
+#elif BIT_DEPTH > 8
+        x264_log( h, X264_LOG_WARNING, "OpenCL lookahead does not support high bit depth, disabling opencl\n" );
+        h->param.b_opencl = 0;
+#else
+        if( h->param.i_width < 32 || h->param.i_height < 32 )
+        {
+            x264_log( h, X264_LOG_WARNING, "OpenCL: frame size is too small, disabling opencl\n" );
+            h->param.b_opencl = 0;
+        }
+#endif
+        if( h->param.opencl_device_id && h->param.i_opencl_device )
+        {
+            x264_log( h, X264_LOG_WARNING, "OpenCL: device id and device skip count configured; dropping skip\n" );
+            h->param.i_opencl_device = 0;
+        }
+    }
+
     h->param.i_keyint_max = x264_clip3( h->param.i_keyint_max, 1, X264_KEYINT_MAX_INFINITE );
     if( h->param.i_keyint_max == 1 )
     {
@@ -646,7 +696,7 @@
                  h->param.rc.i_rc_method == X264_RC_ABR )
         {
             x264_log( h, X264_LOG_WARNING, "max bitrate less than average bitrate, assuming CBR\n" );
-            h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate;
+            h->param.rc.i_bitrate = h->param.rc.i_vbv_max_bitrate;
         }
     }
     else if( h->param.rc.i_vbv_max_bitrate )
@@ -657,6 +707,22 @@
 
     h->param.i_slice_max_size = X264_MAX( h->param.i_slice_max_size, 0 );
     h->param.i_slice_max_mbs = X264_MAX( h->param.i_slice_max_mbs, 0 );
+    h->param.i_slice_min_mbs = X264_MAX( h->param.i_slice_min_mbs, 0 );
+    if( h->param.i_slice_max_mbs )
+        h->param.i_slice_min_mbs = X264_MIN( h->param.i_slice_min_mbs, h->param.i_slice_max_mbs/2 );
+    else if( !h->param.i_slice_max_size )
+        h->param.i_slice_min_mbs = 0;
+    if( PARAM_INTERLACED && h->param.i_slice_min_mbs )
+    {
+        x264_log( h, X264_LOG_WARNING, "interlace + slice-min-mbs is not implemented\n" );
+        h->param.i_slice_min_mbs = 0;
+    }
+    int mb_width = (h->param.i_width+15)/16;
+    if( h->param.i_slice_min_mbs > mb_width )
+    {
+        x264_log( h, X264_LOG_WARNING, "slice-min-mbs > row mb size (%d) not implemented\n", mb_width );
+        h->param.i_slice_min_mbs = mb_width;
+    }
 
     int max_slices = (h->param.i_height+((16<<PARAM_INTERLACED)-1))/(16<<PARAM_INTERLACED);
     if( h->param.b_sliced_threads )
@@ -667,6 +733,8 @@
         if( h->param.i_slice_max_mbs || h->param.i_slice_max_size )
             h->param.i_slice_count = 0;
     }
+    if( h->param.i_slice_count_max > 0 )
+        h->param.i_slice_count_max = X264_MAX( h->param.i_slice_count, h->param.i_slice_count_max );
 
     if( h->param.b_bluray_compat )
     {
@@ -895,6 +963,35 @@
 
     h->param.analyse.i_weighted_pred = x264_clip3( h->param.analyse.i_weighted_pred, X264_WEIGHTP_NONE, X264_WEIGHTP_SMART );
 
+    if( h->param.i_lookahead_threads == X264_THREADS_AUTO )

x264-snapshot-20130224-2245.tar.bz2/encoder/lookahead.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/lookahead.c Changed

@@ -70,18 +70,19 @@
     x264_stack_align( x264_slicetype_decide, h );
 
     x264_lookahead_update_last_nonb( h, h->lookahead->next.list[0] );
+    int shift_frames = h->lookahead->next.list[0]->i_bframes + 1;
 
     x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex );
     while( h->lookahead->ofbuf.i_size == h->lookahead->ofbuf.i_max_size )
         x264_pthread_cond_wait( &h->lookahead->ofbuf.cv_empty, &h->lookahead->ofbuf.mutex );
 
     x264_pthread_mutex_lock( &h->lookahead->next.mutex );
-    x264_lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, h->lookahead->next.list[0]->i_bframes + 1 );
+    x264_lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, shift_frames );
     x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
 
     /* For MB-tree and VBV lookahead, we have to perform propagation analysis on I-frames too. */
     if( h->lookahead->b_analyse_keyframe && IS_X264_TYPE_I( h->lookahead->last_nonb->i_type ) )
-        x264_stack_align( x264_slicetype_analyse, h, 1 );
+        x264_stack_align( x264_slicetype_analyse, h, shift_frames );
 
     x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
 }
@@ -236,11 +237,12 @@
 
         x264_stack_align( x264_slicetype_decide, h );
         x264_lookahead_update_last_nonb( h, h->lookahead->next.list[0] );
-        x264_lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, h->lookahead->next.list[0]->i_bframes + 1 );
+        int shift_frames = h->lookahead->next.list[0]->i_bframes + 1;
+        x264_lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, shift_frames );
 
         /* For MB-tree and VBV lookahead, we have to perform propagation analysis on I-frames too. */
         if( h->lookahead->b_analyse_keyframe && IS_X264_TYPE_I( h->lookahead->last_nonb->i_type ) )
-            x264_stack_align( x264_slicetype_analyse, h, 1 );
+            x264_stack_align( x264_slicetype_analyse, h, shift_frames );
 
         x264_lookahead_encoder_shift( h );
     }

x264-snapshot-20130224-2245.tar.bz2/encoder/macroblock.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/macroblock.c Changed

@@ -128,8 +128,8 @@
     pixel *p_src = h->mb.pic.p_fenc[p];
     pixel *p_dst = h->mb.pic.p_fdec[p];
 
-    ALIGNED_ARRAY_16( dctcoef, dct4x4,[16],[16] );
-    ALIGNED_ARRAY_16( dctcoef, dct_dc4x4,[16] );
+    ALIGNED_ARRAY_N( dctcoef, dct4x4,[16],[16] );
+    ALIGNED_ARRAY_N( dctcoef, dct_dc4x4,[16] );
 
     int nz, block_cbp = 0;
     int decimate_score = h->mb.b_dct_decimate ? 0 : 9;
@@ -157,28 +157,51 @@
         return;
     }
 
+    M32( &h->mb.cache.non_zero_count[x264_scan8[ 0+p*16]] ) = 0;
+    M32( &h->mb.cache.non_zero_count[x264_scan8[ 2+p*16]] ) = 0;
+    M32( &h->mb.cache.non_zero_count[x264_scan8[ 8+p*16]] ) = 0;
+    M32( &h->mb.cache.non_zero_count[x264_scan8[10+p*16]] ) = 0;
+
     h->dctf.sub16x16_dct( dct4x4, p_src, p_dst );
 
-    for( int i = 0; i < 16; i++ )
+    if( h->mb.b_noise_reduction )
+        for( int idx = 0; idx < 16; idx++ )
+            h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 );
+
+    for( int idx = 0; idx < 16; idx++ )
     {
-        /* copy dc coeff */
-        if( h->mb.b_noise_reduction )
-            h->quantf.denoise_dct( dct4x4[i], h->nr_residual_sum[0], h->nr_offset[0], 16 );
-        dct_dc4x4[block_idx_xy_1d[i]] = dct4x4[i][0];
-        dct4x4[i][0] = 0;
+        dct_dc4x4[block_idx_xy_1d[idx]] = dct4x4[idx][0];
+        dct4x4[idx][0] = 0;
+    }
 
-        /* quant/scan/dequant */
-        if( h->mb.b_trellis )
-            nz = x264_quant_4x4_trellis( h, dct4x4[i], i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_AC][p], 1, !!p, i );
-        else
-            nz = h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
-        h->mb.cache.non_zero_count[x264_scan8[16*p+i]] = nz;
-        if( nz )
+    if( h->mb.b_trellis )
+    {
+        for( int idx = 0; idx < 16; idx++ )
+            if( x264_quant_4x4_trellis( h, dct4x4[idx], i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_AC][p], 1, !!p, idx ) )
+            {
+                block_cbp = 0xf;
+                h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+idx], dct4x4[idx] );
+                h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[i_quant_cat], i_qp );
+                if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+idx] );
+                h->mb.cache.non_zero_count[x264_scan8[16*p+idx]] = 1;
+            }
+    }
+    else
+    {
+        for( int i8x8 = 0; i8x8 < 4; i8x8++ )
         {
-            h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+i], dct4x4[i] );
-            h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[i_quant_cat], i_qp );
-            if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+i] );
-            block_cbp = 0xf;
+            nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
+            if( nz )
+            {
+                block_cbp = 0xf;
+                FOREACH_BIT( idx, i8x8*4, nz )
+                {
+                    h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+idx], dct4x4[idx] );
+                    h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[i_quant_cat], i_qp );
+                    if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+idx] );
+                    h->mb.cache.non_zero_count[x264_scan8[16*p+idx]] = 1;
+                }
+            }
         }
     }
 
@@ -245,6 +268,18 @@
     h->mb.i_cbp_chroma = 0;
     h->nr_count[2] += h->mb.b_noise_reduction * 4;
 
+    M16( &h->mb.cache.non_zero_count[x264_scan8[16]] ) = 0;
+    M16( &h->mb.cache.non_zero_count[x264_scan8[18]] ) = 0;
+    M16( &h->mb.cache.non_zero_count[x264_scan8[32]] ) = 0;
+    M16( &h->mb.cache.non_zero_count[x264_scan8[34]] ) = 0;
+    if( chroma422 )
+    {
+        M16( &h->mb.cache.non_zero_count[x264_scan8[24]] ) = 0;
+        M16( &h->mb.cache.non_zero_count[x264_scan8[26]] ) = 0;
+        M16( &h->mb.cache.non_zero_count[x264_scan8[40]] ) = 0;
+        M16( &h->mb.cache.non_zero_count[x264_scan8[42]] ) = 0;
+    }
+
     /* Early termination: check variance of chroma residual before encoding.
      * Don't bother trying early termination at low QPs.
      * Values are experimentally derived. */
@@ -259,17 +294,6 @@
             score += h->pixf.var2[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] );
         if( score < thresh*4 )
         {
-            M16( &h->mb.cache.non_zero_count[x264_scan8[16]] ) = 0;
-            M16( &h->mb.cache.non_zero_count[x264_scan8[18]] ) = 0;
-            M16( &h->mb.cache.non_zero_count[x264_scan8[32]] ) = 0;
-            M16( &h->mb.cache.non_zero_count[x264_scan8[34]] ) = 0;
-            if( chroma422 )
-            {
-                M16( &h->mb.cache.non_zero_count[x264_scan8[24]] ) = 0;
-                M16( &h->mb.cache.non_zero_count[x264_scan8[26]] ) = 0;
-                M16( &h->mb.cache.non_zero_count[x264_scan8[40]] ) = 0;
-                M16( &h->mb.cache.non_zero_count[x264_scan8[42]] ) = 0;
-            }
             h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] = 0;
             h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] = 0;
 
@@ -326,10 +350,10 @@
     {
         pixel *p_src = h->mb.pic.p_fenc[1+ch];
         pixel *p_dst = h->mb.pic.p_fdec[1+ch];
-        int i_decimate_score = 0;
+        int i_decimate_score = b_decimate ? 0 : 7;
         int nz_ac = 0;
 
-        ALIGNED_ARRAY_16( dctcoef, dct4x4,[8],[16] );
+        ALIGNED_ARRAY_N( dctcoef, dct4x4,[8],[16] );
 
         if( h->mb.b_lossless )
         {
@@ -361,20 +385,40 @@
             dct2x2dc( dct_dc, dct4x4 );
 
         /* calculate dct coeffs */
-        for( int i = 0; i < (chroma422?8:4); i++ )
+        for( int i8x8 = 0; i8x8 < (chroma422?2:1); i8x8++ )
         {
             if( h->mb.b_trellis )
-                nz = x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 1, 0 );
+            {
+                for( int i4x4 = 0; i4x4 < 4; i4x4++ )
+                {
+                    if( x264_quant_4x4_trellis( h, dct4x4[i8x8*4+i4x4], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 1, 0 ) )
+                    {
+                        int idx = 16+ch*16+i8x8*8+i4x4;
+                        h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[i8x8*4+i4x4] );
+                        h->quantf.dequant_4x4( dct4x4[i8x8*4+i4x4], dequant_mf, i_qp );
+                        if( i_decimate_score < 7 )
+                            i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[idx] );
+                        h->mb.cache.non_zero_count[x264_scan8[idx]] = 1;
+                        nz_ac = 1;
+                    }
+                }
+            }
             else
-                nz = h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qp], h->quant4_bias[CQM_4IC+b_inter][i_qp] );
-            h->mb.cache.non_zero_count[x264_scan8[16+i+(chroma422?i&4:0)+ch*16]] = nz;
-            if( nz )
             {
-                nz_ac = 1;
-                h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16], dct4x4[i] );
-                h->quantf.dequant_4x4( dct4x4[i], dequant_mf, i_qp );
-                if( b_decimate )
-                    i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16] );
+                nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[CQM_4IC+b_inter][i_qp],
+                                            h->quant4_bias[CQM_4IC+b_inter][i_qp] );
+                nz_ac |= nz;
+
+                FOREACH_BIT( i4x4, 0, nz )
+                {
+                    int idx = 16+ch*16+i8x8*8+i4x4;
+
+                    h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[i8x8*4+i4x4] );
+                    h->quantf.dequant_4x4( dct4x4[i8x8*4+i4x4], dequant_mf, i_qp );
+                    if( i_decimate_score < 7 )
+                        i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[idx] );
+                    h->mb.cache.non_zero_count[x264_scan8[idx]] = 1;
+                }
             }
         }
 
@@ -390,7 +434,7 @@
 
         h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = nz_dc;
 
-        if( (b_decimate && i_decimate_score < 7) || !nz_ac )
+        if( i_decimate_score < 7 || !nz_ac )
         {
             /* Decimate the block */
             M16( &h->mb.cache.non_zero_count[x264_scan8[16+16*ch]] ) = 0;
@@ -646,11 +690,8 @@
     {
         h->mb.b_transform_8x8 = 0;
 
-        for( int p = 0; p < plane_count; p++ )
-        {
+        for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
             x264_mb_encode_i16x16( h, p, i_qp );
-            i_qp = h->mb.i_chroma_qp;
-        }

x264-snapshot-20130224-2245.tar.bz2/encoder/macroblock.h -> x264-snapshot-20130723-2245.tar.bz2/encoder/macroblock.h Changed

@@ -104,12 +104,16 @@
     M32( &h->mb.cache.non_zero_count[x264_scan8[16*p+10]] ) = 0;\
 } while(0)
 
+/* A special for loop that iterates branchlessly over each set
+ * bit in a 4-bit input. */
+#define FOREACH_BIT(idx,start,mask) for( int idx = start, msk = mask, skip; msk && (skip = x264_ctz_4bit(msk), idx += skip, msk >>= skip+1, 1); idx++ )
+
 static ALWAYS_INLINE void x264_mb_encode_i4x4( x264_t *h, int p, int idx, int i_qp, int i_mode, int b_predict )
 {
     int nz;
     pixel *p_src = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[idx]];
     pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[idx]];
-    ALIGNED_ARRAY_16( dctcoef, dct4x4,[16] );
+    ALIGNED_ARRAY_N( dctcoef, dct4x4,[16] );
 
     if( b_predict )
     {
@@ -147,7 +151,7 @@
     int nz;
     pixel *p_src = &h->mb.pic.p_fenc[p][8*x + 8*y*FENC_STRIDE];
     pixel *p_dst = &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE];
-    ALIGNED_ARRAY_16( dctcoef, dct8x8,[64] );
+    ALIGNED_ARRAY_N( dctcoef, dct8x8,[64] );
     ALIGNED_ARRAY_32( pixel, edge_buf,[36] );
 
     if( b_predict )

x264-snapshot-20130224-2245.tar.bz2/encoder/me.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/me.c Changed

@@ -61,21 +61,22 @@
     (p_cost_mvx[(mx)<<2] + p_cost_mvy[(my)<<2])
 
 #define COST_MV( mx, my )\
+do\
 {\
     int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE,\
                    &p_fref_w[(my)*stride+(mx)], stride )\
              + BITS_MVD(mx,my);\
     COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my );\
-}
+} while(0)
 
-#define COST_MV_HPEL( mx, my ) \
-{ \
-    intptr_t stride2 = 16; \
-    pixel *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh, &m->weight[0] ); \
-    int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, src, stride2 ) \
-             + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
-    COPY3_IF_LT( bpred_cost, cost, bpred_mx, mx, bpred_my, my ); \
-}
+#define COST_MV_HPEL( mx, my, cost )\
+do\
+{\
+    intptr_t stride2 = 16;\
+    pixel *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh, &m->weight[0] );\
+    cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, src, stride2 )\
+         + p_cost_mvx[ mx ] + p_cost_mvy[ my ];\
+} while(0)
 
 #define COST_MV_X3_DIR( m0x, m0y, m1x, m1y, m2x, m2y, costs )\
 {\
@@ -174,6 +175,10 @@
     }\
 }
 
+#define FPEL(mv) (((mv)+2)>>2) /* Convert subpel MV to fullpel with rounding... */
+#define SPEL(mv) ((mv)<<2)     /* ... and the reverse. */
+#define SPELx2(mv) (SPEL(mv)&0xFFFCFFFC) /* for two packed MVs */
+
 void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_halfpel_thresh )
 {
     const int bw = x264_pixel_size[m->i_pixel].w;
@@ -181,97 +186,136 @@
     const int i_pixel = m->i_pixel;
     const int stride = m->i_stride[0];
     int i_me_range = h->param.analyse.i_me_range;
-    int bmx, bmy, bcost;
-    int bpred_mx = 0, bpred_my = 0, bpred_cost = COST_MAX;
+    int bmx, bmy, bcost = COST_MAX;
+    int bpred_cost = COST_MAX;
     int omx, omy, pmx, pmy;
     pixel *p_fenc = m->p_fenc[0];
     pixel *p_fref_w = m->p_fref_w;
-    ALIGNED_ARRAY_16( pixel, pix,[16*16] );
-
-    int costs[16];
-
-    int mv_x_min = h->mb.mv_min_fpel[0];
-    int mv_y_min = h->mb.mv_min_fpel[1];
-    int mv_x_max = h->mb.mv_max_fpel[0];
-    int mv_y_max = h->mb.mv_max_fpel[1];
-    int mv_x_min_qpel = mv_x_min << 2;
-    int mv_y_min_qpel = mv_y_min << 2;
-    int mv_x_max_qpel = mv_x_max << 2;
-    int mv_y_max_qpel = mv_y_max << 2;
+    ALIGNED_ARRAY_N( pixel, pix,[16*16] );
+    ALIGNED_ARRAY_8( int16_t, mvc_temp,[16],[2] );
+
+    ALIGNED_ARRAY_16( int, costs,[16] );
+
+    int mv_x_min = h->mb.mv_limit_fpel[0][0];
+    int mv_y_min = h->mb.mv_limit_fpel[0][1];
+    int mv_x_max = h->mb.mv_limit_fpel[1][0];
+    int mv_y_max = h->mb.mv_limit_fpel[1][1];
 /* Special version of pack to allow shortcuts in CHECK_MVRANGE */
 #define pack16to32_mask2(mx,my) ((mx<<16)|(my&0x7FFF))
     uint32_t mv_min = pack16to32_mask2( -mv_x_min, -mv_y_min );
     uint32_t mv_max = pack16to32_mask2( mv_x_max, mv_y_max )|0x8000;
+    uint32_t pmv, bpred_mv = 0;
 
 #define CHECK_MVRANGE(mx,my) (!(((pack16to32_mask2(mx,my) + mv_min) | (mv_max - pack16to32_mask2(mx,my))) & 0x80004000))
 
     const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
     const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
 
-    uint32_t pmv;
-    bmx = x264_clip3( m->mvp[0], mv_x_min_qpel, mv_x_max_qpel );
-    bmy = x264_clip3( m->mvp[1], mv_y_min_qpel, mv_y_max_qpel );
-    pmx = ( bmx + 2 ) >> 2;
-    pmy = ( bmy + 2 ) >> 2;
-    bcost = COST_MAX;
-
-    /* try extra predictors if provided */
+    /* Try extra predictors if provided.  If subme >= 3, check subpel predictors,
+     * otherwise round them to fullpel. */
     if( h->mb.i_subpel_refine >= 3 )
     {
-        pmv = pack16to32_mask(bmx,bmy);
-        if( i_mvc )
-            COST_MV_HPEL( bmx, bmy );
-        for( int i = 0; i < i_mvc; i++ )
+        /* Calculate and check the MVP first */
+        int bpred_mx = x264_clip3( m->mvp[0], SPEL(mv_x_min), SPEL(mv_x_max) );
+        int bpred_my = x264_clip3( m->mvp[1], SPEL(mv_y_min), SPEL(mv_y_max) );
+        pmv = pack16to32_mask( bpred_mx, bpred_my );
+        pmx = FPEL( bpred_mx );
+        pmy = FPEL( bpred_my );
+
+        COST_MV_HPEL( bpred_mx, bpred_my, bpred_cost );
+        int pmv_cost = bpred_cost;
+
+        if( i_mvc > 0 )
         {
-            if( M32( mvc[i] ) && (pmv != M32( mvc[i] )) )
+            /* Clip MV candidates and eliminate those equal to zero and pmv. */
+            int valid_mvcs = x264_predictor_clip( mvc_temp+2, mvc, i_mvc, h->mb.mv_limit_fpel, pmv );
+            if( valid_mvcs > 0 )
             {
-                int mx = x264_clip3( mvc[i][0], mv_x_min_qpel, mv_x_max_qpel );
-                int my = x264_clip3( mvc[i][1], mv_y_min_qpel, mv_y_max_qpel );
-                COST_MV_HPEL( mx, my );
+                int i = 1, cost;
+                /* We stuff pmv here to branchlessly pick between pmv and the various
+                 * MV candidates. [0] gets skipped in order to maintain alignment for
+                 * x264_predictor_clip. */
+                M32( mvc_temp[1] ) = pmv;
+                bpred_cost <<= 4;
+                do
+                {
+                    int mx = mvc_temp[i+1][0];
+                    int my = mvc_temp[i+1][1];
+                    COST_MV_HPEL( mx, my, cost );
+                    COPY1_IF_LT( bpred_cost, (cost << 4) + i );
+                } while( ++i <= valid_mvcs );
+                bpred_mx = mvc_temp[(bpred_cost&15)+1][0];
+                bpred_my = mvc_temp[(bpred_cost&15)+1][1];
+                bpred_cost >>= 4;
             }
         }
-        bmx = ( bpred_mx + 2 ) >> 2;
-        bmy = ( bpred_my + 2 ) >> 2;
-        COST_MV( bmx, bmy );
+
+        /* Round the best predictor back to fullpel and get the cost, since this is where
+         * we'll be starting the fullpel motion search. */
+        bmx = FPEL( bpred_mx );
+        bmy = FPEL( bpred_my );
+        bpred_mv = pack16to32_mask(bpred_mx, bpred_my);
+        if( bpred_mv&0x00030003 ) /* Only test if the tested predictor is actually subpel... */
+            COST_MV( bmx, bmy );
+        else                          /* Otherwise just copy the cost (we already know it) */
+            bcost = bpred_cost;
+
+        /* Test the zero vector if it hasn't been tested yet. */
+        if( pmv )
+        {
+            if( bmx|bmy ) COST_MV( 0, 0 );
+        }
+        /* If a subpel mv candidate was better than the zero vector, the previous
+         * fullpel check won't have gotten it even if the pmv was zero. So handle
+         * that possibility here. */
+        else
+        {
+            COPY3_IF_LT( bcost, pmv_cost, bmx, 0, bmy, 0 );
+        }
     }
     else
     {
-        /* check the MVP */
-        bmx = pmx;
-        bmy = pmy;
+        /* Calculate and check the fullpel MVP first */
+        bmx = pmx = x264_clip3( FPEL(m->mvp[0]), mv_x_min, mv_x_max );
+        bmy = pmy = x264_clip3( FPEL(m->mvp[1]), mv_y_min, mv_y_max );
+        pmv = pack16to32_mask( bmx, bmy );
+
         /* Because we are rounding the predicted motion vector to fullpel, there will be
          * an extra MV cost in 15 out of 16 cases.  However, when the predicted MV is
          * chosen as the best predictor, it is often the case that the subpel search will
-         * result in a vector at or next to the predicted motion vector.  Therefore, it is
-         * sensible to omit the cost of the MV from the rounded MVP to avoid unfairly
-         * biasing against use of the predicted motion vector. */
+         * result in a vector at or next to the predicted motion vector.  Therefore, we omit
+         * the cost of the MV from the rounded MVP to avoid unfairly biasing against use of
+         * the predicted motion vector.
+         *
+         * Disclaimer: this is a post-hoc rationalization for why this hack works. */
         bcost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[bmy*stride+bmx], stride );
-        pmv = pack16to32_mask( bmx, bmy );
+
         if( i_mvc > 0 )
         {
-            ALIGNED_ARRAY_8( int16_t, mvc_fpel,[16],[2] );
-            x264_predictor_roundclip( mvc_fpel+2, mvc, i_mvc, mv_x_min, mv_x_max, mv_y_min, mv_y_max );
-            M32( mvc_fpel[1] ) = pmv;
-            bcost <<= 4;
-            for( int i = 1; i <= i_mvc; i++ )
+            /* Like in subme>=3, except we also round the candidates to fullpel. */
+            int valid_mvcs = x264_predictor_roundclip( mvc_temp+2, mvc, i_mvc, h->mb.mv_limit_fpel, pmv );

x264-snapshot-20130224-2245.tar.bz2/encoder/ratecontrol.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/ratecontrol.c Changed

@@ -1118,7 +1118,8 @@
             total_qp_aq += qp_aq;
             p = next;
         }
-        h->pps->i_pic_init_qp = SPEC_QP( (int)(total_qp_aq / rc->num_entries + 0.5) );
+        if( !h->param.b_stitchable )
+            h->pps->i_pic_init_qp = SPEC_QP( (int)(total_qp_aq / rc->num_entries + 0.5) );
 
         x264_free( stats_buf );
 
@@ -1667,7 +1668,8 @@
             rc->qpm = x264_clip3f( (prev_row_qp + rc->qpm)*0.5f, prev_row_qp + 1.0f, qp_max );
             rc->qpa_rc = rc->qpa_rc_prev;
             rc->qpa_aq = rc->qpa_aq_prev;
-            h->fdec->i_row_bits[y] = h->fdec->i_row_bits[y-SLICE_MBAFF] = 0;
+            h->fdec->i_row_bits[y] = 0;
+            h->fdec->i_row_bits[y-SLICE_MBAFF] = 0;
             return -1;
         }
     }
@@ -1683,7 +1685,8 @@
             rc->qpm = qp_max;
             rc->qpa_rc = rc->qpa_rc_prev;
             rc->qpa_aq = rc->qpa_aq_prev;
-            h->fdec->i_row_bits[y] = h->fdec->i_row_bits[y-SLICE_MBAFF] = 0;
+            h->fdec->i_row_bits[y] = 0;
+            h->fdec->i_row_bits[y-SLICE_MBAFF] = 0;
             return -1;
         }
     }
@@ -2591,14 +2594,16 @@
     if( h->i_frame == 0 )
         for( int i = 0; i < h->param.i_threads; i++ )
         {
-            x264_ratecontrol_t *t = h->thread[i]->rc;
-            memcpy( t->row_preds, rc->row_preds, sizeof(rc->row_preds) );
+            x264_t *t = h->thread[i];
+            if( t != h )
+                memcpy( t->rc->row_preds, rc->row_preds, sizeof(rc->row_preds) );
         }
 
     for( int i = 0; i < h->param.i_threads; i++ )
     {
         x264_t *t = h->thread[i];
-        memcpy( t->rc, rc, offsetof(x264_ratecontrol_t, row_pred) );
+        if( t != h )
+            memcpy( t->rc, rc, offsetof(x264_ratecontrol_t, row_pred) );
         t->rc->row_pred = &t->rc->row_preds[h->sh.i_type];
         /* Calculate the planned slice size. */
         if( rc->b_vbv && rc->frame_size_planned )

x264-snapshot-20130224-2245.tar.bz2/encoder/rdo.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/rdo.c Changed

@@ -634,13 +634,13 @@
                          const uint8_t *zigzag, int ctx_block_cat, int lambda2, int b_ac,
                          int b_chroma, int dc, int num_coefs, int idx )
 {
-    ALIGNED_ARRAY_16( dctcoef, orig_coefs, [64] );
-    ALIGNED_ARRAY_16( dctcoef, quant_coefs, [64] );
+    ALIGNED_ARRAY_N( dctcoef, orig_coefs, [64] );
+    ALIGNED_ARRAY_N( dctcoef, quant_coefs, [64] );
     const uint32_t *coef_weight1 = num_coefs == 64 ? x264_dct8_weight_tab : x264_dct4_weight_tab;
     const uint32_t *coef_weight2 = num_coefs == 64 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
     const int b_interlaced = MB_INTERLACED;
-    uint8_t *cabac_state_sig = &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][ctx_block_cat] ];
-    uint8_t *cabac_state_last = &h->cabac.state[ last_coeff_flag_offset[b_interlaced][ctx_block_cat] ];
+    uint8_t *cabac_state_sig = &h->cabac.state[ x264_significant_coeff_flag_offset[b_interlaced][ctx_block_cat] ];
+    uint8_t *cabac_state_last = &h->cabac.state[ x264_last_coeff_flag_offset[b_interlaced][ctx_block_cat] ];
     int levelgt1_ctx = b_chroma && dc ? 8 : 9;
 
     if( dc )
@@ -683,7 +683,7 @@
     }
 
     int last_nnz = h->quantf.coeff_last[ctx_block_cat]( quant_coefs+b_ac )+b_ac;
-    uint8_t *cabac_state = &h->cabac.state[ coeff_abs_level_m1_offset[ctx_block_cat] ];
+    uint8_t *cabac_state = &h->cabac.state[ x264_coeff_abs_level_m1_offset[ctx_block_cat] ];
 
     /* shortcut for dc-only blocks.
      * this doesn't affect the output, but saves some unnecessary computation. */
@@ -1161,5 +1161,6 @@
         h->mb.cache.non_zero_count[x264_scan8[idx*4+i]] = nz;
         nzaccum |= nz;
     }
+    STORE_8x8_NNZ( 0, idx, 0 );
     return nzaccum;
 }

x264-snapshot-20130224-2245.tar.bz2/encoder/set.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/set.c Changed

@@ -208,9 +208,9 @@
                            ( csp >= X264_CSP_BGR ? 1 : 0 ) );
     sps->vui.b_color_description_present = 0;
 
-    sps->vui.i_colorprim = ( param->vui.i_colorprim >= 0 && param->vui.i_colorprim <=  8 ? param->vui.i_colorprim : 2 );
-    sps->vui.i_transfer  = ( param->vui.i_transfer  >= 0 && param->vui.i_transfer  <= 10 ? param->vui.i_transfer  : 2 );
-    sps->vui.i_colmatrix = ( param->vui.i_colmatrix >= 0 && param->vui.i_colmatrix <=  8 ? param->vui.i_colmatrix :
+    sps->vui.i_colorprim = ( param->vui.i_colorprim >= 0 && param->vui.i_colorprim <=  9 ? param->vui.i_colorprim : 2 );
+    sps->vui.i_transfer  = ( param->vui.i_transfer  >= 0 && param->vui.i_transfer  <= 15 ? param->vui.i_transfer  : 2 );
+    sps->vui.i_colmatrix = ( param->vui.i_colmatrix >= 0 && param->vui.i_colmatrix <= 10 ? param->vui.i_colmatrix :
                            ( csp >= X264_CSP_BGR ? 0 : 2 ) );
     if( sps->vui.i_colorprim != 2 ||
         sps->vui.i_transfer  != 2 ||
@@ -430,7 +430,7 @@
     pps->b_weighted_pred = param->analyse.i_weighted_pred > 0;
     pps->b_weighted_bipred = param->analyse.b_weighted_bipred ? 2 : 0;
 
-    pps->i_pic_init_qp = param->rc.i_rc_method == X264_RC_ABR ? 26 + QP_BD_OFFSET : SPEC_QP( param->rc.i_qp_constant );
+    pps->i_pic_init_qp = param->rc.i_rc_method == X264_RC_ABR || param->b_stitchable ? 26 + QP_BD_OFFSET : SPEC_QP( param->rc.i_qp_constant );
     pps->i_pic_init_qs = 26 + QP_BD_OFFSET;
 
     pps->i_chroma_qp_index_offset = param->analyse.i_chroma_qp_offset;

x264-snapshot-20130723-2245.tar.bz2/encoder/slicetype-cl.c Added

@@ -0,0 +1,780 @@
+/*****************************************************************************
+ * slicetype-cl.c: OpenCL slicetype decision code (lowres lookahead)
+ *****************************************************************************
+ * Copyright (C) 2012-2013 x264 project
+ *
+ * Authors: Steve Borho <sborho@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "macroblock.h"
+#include "me.h"
+
+#if HAVE_OPENCL
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead );
+
+/* We define CL_QUEUE_THREAD_HANDLE_AMD here because it is not defined
+ * in the OpenCL headers shipped with NVIDIA drivers.  We need to be
+ * able to compile on an NVIDIA machine and run optimally on an AMD GPU. */
+#define CL_QUEUE_THREAD_HANDLE_AMD 0x403E
+
+#define OCLCHECK( method, ... )\
+do\
+{\
+    if( h->opencl.b_fatal_error )\
+        return -1;\
+    status = ocl->method( __VA_ARGS__ );\
+    if( status != CL_SUCCESS ) {\
+        h->param.b_opencl = 0;\
+        h->opencl.b_fatal_error = 1;\
+        x264_log( h, X264_LOG_ERROR, # method " error '%d'\n", status );\
+        return -1;\
+    }\
+} while( 0 )
+
+void x264_opencl_flush( x264_t *h )
+{
+    x264_opencl_function_t *ocl = h->opencl.ocl;
+
+    ocl->clFinish( h->opencl.queue );
+
+    /* Finish copies from the GPU by copying from the page-locked buffer to
+     * their final destination */
+    for( int i = 0; i < h->opencl.num_copies; i++ )
+        memcpy( h->opencl.copies[i].dest, h->opencl.copies[i].src, h->opencl.copies[i].bytes );
+    h->opencl.num_copies = 0;
+    h->opencl.pl_occupancy = 0;
+}
+
+static void *x264_opencl_alloc_locked( x264_t *h, int bytes )
+{
+    if( h->opencl.pl_occupancy + bytes >= PAGE_LOCKED_BUF_SIZE )
+        x264_opencl_flush( h );
+    assert( bytes < PAGE_LOCKED_BUF_SIZE );
+    char *ptr = h->opencl.page_locked_ptr + h->opencl.pl_occupancy;
+    h->opencl.pl_occupancy += bytes;
+    return ptr;
+}
+
+int x264_opencl_lowres_init( x264_t *h, x264_frame_t *fenc, int lambda )
+{
+    if( fenc->b_intra_calculated )
+        return 0;
+    fenc->b_intra_calculated = 1;
+
+    x264_opencl_function_t *ocl = h->opencl.ocl;
+    int luma_length = fenc->i_stride[0] * fenc->i_lines[0];
+
+#define CREATEBUF( out, flags, size )\
+    out = ocl->clCreateBuffer( h->opencl.context, (flags), (size), NULL, &status );\
+    if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateBuffer error '%d'\n", status ); return -1; }
+#define CREATEIMAGE( out, flags, pf, width, height )\
+    out = ocl->clCreateImage2D( h->opencl.context, (flags), &pf, width, height, 0, NULL, &status );\
+    if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateImage2D error '%d'\n", status ); return -1; }
+
+    int mb_count = h->mb.i_mb_count;
+    cl_int status;
+
+    if( !h->opencl.lowres_mv_costs )
+    {
+        /* Allocate shared memory buffers */
+        int width = h->mb.i_mb_width * 8 * sizeof(pixel);
+        int height = h->mb.i_mb_height * 8 * sizeof(pixel);
+
+        cl_image_format pixel_format;
+        pixel_format.image_channel_order = CL_R;
+        pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
+        CREATEIMAGE( h->opencl.weighted_luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
+
+        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
+        {
+            pixel_format.image_channel_order = CL_RGBA;
+            pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
+            CREATEIMAGE( h->opencl.weighted_scaled_images[i], CL_MEM_READ_WRITE, pixel_format, width, height );
+            width >>= 1;
+            height >>= 1;
+        }
+
+        CREATEBUF( h->opencl.lowres_mv_costs,     CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
+        CREATEBUF( h->opencl.lowres_costs[0],     CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
+        CREATEBUF( h->opencl.lowres_costs[1],     CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
+        CREATEBUF( h->opencl.mv_buffers[0],       CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
+        CREATEBUF( h->opencl.mv_buffers[1],       CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
+        CREATEBUF( h->opencl.mvp_buffer,          CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
+        CREATEBUF( h->opencl.frame_stats[0],      CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
+        CREATEBUF( h->opencl.frame_stats[1],      CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
+        CREATEBUF( h->opencl.row_satds[0],        CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
+        CREATEBUF( h->opencl.row_satds[1],        CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
+        CREATEBUF( h->opencl.luma_16x16_image[0], CL_MEM_READ_ONLY,  luma_length );
+        CREATEBUF( h->opencl.luma_16x16_image[1], CL_MEM_READ_ONLY,  luma_length );
+    }
+
+    if( !fenc->opencl.intra_cost )
+    {
+        /* Allocate per-frame buffers */
+        int width = h->mb.i_mb_width * 8 * sizeof(pixel);
+        int height = h->mb.i_mb_height * 8 * sizeof(pixel);
+
+        cl_image_format pixel_format;
+        pixel_format.image_channel_order = CL_R;
+        pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
+        CREATEIMAGE( fenc->opencl.luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
+
+        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
+        {
+            pixel_format.image_channel_order = CL_RGBA;
+            pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
+            CREATEIMAGE( fenc->opencl.scaled_image2Ds[i], CL_MEM_READ_WRITE, pixel_format, width, height );
+            width >>= 1;
+            height >>= 1;
+        }
+        CREATEBUF( fenc->opencl.inv_qscale_factor, CL_MEM_READ_ONLY,  mb_count * sizeof(int16_t) );
+        CREATEBUF( fenc->opencl.intra_cost,        CL_MEM_WRITE_ONLY, mb_count * sizeof(int16_t) );
+        CREATEBUF( fenc->opencl.lowres_mvs0,       CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
+        CREATEBUF( fenc->opencl.lowres_mvs1,       CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
+        CREATEBUF( fenc->opencl.lowres_mv_costs0,  CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
+        CREATEBUF( fenc->opencl.lowres_mv_costs1,  CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
+    }
+#undef CREATEBUF
+#undef CREATEIMAGE
+
+    /* Copy image to the GPU, downscale to unpadded 8x8, then continue for all scales */
+
+    char *locked = x264_opencl_alloc_locked( h, luma_length );
+    memcpy( locked, fenc->plane[0], luma_length );
+    OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue,  h->opencl.luma_16x16_image[h->opencl.last_buf], CL_FALSE, 0, luma_length, locked, 0, NULL, NULL );
+
+    size_t gdim[2];
+    if( h->param.rc.i_aq_mode && fenc->i_inv_qscale_factor )
+    {
+        int size = h->mb.i_mb_count * sizeof(int16_t);
+        locked = x264_opencl_alloc_locked( h, size );
+        memcpy( locked, fenc->i_inv_qscale_factor, size );
+        OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue, fenc->opencl.inv_qscale_factor, CL_FALSE, 0, size, locked, 0, NULL, NULL );
+    }
+    else
+    {
+        /* Fill fenc->opencl.inv_qscale_factor with NOP (256) */
+        cl_uint arg = 0;
+        int16_t value = 256;
+        OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
+        OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(int16_t), &value );
+        gdim[0] = h->mb.i_mb_count;
+        OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.memset_kernel, 1, NULL, gdim, NULL, 0, NULL, NULL );
+    }
+
+    int stride = fenc->i_stride[0];
+    cl_uint arg = 0;
+    OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.luma_16x16_image[h->opencl.last_buf] );
+    OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
+    OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.luma_hpel );
+    OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(int), &stride );
+    gdim[0] = 8 * h->mb.i_mb_width;
+    gdim[1] = 8 * h->mb.i_mb_height;
+    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.downscale_hpel_kernel, 2, NULL, gdim, NULL, 0, NULL, NULL );
+
+    for( int i = 0; i < NUM_IMAGE_SCALES - 1; i++ )
+    {
+        /* Workaround for AMD Southern Island:

x264-snapshot-20130224-2245.tar.bz2/encoder/slicetype.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/slicetype.c Changed

@@ -36,6 +36,18 @@
                                       x264_frame_t **frames, int p0, int p1, int b,
                                       int b_intra_penalty );
 
+void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead );
+
+#if HAVE_OPENCL
+int x264_opencl_lowres_init( x264_t *h, x264_frame_t *fenc, int lambda );
+int x264_opencl_motionsearch( x264_t *h, x264_frame_t **frames, int b, int ref, int b_islist1, int lambda, const x264_weight_t *w );
+int x264_opencl_finalize_cost( x264_t *h, int lambda, x264_frame_t **frames, int p0, int p1, int b, int dist_scale_factor );
+int x264_opencl_precalculate_frame_cost( x264_t *h, x264_frame_t **frames, int lambda, int p0, int p1, int b );
+void x264_opencl_flush( x264_t *h );
+void x264_opencl_slicetype_prep( x264_t *h, x264_frame_t **frames, int num_frames, int lambda );
+void x264_opencl_slicetype_end( x264_t *h );
+#endif
+
 static void x264_lowres_context_init( x264_t *h, x264_mb_analysis_t *a )
 {
     a->i_qp = X264_LOOKAHEAD_QP;
@@ -60,7 +72,7 @@
     w->i_offset = offset;
     w->i_denom = 7;
     w->i_scale = weight_nonh264;
-    while( w->i_denom > 0 && (w->i_scale > 127 || !(w->i_scale & 1)) )
+    while( w->i_denom > 0 && (w->i_scale > 127) )
     {
         w->i_denom--;
         w->i_scale >>= 1;
@@ -276,7 +288,7 @@
     return cost;
 }
 
-static void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead )
+void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead )
 {
     int i_delta_index = fenc->i_frame - ref->i_frame - 1;
     /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
@@ -286,21 +298,40 @@
     SET_WEIGHT( weights[1], 0, 1, 0, 0 );
     SET_WEIGHT( weights[2], 0, 1, 0, 0 );
     int chroma_initted = 0;
+    float guess_scale[3];
+    float fenc_mean[3];
+    float ref_mean[3];
+    for( int plane = 0; plane <= 2*!b_lookahead; plane++ )
+    {
+        float fenc_var = fenc->i_pixel_ssd[plane] + !ref->i_pixel_ssd[plane];
+        float ref_var  =  ref->i_pixel_ssd[plane] + !ref->i_pixel_ssd[plane];
+        guess_scale[plane] = sqrtf( fenc_var / ref_var );
+        fenc_mean[plane] = (float)fenc->i_pixel_sum[plane] / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]) / (1 << (BIT_DEPTH - 8));
+        ref_mean[plane]  = (float) ref->i_pixel_sum[plane] / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]) / (1 << (BIT_DEPTH - 8));
+    }
+
+    int chroma_denom = 7;
+    if( !b_lookahead )
+    {
+        /* make sure both our scale factors fit */
+        while( chroma_denom > 0 )
+        {
+            float thresh = 127.f / (1<<chroma_denom);
+            if( guess_scale[1] < thresh && guess_scale[2] < thresh )
+                break;
+            chroma_denom--;
+        }
+    }
+
     /* Don't check chroma in lookahead, or if there wasn't a luma weight. */
     for( int plane = 0; plane <= 2 && !( plane && ( !weights[0].weightfn || b_lookahead ) ); plane++ )
     {
-        int cur_offset, start_offset, end_offset;
         int minoff, minscale, mindenom;
         unsigned int minscore, origscore;
         int found;
-        float fenc_var = fenc->i_pixel_ssd[plane] + !ref->i_pixel_ssd[plane];
-        float ref_var  =  ref->i_pixel_ssd[plane] + !ref->i_pixel_ssd[plane];
-        float guess_scale = sqrtf( fenc_var / ref_var );
-        float fenc_mean = (float)fenc->i_pixel_sum[plane] / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]) / (1 << (BIT_DEPTH - 8));
-        float ref_mean  = (float) ref->i_pixel_sum[plane] / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]) / (1 << (BIT_DEPTH - 8));
 
         //early termination
-        if( fabsf( ref_mean - fenc_mean ) < 0.5f && fabsf( 1.f - guess_scale ) < epsilon )
+        if( fabsf( ref_mean[plane] - fenc_mean[plane] ) < 0.5f && fabsf( 1.f - guess_scale[plane] ) < epsilon )
         {
             SET_WEIGHT( weights[plane], 0, 1, 0, 0 );
             continue;
@@ -308,8 +339,8 @@
 
         if( plane )
         {
-            weights[plane].i_denom = 6;
-            weights[plane].i_scale = x264_clip3( round( guess_scale * 64 ), 0, 255 );
+            weights[plane].i_denom = chroma_denom;
+            weights[plane].i_scale = x264_clip3( round( guess_scale[plane] * (1<<chroma_denom) ), 0, 255 );
             if( weights[plane].i_scale > 127 )
             {
                 weights[1].weightfn = weights[2].weightfn = NULL;
@@ -317,7 +348,7 @@
             }
         }
         else
-            x264_weight_get_h264( round( guess_scale * 128 ), 0, &weights[plane] );
+            x264_weight_get_h264( round( guess_scale[plane] * 128 ), 0, &weights[plane] );
 
         found = 0;
         mindenom = weights[plane].i_denom;
@@ -357,33 +388,65 @@
         if( !minscore )
             continue;
 
-        // This gives a slight improvement due to rounding errors but only tests one offset in lookahead.
-        // Currently only searches within +/- 1 of the best offset found so far.
-        // TODO: Try other offsets/multipliers/combinations thereof?
-        cur_offset = fenc_mean - ref_mean * minscale / (1 << mindenom) + 0.5f * b_lookahead;
-        start_offset = x264_clip3( cur_offset - !b_lookahead, -128, 127 );
-        end_offset   = x264_clip3( cur_offset + !b_lookahead, -128, 127 );
-        for( int i_off = start_offset; i_off <= end_offset; i_off++ )
+        /* Picked somewhat arbitrarily */
+        static const uint8_t weight_check_distance[][2] =
+        {
+            {0,0},{0,0},{0,1},{0,1},
+            {0,1},{0,1},{0,1},{1,1},
+            {1,1},{2,1},{2,1},{4,2}
+        };
+        int scale_dist =  b_lookahead ? 0 : weight_check_distance[h->param.analyse.i_subpel_refine][0];
+        int offset_dist = b_lookahead ? 0 : weight_check_distance[h->param.analyse.i_subpel_refine][1];
+
+        int start_scale  = x264_clip3( minscale - scale_dist, 0, 127 );
+        int end_scale    = x264_clip3( minscale + scale_dist, 0, 127 );
+        for( int i_scale = start_scale; i_scale <= end_scale; i_scale++ )
         {
-            SET_WEIGHT( weights[plane], 1, minscale, mindenom, i_off );
-            unsigned int s;
-            if( plane )
+            int cur_scale = i_scale;
+            int cur_offset = fenc_mean[plane] - ref_mean[plane] * cur_scale / (1 << mindenom) + 0.5f * b_lookahead;
+            if( cur_offset < - 128 || cur_offset > 127 )
             {
-                if( CHROMA444 )
-                    s = x264_weight_cost_chroma444( h, fenc, mcbuf, &weights[plane], plane );
-                else
-                    s = x264_weight_cost_chroma( h, fenc, mcbuf, &weights[plane] );
+                /* Rescale considering the constraints on cur_offset. We do it in this order
+                 * because scale has a much wider range than offset (because of denom), so
+                 * it should almost never need to be clamped. */
+                cur_offset = x264_clip3( cur_offset, -128, 127 );
+                cur_scale = (1 << mindenom) * (fenc_mean[plane] - cur_offset) / ref_mean[plane] + 0.5f;
+                cur_scale = x264_clip3( cur_scale, 0, 127 );
             }
-            else
-                s = x264_weight_cost_luma( h, fenc, mcbuf, &weights[plane] );
-            COPY3_IF_LT( minscore, s, minoff, i_off, found, 1 );
+            int start_offset = x264_clip3( cur_offset - offset_dist, -128, 127 );
+            int end_offset   = x264_clip3( cur_offset + offset_dist, -128, 127 );
+            for( int i_off = start_offset; i_off <= end_offset; i_off++ )
+            {
+                SET_WEIGHT( weights[plane], 1, cur_scale, mindenom, i_off );
+                unsigned int s;
+                if( plane )
+                {
+                    if( CHROMA444 )
+                        s = x264_weight_cost_chroma444( h, fenc, mcbuf, &weights[plane], plane );
+                    else
+                        s = x264_weight_cost_chroma( h, fenc, mcbuf, &weights[plane] );
+                }
+                else
+                    s = x264_weight_cost_luma( h, fenc, mcbuf, &weights[plane] );
+                COPY4_IF_LT( minscore, s, minscale, cur_scale, minoff, i_off, found, 1 );
 
-            // Don't check any more offsets if the previous one had a lower cost than the current one
-            if( minoff == start_offset && i_off != start_offset )
-                break;
+                // Don't check any more offsets if the previous one had a lower cost than the current one
+                if( minoff == start_offset && i_off != start_offset )
+                    break;
+            }
         }
         x264_emms();
 
+        /* Use a smaller denominator if possible */
+        if( !plane )
+        {
+            while( mindenom > 0 && !(minscale&1) )
+            {
+                mindenom--;
+                minscale >>= 1;
+            }
+        }
+
         /* FIXME: More analysis can be done here on SAD vs. SATD termination. */
         /* 0.2% termination derived experimentally to avoid weird weights in frames that are mostly intra. */
         if( !found || (minscale == 1 << mindenom && minoff == 0) || (float)minscore / origscore > 0.998f )
@@ -398,18 +461,29 @@
             fenc->f_weighted_cost_delta[i_delta_index] = (float)minscore / origscore;
     }
 
-    //FIXME, what is the correct way to deal with this?
-    if( weights[1].weightfn && weights[2].weightfn && weights[1].i_denom != weights[2].i_denom )
+    /* Optimize and unify denominator */
+    if( weights[1].weightfn || weights[2].weightfn )
     {

x264-snapshot-20130723-2245.tar.bz2/extras/avxsynth_c.h Added

@@ -0,0 +1,727 @@
+// Avisynth C Interface Version 0.20
+// Copyright 2003 Kevin Atkinson
+
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
+// http://www.gnu.org/copyleft/gpl.html .
+//
+// As a special exception, I give you permission to link to the
+// Avisynth C interface with independent modules that communicate with
+// the Avisynth C interface solely through the interfaces defined in
+// avisynth_c.h, regardless of the license terms of these independent
+// modules, and to copy and distribute the resulting combined work
+// under terms of your choice, provided that every copy of the
+// combined work is accompanied by a complete copy of the source code
+// of the Avisynth C interface and Avisynth itself (with the version
+// used to produce the combined work), being distributed under the
+// terms of the GNU General Public License plus this exception.  An
+// independent module is a module which is not derived from or based
+// on Avisynth C Interface, such as 3rd-party filters, import and
+// export plugins, or graphical user interfaces.
+
+#ifndef __AVXSYNTH_C__
+#define __AVXSYNTH_C__
+  
+#include "windowsPorts/windows2linux.h"
+#include <stdarg.h>
+
+#ifdef __cplusplus
+#  define EXTERN_C extern "C"
+#else
+#  define EXTERN_C
+#endif
+
+#define AVSC_USE_STDCALL 1
+
+#ifndef AVSC_USE_STDCALL
+#  define AVSC_CC __cdecl
+#else
+#  define AVSC_CC __stdcall
+#endif
+
+#define AVSC_INLINE static __inline
+
+#ifdef AVISYNTH_C_EXPORTS
+#  define AVSC_EXPORT EXTERN_C
+#  define AVSC_API(ret, name) EXTERN_C __declspec(dllexport) ret AVSC_CC name
+#else
+#  define AVSC_EXPORT EXTERN_C __declspec(dllexport)
+#  ifndef AVSC_NO_DECLSPEC
+#    define AVSC_API(ret, name) EXTERN_C __declspec(dllimport) ret AVSC_CC name
+#  else
+#    define AVSC_API(ret, name) typedef ret (AVSC_CC *name##_func)
+#  endif
+#endif
+
+#ifdef __GNUC__
+typedef long long int INT64;
+#else
+typedef __int64 INT64;
+#endif
+
+
+/////////////////////////////////////////////////////////////////////
+//
+// Constants
+//
+
+#ifndef __AVXSYNTH_H__
+enum { AVISYNTH_INTERFACE_VERSION = 3 };
+#endif
+
+enum {AVS_SAMPLE_INT8  = 1<<0,
+      AVS_SAMPLE_INT16 = 1<<1, 
+      AVS_SAMPLE_INT24 = 1<<2,
+      AVS_SAMPLE_INT32 = 1<<3,
+      AVS_SAMPLE_FLOAT = 1<<4};
+
+enum {AVS_PLANAR_Y=1<<0,
+      AVS_PLANAR_U=1<<1,
+      AVS_PLANAR_V=1<<2,
+      AVS_PLANAR_ALIGNED=1<<3,
+      AVS_PLANAR_Y_ALIGNED=AVS_PLANAR_Y|AVS_PLANAR_ALIGNED,
+      AVS_PLANAR_U_ALIGNED=AVS_PLANAR_U|AVS_PLANAR_ALIGNED,
+      AVS_PLANAR_V_ALIGNED=AVS_PLANAR_V|AVS_PLANAR_ALIGNED};
+
+  // Colorspace properties.
+enum {AVS_CS_BGR = 1<<28,  
+      AVS_CS_YUV = 1<<29,
+      AVS_CS_INTERLEAVED = 1<<30,
+      AVS_CS_PLANAR = 1<<31};
+
+  // Specific colorformats
+enum {
+  AVS_CS_UNKNOWN = 0,
+  AVS_CS_BGR24 = 1<<0 | AVS_CS_BGR | AVS_CS_INTERLEAVED,
+  AVS_CS_BGR32 = 1<<1 | AVS_CS_BGR | AVS_CS_INTERLEAVED,
+  AVS_CS_YUY2 = 1<<2 | AVS_CS_YUV | AVS_CS_INTERLEAVED,
+  AVS_CS_YV12 = 1<<3 | AVS_CS_YUV | AVS_CS_PLANAR,  // y-v-u, planar
+  AVS_CS_I420 = 1<<4 | AVS_CS_YUV | AVS_CS_PLANAR,  // y-u-v, planar
+  AVS_CS_IYUV = 1<<4 | AVS_CS_YUV | AVS_CS_PLANAR  // same as above
+};
+
+enum {
+  AVS_IT_BFF = 1<<0,
+  AVS_IT_TFF = 1<<1,
+  AVS_IT_FIELDBASED = 1<<2};
+
+enum {
+  AVS_FILTER_TYPE=1,
+  AVS_FILTER_INPUT_COLORSPACE=2,
+  AVS_FILTER_OUTPUT_TYPE=9,
+  AVS_FILTER_NAME=4,
+  AVS_FILTER_AUTHOR=5,
+  AVS_FILTER_VERSION=6,
+  AVS_FILTER_ARGS=7,
+  AVS_FILTER_ARGS_INFO=8,
+  AVS_FILTER_ARGS_DESCRIPTION=10,
+  AVS_FILTER_DESCRIPTION=11};
+
+enum {  //SUBTYPES
+  AVS_FILTER_TYPE_AUDIO=1,
+  AVS_FILTER_TYPE_VIDEO=2,
+  AVS_FILTER_OUTPUT_TYPE_SAME=3,
+  AVS_FILTER_OUTPUT_TYPE_DIFFERENT=4};
+
+enum {
+  AVS_CACHE_NOTHING=0,
+  AVS_CACHE_RANGE=1,
+  AVS_CACHE_ALL=2,
+  AVS_CACHE_AUDIO=3,
+  AVS_CACHE_AUDIO_NONE=4,
+  AVS_CACHE_AUDIO_AUTO=5
+};
+
+#define AVS_FRAME_ALIGN 16 
+
+typedef struct AVS_Clip AVS_Clip;
+typedef struct AVS_ScriptEnvironment AVS_ScriptEnvironment;
+
+/////////////////////////////////////////////////////////////////////
+//
+// AVS_VideoInfo
+//
+
+// AVS_VideoInfo is layed out identicly to VideoInfo
+typedef struct AVS_VideoInfo {
+  int width, height;    // width=0 means no video
+  unsigned fps_numerator, fps_denominator;
+  int num_frames;
+
+  int pixel_type;
+  
+  int audio_samples_per_second;   // 0 means no audio
+  int sample_type;
+  INT64 num_audio_samples;
+  int nchannels;
+
+  // Imagetype properties
+
+  int image_type;
+} AVS_VideoInfo;
+
+// useful functions of the above
+AVSC_INLINE int avs_has_video(const AVS_VideoInfo * p) 
+        { return (p->width!=0); }
+
+AVSC_INLINE int avs_has_audio(const AVS_VideoInfo * p) 
+        { return (p->audio_samples_per_second!=0); }
+
+AVSC_INLINE int avs_is_rgb(const AVS_VideoInfo * p) 
+        { return !!(p->pixel_type&AVS_CS_BGR); }
+
+AVSC_INLINE int avs_is_rgb24(const AVS_VideoInfo * p) 
+        { return (p->pixel_type&AVS_CS_BGR24)==AVS_CS_BGR24; } // Clear out additional properties
+
+AVSC_INLINE int avs_is_rgb32(const AVS_VideoInfo * p) 
+        { return (p->pixel_type & AVS_CS_BGR32) == AVS_CS_BGR32 ; }
+
+AVSC_INLINE int avs_is_yuv(const AVS_VideoInfo * p) 
+        { return !!(p->pixel_type&AVS_CS_YUV ); }
+
+AVSC_INLINE int avs_is_yuy2(const AVS_VideoInfo * p) 
+        { return (p->pixel_type & AVS_CS_YUY2) == AVS_CS_YUY2; }  
+
+AVSC_INLINE int avs_is_yv12(const AVS_VideoInfo * p) 
+        { return ((p->pixel_type & AVS_CS_YV12) == AVS_CS_YV12)||((p->pixel_type & AVS_CS_I420) == AVS_CS_I420); }
+
+AVSC_INLINE int avs_is_color_space(const AVS_VideoInfo * p, int c_space)

x264-snapshot-20130723-2245.tar.bz2/extras/cl.h Added

@@ -0,0 +1,1209 @@
+/*******************************************************************************
+ * Copyright (c) 2008 - 2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+#ifndef __OPENCL_CL_H
+#define __OPENCL_CL_H
+
+#include "cl_platform.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************/
+
+typedef struct _cl_platform_id *    cl_platform_id;
+typedef struct _cl_device_id *      cl_device_id;
+typedef struct _cl_context *        cl_context;
+typedef struct _cl_command_queue *  cl_command_queue;
+typedef struct _cl_mem *            cl_mem;
+typedef struct _cl_program *        cl_program;
+typedef struct _cl_kernel *         cl_kernel;
+typedef struct _cl_event *          cl_event;
+typedef struct _cl_sampler *        cl_sampler;
+
+typedef cl_uint             cl_bool;                     /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */
+typedef cl_ulong            cl_bitfield;
+typedef cl_bitfield         cl_device_type;
+typedef cl_uint             cl_platform_info;
+typedef cl_uint             cl_device_info;
+typedef cl_bitfield         cl_device_fp_config;
+typedef cl_uint             cl_device_mem_cache_type;
+typedef cl_uint             cl_device_local_mem_type;
+typedef cl_bitfield         cl_device_exec_capabilities;
+typedef cl_bitfield         cl_command_queue_properties;
+typedef intptr_t            cl_device_partition_property;
+typedef cl_bitfield         cl_device_affinity_domain;
+
+typedef intptr_t            cl_context_properties;
+typedef cl_uint             cl_context_info;
+typedef cl_uint             cl_command_queue_info;
+typedef cl_uint             cl_channel_order;
+typedef cl_uint             cl_channel_type;
+typedef cl_bitfield         cl_mem_flags;
+typedef cl_uint             cl_mem_object_type;
+typedef cl_uint             cl_mem_info;
+typedef cl_bitfield         cl_mem_migration_flags;
+typedef cl_uint             cl_image_info;
+typedef cl_uint             cl_buffer_create_type;
+typedef cl_uint             cl_addressing_mode;
+typedef cl_uint             cl_filter_mode;
+typedef cl_uint             cl_sampler_info;
+typedef cl_bitfield         cl_map_flags;
+typedef cl_uint             cl_program_info;
+typedef cl_uint             cl_program_build_info;
+typedef cl_uint             cl_program_binary_type;
+typedef cl_int              cl_build_status;
+typedef cl_uint             cl_kernel_info;
+typedef cl_uint             cl_kernel_arg_info;
+typedef cl_uint             cl_kernel_arg_address_qualifier;
+typedef cl_uint             cl_kernel_arg_access_qualifier;
+typedef cl_bitfield         cl_kernel_arg_type_qualifier;
+typedef cl_uint             cl_kernel_work_group_info;
+typedef cl_uint             cl_event_info;
+typedef cl_uint             cl_command_type;
+typedef cl_uint             cl_profiling_info;
+
+
+typedef struct _cl_image_format {
+    cl_channel_order        image_channel_order;
+    cl_channel_type         image_channel_data_type;
+} cl_image_format;
+
+typedef struct _cl_image_desc {
+    cl_mem_object_type      image_type;
+    size_t                  image_width;
+    size_t                  image_height;
+    size_t                  image_depth;
+    size_t                  image_array_size;
+    size_t                  image_row_pitch;
+    size_t                  image_slice_pitch;
+    cl_uint                 num_mip_levels;
+    cl_uint                 num_samples;
+    cl_mem                  buffer;
+} cl_image_desc;
+
+typedef struct _cl_buffer_region {
+    size_t                  origin;
+    size_t                  size;
+} cl_buffer_region;
+
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_SUCCESS                                  0
+#define CL_DEVICE_NOT_FOUND                         -1
+#define CL_DEVICE_NOT_AVAILABLE                     -2
+#define CL_COMPILER_NOT_AVAILABLE                   -3
+#define CL_MEM_OBJECT_ALLOCATION_FAILURE            -4
+#define CL_OUT_OF_RESOURCES                         -5
+#define CL_OUT_OF_HOST_MEMORY                       -6
+#define CL_PROFILING_INFO_NOT_AVAILABLE             -7
+#define CL_MEM_COPY_OVERLAP                         -8
+#define CL_IMAGE_FORMAT_MISMATCH                    -9
+#define CL_IMAGE_FORMAT_NOT_SUPPORTED               -10
+#define CL_BUILD_PROGRAM_FAILURE                    -11
+#define CL_MAP_FAILURE                              -12
+#define CL_MISALIGNED_SUB_BUFFER_OFFSET             -13
+#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
+#define CL_COMPILE_PROGRAM_FAILURE                  -15
+#define CL_LINKER_NOT_AVAILABLE                     -16
+#define CL_LINK_PROGRAM_FAILURE                     -17
+#define CL_DEVICE_PARTITION_FAILED                  -18
+#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE            -19
+
+#define CL_INVALID_VALUE                            -30
+#define CL_INVALID_DEVICE_TYPE                      -31
+#define CL_INVALID_PLATFORM                         -32
+#define CL_INVALID_DEVICE                           -33
+#define CL_INVALID_CONTEXT                          -34
+#define CL_INVALID_QUEUE_PROPERTIES                 -35
+#define CL_INVALID_COMMAND_QUEUE                    -36
+#define CL_INVALID_HOST_PTR                         -37
+#define CL_INVALID_MEM_OBJECT                       -38
+#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR          -39
+#define CL_INVALID_IMAGE_SIZE                       -40
+#define CL_INVALID_SAMPLER                          -41
+#define CL_INVALID_BINARY                           -42
+#define CL_INVALID_BUILD_OPTIONS                    -43
+#define CL_INVALID_PROGRAM                          -44
+#define CL_INVALID_PROGRAM_EXECUTABLE               -45
+#define CL_INVALID_KERNEL_NAME                      -46
+#define CL_INVALID_KERNEL_DEFINITION                -47
+#define CL_INVALID_KERNEL                           -48
+#define CL_INVALID_ARG_INDEX                        -49
+#define CL_INVALID_ARG_VALUE                        -50
+#define CL_INVALID_ARG_SIZE                         -51
+#define CL_INVALID_KERNEL_ARGS                      -52
+#define CL_INVALID_WORK_DIMENSION                   -53
+#define CL_INVALID_WORK_GROUP_SIZE                  -54
+#define CL_INVALID_WORK_ITEM_SIZE                   -55
+#define CL_INVALID_GLOBAL_OFFSET                    -56
+#define CL_INVALID_EVENT_WAIT_LIST                  -57
+#define CL_INVALID_EVENT                            -58
+#define CL_INVALID_OPERATION                        -59
+#define CL_INVALID_GL_OBJECT                        -60
+#define CL_INVALID_BUFFER_SIZE                      -61
+#define CL_INVALID_MIP_LEVEL                        -62
+#define CL_INVALID_GLOBAL_WORK_SIZE                 -63
+#define CL_INVALID_PROPERTY                         -64
+#define CL_INVALID_IMAGE_DESCRIPTOR                 -65
+#define CL_INVALID_COMPILER_OPTIONS                 -66
+#define CL_INVALID_LINKER_OPTIONS                   -67
+#define CL_INVALID_DEVICE_PARTITION_COUNT           -68
+
+/* OpenCL Version */
+#define CL_VERSION_1_0                              1
+#define CL_VERSION_1_1                              1
+#define CL_VERSION_1_2                              1
+
+/* cl_bool */
+#define CL_FALSE                                    0
+#define CL_TRUE                                     1
+#define CL_BLOCKING                                 CL_TRUE
+#define CL_NON_BLOCKING                             CL_FALSE
+
+/* cl_platform_info */
+#define CL_PLATFORM_PROFILE                         0x0900
+#define CL_PLATFORM_VERSION                         0x0901
+#define CL_PLATFORM_NAME                            0x0902
+#define CL_PLATFORM_VENDOR                          0x0903
+#define CL_PLATFORM_EXTENSIONS                      0x0904
+
+/* cl_device_type - bitfield */
+#define CL_DEVICE_TYPE_DEFAULT                      (1 << 0)
+#define CL_DEVICE_TYPE_CPU                          (1 << 1)
+#define CL_DEVICE_TYPE_GPU                          (1 << 2)
+#define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
+#define CL_DEVICE_TYPE_CUSTOM                       (1 << 4)

x264-snapshot-20130723-2245.tar.bz2/extras/cl_platform.h Added

@@ -0,0 +1,1268 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11803 $ on $Date: 2010-06-25 10:02:12 -0700 (Fri, 25 Jun 2010) $ */
+
+#ifndef __CL_PLATFORM_H
+#define __CL_PLATFORM_H
+
+#ifdef __APPLE__
+    /* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */
+    #include <AvailabilityMacros.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_WIN32)
+    #define CL_API_ENTRY
+    #define CL_API_CALL     __stdcall
+    #define CL_CALLBACK     __stdcall
+#else
+    #define CL_API_ENTRY
+    #define CL_API_CALL
+    #define CL_CALLBACK
+#endif
+
+#ifdef __APPLE__
+    #define CL_EXTENSION_WEAK_LINK       __attribute__((weak_import))
+    #ifndef UNAVAILABLE_ATTRIBUTE
+        #define UNAVAILABLE_ATTRIBUTE
+    #endif
+    #ifdef AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+        #define CL_API_SUFFIX__VERSION_1_0              AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_0              CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+    #else
+        #define CL_API_SUFFIX__VERSION_1_0              UNAVAILABLE_ATTRIBUTE
+        #define CL_EXT_SUFFIX__VERSION_1_0              CL_EXTENSION_WEAK_LINK UNAVAILABLE_ATTRIBUTE
+    #endif
+    #ifdef AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define CL_API_SUFFIX__VERSION_1_1              AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define GCL_API_SUFFIX__VERSION_1_1             AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_1              CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED   CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7
+    #else
+        #define CL_API_SUFFIX__VERSION_1_1              UNAVAILABLE_ATTRIBUTE
+        #define GCL_API_SUFFIX__VERSION_1_1             UNAVAILABLE_ATTRIBUTE
+        #define CL_EXT_SUFFIX__VERSION_1_1              CL_EXTENSION_WEAK_LINK UNAVAILABLE_ATTRIBUTE
+        #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATE    CL_EXT_SUFFIX__VERSION_1_0
+    #endif
+    #ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_API_SUFFIX__VERSION_1_2              AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define GCL_API_SUFFIX__VERSION_1_2             AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_2              CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED   CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8
+    #else
+        #define CL_API_SUFFIX__VERSION_1_2              UNAVAILABLE_ATTRIBUTE
+        #define GCL_API_SUFFIX__VERSION_1_2             UNAVAILABLE_ATTRIBUTE
+        #define CL_EXT_SUFFIX__VERSION_1_2              CL_EXTENSION_WEAK_LINK UNAVAILABLE_ATTRIBUTE
+        #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED   CL_EXT_SUFFIX__VERSION_1_1
+    #endif
+#else
+    #define CL_EXTENSION_WEAK_LINK
+    #define CL_API_SUFFIX__VERSION_1_0
+    #define CL_EXT_SUFFIX__VERSION_1_0
+    #define CL_API_SUFFIX__VERSION_1_1
+    #define CL_EXT_SUFFIX__VERSION_1_1
+    #define CL_API_SUFFIX__VERSION_1_2
+    #define CL_EXT_SUFFIX__VERSION_1_2
+
+    #ifdef __GNUC__
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+        #endif
+
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+        #endif
+    #elif _WIN32
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED __declspec(deprecated)
+        #endif
+
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED __declspec(deprecated)
+        #endif
+    #else
+        #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+    #endif
+#endif
+
+#if (defined (_WIN32) && defined(_MSC_VER))
+
+/* scalar types  */
+typedef signed   __int8         cl_char;
+typedef unsigned __int8         cl_uchar;
+typedef signed   __int16        cl_short;
+typedef unsigned __int16        cl_ushort;
+typedef signed   __int32        cl_int;
+typedef unsigned __int32        cl_uint;
+typedef signed   __int64        cl_long;
+typedef unsigned __int64        cl_ulong;
+
+typedef unsigned __int16        cl_half;
+typedef float                   cl_float;
+typedef double                  cl_double;
+
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT         8
+#define CL_SCHAR_MAX        127
+#define CL_SCHAR_MIN        (-127-1)
+#define CL_CHAR_MAX         CL_SCHAR_MAX
+#define CL_CHAR_MIN         CL_SCHAR_MIN
+#define CL_UCHAR_MAX        255
+#define CL_SHRT_MAX         32767
+#define CL_SHRT_MIN         (-32767-1)
+#define CL_USHRT_MAX        65535
+#define CL_INT_MAX          2147483647
+#define CL_INT_MIN          (-2147483647-1)
+#define CL_UINT_MAX         0xffffffffU
+#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+
+#define CL_FLT_DIG          6
+#define CL_FLT_MANT_DIG     24
+#define CL_FLT_MAX_10_EXP   +38
+#define CL_FLT_MAX_EXP      +128
+#define CL_FLT_MIN_10_EXP   -37
+#define CL_FLT_MIN_EXP      -125
+#define CL_FLT_RADIX        2
+#define CL_FLT_MAX          340282346638528859811704183484516925440.0f
+#define CL_FLT_MIN          1.175494350822287507969e-38f
+#define CL_FLT_EPSILON      0x1.0p-23f
+
+#define CL_DBL_DIG          15
+#define CL_DBL_MANT_DIG     53
+#define CL_DBL_MAX_10_EXP   +308
+#define CL_DBL_MAX_EXP      +1024
+#define CL_DBL_MIN_10_EXP   -307
+#define CL_DBL_MIN_EXP      -1021
+#define CL_DBL_RADIX        2
+#define CL_DBL_MAX          179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0
+#define CL_DBL_MIN          2.225073858507201383090e-308
+#define CL_DBL_EPSILON      2.220446049250313080847e-16
+
+#define  CL_M_E             2.718281828459045090796
+#define  CL_M_LOG2E         1.442695040888963387005
+#define  CL_M_LOG10E        0.434294481903251816668
+#define  CL_M_LN2           0.693147180559945286227
+#define  CL_M_LN10          2.302585092994045901094
+#define  CL_M_PI            3.141592653589793115998
+#define  CL_M_PI_2          1.570796326794896557999
+#define  CL_M_PI_4          0.785398163397448278999
+#define  CL_M_1_PI          0.318309886183790691216
+#define  CL_M_2_PI          0.636619772367581382433
+#define  CL_M_2_SQRTPI      1.128379167095512558561

x264-snapshot-20130723-2245.tar.bz2/extras/windowsPorts Added

x264-snapshot-20130723-2245.tar.bz2/extras/windowsPorts/basicDataTypeConversions.h Added

@@ -0,0 +1,85 @@
+#ifndef __DATA_TYPE_CONVERSIONS_H__
+#define __DATA_TYPE_CONVERSIONS_H__
+
+#include <stdint.h>
+#include <wchar.h>
+
+#ifdef __cplusplus
+namespace avxsynth {
+#endif // __cplusplus
+
+typedef int64_t __int64;
+typedef int32_t __int32;
+#ifdef __cplusplus
+typedef bool	BOOL;
+#else
+typedef uint32_t BOOL;
+#endif // __cplusplus
+typedef void* HMODULE;
+typedef void* LPVOID;
+typedef void* PVOID;
+typedef PVOID HANDLE;
+typedef HANDLE HWND;
+typedef HANDLE HINSTANCE;
+typedef void* HDC;
+typedef void* HBITMAP;
+typedef void* HICON;
+typedef void* HFONT;
+typedef void* HGDIOBJ;
+typedef void* HBRUSH;
+typedef void* HMMIO;
+typedef void* HACMSTREAM;
+typedef void* HACMDRIVER;
+typedef void* HIC;
+typedef void* HACMOBJ;
+typedef HACMSTREAM* LPHACMSTREAM;
+typedef void* HACMDRIVERID;
+typedef void* LPHACMDRIVER;
+typedef unsigned char BYTE;
+typedef BYTE* LPBYTE;
+typedef char TCHAR;
+typedef TCHAR* LPTSTR;
+typedef const TCHAR* LPCTSTR;
+typedef char* LPSTR;
+typedef LPSTR LPOLESTR;
+typedef const char* LPCSTR;
+typedef LPCSTR LPCOLESTR;
+typedef wchar_t WCHAR;
+typedef unsigned short WORD;
+typedef unsigned int UINT;
+typedef UINT MMRESULT;
+typedef uint32_t DWORD;
+typedef DWORD COLORREF;
+typedef DWORD FOURCC;
+typedef DWORD HRESULT;
+typedef DWORD* LPDWORD;
+typedef DWORD* DWORD_PTR;
+typedef int32_t LONG;
+typedef int32_t* LONG_PTR;
+typedef LONG_PTR LRESULT;
+typedef uint32_t ULONG;
+typedef uint32_t* ULONG_PTR;
+//typedef __int64_t intptr_t;
+typedef uint64_t _fsize_t;
+
+
+//
+// Structures
+//
+
+typedef struct _GUID {
+  DWORD Data1;
+  WORD  Data2;
+  WORD  Data3;
+  BYTE  Data4[8];
+} GUID;
+
+typedef GUID REFIID;
+typedef GUID CLSID;
+typedef CLSID* LPCLSID;
+typedef GUID IID;
+
+#ifdef __cplusplus
+}; // namespace avxsynth
+#endif // __cplusplus
+#endif //  __DATA_TYPE_CONVERSIONS_H__

x264-snapshot-20130723-2245.tar.bz2/extras/windowsPorts/windows2linux.h Added

@@ -0,0 +1,77 @@
+#ifndef __WINDOWS2LINUX_H__
+#define __WINDOWS2LINUX_H__
+
+/*
+ * LINUX SPECIFIC DEFINITIONS
+*/
+//
+// Data types conversions
+//
+#include <stdlib.h>
+#include <string.h>
+#include "basicDataTypeConversions.h"
+
+#ifdef __cplusplus
+namespace avxsynth {
+#endif // __cplusplus
+//
+// purposefully define the following MSFT definitions 
+// to mean nothing (as they do not mean anything on Linux)
+//
+#define __stdcall
+#define __cdecl
+#define noreturn
+#define __declspec(x)
+#define STDAPI       extern "C" HRESULT
+#define STDMETHODIMP HRESULT __stdcall
+#define STDMETHODIMP_(x) x __stdcall
+
+#define STDMETHOD(x)    virtual HRESULT x
+#define STDMETHOD_(a, x) virtual a x
+
+#ifndef TRUE
+#define TRUE  true
+#endif 
+
+#ifndef FALSE
+#define FALSE false
+#endif
+
+#define S_OK                (0x00000000)
+#define S_FALSE             (0x00000001)
+#define E_NOINTERFACE       (0X80004002)
+#define E_POINTER           (0x80004003)
+#define E_FAIL              (0x80004005)
+#define E_OUTOFMEMORY       (0x8007000E)
+
+#define INVALID_HANDLE_VALUE    ((HANDLE)((LONG_PTR)-1))
+#define FAILED(hr)              ((hr) & 0x80000000)
+#define SUCCEEDED(hr)           (!FAILED(hr))
+
+
+// 
+// Functions
+//
+#define MAKEDWORD(a,b,c,d) ((a << 24) | (b << 16) | (c << 8) | (d))
+#define MAKEWORD(a,b) ((a << 8) | (b))
+
+#define lstrlen                             strlen
+#define lstrcpy                             strcpy
+#define lstrcmpi                            strcasecmp
+#define _stricmp                            strcasecmp
+#define InterlockedIncrement(x)             __sync_fetch_and_add((x), 1)
+#define InterlockedDecrement(x)             __sync_fetch_and_sub((x), 1)
+// Windows uses (new, old) ordering but GCC has (old, new)
+#define InterlockedCompareExchange(x,y,z)   __sync_val_compare_and_swap(x,z,y)
+
+#define UInt32x32To64(a, b)                 ( (uint64_t) ( ((uint64_t)((uint32_t)(a))) * ((uint32_t)(b))  ) ) 
+#define Int64ShrlMod32(a, b)                ( (uint64_t) ( (uint64_t)(a) >> (b) ) )
+#define Int32x32To64(a, b)                  ((__int64)(((__int64)((long)(a))) * ((long)(b))))
+
+#define MulDiv(nNumber, nNumerator, nDenominator)   (int32_t) (((int64_t) (nNumber) * (int64_t) (nNumerator) + (int64_t) ((nDenominator)/2)) / (int64_t) (nDenominator))
+
+#ifdef __cplusplus
+}; // namespace avxsynth
+#endif // __cplusplus
+
+#endif //  __WINDOWS2LINUX_H__

x264-snapshot-20130224-2245.tar.bz2/input/avs.c -> x264-snapshot-20130723-2245.tar.bz2/input/avs.c Changed

@@ -24,12 +24,30 @@
  *****************************************************************************/
 
 #include "input.h"
+#if USE_AVXSYNTH
+#include <dlfcn.h>
+#if SYS_MACOSX
+#define avs_open dlopen( "libavxsynth.dylib", RTLD_NOW )
+#else
+#define avs_open dlopen( "libavxsynth.so", RTLD_NOW )
+#endif
+#define avs_close dlclose
+#define avs_address dlsym
+#else
 #include <windows.h>
+#define avs_open LoadLibrary( "avisynth" )
+#define avs_close FreeLibrary
+#define avs_address GetProcAddress
+#endif
 #define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "avs", __VA_ARGS__ )
 
 #define AVSC_NO_DECLSPEC
 #undef EXTERN_C
+#if USE_AVXSYNTH
+#include "extras/avxsynth_c.h"
+#else
 #include "extras/avisynth_c.h"
+#endif
 #define AVSC_DECLARE_FUNC(name) name##_func name
 
 /* AVS uses a versioned interface to control backwards compatibility */
@@ -40,12 +58,20 @@
 #include <libavutil/pixfmt.h>
 #endif
 
+/* AvxSynth doesn't have yv24, yv16, yv411, or y8, so disable them. */
+#if USE_AVXSYNTH
+#define avs_is_yv24( vi ) 0
+#define avs_is_yv16( vi ) 0
+#define avs_is_yv411( vi ) 0
+#define avs_is_y8( vi ) 0
+#endif
+
 /* maximum size of the sequence of filters to try on non script files */
 #define AVS_MAX_SEQUENCE 5
 
 #define LOAD_AVS_FUNC(name, continue_on_fail)\
 {\
-    h->func.name = (void*)GetProcAddress( h->library, #name );\
+    h->func.name = (void*)avs_address( h->library, #name );\
     if( !continue_on_fail && !h->func.name )\
         goto fail;\
 }
@@ -76,7 +102,7 @@
 /* load the library and functions we require from it */
 static int x264_avs_load_library( avs_hnd_t *h )
 {
-    h->library = LoadLibrary( "avisynth" );
+    h->library = avs_open;
     if( !h->library )
         return -1;
     LOAD_AVS_FUNC( avs_clip_get_error, 0 );
@@ -93,7 +119,7 @@
     LOAD_AVS_FUNC( avs_take_clip, 0 );
     return 0;
 fail:
-    FreeLibrary( h->library );
+    avs_close( h->library );
     return -1;
 }
 
@@ -101,6 +127,9 @@
 static void avs_build_filter_sequence( char *filename_ext, const char *filter[AVS_MAX_SEQUENCE+1] )
 {
     int i = 0;
+#if USE_AVXSYNTH
+    const char *all_purpose[] = { "FFVideoSource", 0 };
+#else
     const char *all_purpose[] = { "FFmpegSource2", "DSS2", "DirectShowSource", 0 };
     if( !strcasecmp( filename_ext, "avi" ) )
         filter[i++] = "AVISource";
@@ -108,6 +137,7 @@
         filter[i++] = "MPEG2Source";
     if( !strcasecmp( filename_ext, "dga" ) )
         filter[i++] = "AVCSource";
+#endif
     for( int j = 0; all_purpose[j] && i < AVS_MAX_SEQUENCE; j++ )
         filter[i++] = all_purpose[j];
 }
@@ -123,6 +153,13 @@
 
 static float get_avs_version( avs_hnd_t *h )
 {
+/* AvxSynth has its version defined starting at 4.0, even though it's based on
+   AviSynth 2.5.8. This is troublesome for get_avs_version and working around
+   the new colorspaces in 2.6.  So if AvxSynth is detected, explicitly define
+   the version as 2.58. */
+#if USE_AVXSYNTH
+    return 2.58f;
+#else
     FAIL_IF_ERROR( !h->func.avs_function_exists( h->env, "VersionNumber" ), "VersionNumber does not exist\n" )
     AVS_Value ver = h->func.avs_invoke( h->env, "VersionNumber", avs_new_value_array( NULL, 0 ), NULL );
     FAIL_IF_ERROR( avs_is_error( ver ), "unable to determine avisynth version: %s\n", avs_as_error( ver ) )
@@ -130,6 +167,7 @@
     float ret = avs_as_float( ver );
     h->func.avs_release_value( ver );
     return ret;
+#endif
 }
 
 static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
@@ -219,11 +257,11 @@
     }
 #if !HAVE_SWSCALE
     /* if swscale is not available, convert the CSP if necessary */
+    FAIL_IF_ERROR( avs_version < 2.6f && (opt->output_csp == X264_CSP_I422 || opt->output_csp == X264_CSP_I444),
+                   "avisynth >= 2.6 is required for i422/i444 output\n" )
     if( (opt->output_csp == X264_CSP_I420 && !avs_is_yv12( vi )) || (opt->output_csp == X264_CSP_I422 && !avs_is_yv16( vi )) ||
         (opt->output_csp == X264_CSP_I444 && !avs_is_yv24( vi )) || (opt->output_csp == X264_CSP_RGB && !avs_is_rgb( vi )) )
     {
-        FAIL_IF_ERROR( avs_version < 2.6f && (opt->output_csp == X264_CSP_I422 || opt->output_csp == X264_CSP_I444),
-                       "avisynth >= 2.6 is required for i422/i444 output\n" )
 
         const char *csp = opt->output_csp == X264_CSP_I420 ? "YV12" :
                           opt->output_csp == X264_CSP_I422 ? "YV16" :
@@ -270,6 +308,7 @@
         opt->input_range = opt->output_range;
     }
 #endif
+
     h->func.avs_release_value( res );
 
     info->width   = vi->width;
@@ -357,7 +396,7 @@
     h->func.avs_release_clip( h->clip );
     if( h->func.avs_delete_script_environment )
         h->func.avs_delete_script_environment( h->env );
-    FreeLibrary( h->library );
+    avs_close( h->library );
     free( h );
     return 0;
 }

x264-snapshot-20130224-2245.tar.bz2/input/lavf.c -> x264-snapshot-20130723-2245.tar.bz2/input/lavf.c Changed

x264-snapshot-20130224-2245.tar.bz2/input/y4m.c -> x264-snapshot-20130723-2245.tar.bz2/input/y4m.c Changed

x264-snapshot-20130224-2245.tar.bz2/tools/checkasm-a.asm -> x264-snapshot-20130723-2245.tar.bz2/tools/checkasm-a.asm Changed

@@ -4,7 +4,7 @@
 ;* Copyright (C) 2008-2013 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
-;*          Henrik Gramner <hengar-6@student.ltu.se>
+;*          Henrik Gramner <henrik@gramner.com>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
@@ -88,8 +88,7 @@
 ; intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... )
 ;-----------------------------------------------------------------------------
 INIT_XMM
-cglobal checkasm_call, 2,15,16
-    SUB  rsp, max_args*8+16
+cglobal checkasm_call, 2,15,16,max_args*8+8
     mov  r6, r0
     mov  [rsp+max_args*8], r1
 
@@ -158,7 +157,6 @@
     mov  dword [r1], 0
     mov  rax, r9
 .ok:
-    ADD  rsp, max_args*8+16
     RET
 
 %else
@@ -207,8 +205,12 @@
 ; int x264_stack_pagealign( int (*func)(), int align )
 ;-----------------------------------------------------------------------------
 cglobal stack_pagealign, 2,2
+    movsxdifnidn r1, r1d
     push rbp
     mov  rbp, rsp
+%if WIN64
+    sub  rsp, 32 ; shadow space
+%endif
     and  rsp, ~0xfff
     sub  rsp, r1
     call r0

x264-snapshot-20130224-2245.tar.bz2/tools/checkasm.c -> x264-snapshot-20130723-2245.tar.bz2/tools/checkasm.c Changed

@@ -61,7 +61,7 @@
 {
     void *pointer; // just for detecting duplicates
     uint32_t cpu;
-    uint32_t cycles;
+    uint64_t cycles;
     uint32_t den;
 } bench_t;
 
@@ -137,12 +137,12 @@
 
 static void print_bench(void)
 {
-    uint16_t nops[10000] = {0};
+    uint16_t nops[10000];
     int nfuncs, nop_time=0;
 
     for( int i = 0; i < 10000; i++ )
     {
-        int t = read_time();
+        uint32_t t = read_time();
         nops[i] = read_time() - t;
     }
     qsort( nops, 10000, sizeof(uint16_t), cmp_nop );
@@ -164,6 +164,7 @@
             if( k < j )
                 continue;
             printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
+#if HAVE_MMX
                     b->cpu&X264_CPU_AVX2 && b->cpu&X264_CPU_FMA3 ? "avx2_fma3" :
                     b->cpu&X264_CPU_AVX2 ? "avx2" :
                     b->cpu&X264_CPU_FMA3 ? "fma3" :
@@ -176,21 +177,30 @@
                     /* print sse2slow only if there's also a sse2fast version of the same func */
                     b->cpu&X264_CPU_SSE2_IS_SLOW && j<MAX_CPUS-1 && b[1].cpu&X264_CPU_SSE2_IS_FAST && !(b[1].cpu&X264_CPU_SSE3) ? "sse2slow" :
                     b->cpu&X264_CPU_SSE2 ? "sse2" :
+                    b->cpu&X264_CPU_SSE ? "sse" :
                     b->cpu&X264_CPU_MMX ? "mmx" :
+#elif ARCH_PPC
                     b->cpu&X264_CPU_ALTIVEC ? "altivec" :
+#elif ARCH_ARM
                     b->cpu&X264_CPU_NEON ? "neon" :
-                    b->cpu&X264_CPU_ARMV6 ? "armv6" : "c",
+                    b->cpu&X264_CPU_ARMV6 ? "armv6" :
+#endif
+                    "c",
+#if HAVE_MMX
                     b->cpu&X264_CPU_CACHELINE_32 ? "_c32" :
+                    b->cpu&X264_CPU_SLOW_ATOM && b->cpu&X264_CPU_CACHELINE_64 ? "_c64_atom" :
                     b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
-                    b->cpu&X264_CPU_SHUFFLE_IS_FAST && !(b->cpu&X264_CPU_SSE4) ? "_fastshuffle" :
+                    b->cpu&X264_CPU_SLOW_SHUFFLE ? "_slowshuffle" :
                     b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :
                     b->cpu&X264_CPU_LZCNT ? "_lzcnt" :
                     b->cpu&X264_CPU_BMI2 ? "_bmi2" :
-                    b->cpu&X264_CPU_TBM ? "_tbm" :
                     b->cpu&X264_CPU_BMI1 ? "_bmi1" :
-                    b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" :
                     b->cpu&X264_CPU_SLOW_CTZ ? "_slow_ctz" :
-                    b->cpu&X264_CPU_SLOW_ATOM ? "_slow_atom" : "",
+                    b->cpu&X264_CPU_SLOW_ATOM ? "_atom" :
+#elif ARCH_ARM
+                    b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" :
+#endif
+                    "",
                     ((int64_t)10*b->cycles/b->den - nop_time)/4 );
         }
 }
@@ -231,7 +241,7 @@
 #define call_bench(func,cpu,...)\
     if( do_bench && !strncmp(func_name, bench_pattern, bench_pattern_len) )\
     {\
-        uint32_t tsum = 0;\
+        uint64_t tsum = 0;\
         int tcount = 0;\
         call_a1(func, __VA_ARGS__);\
         for( int ti = 0; ti < (cpu?BENCH_RUNS:BENCH_RUNS/4); ti++ )\
@@ -242,7 +252,7 @@
             func(__VA_ARGS__);\
             func(__VA_ARGS__);\
             t = read_time() - t;\
-            if( t*tcount <= tsum*4 && ti > 0 )\
+            if( (uint64_t)t*tcount <= tsum*4 && ti > 0 )\
             {\
                 tsum += t;\
                 tcount++;\
@@ -299,7 +309,7 @@
 
 #define TEST_PIXEL( name, align ) \
     ok = 1, used_asm = 0; \
-    for( int i = 0; i < 8; i++ ) \
+    for( int i = 0; i < ARRAY_ELEMS(pixel_c.name); i++ ) \
     { \
         int res_c, res_asm; \
         if( pixel_asm.name[i] != pixel_ref.name[i] ) \
@@ -337,11 +347,49 @@
     TEST_PIXEL( satd, 0 );
     TEST_PIXEL( sa8d, 1 );
 
+    ok = 1, used_asm = 0;
+    if( pixel_asm.sa8d_satd[PIXEL_16x16] != pixel_ref.sa8d_satd[PIXEL_16x16] )
+    {
+        set_func_name( "sa8d_satd_%s", pixel_names[PIXEL_16x16] );
+        used_asm = 1;
+        for( int j = 0; j < 64; j++ )
+        {
+            uint32_t cost8_c = pixel_c.sa8d[PIXEL_16x16]( pbuf1, 16, pbuf2, 64 );
+            uint32_t cost4_c = pixel_c.satd[PIXEL_16x16]( pbuf1, 16, pbuf2, 64 );
+            uint64_t res_a = call_a( pixel_asm.sa8d_satd[PIXEL_16x16], pbuf1, (intptr_t)16, pbuf2, (intptr_t)64 );
+            uint32_t cost8_a = res_a;
+            uint32_t cost4_a = res_a >> 32;
+            if( cost8_a != cost8_c || cost4_a != cost4_c )
+            {
+                ok = 0;
+                fprintf( stderr, "sa8d_satd [%d]: (%d,%d) != (%d,%d) [FAILED]\n", PIXEL_16x16,
+                         cost8_c, cost4_c, cost8_a, cost4_a );
+                break;
+            }
+        }
+        for( int j = 0; j < 0x1000 && ok; j += 256 ) \
+        {
+            uint32_t cost8_c = pixel_c.sa8d[PIXEL_16x16]( pbuf3+j, 16, pbuf4+j, 16 );
+            uint32_t cost4_c = pixel_c.satd[PIXEL_16x16]( pbuf3+j, 16, pbuf4+j, 16 );
+            uint64_t res_a = pixel_asm.sa8d_satd[PIXEL_16x16]( pbuf3+j, 16, pbuf4+j, 16 );
+            uint32_t cost8_a = res_a;
+            uint32_t cost4_a = res_a >> 32;
+            if( cost8_a != cost8_c || cost4_a != cost4_c )
+            {
+                ok = 0;
+                fprintf( stderr, "sa8d_satd [%d]: overflow (%d,%d) != (%d,%d) [FAILED]\n", PIXEL_16x16,
+                         cost8_c, cost4_c, cost8_a, cost4_a );
+            }
+        }
+    }
+    report( "pixel sa8d_satd :" );
+
 #define TEST_PIXEL_X( N ) \
     ok = 1; used_asm = 0; \
     for( int i = 0; i < 7; i++ ) \
     { \
-        int res_c[4]={0}, res_asm[4]={0}; \
+        ALIGNED_16( int res_c[4] ) = {0}; \
+        ALIGNED_16( int res_asm[4] ) = {0}; \
         if( pixel_asm.sad_x##N[i] && pixel_asm.sad_x##N[i] != pixel_ref.sad_x##N[i] ) \
         { \
             set_func_name( "sad_x%d_%s", N, pixel_names[i] ); \
@@ -494,7 +542,8 @@
 #define TEST_INTRA_X3( name, i8x8, ... ) \
     if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
     { \
-        int res_c[3], res_asm[3]; \
+        ALIGNED_16( int res_c[3] ); \
+        ALIGNED_16( int res_asm[3] ); \
         set_func_name( #name ); \
         used_asm = 1; \
         call_c( pixel_c.name, pbuf1+48, i8x8 ? edge : pbuf3+48, res_c ); \
@@ -696,8 +745,8 @@
         {
             ALIGNED_16( uint16_t sums[72] );
             ALIGNED_16( int dc[4] );
-            ALIGNED_16( int16_t mvs_a[32] );
-            ALIGNED_16( int16_t mvs_c[32] );
+            ALIGNED_16( int16_t mvs_a[48] );
+            ALIGNED_16( int16_t mvs_c[48] );
             int mvn_a, mvn_c;
             int thresh = rand() & 0x3fff;
             set_func_name( "esa_ads" );
@@ -732,10 +781,10 @@
     x264_dct_function_t dct_asm;
     x264_quant_function_t qf;
     int ret = 0, ok, used_asm, interlace = 0;
-    ALIGNED_16( dctcoef dct1[16][16] );
-    ALIGNED_16( dctcoef dct2[16][16] );
-    ALIGNED_16( dctcoef dct4[16][16] );
-    ALIGNED_16( dctcoef dct8[4][64] );
+    ALIGNED_ARRAY_N( dctcoef, dct1, [16],[16] );
+    ALIGNED_ARRAY_N( dctcoef, dct2, [16],[16] );
+    ALIGNED_ARRAY_N( dctcoef, dct4, [16],[16] );
+    ALIGNED_ARRAY_N( dctcoef, dct8, [4],[64] );
     ALIGNED_16( dctcoef dctdc[2][8] );
     x264_t h_buf;
     x264_t *h = &h_buf;
@@ -1030,7 +1079,7 @@
             call_a( zigzag_asm[interlace].name, t2, dct, buf4 ); \
             if( memcmp( t1, t2, size*sizeof(dctcoef) ) || memcmp( buf3, buf4, 10 ) ) \
             { \
-                ok = 0; \
+                ok = 0; printf("%d: %d %d %d %d\n%d %d %d %d\n\n",memcmp( t1, t2, size*sizeof(dctcoef) ),buf3[0], buf3[1], buf3[8], buf3[9], buf4[0], buf4[1], buf4[8], buf4[9]);break;\
             } \
         } \
     }
@@ -1040,13 +1089,13 @@
     x264_zigzag_init( cpu_new, &zigzag_asm[0], &zigzag_asm[1] );
 
     ok = 1; used_asm = 0;
-    TEST_INTERLEAVE( interleave_8x8_cavlc, level1, level2, dct1[0], 64 );
+    TEST_INTERLEAVE( interleave_8x8_cavlc, level1, level2, dct8[0], 64 );
     report( "zigzag_interleave :" );
 
     for( interlace = 0; interlace <= 1; interlace++ )

x264-snapshot-20130723-2245.tar.bz2/tools/cltostr.pl Added

@@ -0,0 +1,65 @@
+# Perl script used for compiling OpenCL src into x264 binary
+#
+# Copyright (C) 2013 x264 project
+# Authors: Steve Borho <sborho@multicorewareinc.com>
+
+use Digest::MD5 qw(md5_hex);
+
+# xxd takes a VAR, which will be the variable name
+# and BYTES, a string of bytes to beencoded.
+sub xxd
+{
+  my %args = @_;
+  my $var = $args{VAR};
+  my $bytes = $args{BYTES};
+  my @hexbytes;
+  my @bytes = split //, $$bytes;
+  foreach $b (@bytes)
+  {
+    push @hexbytes, sprintf("0x%02X", ord($b));
+  }
+
+  # Format 'em nice and pretty-like.
+  print 'static const char ' . $var . '[] = {' . "\n";
+  my $count = 0;
+  foreach my $h (@hexbytes)
+  {
+    print "$h, ";
+    $count++;
+    if ($count == 16)
+    {
+      print "\n";
+      $count = 0;
+    }
+  }
+  print "\n0x00 };\n\n";
+
+  return;
+}
+
+if (@ARGV < 1)
+{
+  printf "%s: VARNAME ", $0 . "\n";
+  exit(-1);
+}
+
+
+my @lines;
+while(<STDIN>)
+{
+  s/^\s+//;                # trim leading whitespace
+  if (/^\/\//)
+  {
+    next;   # skip the line if it starts with '//'
+  }
+  push @lines, $_;
+}
+
+my $lines = join '', @lines;
+xxd(VAR => @ARGV[0], BYTES => \$lines);
+
+my $hash = md5_hex($lines);
+@hash = ( $hash =~ m/../g );
+
+
+xxd(VAR => @ARGV[0] . "_hash", BYTES => \$hash);

x264-snapshot-20130224-2245.tar.bz2/x264.c -> x264-snapshot-20130723-2245.tar.bz2/x264.c Changed

@@ -225,7 +225,7 @@
     va_end( arg );
 }
 
-static void print_version_info()
+static void print_version_info( void )
 {
 #ifdef X264_POINTVER
     printf( "x264 "X264_POINTVER"\n" );
@@ -596,8 +596,11 @@
     H2( "      --slices <integer>      Number of slices per frame; forces rectangular\n"
         "                              slices and is overridden by other slicing options\n" );
     else H1( "      --slices <integer>      Number of slices per frame\n" );
+    H2( "      --slices-max <integer>  Absolute maximum slices per frame; overrides\n"
+        "                              slice-max-size/slice-max-mbs when necessary\n" );
     H2( "      --slice-max-size <integer> Limit the size of each slice in bytes\n");
-    H2( "      --slice-max-mbs <integer> Limit the size of each slice in macroblocks\n");
+    H2( "      --slice-max-mbs <integer> Limit the size of each slice in macroblocks (max)\n");
+    H2( "      --slice-min-mbs <integer> Limit the size of each slice in macroblocks (min)\n");
     H0( "      --tff                   Enable interlaced mode (top field first)\n" );
     H0( "      --bff                   Enable interlaced mode (bottom field first)\n" );
     H2( "      --constrained-intra     Enable constrained intra prediction.\n" );
@@ -743,16 +746,18 @@
     H2( "      --range <string>        Specify color range [\"%s\"]\n"
         "                                  - %s\n", range_names[0], stringify_names( buf, range_names ) );
     H2( "      --colorprim <string>    Specify color primaries [\"%s\"]\n"
-        "                                  - undef, bt709, bt470m, bt470bg\n"
-        "                                    smpte170m, smpte240m, film\n",
+        "                                  - undef, bt709, bt470m, bt470bg, smpte170m,\n"
+        "                                    smpte240m, film, bt2020\n",
                                        strtable_lookup( x264_colorprim_names, defaults->vui.i_colorprim ) );
     H2( "      --transfer <string>     Specify transfer characteristics [\"%s\"]\n"
-        "                                  - undef, bt709, bt470m, bt470bg, linear,\n"
-        "                                    log100, log316, smpte170m, smpte240m\n",
+        "                                  - undef, bt709, bt470m, bt470bg, smpte170m,\n"
+        "                                    smpte240m, linear, log100, log316,\n"
+        "                                    iec61966-2-4, bt1361e, iec61966-2-1,\n"
+        "                                    bt2020-10, bt2020-12\n",
                                        strtable_lookup( x264_transfer_names, defaults->vui.i_transfer ) );
     H2( "      --colormatrix <string>  Specify color matrix setting [\"%s\"]\n"
-        "                                  - undef, bt709, fcc, bt470bg\n"
-        "                                    smpte170m, smpte240m, GBR, YCgCo\n",
+        "                                  - undef, bt709, fcc, bt470bg, smpte170m,\n"
+        "                                    smpte240m, GBR, YCgCo, bt2020nc, bt2020c\n",
                                        strtable_lookup( x264_colmatrix_names, defaults->vui.i_colmatrix ) );
     H2( "      --chromaloc <integer>   Specify chroma sample location (0 to 5) [%d]\n",
                                        defaults->vui.i_chroma_loc );
@@ -787,6 +792,8 @@
     H0( "      --frames <integer>      Maximum number of frames to encode\n" );
     H0( "      --level <string>        Specify level (as defined by Annex A)\n" );
     H1( "      --bluray-compat         Enable compatibility hacks for Blu-ray support\n" );
+    H1( "      --stitchable            Don't optimize headers based on video content\n"
+        "                              Ensures ability to recombine a segmented encode\n" );
     H1( "\n" );
     H1( "  -v, --verbose               Print stats for each frame\n" );
     H1( "      --no-progress           Don't show the progress indicator while encoding\n" );
@@ -806,6 +813,9 @@
         "                                  as opposed to letting them select different algorithms\n" );
     H2( "      --asm <integer>         Override CPU detection\n" );
     H2( "      --no-asm                Disable all CPU optimizations\n" );
+    H2( "      --opencl                Enable use of OpenCL\n" );
+    H2( "      --opencl-clbin <string> Specify path of compiled OpenCL kernel cache\n" );
+    H2( "      --opencl-device <integer>  Specify OpenCL device ordinal\n" );
     H2( "      --visualize             Show MB types overlayed on the encoded video\n" );
     H2( "      --dump-yuv <string>     Save reconstructed frames\n" );
     H2( "      --sps-id <integer>      Set SPS and PPS id numbers [%d]\n", defaults->i_sps_id );
@@ -910,6 +920,9 @@
     { "ref",         required_argument, NULL, 'r' },
     { "asm",         required_argument, NULL, 0 },
     { "no-asm",            no_argument, NULL, 0 },
+    { "opencl",            no_argument, NULL, 1 },
+    { "opencl-clbin",required_argument, NULL, 0 },
+    { "opencl-device",required_argument, NULL, 0 },
     { "sar",         required_argument, NULL, 0 },
     { "fps",         required_argument, NULL, OPT_FPS },
     { "frames",      required_argument, NULL, OPT_FRAMES },
@@ -971,7 +984,9 @@
     { "no-sliced-threads", no_argument, NULL, 0 },
     { "slice-max-size",    required_argument, NULL, 0 },
     { "slice-max-mbs",     required_argument, NULL, 0 },
+    { "slice-min-mbs",     required_argument, NULL, 0 },
     { "slices",            required_argument, NULL, 0 },
+    { "slices-max",        required_argument, NULL, 0 },
     { "thread-input",      no_argument, NULL, OPT_THREAD_INPUT },
     { "sync-lookahead",    required_argument, NULL, 0 },
     { "non-deterministic", no_argument, NULL, 0 },
@@ -1025,6 +1040,7 @@
     { "dts-compress",      no_argument, NULL, OPT_DTS_COMPRESSION },
     { "output-csp",  required_argument, NULL, OPT_OUTPUT_CSP },
     { "input-range", required_argument, NULL, OPT_INPUT_RANGE },
+    { "stitchable",        no_argument, NULL, 0 },
     {0, 0, 0, 0}
 };

x264-snapshot-20130224-2245.tar.bz2/x264.h -> x264-snapshot-20130723-2245.tar.bz2/x264.h Changed

@@ -28,7 +28,7 @@
 #ifndef X264_X264_H
 #define X264_X264_H
 
-#if !defined(_STDINT_H) && !defined(_STDINT_H_) && \
+#if !defined(_STDINT_H) && !defined(_STDINT_H_) && !defined(_STDINT_H_INCLUDED) &&\
     !defined(_INTTYPES_H) && !defined(_INTTYPES_H_)
 # ifdef _MSC_VER
 #  pragma message("You must include stdint.h or inttypes.h before x264.h")
@@ -41,7 +41,7 @@
 
 #include "x264_config.h"
 
-#define X264_BUILD 129
+#define X264_BUILD 135
 
 /* Application developers planning to link against a shared library version of
  * libx264 from a Microsoft Visual Studio or similar development environment
@@ -109,43 +109,53 @@
 /****************************************************************************
  * Encoder parameters
  ****************************************************************************/
-/* CPU flags
- */
-#define X264_CPU_CACHELINE_32    0x0000001  /* avoid memory loads that span the border between two cachelines */
-#define X264_CPU_CACHELINE_64    0x0000002  /* 32/64 is the size of a cacheline in bytes */
-#define X264_CPU_ALTIVEC         0x0000004
-#define X264_CPU_MMX             0x0000008
-#define X264_CPU_MMX2            0x0000010  /* MMX2 aka MMXEXT aka ISSE */
+/* CPU flags */
+
+/* x86 */
+#define X264_CPU_CMOV            0x0000001
+#define X264_CPU_MMX             0x0000002
+#define X264_CPU_MMX2            0x0000004  /* MMX2 aka MMXEXT aka ISSE */
 #define X264_CPU_MMXEXT          X264_CPU_MMX2
-#define X264_CPU_SSE             0x0000020
-#define X264_CPU_SSE2            0x0000040
-#define X264_CPU_SSE2_IS_SLOW    0x0000080  /* avoid most SSE2 functions on Athlon64 */
-#define X264_CPU_SSE2_IS_FAST    0x0000100  /* a few functions are only faster on Core2 and Phenom */
-#define X264_CPU_SSE3            0x0000200
-#define X264_CPU_SSSE3           0x0000400
-#define X264_CPU_SHUFFLE_IS_FAST 0x0000800  /* Penryn, Nehalem, and Phenom have fast shuffle units */
-#define X264_CPU_STACK_MOD4      0x0001000  /* if stack is only mod4 and not mod16 */
-#define X264_CPU_SSE4            0x0002000  /* SSE4.1 */
-#define X264_CPU_SSE42           0x0004000  /* SSE4.2 */
-#define X264_CPU_SSE_MISALIGN    0x0008000  /* Phenom support for misaligned SSE instruction arguments */
-#define X264_CPU_LZCNT           0x0010000  /* Phenom support for "leading zero count" instruction. */
-#define X264_CPU_ARMV6           0x0020000
-#define X264_CPU_NEON            0x0040000  /* ARM NEON */
-#define X264_CPU_FAST_NEON_MRC   0x0080000  /* Transfer from NEON to ARM register is fast (Cortex-A9) */
-#define X264_CPU_SLOW_CTZ        0x0100000  /* BSR/BSF x86 instructions are really slow on some CPUs */
-#define X264_CPU_SLOW_ATOM       0x0200000  /* The Atom just sucks */
-#define X264_CPU_AVX             0x0400000  /* AVX support: requires OS support even if YMM registers
-                                             * aren't used. */
-#define X264_CPU_XOP             0x0800000  /* AMD XOP */
-#define X264_CPU_FMA4            0x1000000  /* AMD FMA4 */
-#define X264_CPU_AVX2            0x2000000  /* AVX2 */
-#define X264_CPU_FMA3            0x4000000  /* Intel FMA3 */
-#define X264_CPU_BMI1            0x8000000  /* BMI1 */
-#define X264_CPU_BMI2           0x10000000  /* BMI2 */
-#define X264_CPU_TBM            0x20000000  /* AMD TBM */
-
-/* Analyse flags
- */
+#define X264_CPU_SSE             0x0000008
+#define X264_CPU_SSE2            0x0000010
+#define X264_CPU_SSE3            0x0000020
+#define X264_CPU_SSSE3           0x0000040
+#define X264_CPU_SSE4            0x0000080  /* SSE4.1 */
+#define X264_CPU_SSE42           0x0000100  /* SSE4.2 */
+#define X264_CPU_SSE_MISALIGN    0x0000200  /* Phenom support for misaligned SSE instruction arguments */
+#define X264_CPU_LZCNT           0x0000400  /* Phenom support for "leading zero count" instruction. */
+#define X264_CPU_AVX             0x0000800  /* AVX support: requires OS support even if YMM registers aren't used. */
+#define X264_CPU_XOP             0x0001000  /* AMD XOP */
+#define X264_CPU_FMA4            0x0002000  /* AMD FMA4 */
+#define X264_CPU_AVX2            0x0004000  /* AVX2 */
+#define X264_CPU_FMA3            0x0008000  /* Intel FMA3 */
+#define X264_CPU_BMI1            0x0010000  /* BMI1 */
+#define X264_CPU_BMI2            0x0020000  /* BMI2 */
+/* x86 modifiers */
+#define X264_CPU_CACHELINE_32    0x0040000  /* avoid memory loads that span the border between two cachelines */
+#define X264_CPU_CACHELINE_64    0x0080000  /* 32/64 is the size of a cacheline in bytes */
+#define X264_CPU_SSE2_IS_SLOW    0x0100000  /* avoid most SSE2 functions on Athlon64 */
+#define X264_CPU_SSE2_IS_FAST    0x0200000  /* a few functions are only faster on Core2 and Phenom */
+#define X264_CPU_SLOW_SHUFFLE    0x0400000  /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */
+#define X264_CPU_STACK_MOD4      0x0800000  /* if stack is only mod4 and not mod16 */
+#define X264_CPU_SLOW_CTZ        0x1000000  /* BSR/BSF x86 instructions are really slow on some CPUs */
+#define X264_CPU_SLOW_ATOM       0x2000000  /* The Atom is terrible: slow SSE unaligned loads, slow
+                                             * SIMD multiplies, slow SIMD variable shifts, slow pshufb,
+                                             * cacheline split penalties -- gather everything here that
+                                             * isn't shared by other CPUs to avoid making half a dozen
+                                             * new SLOW flags. */
+#define X264_CPU_SLOW_PSHUFB     0x4000000  /* such as on the Intel Atom */
+#define X264_CPU_SLOW_PALIGNR    0x8000000  /* such as on the AMD Bobcat */
+
+/* PowerPC */
+#define X264_CPU_ALTIVEC         0x0000001
+
+/* ARM */
+#define X264_CPU_ARMV6           0x0000001
+#define X264_CPU_NEON            0x0000002  /* ARM NEON */
+#define X264_CPU_FAST_NEON_MRC   0x0000004  /* Transfer from NEON to ARM register is fast (Cortex-A9) */
+
+/* Analyse flags */
 #define X264_ANALYSE_I4x4       0x0001  /* Analyse i4x4 */
 #define X264_ANALYSE_I8x8       0x0002  /* Analyse i8x8 (requires 8x8 transform) */
 #define X264_ANALYSE_PSUB16x16  0x0010  /* Analyse p16x8, p8x16 and p8x8 */
@@ -188,9 +198,10 @@
 static const char * const x264_overscan_names[] = { "undef", "show", "crop", 0 };
 static const char * const x264_vidformat_names[] = { "component", "pal", "ntsc", "secam", "mac", "undef", 0 };
 static const char * const x264_fullrange_names[] = { "off", "on", 0 };
-static const char * const x264_colorprim_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "film", 0 };
-static const char * const x264_transfer_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "linear", "log100", "log316", 0 };
-static const char * const x264_colmatrix_names[] = { "GBR", "bt709", "undef", "", "fcc", "bt470bg", "smpte170m", "smpte240m", "YCgCo", 0 };
+static const char * const x264_colorprim_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "film", "bt2020", 0 };
+static const char * const x264_transfer_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "linear", "log100", "log316",
+                                                    "iec61966-2-4", "bt1361e", "iec61966-2-1", "bt2020-10", "bt2020-12", 0 };
+static const char * const x264_colmatrix_names[] = { "GBR", "bt709", "undef", "", "fcc", "bt470bg", "smpte170m", "smpte240m", "YCgCo", "bt2020nc", "bt2020c", 0 };
 static const char * const x264_nal_hrd_names[] = { "none", "vbr", "cbr", 0 };
 
 /* Colorspace type */
@@ -464,10 +475,23 @@
 
     int b_fake_interlaced;
 
+    /* Don't optimize header parameters based on video content, e.g. ensure that splitting an input video, compressing
+     * each part, and stitching them back together will result in identical SPS/PPS. This is necessary for stitching
+     * with container formats that don't allow multiple SPS/PPS. */
+    int b_stitchable;
+
+    int b_opencl;            /* use OpenCL when available */
+    int i_opencl_device;     /* specify count of GPU devices to skip, for CLI users */
+    void *opencl_device_id;  /* pass explicit cl_device_id as void*, for API users */
+    char *psz_clbin_file;    /* compiled OpenCL kernel cache file */
+
     /* Slicing parameters */
     int i_slice_max_size;    /* Max size per slice in bytes; includes estimated NAL overhead. */
     int i_slice_max_mbs;     /* Max number of MBs per slice; overrides i_slice_count. */
+    int i_slice_min_mbs;     /* Min number of MBs per slice */
     int i_slice_count;       /* Number of slices per frame: forces rectangular slices. */
+    int i_slice_count_max;   /* Absolute cap on slices per frame; stops applying slice-max-size
+                              * and slice-max-mbs if this is reached. */
 
     /* Optional callback for freeing this x264_param_t when it is done being used.
      * Only used when the x264_param_t sits in memory for an indefinite period of time,
@@ -481,7 +505,7 @@
      * is done encoding.
      *
      * This callback MUST do the following in order to work correctly:
-     * 1) Have available an output buffer of at least size nal->i_payload*3/2 + 5 + 16.
+     * 1) Have available an output buffer of at least size nal->i_payload*3/2 + 5 + 64.
      * 2) Call x264_nal_encode( h, dst, nal ), where dst is the output buffer.
      * After these steps, the content of nal is valid and can be used in the same way as if
      * the NAL unit were output by x264_encoder_encode.
@@ -834,7 +858,13 @@
  *      due to delay, this may not be the next frame passed to encoder_encode.
  *      if the change should apply to some particular frame, use x264_picture_t->param instead.
  *      returns 0 on success, negative on parameter validation error.
- *      not all parameters can be changed; see the actual function for a detailed breakdown. */
+ *      not all parameters can be changed; see the actual function for a detailed breakdown.
+ *
+ *      since not all parameters can be changed, moving from preset to preset may not always
+ *      fully copy all relevant parameters, but should still work usably in practice. however,
+ *      more so than for other presets, many of the speed shortcuts used in ultrafast cannot be
+ *      switched out of; using reconfig to switch between ultrafast and other presets is not
+ *      recommended without a more fine-grained breakdown of parameters to take this into account. */
 int     x264_encoder_reconfig( x264_t *, x264_param_t * );
 /* x264_encoder_parameters:
  *      copies the current internal set of parameters to the pointer provided