Packman Build Service PMBS

libx264.changes Changed

libx264.spec Changed

x264-snapshot-20120928-2245.tar.bz2/common/arm/asm.S -> x264-snapshot-20130224-2245.tar.bz2/common/arm/asm.S Changed

x264-snapshot-20120928-2245.tar.bz2/common/arm/cpu-a.S -> x264-snapshot-20130224-2245.tar.bz2/common/arm/cpu-a.S Changed

x264-snapshot-20120928-2245.tar.bz2/common/arm/dct-a.S -> x264-snapshot-20130224-2245.tar.bz2/common/arm/dct-a.S Changed

x264-snapshot-20120928-2245.tar.bz2/common/arm/dct.h -> x264-snapshot-20130224-2245.tar.bz2/common/arm/dct.h Changed

x264-snapshot-20120928-2245.tar.bz2/common/arm/deblock-a.S -> x264-snapshot-20130224-2245.tar.bz2/common/arm/deblock-a.S Changed

x264-snapshot-20120928-2245.tar.bz2/common/arm/mc-a.S -> x264-snapshot-20130224-2245.tar.bz2/common/arm/mc-a.S Changed

x264-snapshot-20120928-2245.tar.bz2/common/arm/mc-c.c -> x264-snapshot-20130224-2245.tar.bz2/common/arm/mc-c.c Changed

x264-snapshot-20120928-2245.tar.bz2/common/arm/mc.h -> x264-snapshot-20130224-2245.tar.bz2/common/arm/mc.h Changed

x264-snapshot-20120928-2245.tar.bz2/common/arm/pixel-a.S -> x264-snapshot-20130224-2245.tar.bz2/common/arm/pixel-a.S Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * pixel.S: arm pixel metrics
  *****************************************************************************
- * Copyright (C) 2009-2012 x264 project
+ * Copyright (C) 2009-2013 x264 project
  *
  * Authors: David Conrad <lessen42@gmail.com>
  *
@@ -68,45 +68,45 @@
 
 
 .macro SAD_START_4 align:vararg
-    vld1.32     {d1[]}, [r2 \align], r3
+    vld1.32     {d1[]}, [r2\align], r3
     vld1.32     {d0[]}, [r0,:32], r1
     vabdl.u8    q8,  d0,  d1
 .endm
 
 .macro SAD_4 align:vararg
-    vld1.32     {d1[]}, [r2 \align], r3
+    vld1.32     {d1[]}, [r2\align], r3
     vld1.32     {d0[]}, [r0,:32], r1
     vabal.u8    q8,  d0,  d1
 .endm
 
 .macro SAD_START_8 align:vararg
-    vld1.64     {d1}, [r2 \align], r3
+    vld1.64     {d1}, [r2\align], r3
     vld1.64     {d0}, [r0,:64], r1
     vabdl.u8    q8,  d0,  d1
 .endm
 
 .macro SAD_8 align:vararg
-    vld1.64     {d1}, [r2 \align], r3
+    vld1.64     {d1}, [r2\align], r3
     vld1.64     {d0}, [r0,:64], r1
     vabal.u8    q8,  d0,  d1
 .endm
 
 .macro SAD_START_16 align:vararg
-    vld1.64     {d2-d3}, [r2 \align], r3
+    vld1.64     {d2-d3}, [r2\align], r3
     vld1.64     {d0-d1}, [r0,:128], r1
     vabdl.u8    q8,  d0,  d2
-    vld1.64     {d6-d7}, [r2 \align], r3
+    vld1.64     {d6-d7}, [r2\align], r3
     vabdl.u8    q9,  d1,  d3
     vld1.64     {d4-d5}, [r0,:128], r1
 .endm
 
 .macro SAD_16 align:vararg
     vabal.u8    q8,  d4,  d6
-    vld1.64     {d2-d3}, [r2 \align], r3
+    vld1.64     {d2-d3}, [r2\align], r3
     vabal.u8    q9,  d5,  d7
     vld1.64     {d0-d1}, [r0,:128], r1
     vabal.u8    q8,  d0,  d2
-    vld1.64     {d6-d7}, [r2 \align], r3
+    vld1.64     {d6-d7}, [r2\align], r3
     vabal.u8    q9,  d1,  d3
     vld1.64     {d4-d5}, [r0,:128], r1
 .endm

x264-snapshot-20120928-2245.tar.bz2/common/arm/pixel.h -> x264-snapshot-20130224-2245.tar.bz2/common/arm/pixel.h Changed

x264-snapshot-20120928-2245.tar.bz2/common/arm/predict-a.S -> x264-snapshot-20130224-2245.tar.bz2/common/arm/predict-a.S Changed

x264-snapshot-20120928-2245.tar.bz2/common/arm/predict-c.c -> x264-snapshot-20130224-2245.tar.bz2/common/arm/predict-c.c Changed

x264-snapshot-20120928-2245.tar.bz2/common/arm/predict.h -> x264-snapshot-20130224-2245.tar.bz2/common/arm/predict.h Changed

x264-snapshot-20120928-2245.tar.bz2/common/arm/quant-a.S -> x264-snapshot-20130224-2245.tar.bz2/common/arm/quant-a.S Changed

x264-snapshot-20120928-2245.tar.bz2/common/arm/quant.h -> x264-snapshot-20130224-2245.tar.bz2/common/arm/quant.h Changed

x264-snapshot-20120928-2245.tar.bz2/common/bitstream.c -> x264-snapshot-20130224-2245.tar.bz2/common/bitstream.c Changed

x264-snapshot-20120928-2245.tar.bz2/common/bitstream.h -> x264-snapshot-20130224-2245.tar.bz2/common/bitstream.h Changed

x264-snapshot-20120928-2245.tar.bz2/common/cabac.c -> x264-snapshot-20130224-2245.tar.bz2/common/cabac.c Changed

x264-snapshot-20120928-2245.tar.bz2/common/cabac.h -> x264-snapshot-20130224-2245.tar.bz2/common/cabac.h Changed

x264-snapshot-20120928-2245.tar.bz2/common/common.c -> x264-snapshot-20130224-2245.tar.bz2/common/common.c Changed

x264-snapshot-20120928-2245.tar.bz2/common/common.h -> x264-snapshot-20130224-2245.tar.bz2/common/common.h Changed

x264-snapshot-20120928-2245.tar.bz2/common/cpu.c -> x264-snapshot-20130224-2245.tar.bz2/common/cpu.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * cpu.c: cpu detection
  *****************************************************************************
- * Copyright (C) 2003-2012 x264 project
+ * Copyright (C) 2003-2013 x264 project
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Laurent Aimar <fenrir@via.ecp.fr>
@@ -376,7 +376,10 @@
     // Note that there is potential for a race condition if another program or
     // x264 instance disables or reinits the counters while x264 is using them,
     // which may result in incorrect detection and the counters stuck enabled.
+    // right now Apple does not seem to support performance counters for this test
+#ifndef __MACH__
     flags |= x264_cpu_fast_neon_mrc_test() ? X264_CPU_FAST_NEON_MRC : 0;
+#endif
     // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
 #endif
     return flags;
@@ -399,7 +402,7 @@
 #elif SYS_WINDOWS
     return x264_pthread_num_processors_np();
 
-#elif SYS_CYGWIN
+#elif SYS_CYGWIN || SYS_SunOS
     return sysconf( _SC_NPROCESSORS_ONLN );
 
 #elif SYS_LINUX

x264-snapshot-20120928-2245.tar.bz2/common/cpu.h -> x264-snapshot-20130224-2245.tar.bz2/common/cpu.h Changed

x264-snapshot-20120928-2245.tar.bz2/common/dct.c -> x264-snapshot-20130224-2245.tar.bz2/common/dct.c Changed

x264-snapshot-20120928-2245.tar.bz2/common/dct.h -> x264-snapshot-20130224-2245.tar.bz2/common/dct.h Changed

x264-snapshot-20120928-2245.tar.bz2/common/deblock.c -> x264-snapshot-20130224-2245.tar.bz2/common/deblock.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * deblock.c: deblocking
  *****************************************************************************
- * Copyright (C) 2003-2012 x264 project
+ * Copyright (C) 2003-2013 x264 project
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
@@ -779,13 +779,13 @@
             pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_sse2;
             pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_sse2;
             pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_sse2;
+            pf->deblock_luma[1] = x264_deblock_v_luma_sse2;
+            pf->deblock_luma[0] = x264_deblock_h_luma_sse2;
+            pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_sse2;
+            pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2;
             if( !(cpu&X264_CPU_STACK_MOD4) )
             {
-                pf->deblock_luma[1] = x264_deblock_v_luma_sse2;
-                pf->deblock_luma[0] = x264_deblock_h_luma_sse2;
                 pf->deblock_chroma[1] = x264_deblock_v_chroma_sse2;
-                pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_sse2;
-                pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2;
                 pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_sse2;
                 pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_sse2;
 #if HIGH_BIT_DEPTH
@@ -801,13 +801,13 @@
             pf->deblock_h_chroma_420 = x264_deblock_h_chroma_avx;
             pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_avx;
             pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_avx;
+            pf->deblock_luma[1] = x264_deblock_v_luma_avx;
+            pf->deblock_luma[0] = x264_deblock_h_luma_avx;
+            pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_avx;
+            pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_avx;
             if( !(cpu&X264_CPU_STACK_MOD4) )
             {
-                pf->deblock_luma[1] = x264_deblock_v_luma_avx;
-                pf->deblock_luma[0] = x264_deblock_h_luma_avx;
                 pf->deblock_chroma[1] = x264_deblock_v_chroma_avx;
-                pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_avx;
-                pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_avx;
                 pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_avx;
                 pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_avx;
 #if HIGH_BIT_DEPTH

x264-snapshot-20120928-2245.tar.bz2/common/display-x11.c -> x264-snapshot-20130224-2245.tar.bz2/common/display-x11.c Changed

x264-snapshot-20120928-2245.tar.bz2/common/display.h -> x264-snapshot-20130224-2245.tar.bz2/common/display.h Changed

x264-snapshot-20120928-2245.tar.bz2/common/frame.c -> x264-snapshot-20130224-2245.tar.bz2/common/frame.c Changed

x264-snapshot-20120928-2245.tar.bz2/common/frame.h -> x264-snapshot-20130224-2245.tar.bz2/common/frame.h Changed

x264-snapshot-20120928-2245.tar.bz2/common/macroblock.c -> x264-snapshot-20130224-2245.tar.bz2/common/macroblock.c Changed

x264-snapshot-20120928-2245.tar.bz2/common/macroblock.h -> x264-snapshot-20130224-2245.tar.bz2/common/macroblock.h Changed

x264-snapshot-20120928-2245.tar.bz2/common/mc.c -> x264-snapshot-20130224-2245.tar.bz2/common/mc.c Changed

x264-snapshot-20120928-2245.tar.bz2/common/mc.h -> x264-snapshot-20130224-2245.tar.bz2/common/mc.h Changed

x264-snapshot-20120928-2245.tar.bz2/common/mvpred.c -> x264-snapshot-20130224-2245.tar.bz2/common/mvpred.c Changed

x264-snapshot-20120928-2245.tar.bz2/common/osdep.c -> x264-snapshot-20130224-2245.tar.bz2/common/osdep.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * osdep.c: platform-specific code
  *****************************************************************************
- * Copyright (C) 2003-2012 x264 project
+ * Copyright (C) 2003-2013 x264 project
  *
  * Authors: Steven Walters <kemuri9@gmail.com>
  *          Laurent Aimar <fenrir@via.ecp.fr>
@@ -90,6 +90,7 @@
 }
 #endif
 
+#if HAVE_MMX
 #ifdef __INTEL_COMPILER
 /* Agner's patch to Intel's CPU dispatcher from pages 131-132 of
  * http://agner.org/optimize/optimizing_cpp.pdf (2011-01-30)
@@ -98,7 +99,7 @@
 // Global variable indicating cpu
 int __intel_cpu_indicator = 0;
 // CPU dispatcher function
-void __intel_cpu_indicator_init( void )
+void x264_intel_cpu_indicator_init( void )
 {
     unsigned int cpu = x264_cpu_detect();
     if( cpu&X264_CPU_AVX )
@@ -120,4 +121,16 @@
     else
         __intel_cpu_indicator = 1;
 }
+
+/* __intel_cpu_indicator_init appears to have a non-standard calling convention that
+ * assumes certain registers aren't preserved, so we'll route it through a function
+ * that backs up all the registers. */
+void __intel_cpu_indicator_init( void )
+{
+    x264_safe_intel_cpu_indicator_init();
+}
+#else
+void x264_intel_cpu_indicator_init( void )
+{}
+#endif
 #endif

x264-snapshot-20120928-2245.tar.bz2/common/osdep.h -> x264-snapshot-20130224-2245.tar.bz2/common/osdep.h Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * osdep.h: platform-specific code
  *****************************************************************************
- * Copyright (C) 2007-2012 x264 project
+ * Copyright (C) 2007-2013 x264 project
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Laurent Aimar <fenrir@via.ecp.fr>
@@ -108,8 +108,10 @@
     ALIGNED_16( type name sub1 __VA_ARGS__ )
 #endif
 
-#define ALIGNED_ARRAY_32( ... ) ALIGNED_ARRAY_EMU( 31, __VA_ARGS__ )
-#define ALIGNED_ARRAY_64( ... ) ALIGNED_ARRAY_EMU( 63, __VA_ARGS__ )
+#define EXPAND(x) x
+
+#define ALIGNED_ARRAY_32( ... ) EXPAND( ALIGNED_ARRAY_EMU( 31, __VA_ARGS__ ) )
+#define ALIGNED_ARRAY_64( ... ) EXPAND( ALIGNED_ARRAY_EMU( 63, __VA_ARGS__ ) )
 
 #define UNINIT(x) x=x
 
@@ -147,7 +149,7 @@
      return 0;
 }
 #define x264_pthread_join(t,s)       { long tmp; \
-                                       wait_for_thread(t,(s)?(long*)(*(s)):&tmp); }
+                                       wait_for_thread(t,(s)?(long*)(s):&tmp); }
 
 #elif HAVE_POSIXTHREAD
 #include <pthread.h>

x264-snapshot-20120928-2245.tar.bz2/common/pixel.c -> x264-snapshot-20130224-2245.tar.bz2/common/pixel.c Changed

x264-snapshot-20120928-2245.tar.bz2/common/pixel.h -> x264-snapshot-20130224-2245.tar.bz2/common/pixel.h Changed

x264-snapshot-20120928-2245.tar.bz2/common/ppc/dct.c -> x264-snapshot-20130224-2245.tar.bz2/common/ppc/dct.c Changed

x264-snapshot-20120928-2245.tar.bz2/common/ppc/dct.h -> x264-snapshot-20130224-2245.tar.bz2/common/ppc/dct.h Changed

x264-snapshot-20120928-2245.tar.bz2/common/ppc/deblock.c -> x264-snapshot-20130224-2245.tar.bz2/common/ppc/deblock.c Changed

x264-snapshot-20120928-2245.tar.bz2/common/ppc/mc.c -> x264-snapshot-20130224-2245.tar.bz2/common/ppc/mc.c Changed

x264-snapshot-20120928-2245.tar.bz2/common/ppc/mc.h -> x264-snapshot-20130224-2245.tar.bz2/common/ppc/mc.h Changed

x264-snapshot-20120928-2245.tar.bz2/common/ppc/pixel.c -> x264-snapshot-20130224-2245.tar.bz2/common/ppc/pixel.c Changed

x264-snapshot-20120928-2245.tar.bz2/common/ppc/pixel.h -> x264-snapshot-20130224-2245.tar.bz2/common/ppc/pixel.h Changed

x264-snapshot-20120928-2245.tar.bz2/common/ppc/ppccommon.h -> x264-snapshot-20130224-2245.tar.bz2/common/ppc/ppccommon.h Changed

x264-snapshot-20120928-2245.tar.bz2/common/ppc/predict.c -> x264-snapshot-20130224-2245.tar.bz2/common/ppc/predict.c Changed

x264-snapshot-20120928-2245.tar.bz2/common/ppc/predict.h -> x264-snapshot-20130224-2245.tar.bz2/common/ppc/predict.h Changed

x264-snapshot-20120928-2245.tar.bz2/common/ppc/quant.c -> x264-snapshot-20130224-2245.tar.bz2/common/ppc/quant.c Changed

x264-snapshot-20120928-2245.tar.bz2/common/ppc/quant.h -> x264-snapshot-20130224-2245.tar.bz2/common/ppc/quant.h Changed

x264-snapshot-20120928-2245.tar.bz2/common/predict.c -> x264-snapshot-20130224-2245.tar.bz2/common/predict.c Changed

x264-snapshot-20120928-2245.tar.bz2/common/predict.h -> x264-snapshot-20130224-2245.tar.bz2/common/predict.h Changed

x264-snapshot-20120928-2245.tar.bz2/common/quant.c -> x264-snapshot-20130224-2245.tar.bz2/common/quant.c Changed

x264-snapshot-20120928-2245.tar.bz2/common/quant.h -> x264-snapshot-20130224-2245.tar.bz2/common/quant.h Changed

x264-snapshot-20120928-2245.tar.bz2/common/rectangle.c -> x264-snapshot-20130224-2245.tar.bz2/common/rectangle.c Changed

x264-snapshot-20120928-2245.tar.bz2/common/rectangle.h -> x264-snapshot-20130224-2245.tar.bz2/common/rectangle.h Changed

x264-snapshot-20120928-2245.tar.bz2/common/set.c -> x264-snapshot-20130224-2245.tar.bz2/common/set.c Changed

x264-snapshot-20120928-2245.tar.bz2/common/set.h -> x264-snapshot-20130224-2245.tar.bz2/common/set.h Changed

x264-snapshot-20120928-2245.tar.bz2/common/sparc/pixel.asm -> x264-snapshot-20130224-2245.tar.bz2/common/sparc/pixel.asm Changed

x264-snapshot-20120928-2245.tar.bz2/common/sparc/pixel.h -> x264-snapshot-20130224-2245.tar.bz2/common/sparc/pixel.h Changed

x264-snapshot-20120928-2245.tar.bz2/common/threadpool.c -> x264-snapshot-20130224-2245.tar.bz2/common/threadpool.c Changed

x264-snapshot-20120928-2245.tar.bz2/common/threadpool.h -> x264-snapshot-20130224-2245.tar.bz2/common/threadpool.h Changed

x264-snapshot-20120928-2245.tar.bz2/common/visualize.c -> x264-snapshot-20130224-2245.tar.bz2/common/visualize.c Changed

x264-snapshot-20120928-2245.tar.bz2/common/visualize.h -> x264-snapshot-20130224-2245.tar.bz2/common/visualize.h Changed

x264-snapshot-20120928-2245.tar.bz2/common/vlc.c -> x264-snapshot-20130224-2245.tar.bz2/common/vlc.c Changed

x264-snapshot-20120928-2245.tar.bz2/common/win32thread.c -> x264-snapshot-20130224-2245.tar.bz2/common/win32thread.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * win32thread.c: windows threading
  *****************************************************************************
- * Copyright (C) 2010-2012 x264 project
+ * Copyright (C) 2010-2013 x264 project
  *
  * Authors: Steven Walters <kemuri9@gmail.com>
  *          Pegasys Inc. <http://www.pegasys-inc.com>
@@ -62,7 +62,7 @@
 static unsigned __stdcall x264_win32thread_worker( void *arg )
 {
     x264_pthread_t *h = arg;
-    h->ret = h->func( h->arg );
+    *h->p_ret = h->func( h->arg );
     return 0;
 }
 
@@ -71,6 +71,8 @@
 {
     thread->func   = start_routine;
     thread->arg    = arg;
+    thread->p_ret  = &thread->ret;
+    thread->ret    = NULL;
     thread->handle = (void*)_beginthreadex( NULL, 0, x264_win32thread_worker, thread, 0, NULL );
     return !thread->handle;
 }
@@ -81,7 +83,7 @@
     if( ret != WAIT_OBJECT_0 )
         return -1;
     if( value_ptr )
-        *value_ptr = thread.ret;
+        *value_ptr = *thread.p_ret;
     CloseHandle( thread.handle );
     return 0;
 }

x264-snapshot-20120928-2245.tar.bz2/common/win32thread.h -> x264-snapshot-20130224-2245.tar.bz2/common/win32thread.h Changed

x264-snapshot-20120928-2245.tar.bz2/common/x86/bitstream-a.asm -> x264-snapshot-20130224-2245.tar.bz2/common/x86/bitstream-a.asm Changed

x264-snapshot-20120928-2245.tar.bz2/common/x86/cabac-a.asm -> x264-snapshot-20130224-2245.tar.bz2/common/x86/cabac-a.asm Changed

x264-snapshot-20120928-2245.tar.bz2/common/x86/const-a.asm -> x264-snapshot-20130224-2245.tar.bz2/common/x86/const-a.asm Changed

x264-snapshot-20120928-2245.tar.bz2/common/x86/cpu-a.asm -> x264-snapshot-20130224-2245.tar.bz2/common/x86/cpu-a.asm Changed

@@ -1,7 +1,7 @@
 ;*****************************************************************************
 ;* cpu-a.asm: x86 cpu utilities
 ;*****************************************************************************
-;* Copyright (C) 2003-2012 x264 project
+;* Copyright (C) 2003-2013 x264 project
 ;*
 ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
 ;*          Loren Merritt <lorenm@u.washington.edu>
@@ -139,3 +139,50 @@
     ldmxcsr [rsp]
     add   rsp, 4
     ret
+
+cextern intel_cpu_indicator_init
+
+;-----------------------------------------------------------------------------
+; void safe_intel_cpu_indicator_init( void );
+;-----------------------------------------------------------------------------
+cglobal safe_intel_cpu_indicator_init
+    push r0
+    push r1
+    push r2
+    push r3
+    push r4
+    push r5
+    push r6
+%if ARCH_X86_64
+    push r7
+    push r8
+    push r9
+    push r10
+    push r11
+    push r12
+    push r13
+    push r14
+%endif
+    push rbp
+    mov  rbp, rsp
+    and  rsp, ~15
+    call intel_cpu_indicator_init
+    leave
+%if ARCH_X86_64
+    pop r14
+    pop r13
+    pop r12
+    pop r11
+    pop r10
+    pop r9
+    pop r8
+    pop r7
+%endif
+    pop r6
+    pop r5
+    pop r4
+    pop r3
+    pop r2
+    pop r1
+    pop r0
+    ret

x264-snapshot-20120928-2245.tar.bz2/common/x86/dct-32.asm -> x264-snapshot-20130224-2245.tar.bz2/common/x86/dct-32.asm Changed

x264-snapshot-20120928-2245.tar.bz2/common/x86/dct-64.asm -> x264-snapshot-20130224-2245.tar.bz2/common/x86/dct-64.asm Changed

x264-snapshot-20120928-2245.tar.bz2/common/x86/dct-a.asm -> x264-snapshot-20130224-2245.tar.bz2/common/x86/dct-a.asm Changed

@@ -1,7 +1,7 @@
 ;*****************************************************************************
 ;* dct-a.asm: x86 transform and zigzag
 ;*****************************************************************************
-;* Copyright (C) 2003-2012 x264 project
+;* Copyright (C) 2003-2013 x264 project
 ;*
 ;* Authors: Holger Lubitz <holger@lubitz.org>
 ;*          Loren Merritt <lorenm@u.washington.edu>
@@ -555,7 +555,7 @@
     add         r0, 4*FDEC_STRIDEB
     dec         r2
     jg .loop
-    REP_RET
+    RET
 %endmacro ; ADD_IDCT_DC
 
 INIT_XMM sse2
@@ -664,7 +664,7 @@
     add       r0, FDEC_STRIDE*4
     dec       r2
     jg .loop
-    REP_RET
+    RET
 
 INIT_XMM sse2
 cglobal add16x16_idct_dc, 2,2,8
@@ -1274,7 +1274,7 @@
     mova       m1, [r1+ 4*SIZEOF_DCTCOEF]       ; 07 06 05 04
     mova       m2, [r1+ 8*SIZEOF_DCTCOEF]       ; 11 10 09 08
     pshuf%1    m3, m0, q3333                    ; 03 03 03 03
-    movd       r2, m2                           ; 09 08
+    movd      r2d, m2                           ; 09 08
     pshuf%1    m2, m2, q0321                    ; 08 11 10 09
     punpckl%2  m3, m1                           ; 05 03 04 03
     pinsr%1    m0, r2d, 3                       ; 08 02 01 00

x264-snapshot-20120928-2245.tar.bz2/common/x86/dct.h -> x264-snapshot-20130224-2245.tar.bz2/common/x86/dct.h Changed

x264-snapshot-20120928-2245.tar.bz2/common/x86/deblock-a.asm -> x264-snapshot-20130224-2245.tar.bz2/common/x86/deblock-a.asm Changed

@@ -1,7 +1,7 @@
 ;*****************************************************************************
 ;* deblock-a.asm: x86 deblocking
 ;*****************************************************************************
-;* Copyright (C) 2005-2012 x264 project
+;* Copyright (C) 2005-2013 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Jason Garrett-Glaser <darkshikari@gmail.com>
@@ -171,7 +171,7 @@
     %define bm  [rsp+mmsize*4]
     SUB        rsp, pad
     add         r1, r1
-    LOAD_AB     m4, m5, r2, r3
+    LOAD_AB     m4, m5, r2d, r3d
     mov         r3, 32/mmsize
     mov         r2, r0
     sub         r0, r1
@@ -227,7 +227,7 @@
     %define bm  [rsp+mmsize*6]
     SUB        rsp, pad
     add         r1, r1
-    LOAD_AB     m4, m5, r2, r3
+    LOAD_AB     m4, m5, r2d, r3d
     mov         r3, r1
     mova        am, m4
     add         r3, r1
@@ -355,7 +355,7 @@
     %define mask1 m10
     %define mask2 m11
     add         r1, r1
-    LOAD_AB    m12, m13, r2, r3
+    LOAD_AB    m12, m13, r2d, r3d
     mov         r2, r0
     sub         r0, r1
     sub         r0, r1
@@ -378,11 +378,11 @@
     add         r4, 2
     dec         r3
     jg .loop
-    REP_RET
+    RET
 
 cglobal deblock_h_luma, 5,7,15
     add         r1, r1
-    LOAD_AB    m12, m13, r2, r3
+    LOAD_AB    m12, m13, r2d, r3d
     mov         r2, r1
     add         r2, r1
     add         r2, r1
@@ -416,7 +416,7 @@
     lea         r5, [r5+r1*8]
     dec         r6
     jg .loop
-    REP_RET
+    RET
 %endmacro
 
 INIT_XMM sse2
@@ -650,7 +650,7 @@
     add     r4, mmsize
     dec     r6
     jg .loop
-    REP_RET
+    RET
 
 ;-----------------------------------------------------------------------------
 ; void deblock_h_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
@@ -1205,20 +1205,18 @@
 ;-----------------------------------------------------------------------------
 ; void deblock_v8_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
-cglobal deblock_%1_luma, 5,5
+cglobal deblock_%1_luma, 5,5,8,2*%2
     lea     r4, [r1*3]
     dec     r2     ; alpha-1
     neg     r4
     dec     r3     ; beta-1
     add     r4, r0 ; pix-3*stride
-    %assign pad 2*%2+12-(stack_offset&15)
-    SUB     esp, pad
 
     mova    m0, [r4+r1]   ; p1
     mova    m1, [r4+2*r1] ; p0
     mova    m2, [r0]      ; q0
     mova    m3, [r0+r1]   ; q1
-    LOAD_MASK r2, r3
+    LOAD_MASK r2d, r3d
 
     mov     r3, r4mp
     movd    m4, [r3] ; tc0
@@ -1251,22 +1249,19 @@
     DEBLOCK_P0_Q0
     mova    [r4+2*r1], m1
     mova    [r0], m2
-    ADD     esp, pad
     RET
 
 ;-----------------------------------------------------------------------------
 ; void deblock_h_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
 INIT_MMX cpuname
-cglobal deblock_h_luma, 0,5
+cglobal deblock_h_luma, 0,5,8,0x60+HAVE_ALIGNED_STACK*12
     mov    r0, r0mp
     mov    r3, r1m
     lea    r4, [r3*3]
     sub    r0, 4
     lea    r1, [r0+r4]
-    %assign pad 0x78-(stack_offset&15)
-    SUB    esp, pad
-%define pix_tmp esp+12
+    %define pix_tmp esp+12*HAVE_ALIGNED_STACK
 
     ; transpose 6x16 -> tmp space
     TRANSPOSE6x8_MEM  PASS8ROWS(r0, r1, r3, r4), pix_tmp
@@ -1308,7 +1303,6 @@
     movq   m3, [pix_tmp+0x48]
     TRANSPOSE8x4B_STORE  PASS8ROWS(r0, r1, r3, r4)
 
-    ADD    esp, pad
     RET
 %endmacro ; DEBLOCK_LUMA
 
@@ -1439,7 +1433,7 @@
     %define mpb_0 m14
     %define mpb_1 m15
 %else
-    %define spill(x) [esp+16*x+((stack_offset+4)&15)]
+    %define spill(x) [esp+16*x]
     %define p2 [r4+r1]
     %define q2 [r0+2*r1]
     %define t4 spill(0)
@@ -1454,10 +1448,7 @@
 ;-----------------------------------------------------------------------------
 ; void deblock_v_luma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
-cglobal deblock_%1_luma_intra, 4,6,16
-%if ARCH_X86_64 == 0
-    sub     esp, 0x60
-%endif
+cglobal deblock_%1_luma_intra, 4,6,16,ARCH_X86_64*0x50-0x50
     lea     r4, [r1*4]
     lea     r5, [r1*3] ; 3*stride
     dec     r2d        ; alpha-1
@@ -1506,10 +1497,7 @@
     LUMA_INTRA_SWAP_PQ
     LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
 .end:
-%if ARCH_X86_64 == 0
-    add     esp, 0x60
-%endif
-    RET
+    REP_RET
 
 INIT_MMX cpuname
 %if ARCH_X86_64
@@ -1545,12 +1533,10 @@
     add   rsp, 0x88
     RET
 %else
-cglobal deblock_h_luma_intra, 2,4
+cglobal deblock_h_luma_intra, 2,4,8,0x80
     lea    r3,  [r1*3]
     sub    r0,  4
     lea    r2,  [r0+r3]
-%assign pad 0x8c-(stack_offset&15)
-    SUB    rsp, pad
     %define pix_tmp rsp
 
     ; transpose 8x16 -> tmp space
@@ -1581,7 +1567,6 @@
     lea    r0,  [r0+r1*8]
     lea    r2,  [r2+r1*8]
     TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
-    ADD    rsp, pad
     RET
 %endif ; ARCH_X86_64
 %endmacro ; DEBLOCK_LUMA_INTRA
@@ -1675,7 +1660,7 @@
 
 %macro DEBLOCK_CHROMA 0
 cglobal deblock_inter_body
-    LOAD_AB     m4, m5, r2, r3
+    LOAD_AB     m4, m5, r2d, r3d
     LOAD_MASK   m0, m1, m2, m3, m4, m5, m7, m6, m4
     pxor        m4, m4
     LOAD_TC     m6, r4
@@ -1702,7 +1687,7 @@
     add         r4, mmsize/8
     dec         r6
     jg .loop
-    REP_RET
+    RET
 
 ;-----------------------------------------------------------------------------
 ; void deblock_h_chroma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
@@ -1721,11 +1706,11 @@
     add         r4, mmsize/8
     dec         r5
     jg .loop
-    REP_RET
+    RET
 
 
 cglobal deblock_intra_body
-    LOAD_AB     m4, m5, r2, r3
+    LOAD_AB     m4, m5, r2d, r3d
     LOAD_MASK   m0, m1, m2, m3, m4, m5, m7, m6, m4
     CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
     ret
@@ -1736,7 +1721,7 @@
 cglobal deblock_v_chroma_intra, 4,6,8
     add         r1, r1
     mov         r5, 32/mmsize
-    movd        m5, r3
+    movd        m5, r3d
     mov         r4, r0
     sub         r0, r1
     sub         r0, r1
@@ -1749,7 +1734,7 @@
     add         r4, mmsize
     dec         r5
     jg .loop
-    REP_RET
+    RET
 
 ;-----------------------------------------------------------------------------
 ; void deblock_h_chroma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
@@ -1767,7 +1752,7 @@
     lea         r0, [r0+r1*(mmsize/4)]
     dec         r4
     jg .loop
-    REP_RET
+    RET
 
 ;-----------------------------------------------------------------------------
 ; void deblock_h_chroma_intra_mbaff( uint16_t *pix, intptr_t stride, int alpha, int beta )
@@ -1781,7 +1766,7 @@
     lea         r5, [r1*3]
 %endif
     CHROMA_H_LOAD r5
-    LOAD_AB     m4, m5, r2, r3
+    LOAD_AB     m4, m5, r2d, r3d
     LOAD_MASK   m0, m1, m2, m3, m4, m5, m7, m6, m4
     CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
     CHROMA_H_STORE r5
@@ -1790,7 +1775,7 @@
     dec         r4
     jg .loop
 %endif
-    REP_RET
+    RET
 
 ;-----------------------------------------------------------------------------
 ; void deblock_h_chroma_mbaff( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
@@ -1803,7 +1788,7 @@
 .loop:
 %endif
     CHROMA_H_LOAD r6
-    LOAD_AB     m4, m5, r2, r3
+    LOAD_AB     m4, m5, r2d, r3d
     LOAD_MASK   m0, m1, m2, m3, m4, m5, m7, m6, m4
     movd      m6, [r4]
     punpcklbw m6, m6
@@ -1818,7 +1803,7 @@
     dec         r5
     jg .loop
 %endif
-    REP_RET
+    RET
 
 ;-----------------------------------------------------------------------------
 ; void deblock_h_chroma_422_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
@@ -1836,7 +1821,7 @@
     lea         r0, [r0+r1*(mmsize/4)]
     dec         r4
     jg .loop
-    REP_RET
+    RET
 
 ;-----------------------------------------------------------------------------
 ; void deblock_h_chroma_422( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
@@ -1847,7 +1832,7 @@
     lea         r6, [r1*3]
 .loop:
     CHROMA_H_LOAD r6
-    LOAD_AB     m4, m5, r2m, r3
+    LOAD_AB     m4, m5, r2m, r3d
     LOAD_MASK   m0, m1, m2, m3, m4, m5, m7, m6, m4
     pxor        m4, m4
     movd        m6, [r4-1]
@@ -1867,7 +1852,7 @@
 %endif
     dec         r5
     jg .loop
-    REP_RET
+    RET
 %endmacro ; DEBLOCK_CHROMA
 
 %if ARCH_X86_64 == 0
@@ -2035,7 +2020,7 @@
     add   r4, mmsize/8
     dec   cntr
     jg .loop
-    REP_RET
+    RET
 %endmacro
 
 INIT_MMX mmx2
@@ -2116,7 +2101,7 @@
     lea   t5, [t5+r1*(mmsize/2)]
     dec  r6d
     jg .loop
-    REP_RET
+    RET
 %endmacro ; DEBLOCK_CHROMA_INTRA
 
 INIT_XMM sse2

x264-snapshot-20120928-2245.tar.bz2/common/x86/mc-a.asm -> x264-snapshot-20130224-2245.tar.bz2/common/x86/mc-a.asm Changed

@@ -1,7 +1,7 @@
 ;*****************************************************************************
 ;* mc-a.asm: x86 motion compensation
 ;*****************************************************************************
-;* Copyright (C) 2003-2012 x264 project
+;* Copyright (C) 2003-2013 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Jason Garrett-Glaser <darkshikari@gmail.com>
@@ -87,7 +87,7 @@
     lea  t0, [t0+t1*2*SIZEOF_PIXEL]
     sub eax, 2
     jg .height_loop
-    REP_RET
+    RET
 %endmacro
 
 %if HIGH_BIT_DEPTH
@@ -415,7 +415,7 @@
     lea  r2, [r2+r3*2]
     sub r5d, 2
     jg .loop
-    REP_RET
+    RET
 %endmacro
 
 INIT_MMX mmx2
@@ -495,7 +495,7 @@
     lea  r2, [r2+r3*2]
     sub r5d, 2
     jg .loop
-    REP_RET
+    RET
 %endmacro
 
 %macro OFFSETPN 1
@@ -672,7 +672,7 @@
     lea     r0, [r0+r1*4]
     sub    r5d, 2
     jg .height_loop
-    REP_RET
+    RET
 %endmacro
 
 %macro AVG2_W_TWO 3
@@ -707,7 +707,7 @@
     lea     r0, [r0+r1*4]
     sub    r5d, 2
     jg .height_loop
-    REP_RET
+    RET
 %endmacro
 
 INIT_MMX mmx2
@@ -745,7 +745,7 @@
     lea     r0, [r0+r1*2*2]
     sub    r5d, 2
     jg .height_loop
-    REP_RET
+    RET
 
 cglobal pixel_avg2_w16_mmx2, 6,7
     sub     r4, r2
@@ -779,7 +779,7 @@
     lea     r0, [r0+r1*2*2]
     sub    r5d, 2
     jg .height_loop
-    REP_RET
+    RET
 
 cglobal pixel_avg2_w18_mmx2, 6,7
     sub     r4, r2
@@ -803,7 +803,7 @@
     lea     r0, [r0+r1*2]
     dec    r5d
     jg .height_loop
-    REP_RET
+    RET
 
 INIT_XMM
 cglobal pixel_avg2_w18_sse2, 6,7,6
@@ -825,7 +825,7 @@
     lea     r0, [r0+r1*2]
     dec    r5d
     jg .height_loop
-    REP_RET
+    RET
 %endif ; HIGH_BIT_DEPTH
 
 %if HIGH_BIT_DEPTH == 0
@@ -849,7 +849,7 @@
     lea    r0, [r0+r1*2]
     sub    r5d, 2
     jg     .height_loop
-    REP_RET
+    RET
 %endmacro
 
 INIT_MMX
@@ -877,7 +877,7 @@
     lea    r0, [r0+r1*2]
     sub    r5d, 2
     jg     .height_loop
-    REP_RET
+    RET
 %endmacro
 
 AVG2_W16 12, movd
@@ -909,7 +909,7 @@
     lea    r0, [r0+r1*2]
     sub    r5d, 2
     jg     .height_loop
-    REP_RET
+    RET
 
 cglobal pixel_avg2_w16_sse2, 6,7
     sub    r4, r2
@@ -927,7 +927,7 @@
     lea    r0, [r0+r1*2]
     sub    r5d, 2
     jg     .height_loop
-    REP_RET
+    RET
 
 %macro AVG2_W20 1
 cglobal pixel_avg2_w20_%1, 6,7
@@ -959,7 +959,7 @@
     lea    r0, [r0+r1*2]
     sub    r5d, 2
     jg     .height_loop
-    REP_RET
+    RET
 %endmacro
 
 AVG2_W20 sse2
@@ -1022,7 +1022,7 @@
     add    r0, r1
     dec    r5d
     jg .height_loop
-    REP_RET
+    RET
 %endmacro
 
 %macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
@@ -1226,7 +1226,7 @@
     lea     r0, [r0+r1*4]
     sub    r4d, 4
     jg .height_loop
-    REP_RET
+    RET
 %endif
 %endmacro
 
@@ -1506,7 +1506,7 @@
     add        r1, r2
     dec       r5d
     jg .loop2
-    REP_RET
+    RET
 
 %if mmsize==8
 .width4:
@@ -1626,11 +1626,11 @@
     dec       r5d
     jg .loop4
 %if mmsize!=8
-    REP_RET
+    RET
 %else
     sub dword r7m, 4
     jg .width8
-    REP_RET
+    RET
 .width8:
 %if ARCH_X86_64
     lea        r3, [t2+8*SIZEOF_PIXEL]
@@ -1766,7 +1766,7 @@
     add        r1, r2
     dec       r5d
     jg .loop1d_w4
-    REP_RET
+    RET
 .mc1d_w8:
     sub       r2, 4*SIZEOF_PIXEL
     sub       r4, 8*SIZEOF_PIXEL
@@ -1848,7 +1848,7 @@
     lea        r1, [r1+r2*2]
     sub       r5d, 2
     jg .loop4
-    REP_RET
+    RET
 
 .width8:
     movu       m0, [r3]
@@ -1909,7 +1909,7 @@
     lea        r1, [r1+r2*2]
     sub       r5d, 2
     jg .loop8
-    REP_RET
+    RET
 %endmacro
 
 %if HIGH_BIT_DEPTH

x264-snapshot-20120928-2245.tar.bz2/common/x86/mc-a2.asm -> x264-snapshot-20130224-2245.tar.bz2/common/x86/mc-a2.asm Changed

@@ -1,7 +1,7 @@
 ;*****************************************************************************
 ;* mc-a2.asm: x86 motion compensation
 ;*****************************************************************************
-;* Copyright (C) 2005-2012 x264 project
+;* Copyright (C) 2005-2013 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Jason Garrett-Glaser <darkshikari@gmail.com>
@@ -210,7 +210,7 @@
     mova      [r0+r4+mmsize], m4
     add        r4, 2*mmsize
     jl .loop
-    REP_RET
+    RET
 
 ;-----------------------------------------------------------------------------
 ; void hpel_filter_c( uint16_t *dst, int16_t *buf, intptr_t width );
@@ -259,7 +259,7 @@
     mova  [r0+r2], m1
     add        r2, mmsize
     jl .loop
-    REP_RET
+    RET
 
 ;-----------------------------------------------------------------------------
 ; void hpel_filter_h( uint16_t *dst, uint16_t *src, intptr_t width );
@@ -302,7 +302,7 @@
     mova      [r0+r2+mmsize], m4
     add        r2, mmsize*2
     jl .loop
-    REP_RET
+    RET
 %endmacro ; HPEL_FILTER
 
 INIT_MMX mmx2
@@ -365,7 +365,7 @@
     add r5, mmsize
     add r4, mmsize
     jl .loop
-    REP_RET
+    RET
 %endmacro
 
 ;-----------------------------------------------------------------------------
@@ -396,7 +396,7 @@
     movntq [r0+r2], m1
     add r2, 8
     jl .loop
-    REP_RET
+    RET
 
 ;-----------------------------------------------------------------------------
 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
@@ -440,7 +440,7 @@
     movntq     [r0+r2], m1
     add r2, 8
     jl .loop
-    REP_RET
+    RET
 
 INIT_XMM
 
@@ -510,7 +510,7 @@
     movntps [r0+r2], m4
     add r2, 16
     jl .loop
-    REP_RET
+    RET
 %endmacro
 
 ;-----------------------------------------------------------------------------
@@ -559,7 +559,7 @@
     movntps    [r0+r2], m1
     add r2, 16
     jl .loop
-    REP_RET
+    RET
 
 ;-----------------------------------------------------------------------------
 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
@@ -600,7 +600,7 @@
     movntps [r0+r2], m3
     add r2, 16
     jl .loop
-    REP_RET
+    RET
 %endmacro
 
 INIT_MMX mmx2
@@ -1026,7 +1026,7 @@
     lea    r0, [r0+r1*2]
     sub   r4d, 2
     jg .loop
-    REP_RET
+    RET
 %endmacro ; PLANE_INTERLEAVE
 
 %macro DEINTERLEAVE_START 0
@@ -1068,7 +1068,7 @@
     add    r4, r5
     dec dword r7m
     jg .loopy
-    REP_RET
+    RET
 
 ;-----------------------------------------------------------------------------
 ; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height )
@@ -1083,7 +1083,7 @@
     lea    r1, [r1+r2*2]
     sub   r3d, 2
     jg .loop
-    REP_RET
+    RET
 
 ;-----------------------------------------------------------------------------
 ; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height )
@@ -1098,7 +1098,7 @@
     lea    r1, [r1+r2*2]
     sub   r3d, 2
     jg .loop
-    REP_RET
+    RET
 %endmacro ; PLANE_DEINTERLEAVE
 
 %if HIGH_BIT_DEPTH
@@ -1155,7 +1155,7 @@
     sub  r2d, 32
     jg .copy32
 .ret
-    REP_RET
+    RET
 
 ;-----------------------------------------------------------------------------
 ; void *memcpy_aligned( void *dst, const void *src, size_t n );
@@ -1207,7 +1207,7 @@
 %endrep
     add r1, mmsize*8
     jl .loop
-    REP_RET
+    RET
 %endmacro
 
 INIT_MMX mmx
@@ -1239,7 +1239,7 @@
     movdqa  [r3+r2*2+16], m1
     add     r2, 16
     jl .loop
-    REP_RET
+    RET
 
 %macro INTEGRAL_INIT8H 0
 cglobal integral_init8h, 3,4
@@ -1263,7 +1263,7 @@
     movdqa  [r3+r2*2+16], m1
     add     r2, 16
     jl .loop
-    REP_RET
+    RET
 %endmacro
 
 INIT_XMM sse4
@@ -1290,7 +1290,7 @@
     mova  [r0+r1+mmsize], m1
     add   r1, 2*mmsize
     jl .loop
-    REP_RET
+    RET
 %endmacro
 
 INIT_MMX mmx
@@ -1321,7 +1321,7 @@
     mova  [r1+r2-8], m3
     sub   r2, 8
     jge .loop
-    REP_RET
+    RET
 
 INIT_XMM
 cglobal integral_init4v_sse2, 3,5
@@ -1347,7 +1347,7 @@
     mova  [r1+r2], m3
     add     r2, 16
     jl .loop
-    REP_RET
+    RET
 
 cglobal integral_init4v_ssse3, 3,5
     shl     r2, 1
@@ -1372,7 +1372,7 @@
     mova  [r1+r2], m3
     add     r2, 16
     jl .loop
-    REP_RET
+    RET
 
 %macro FILT8x4 7
     mova      %3, [r0+%7]
@@ -1702,7 +1702,7 @@
 %if cpuflag(fma4)
     cvtdq2ps  xmm0, xmm0
     cvtdq2ps  xmm1, xmm1
-    vfmaddps  xmm0, xmm0, xmm6, xmm1
+    fmaddps   xmm0, xmm0, xmm6, xmm1
     cvtdq2ps  xmm1, xmm2
     psubd     xmm2, xmm3
     cvtdq2ps  xmm2, xmm2
@@ -1710,7 +1710,7 @@
     mulps     xmm1, xmm3
     mulps     xmm0, xmm2
     addps     xmm2, xmm3, xmm3
-    vfnmaddps xmm3, xmm1, xmm3, xmm2
+    fnmaddps  xmm3, xmm1, xmm3, xmm2
     mulps     xmm0, xmm3
 %else
     cvtdq2ps  xmm0, xmm0
@@ -1732,7 +1732,7 @@
     movdqa [r0+r6*2], xmm0
     add         r6, 8
     jl .loop
-    REP_RET
+    RET
 %endmacro
 
 INIT_XMM sse2
@@ -1742,14 +1742,18 @@
 MBTREE
 
 %macro INT16_TO_FLOAT 1
+%if cpuflag(avx2)
+    vpmovzxwd   ymm%1, xmm%1
+%else
     vpunpckhwd   xmm4, xmm%1, xmm7
     vpunpcklwd  xmm%1, xmm7
     vinsertf128 ymm%1, ymm%1, xmm4, 1
+%endif
     vcvtdq2ps   ymm%1, ymm%1
 %endmacro
 
 ; FIXME: align loads/stores to 16 bytes
-INIT_YMM avx
+%macro MBTREE_AVX 0
 cglobal mbtree_propagate_cost, 7,7,8
     add           r6d, r6d
     lea            r0, [r0+r6*2]
@@ -1761,7 +1765,9 @@
     vmovdqa      xmm5, [pw_3fff]
     vbroadcastss ymm6, [r5]
     vmulps       ymm6, ymm6, [pf_inv256]
+%if notcpuflag(avx2)
     vpxor        xmm7, xmm7
+%endif
 .loop:
     vmovdqu      xmm0, [r2+r6]       ; intra
     vmovdqu      xmm1, [r4+r6]       ; invq
@@ -1771,6 +1777,17 @@
     INT16_TO_FLOAT 1
     INT16_TO_FLOAT 2
     INT16_TO_FLOAT 3
+%if cpuflag(fma3)
+    vmulps       ymm1, ymm1, ymm0
+    vsubps       ymm4, ymm0, ymm3
+    fmaddps      ymm1, ymm1, ymm6, ymm2
+    vrcpps       ymm3, ymm0
+    vmulps       ymm2, ymm0, ymm3
+    vmulps       ymm1, ymm1, ymm4
+    vaddps       ymm4, ymm3, ymm3
+    fnmaddps     ymm4, ymm2, ymm3, ymm4
+    vmulps       ymm1, ymm1, ymm4
+%else
     vmulps       ymm1, ymm1, ymm0
     vsubps       ymm4, ymm0, ymm3
     vmulps       ymm1, ymm1, ymm6    ; intra*invq*fps_factor>>8
@@ -1782,8 +1799,15 @@
     vaddps       ymm3, ymm3, ymm3    ; 2 * (1/intra 1st approx)
     vsubps       ymm3, ymm3, ymm2    ; 2nd approximation for 1/intra
     vmulps       ymm1, ymm1, ymm3    ; / intra
+%endif
     vcvtps2dq    ymm1, ymm1
     vmovdqu [r0+r6*2], ymm1
     add            r6, 16
     jl .loop
-    REP_RET
+    RET
+%endmacro
+
+INIT_YMM avx
+MBTREE_AVX
+INIT_YMM avx2,fma3
+MBTREE_AVX

x264-snapshot-20120928-2245.tar.bz2/common/x86/mc-c.c -> x264-snapshot-20130224-2245.tar.bz2/common/x86/mc-c.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * mc-c.c: x86 motion compensation
  *****************************************************************************
- * Copyright (C) 2003-2012 x264 project
+ * Copyright (C) 2003-2013 x264 project
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
@@ -139,6 +139,8 @@
                                       uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
 void x264_mbtree_propagate_cost_fma4( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
                                       uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
+void x264_mbtree_propagate_cost_avx2_fma3( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+                                           uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
 
 #define MC_CHROMA(cpu)\
 void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src, intptr_t i_src,\
@@ -754,7 +756,12 @@
         return;
     pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx;
 
-    if( !(cpu&X264_CPU_FMA4) )
+    if( cpu&X264_CPU_FMA4 )
+        pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_fma4;
+
+    if( !(cpu&X264_CPU_AVX2) )
         return;
-    pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_fma4;
+
+    if( cpu&X264_CPU_FMA3 )
+        pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2_fma3;
 }

x264-snapshot-20120928-2245.tar.bz2/common/x86/mc.h -> x264-snapshot-20130224-2245.tar.bz2/common/x86/mc.h Changed

x264-snapshot-20120928-2245.tar.bz2/common/x86/pixel-32.asm -> x264-snapshot-20130224-2245.tar.bz2/common/x86/pixel-32.asm Changed

x264-snapshot-20120928-2245.tar.bz2/common/x86/pixel-a.asm -> x264-snapshot-20130224-2245.tar.bz2/common/x86/pixel-a.asm Changed

x264-snapshot-20120928-2245.tar.bz2/common/x86/pixel.h -> x264-snapshot-20130224-2245.tar.bz2/common/x86/pixel.h Changed

x264-snapshot-20120928-2245.tar.bz2/common/x86/predict-a.asm -> x264-snapshot-20130224-2245.tar.bz2/common/x86/predict-a.asm Changed

@@ -1,7 +1,7 @@
 ;*****************************************************************************
 ;* predict-a.asm: x86 intra prediction
 ;*****************************************************************************
-;* Copyright (C) 2005-2012 x264 project
+;* Copyright (C) 2005-2013 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Holger Lubitz <holger@lubitz.org>
@@ -807,7 +807,7 @@
     psrlw       m0, 4
     SPLATW      m0, m0
     STORE8x8    m0, m0
-    REP_RET
+    RET
 
 %else ; !HIGH_BIT_DEPTH
 INIT_MMX mmx2
@@ -1103,7 +1103,7 @@
     add         r0, FDEC_STRIDE
     dec         r1d
     jg .loop
-    REP_RET
+    RET
 %endmacro ; PREDICT_CHROMA_P_MMX
 
 INIT_MMX mmx2
@@ -1140,7 +1140,7 @@
     add         r0, FDEC_STRIDEB
     dec        r1d
     jg .loop
-    REP_RET
+    RET
 %else ; !HIGH_BIT_DEPTH
 cglobal predict_8x%1c_p_core, 1,2
     movd        m0, r1m
@@ -1225,7 +1225,7 @@
     add         r0, FDEC_STRIDE
     dec         r1d
     jg          .loop
-    REP_RET
+    RET
 %endif ; !ARCH_X86_64
 
 %macro PREDICT_16x16_P 0
@@ -1282,7 +1282,7 @@
     dec      r1d
     jg       .loop
 %endif ; !HIGH_BIT_DEPTH
-    REP_RET
+    RET
 %endmacro ; PREDICT_16x16_P
 
 INIT_XMM sse2
@@ -1996,20 +1996,20 @@
     mova        m2, [r0 - FDEC_STRIDEB+16]
     mova        m3, [r0 - FDEC_STRIDEB+24]
     STORE16x16  m0, m1, m2, m3
-    REP_RET
+    RET
 INIT_XMM
 cglobal predict_16x16_v_sse2, 2,2
     mova      m0, [r0 - FDEC_STRIDEB+ 0]
     mova      m1, [r0 - FDEC_STRIDEB+16]
     STORE16x16_SSE2 m0, m1
-    REP_RET
+    RET
 %else ; !HIGH_BIT_DEPTH
 INIT_MMX
 cglobal predict_16x16_v_mmx2, 1,2
     movq        m0, [r0 - FDEC_STRIDE + 0]
     movq        m1, [r0 - FDEC_STRIDE + 8]
     STORE16x16  m0, m1
-    REP_RET
+    RET
 INIT_XMM
 cglobal predict_16x16_v_sse2, 1,1
     movdqa      xmm0, [r0 - FDEC_STRIDE]
@@ -2055,7 +2055,7 @@
 %endif ; HIGH_BIT_DEPTH
     sub r1, 4*FDEC_STRIDEB
     jge .vloop
-    REP_RET
+    RET
 %endmacro
 
 INIT_MMX mmx2
@@ -2106,12 +2106,12 @@
 %else
     PRED16x16_DC r1m, 5
 %endif
-    REP_RET
+    RET
 
 INIT_MMX mmx2
 cglobal predict_16x16_dc_top, 1,2
     PRED16x16_DC [pw_8], 4
-    REP_RET
+    RET
 
 INIT_MMX mmx2
 %if HIGH_BIT_DEPTH
@@ -2119,14 +2119,14 @@
     movd       m0, r1m
     SPLATW     m0, m0
     STORE16x16 m0, m0, m0, m0
-    REP_RET
+    RET
 %else ; !HIGH_BIT_DEPTH
 cglobal predict_16x16_dc_left_core, 1,1
     movd       m0, r1m
     pshufw     m0, m0, 0
     packuswb   m0, m0
     STORE16x16 m0, m0
-    REP_RET
+    RET
 %endif
 
 ;-----------------------------------------------------------------------------
@@ -2159,11 +2159,11 @@
 cglobal predict_16x16_dc_core, 2,2,4
     movd       m3, r1m
     PRED16x16_DC_SSE2 m3, 5
-    REP_RET
+    RET
 
 cglobal predict_16x16_dc_top, 1,2
     PRED16x16_DC_SSE2 [pw_8], 4
-    REP_RET
+    RET
 
 INIT_XMM sse2
 %if HIGH_BIT_DEPTH
@@ -2171,7 +2171,7 @@
     movd       m0, r1m
     SPLATW     m0, m0
     STORE16x16_SSE2 m0, m0
-    REP_RET
+    RET
 %else ; !HIGH_BIT_DEPTH
 cglobal predict_16x16_dc_left_core, 1,1
     movd       m0, r1m

x264-snapshot-20120928-2245.tar.bz2/common/x86/predict-c.c -> x264-snapshot-20130224-2245.tar.bz2/common/x86/predict-c.c Changed

x264-snapshot-20120928-2245.tar.bz2/common/x86/predict.h -> x264-snapshot-20130224-2245.tar.bz2/common/x86/predict.h Changed

x264-snapshot-20120928-2245.tar.bz2/common/x86/quant-a.asm -> x264-snapshot-20130224-2245.tar.bz2/common/x86/quant-a.asm Changed

@@ -1,7 +1,7 @@
 ;*****************************************************************************
 ;* quant-a.asm: x86 quantization and level-run
 ;*****************************************************************************
-;* Copyright (C) 2005-2012 x264 project
+;* Copyright (C) 2005-2013 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Jason Garrett-Glaser <darkshikari@gmail.com>
@@ -416,7 +416,7 @@
     %1 [r0+(t0+8*%3)*SIZEOF_PIXEL], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
     sub t0d, 16*%3
     jge %%loop
-    REP_RET
+    RET
 %else
     %1 [r0+(8*%3)*SIZEOF_PIXEL], [r1+16*%3], [r1+24*%3]
     %1 [r0+(0   )*SIZEOF_PIXEL], [r1+0    ], [r1+ 8*%3]
@@ -738,7 +738,7 @@
     PSIGND    m5, m2, m1
     test     t3d, t3d
     jnz .outer_loop_0
-    REP_RET
+    RET
 %endmacro
 
 %if HIGH_BIT_DEPTH == 0
@@ -783,7 +783,7 @@
     mova      [r1+r3*4-1*mmsize], m5
     sub       r3, mmsize/2
     jg .loop
-    REP_RET
+    RET
 %endmacro
 
 %if ARCH_X86_64 == 0
@@ -831,7 +831,7 @@
     mova      [r1+r3*4-1*mmsize], m1
     sub       r3, mmsize
     jg .loop
-    REP_RET
+    RET
 %endmacro
 
 %if ARCH_X86_64 == 0
@@ -954,7 +954,7 @@
     jne  .loop
 %endif
 .ret:
-    RET
+    REP_RET
 .ret9:
     mov   eax, 9
     RET
@@ -1066,7 +1066,7 @@
 .tryret:
     xor   r4, -1
     jne  .cont
-    REP_RET
+    RET
 .ret9:
     mov   eax, 9
     RET
@@ -1077,7 +1077,7 @@
     shr   r3, cl
     shr   r3, 1
     jne  .loop
-    REP_RET
+    RET
 %endif ; ARCH
 
 %endmacro
@@ -1381,7 +1381,7 @@
     inc    t6d
     sub    t4d, t3d
     jge .loop
-    REP_RET
+    RET
 %endmacro
 
 INIT_MMX mmx2

x264-snapshot-20120928-2245.tar.bz2/common/x86/quant.h -> x264-snapshot-20130224-2245.tar.bz2/common/x86/quant.h Changed

x264-snapshot-20120928-2245.tar.bz2/common/x86/sad-a.asm -> x264-snapshot-20130224-2245.tar.bz2/common/x86/sad-a.asm Changed

x264-snapshot-20120928-2245.tar.bz2/common/x86/sad16-a.asm -> x264-snapshot-20130224-2245.tar.bz2/common/x86/sad16-a.asm Changed

x264-snapshot-20120928-2245.tar.bz2/common/x86/trellis-64.asm -> x264-snapshot-20130224-2245.tar.bz2/common/x86/trellis-64.asm Changed

x264-snapshot-20120928-2245.tar.bz2/common/x86/util.h -> x264-snapshot-20130224-2245.tar.bz2/common/x86/util.h Changed

x264-snapshot-20120928-2245.tar.bz2/common/x86/x86inc.asm -> x264-snapshot-20130224-2245.tar.bz2/common/x86/x86inc.asm Changed

@@ -1,7 +1,7 @@
 ;*****************************************************************************
 ;* x86inc.asm: x264asm abstraction layer
 ;*****************************************************************************
-;* Copyright (C) 2005-2012 x264 project
+;* Copyright (C) 2005-2013 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Anton Mitrofanov <BugMaster@narod.ru>
@@ -34,7 +34,9 @@
 ; as this feature might be useful for others as well.  Send patches or ideas
 ; to x264-devel@videolan.org .
 
-%define program_name x264
+%ifndef program_name
+    %define program_name x264
+%endif
 
 %define WIN64  0
 %define UNIX64 0
@@ -103,7 +105,12 @@
 ; %1 = number of arguments. loads them from stack if needed.
 ; %2 = number of registers used. pushes callee-saved regs if needed.
 ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
-; %4 = list of names to define to registers
+; %4 = (optional) stack size to be allocated. If not aligned (x86-32 ICC 10.x,
+;      MSVC or YMM), the stack will be manually aligned (to 16 or 32 bytes),
+;      and an extra register will be allocated to hold the original stack
+;      pointer (to not invalidate r0m etc.). To prevent the use of an extra
+;      register as stack pointer, request a negative stack size.
+; %4+/%5+ = list of names to define to registers
 ; PROLOGUE can also be invoked by adding the same options to cglobal
 
 ; e.g.
@@ -118,8 +125,7 @@
 ; Pops anything that was pushed by PROLOGUE, and returns.
 
 ; REP_RET:
-; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
-; which are slow when a normal ret follows a branch.
+; Use this instead of RET if it's a branch target.
 
 ; registers:
 ; rN and rNq are the native-size register holding function argument N
@@ -138,11 +144,11 @@
         %define r%1m  %2d
         %define r%1mp %2
     %elif ARCH_X86_64 ; memory
-        %define r%1m [rsp + stack_offset + %3]
-        %define r%1mp qword r %+ %1m
+        %define r%1m [rstk + stack_offset + %3]
+        %define r%1mp qword r %+ %1 %+ m
     %else
-        %define r%1m [esp + stack_offset + %3]
-        %define r%1mp dword r %+ %1m
+        %define r%1m [rstk + stack_offset + %3]
+        %define r%1mp dword r %+ %1 %+ m
     %endif
     %define r%1  %2
 %endmacro
@@ -203,12 +209,16 @@
 
 %macro PUSH 1
     push %1
-    %assign stack_offset stack_offset+gprsize
+    %ifidn rstk, rsp
+        %assign stack_offset stack_offset+gprsize
+    %endif
 %endmacro
 
 %macro POP 1
     pop %1
-    %assign stack_offset stack_offset-gprsize
+    %ifidn rstk, rsp
+        %assign stack_offset stack_offset-gprsize
+    %endif
 %endmacro
 
 %macro PUSH_IF_USED 1-*
@@ -240,14 +250,14 @@
 
 %macro SUB 2
     sub %1, %2
-    %ifidn %1, rsp
+    %ifidn %1, rstk
         %assign stack_offset stack_offset+(%2)
     %endif
 %endmacro
 
 %macro ADD 2
     add %1, %2
-    %ifidn %1, rsp
+    %ifidn %1, rstk
         %assign stack_offset stack_offset-(%2)
     %endif
 %endmacro
@@ -305,6 +315,79 @@
     %assign n_arg_names %0
 %endmacro
 
+%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
+    %ifnum %1
+        %if %1 != 0
+            %assign %%stack_alignment ((mmsize + 15) & ~15)
+            %assign stack_size %1
+            %if stack_size < 0
+                %assign stack_size -stack_size
+            %endif
+            %if mmsize != 8
+                %assign xmm_regs_used %2
+            %endif
+            %if mmsize <= 16 && HAVE_ALIGNED_STACK
+                %assign stack_size_padded stack_size + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1))
+                %if xmm_regs_used > 6
+                    %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
+                %endif
+                SUB rsp, stack_size_padded
+            %else
+                %assign %%reg_num (regs_used - 1)
+                %xdefine rstk r %+ %%reg_num
+                ; align stack, and save original stack location directly above
+                ; it, i.e. in [rsp+stack_size_padded], so we can restore the
+                ; stack in a single instruction (i.e. mov rsp, rstk or mov
+                ; rsp, [rsp+stack_size_padded])
+                mov  rstk, rsp
+                %assign stack_size_padded stack_size
+                %if xmm_regs_used > 6
+                    %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
+                    %if mmsize == 32 && xmm_regs_used & 1
+                        ; re-align to 32 bytes
+                        %assign stack_size_padded (stack_size_padded + 16)
+                    %endif
+                %endif
+                %if %1 < 0 ; need to store rsp on stack
+                    sub  rsp, gprsize+stack_size_padded
+                    and  rsp, ~(%%stack_alignment-1)
+                    %xdefine rstkm [rsp+stack_size_padded]
+                    mov rstkm, rstk
+                %else ; can keep rsp in rstk during whole function
+                    sub  rsp, stack_size_padded
+                    and  rsp, ~(%%stack_alignment-1)
+                    %xdefine rstkm rstk
+                %endif
+            %endif
+            %if xmm_regs_used > 6
+                WIN64_PUSH_XMM
+            %endif
+        %endif
+    %endif
+%endmacro
+
+%macro SETUP_STACK_POINTER 1
+    %ifnum %1
+        %if %1 != 0 && (HAVE_ALIGNED_STACK == 0 || mmsize == 32)
+            %if %1 > 0
+                %assign regs_used (regs_used + 1)
+            %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2
+                %warning "Stack pointer will overwrite register argument"
+            %endif
+        %endif
+    %endif
+%endmacro
+
+%macro DEFINE_ARGS_INTERNAL 3+
+    %ifnum %2
+        DEFINE_ARGS %3
+    %elif %1 == 4
+        DEFINE_ARGS %2
+    %elif %1 > 4
+        DEFINE_ARGS %2, %3
+    %endif
+%endmacro
+
 %if WIN64 ; Windows x64 ;=================================================
 
 DECLARE_REG 0,  rcx
@@ -323,19 +406,27 @@
 DECLARE_REG 13, R14, 112
 DECLARE_REG 14, R15, 120
 
-%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...
+%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
     %assign num_args %1
     %assign regs_used %2
     ASSERT regs_used >= num_args
+    SETUP_STACK_POINTER %4
     ASSERT regs_used <= 15
     PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
-    %if mmsize == 8
-        %assign xmm_regs_used 0
-    %else
+    ALLOC_STACK %4, %3
+    %if mmsize != 8 && stack_size == 0
         WIN64_SPILL_XMM %3
     %endif
     LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
-    DEFINE_ARGS %4
+    DEFINE_ARGS_INTERNAL %0, %4, %5
+%endmacro
+
+%macro WIN64_PUSH_XMM 0
+    %assign %%i xmm_regs_used
+    %rep (xmm_regs_used-6)
+        %assign %%i %%i-1
+        movdqa [rsp + (%%i-6)*16 + stack_size + (~stack_offset&8)], xmm %+ %%i
+    %endrep
 %endmacro
 
 %macro WIN64_SPILL_XMM 1
@@ -343,11 +434,7 @@
     ASSERT xmm_regs_used <= 16
     %if xmm_regs_used > 6
         SUB rsp, (xmm_regs_used-6)*16+16
-        %assign %%i xmm_regs_used
-        %rep (xmm_regs_used-6)
-            %assign %%i %%i-1
-            movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i
-        %endrep
+        WIN64_PUSH_XMM
     %endif
 %endmacro
 
@@ -356,19 +443,28 @@
         %assign %%i xmm_regs_used
         %rep (xmm_regs_used-6)
             %assign %%i %%i-1
-            movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)]
+            movdqa xmm %+ %%i, [%1 + (%%i-6)*16+stack_size+(~stack_offset&8)]
         %endrep
-        add %1, (xmm_regs_used-6)*16+16
+        %if stack_size_padded == 0
+            add %1, (xmm_regs_used-6)*16+16
+        %endif
+    %endif
+    %if stack_size_padded > 0
+        %if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0)
+            mov rsp, rstkm
+        %else
+            add %1, stack_size_padded
+        %endif
     %endif
 %endmacro
 
 %macro WIN64_RESTORE_XMM 1
     WIN64_RESTORE_XMM_INTERNAL %1
-    %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
+    %assign stack_offset (stack_offset-stack_size_padded)
     %assign xmm_regs_used 0
 %endmacro
 
-%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32
+%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
 
 %macro RET 0
     WIN64_RESTORE_XMM_INTERNAL rsp
@@ -376,7 +472,7 @@
 %if mmsize == 32
     vzeroupper
 %endif
-    ret
+    AUTO_REP_RET
 %endmacro
 
 %elif ARCH_X86_64 ; *nix x64 ;=============================================
@@ -397,24 +493,33 @@
 DECLARE_REG 13, R14, 64
 DECLARE_REG 14, R15, 72
 
-%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
+%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
     %assign num_args %1
     %assign regs_used %2
     ASSERT regs_used >= num_args
+    SETUP_STACK_POINTER %4
     ASSERT regs_used <= 15
     PUSH_IF_USED 9, 10, 11, 12, 13, 14
+    ALLOC_STACK %4
     LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
-    DEFINE_ARGS %4
+    DEFINE_ARGS_INTERNAL %0, %4, %5
 %endmacro
 
-%define has_epilogue regs_used > 9 || mmsize == 32
+%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
 
 %macro RET 0
+%if stack_size_padded > 0
+%if mmsize == 32 || HAVE_ALIGNED_STACK == 0
+    mov rsp, rstkm
+%else
+    add rsp, stack_size_padded
+%endif
+%endif
     POP_IF_USED 14, 13, 12, 11, 10, 9
 %if mmsize == 32
     vzeroupper
 %endif
-    ret
+    AUTO_REP_RET
 %endmacro
 
 %else ; X86_32 ;==============================================================
@@ -430,7 +535,7 @@
 
 %macro DECLARE_ARG 1-*
     %rep %0
-        %define r%1m [esp + stack_offset + 4*%1 + 4]
+        %define r%1m [rstk + stack_offset + 4*%1 + 4]
         %define r%1mp dword r%1m
         %rotate 1
     %endrep
@@ -438,26 +543,39 @@
 
 DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
 
-%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
+%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
     %assign num_args %1
     %assign regs_used %2
+    ASSERT regs_used >= num_args
+    %if num_args > 7
+        %assign num_args 7
+    %endif
     %if regs_used > 7
         %assign regs_used 7
     %endif
-    ASSERT regs_used >= num_args
+    SETUP_STACK_POINTER %4
+    ASSERT regs_used <= 7
     PUSH_IF_USED 3, 4, 5, 6
+    ALLOC_STACK %4
     LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
-    DEFINE_ARGS %4
+    DEFINE_ARGS_INTERNAL %0, %4, %5
 %endmacro
 
-%define has_epilogue regs_used > 3 || mmsize == 32
+%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
 
 %macro RET 0
+%if stack_size_padded > 0
+%if mmsize == 32 || HAVE_ALIGNED_STACK == 0
+    mov rsp, rstkm
+%else
+    add rsp, stack_size_padded
+%endif
+%endif
     POP_IF_USED 6, 5, 4, 3
 %if mmsize == 32
     vzeroupper
 %endif
-    ret
+    AUTO_REP_RET
 %endmacro
 
 %endif ;======================================================================
@@ -467,8 +585,14 @@
 %endmacro
 %macro WIN64_RESTORE_XMM 1
 %endmacro
+%macro WIN64_PUSH_XMM 0
+%endmacro
 %endif
 
+; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
+; a branch or a branch target. So switch to a 2-byte form of ret in that case.
+; We can automatically detect "follows a branch", but not a branch target.
+; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
 %macro REP_RET 0
     %if has_epilogue
         RET
@@ -477,6 +601,29 @@
     %endif
 %endmacro
 
+%define last_branch_adr $$
+%macro AUTO_REP_RET 0
+    %ifndef cpuflags
+        times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr.
+    %elif notcpuflag(ssse3)
+        times ((last_branch_adr-$)>>31)+1 rep
+    %endif
+    ret
+%endmacro
+
+%macro BRANCH_INSTR 0-*
+    %rep %0
+        %macro %1 1-2 %1
+            %2 %1
+            %%branch_instr:
+            %xdefine last_branch_adr %%branch_instr
+        %endmacro
+        %rotate 1
+    %endrep
+%endmacro
+
+BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
+
 %macro TAIL_CALL 2 ; callee, is_nonadjacent
     %if has_epilogue
         call %1
@@ -496,12 +643,10 @@
 ; Applies any symbol mangling needed for C linkage, and sets up a define such that
 ; subsequent uses of the function name automatically refer to the mangled version.
 ; Appends cpuflags to the function name if cpuflags has been specified.
-%macro cglobal 1-2+ ; name, [PROLOGUE args]
-%if %0 == 1
-    cglobal_internal %1 %+ SUFFIX
-%else
+%macro cglobal 1-2+ "" ; name, [PROLOGUE args]
+    ; the "" is a workaround for nasm, which fails if SUFFIX is empty
+    ; and we call cglobal_internal with just %1 %+ SUFFIX (without %2)
     cglobal_internal %1 %+ SUFFIX, %2
-%endif
 %endmacro
 %macro cglobal_internal 1-2+
     %ifndef cglobaled_%1
@@ -518,8 +663,12 @@
     align function_align
     %1:
     RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
+    %xdefine rstk rsp
     %assign stack_offset 0
-    %if %0 > 1
+    %assign stack_size 0
+    %assign stack_size_padded 0
+    %assign xmm_regs_used 0
+    %ifnidn %2, ""
         PROLOGUE %2
     %endif
 %endmacro
@@ -554,7 +703,7 @@
 %assign cpuflags_mmx      (1<<0)
 %assign cpuflags_mmx2     (1<<1) | cpuflags_mmx
 %assign cpuflags_3dnow    (1<<2) | cpuflags_mmx
-%assign cpuflags_3dnow2   (1<<3) | cpuflags_3dnow
+%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
 %assign cpuflags_sse      (1<<4) | cpuflags_mmx2
 %assign cpuflags_sse2     (1<<5) | cpuflags_sse
 %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
@@ -597,7 +746,7 @@
         %if cpuflag(avx)
             %assign avx_enabled 1
         %endif
-        %if mmsize == 16 && notcpuflag(sse2)
+        %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2))
             %define mova movaps
             %define movu movups
             %define movnta movntps
@@ -676,10 +825,10 @@
     %if ARCH_X86_64
     %define num_mmregs 16
     %endif
-    %define mova vmovaps
-    %define movu vmovups
+    %define mova movdqa
+    %define movu movdqu
     %undef movh
-    %define movnta vmovntps
+    %define movnta movntdq
     %assign %%i 0
     %rep num_mmregs
     CAT_XDEFINE m, %%i, ymm %+ %%i
@@ -840,101 +989,107 @@
 
 ;%1 == instruction
 ;%2 == 1 if float, 0 if int
-;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
-;%4 == number of operands given
+;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
+;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
 ;%5+: operands
-%macro RUN_AVX_INSTR 6-7+
-    %ifid %6
-        %define %%sizeofreg sizeof%6
-    %elifid %5
-        %define %%sizeofreg sizeof%5
+%macro RUN_AVX_INSTR 5-8+
+    %ifnum sizeof%6
+        %assign %%sizeofreg sizeof%6
+    %elifnum sizeof%5
+        %assign %%sizeofreg sizeof%5
     %else
-        %define %%sizeofreg mmsize
+        %assign %%sizeofreg mmsize
     %endif
-    %if %%sizeofreg==32
-        %if %4>=3
-            v%1 %5, %6, %7
-        %else
-            v%1 %5, %6
-        %endif
+    %assign %%emulate_avx 0
+    %if avx_enabled && %%sizeofreg >= 16
+        %xdefine %%instr v%1
     %else
-        %if %%sizeofreg==8
-            %define %%regmov movq
-        %elif %2
-            %define %%regmov movaps
-        %else
-            %define %%regmov movdqa
+        %xdefine %%instr %1
+        %if %0 >= 7+%3
+            %assign %%emulate_avx 1
         %endif
+    %endif
 
-        %if %4>=3+%3
-            %ifnidn %5, %6
-                %if avx_enabled && %%sizeofreg==16
-                    v%1 %5, %6, %7
-                %else
-                    CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7
-                    %%regmov %5, %6
-                    %1 %5, %7
+    %if %%emulate_avx
+        %xdefine %%src1 %6
+        %xdefine %%src2 %7
+        %ifnidn %5, %6
+            %if %0 >= 8
+                CHECK_AVX_INSTR_EMU {%1 %5, %6, %7, %8}, %5, %7, %8
+            %else
+                CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7
+            %endif
+            %if %4 && %3 == 0
+                %ifnid %7
+                    ; 3-operand AVX instructions with a memory arg can only have it in src2,
+                    ; whereas SSE emulation prefers to have it in src1 (i.e. the mov).
+                    ; So, if the instruction is commutative with a memory arg, swap them.
+                    %xdefine %%src1 %7
+                    %xdefine %%src2 %6
                 %endif
+            %endif
+            %if %%sizeofreg == 8
+                MOVQ %5, %%src1
+            %elif %2
+                MOVAPS %5, %%src1
             %else
-                %1 %5, %7
+                MOVDQA %5, %%src1
             %endif
-        %elif %4>=3
-            %1 %5, %6, %7
-        %else
-            %1 %5, %6
-        %endif
-    %endif
-%endmacro
-
-; 3arg AVX ops with a memory arg can only have it in src2,
-; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov).
-; So, if the op is symmetric and the wrong one is memory, swap them.
-%macro RUN_AVX_INSTR1 8
-    %assign %%swap 0
-    %if avx_enabled
-        %ifnid %6
-            %assign %%swap 1
         %endif
-    %elifnidn %5, %6
-        %ifnid %7
-            %assign %%swap 1
+        %if %0 >= 8
+            %1 %5, %%src2, %8
+        %else
+            %1 %5, %%src2
         %endif
-    %endif
-    %if %%swap && %3 == 0 && %8 == 1
-        RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6
+    %elif %0 >= 8
+        %%instr %5, %6, %7, %8
+    %elif %0 == 7
+        %%instr %5, %6, %7
+    %elif %0 == 6
+        %%instr %5, %6
     %else
-        RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7
+        %%instr %5
     %endif
 %endmacro
 
 ;%1 == instruction
 ;%2 == 1 if float, 0 if int
-;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
-;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not
-%macro AVX_INSTR 4
-    %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4
-        %ifidn %3, fnord
-            RUN_AVX_INSTR %6, %7, %8, 2, %1, %2
+;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
+;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
+%macro AVX_INSTR 1-4 0, 1, 0
+    %macro %1 1-9 fnord, fnord, fnord, fnord, %1, %2, %3, %4
+        %ifidn %2, fnord
+            RUN_AVX_INSTR %6, %7, %8, %9, %1
+        %elifidn %3, fnord
+            RUN_AVX_INSTR %6, %7, %8, %9, %1, %2
         %elifidn %4, fnord
-            RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9
+            RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3
         %elifidn %5, fnord
-            RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4
+            RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4
         %else
-            RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5
+            RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4, %5
         %endif
     %endmacro
 %endmacro
 
+; Instructions with both VEX and non-VEX encodings
+; Non-destructive instructions are written without parameters
 AVX_INSTR addpd, 1, 0, 1
 AVX_INSTR addps, 1, 0, 1
 AVX_INSTR addsd, 1, 0, 1
 AVX_INSTR addss, 1, 0, 1
 AVX_INSTR addsubpd, 1, 0, 0
 AVX_INSTR addsubps, 1, 0, 0
-AVX_INSTR andpd, 1, 0, 1
-AVX_INSTR andps, 1, 0, 1
+AVX_INSTR aesdec, 0, 0, 0
+AVX_INSTR aesdeclast, 0, 0, 0
+AVX_INSTR aesenc, 0, 0, 0
+AVX_INSTR aesenclast, 0, 0, 0
+AVX_INSTR aesimc
+AVX_INSTR aeskeygenassist
 AVX_INSTR andnpd, 1, 0, 0
 AVX_INSTR andnps, 1, 0, 0
+AVX_INSTR andpd, 1, 0, 1
+AVX_INSTR andps, 1, 0, 1
 AVX_INSTR blendpd, 1, 0, 0
 AVX_INSTR blendps, 1, 0, 0
 AVX_INSTR blendvpd, 1, 0, 0
@@ -943,18 +1098,39 @@
 AVX_INSTR cmpps, 1, 0, 0
 AVX_INSTR cmpsd, 1, 0, 0
 AVX_INSTR cmpss, 1, 0, 0
-AVX_INSTR cvtdq2ps, 1, 0, 0
-AVX_INSTR cvtps2dq, 1, 0, 0
+AVX_INSTR comisd
+AVX_INSTR comiss
+AVX_INSTR cvtdq2pd
+AVX_INSTR cvtdq2ps
+AVX_INSTR cvtpd2dq
+AVX_INSTR cvtpd2ps
+AVX_INSTR cvtps2dq
+AVX_INSTR cvtps2pd
+AVX_INSTR cvtsd2si
+AVX_INSTR cvtsd2ss
+AVX_INSTR cvtsi2sd
+AVX_INSTR cvtsi2ss
+AVX_INSTR cvtss2sd
+AVX_INSTR cvtss2si
+AVX_INSTR cvttpd2dq
+AVX_INSTR cvttps2dq
+AVX_INSTR cvttsd2si
+AVX_INSTR cvttss2si
 AVX_INSTR divpd, 1, 0, 0
 AVX_INSTR divps, 1, 0, 0
 AVX_INSTR divsd, 1, 0, 0
 AVX_INSTR divss, 1, 0, 0
 AVX_INSTR dppd, 1, 1, 0
 AVX_INSTR dpps, 1, 1, 0
+AVX_INSTR extractps
 AVX_INSTR haddpd, 1, 0, 0
 AVX_INSTR haddps, 1, 0, 0
 AVX_INSTR hsubpd, 1, 0, 0
 AVX_INSTR hsubps, 1, 0, 0
+AVX_INSTR insertps, 1, 1, 0
+AVX_INSTR lddqu
+AVX_INSTR ldmxcsr
+AVX_INSTR maskmovdqu
 AVX_INSTR maxpd, 1, 0, 1
 AVX_INSTR maxps, 1, 0, 1
 AVX_INSTR maxsd, 1, 0, 1
@@ -963,10 +1139,31 @@
 AVX_INSTR minps, 1, 0, 1
 AVX_INSTR minsd, 1, 0, 1
 AVX_INSTR minss, 1, 0, 1
+AVX_INSTR movapd
+AVX_INSTR movaps
+AVX_INSTR movd
+AVX_INSTR movddup
+AVX_INSTR movdqa
+AVX_INSTR movdqu
 AVX_INSTR movhlps, 1, 0, 0
+AVX_INSTR movhpd, 1, 0, 0
+AVX_INSTR movhps, 1, 0, 0
 AVX_INSTR movlhps, 1, 0, 0
+AVX_INSTR movlpd, 1, 0, 0
+AVX_INSTR movlps, 1, 0, 0
+AVX_INSTR movmskpd
+AVX_INSTR movmskps
+AVX_INSTR movntdq
+AVX_INSTR movntdqa
+AVX_INSTR movntpd
+AVX_INSTR movntps
+AVX_INSTR movq
 AVX_INSTR movsd, 1, 0, 0
+AVX_INSTR movshdup
+AVX_INSTR movsldup
 AVX_INSTR movss, 1, 0, 0
+AVX_INSTR movupd
+AVX_INSTR movups
 AVX_INSTR mpsadbw, 0, 1, 0
 AVX_INSTR mulpd, 1, 0, 1
 AVX_INSTR mulps, 1, 0, 1
@@ -974,9 +1171,9 @@
 AVX_INSTR mulss, 1, 0, 1
 AVX_INSTR orpd, 1, 0, 1
 AVX_INSTR orps, 1, 0, 1
-AVX_INSTR pabsb, 0, 0, 0
-AVX_INSTR pabsw, 0, 0, 0
-AVX_INSTR pabsd, 0, 0, 0
+AVX_INSTR pabsb
+AVX_INSTR pabsd
+AVX_INSTR pabsw
 AVX_INSTR packsswb, 0, 0, 0
 AVX_INSTR packssdw, 0, 0, 0
 AVX_INSTR packuswb, 0, 0, 0
@@ -996,10 +1193,11 @@
 AVX_INSTR pavgw, 0, 0, 1
 AVX_INSTR pblendvb, 0, 0, 0
 AVX_INSTR pblendw, 0, 1, 0
-AVX_INSTR pcmpestri, 0, 0, 0
-AVX_INSTR pcmpestrm, 0, 0, 0
-AVX_INSTR pcmpistri, 0, 0, 0
-AVX_INSTR pcmpistrm, 0, 0, 0
+AVX_INSTR pclmulqdq, 0, 1, 0
+AVX_INSTR pcmpestri
+AVX_INSTR pcmpestrm
+AVX_INSTR pcmpistri
+AVX_INSTR pcmpistrm
 AVX_INSTR pcmpeqb, 0, 0, 1
 AVX_INSTR pcmpeqw, 0, 0, 1
 AVX_INSTR pcmpeqd, 0, 0, 1
@@ -1008,12 +1206,21 @@
 AVX_INSTR pcmpgtw, 0, 0, 0
 AVX_INSTR pcmpgtd, 0, 0, 0
 AVX_INSTR pcmpgtq, 0, 0, 0
+AVX_INSTR pextrb
+AVX_INSTR pextrd
+AVX_INSTR pextrq
+AVX_INSTR pextrw
 AVX_INSTR phaddw, 0, 0, 0
 AVX_INSTR phaddd, 0, 0, 0
 AVX_INSTR phaddsw, 0, 0, 0
+AVX_INSTR phminposuw
 AVX_INSTR phsubw, 0, 0, 0
 AVX_INSTR phsubd, 0, 0, 0
 AVX_INSTR phsubsw, 0, 0, 0
+AVX_INSTR pinsrb, 0, 1, 0
+AVX_INSTR pinsrd, 0, 1, 0
+AVX_INSTR pinsrq, 0, 1, 0
+AVX_INSTR pinsrw, 0, 1, 0
 AVX_INSTR pmaddwd, 0, 0, 1
 AVX_INSTR pmaddubsw, 0, 0, 0
 AVX_INSTR pmaxsb, 0, 0, 1
@@ -1028,20 +1235,32 @@
 AVX_INSTR pminub, 0, 0, 1
 AVX_INSTR pminuw, 0, 0, 1
 AVX_INSTR pminud, 0, 0, 1
-AVX_INSTR pmovmskb, 0, 0, 0
-AVX_INSTR pmulhuw, 0, 0, 1
+AVX_INSTR pmovmskb
+AVX_INSTR pmovsxbw
+AVX_INSTR pmovsxbd
+AVX_INSTR pmovsxbq
+AVX_INSTR pmovsxwd
+AVX_INSTR pmovsxwq
+AVX_INSTR pmovsxdq
+AVX_INSTR pmovzxbw
+AVX_INSTR pmovzxbd
+AVX_INSTR pmovzxbq
+AVX_INSTR pmovzxwd
+AVX_INSTR pmovzxwq
+AVX_INSTR pmovzxdq
+AVX_INSTR pmuldq, 0, 0, 1
 AVX_INSTR pmulhrsw, 0, 0, 1
+AVX_INSTR pmulhuw, 0, 0, 1
 AVX_INSTR pmulhw, 0, 0, 1
 AVX_INSTR pmullw, 0, 0, 1
 AVX_INSTR pmulld, 0, 0, 1
 AVX_INSTR pmuludq, 0, 0, 1
-AVX_INSTR pmuldq, 0, 0, 1
 AVX_INSTR por, 0, 0, 1
 AVX_INSTR psadbw, 0, 0, 1
 AVX_INSTR pshufb, 0, 0, 0
-AVX_INSTR pshufd, 0, 1, 0
-AVX_INSTR pshufhw, 0, 1, 0
-AVX_INSTR pshuflw, 0, 1, 0
+AVX_INSTR pshufd
+AVX_INSTR pshufhw
+AVX_INSTR pshuflw
 AVX_INSTR psignb, 0, 0, 0
 AVX_INSTR psignw, 0, 0, 0
 AVX_INSTR psignd, 0, 0, 0
@@ -1063,7 +1282,7 @@
 AVX_INSTR psubsw, 0, 0, 0
 AVX_INSTR psubusb, 0, 0, 0
 AVX_INSTR psubusw, 0, 0, 0
-AVX_INSTR ptest, 0, 0, 0
+AVX_INSTR ptest
 AVX_INSTR punpckhbw, 0, 0, 0
 AVX_INSTR punpckhwd, 0, 0, 0
 AVX_INSTR punpckhdq, 0, 0, 0
@@ -1073,11 +1292,27 @@
 AVX_INSTR punpckldq, 0, 0, 0
 AVX_INSTR punpcklqdq, 0, 0, 0
 AVX_INSTR pxor, 0, 0, 1
+AVX_INSTR rcpps, 1, 0, 0
+AVX_INSTR rcpss, 1, 0, 0
+AVX_INSTR roundpd
+AVX_INSTR roundps
+AVX_INSTR roundsd
+AVX_INSTR roundss
+AVX_INSTR rsqrtps, 1, 0, 0
+AVX_INSTR rsqrtss, 1, 0, 0
+AVX_INSTR shufpd, 1, 1, 0
 AVX_INSTR shufps, 1, 1, 0
+AVX_INSTR sqrtpd, 1, 0, 0
+AVX_INSTR sqrtps, 1, 0, 0
+AVX_INSTR sqrtsd, 1, 0, 0
+AVX_INSTR sqrtss, 1, 0, 0
+AVX_INSTR stmxcsr
 AVX_INSTR subpd, 1, 0, 0
 AVX_INSTR subps, 1, 0, 0
 AVX_INSTR subsd, 1, 0, 0
 AVX_INSTR subss, 1, 0, 0
+AVX_INSTR ucomisd
+AVX_INSTR ucomiss
 AVX_INSTR unpckhpd, 1, 0, 0
 AVX_INSTR unpckhps, 1, 0, 0
 AVX_INSTR unpcklpd, 1, 0, 0
@@ -1123,6 +1358,44 @@
 FMA_INSTR  pmacsww,  pmullw, paddw
 FMA_INSTR pmadcswd, pmaddwd, paddd
 
-; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf.
-; This lets us use tzcnt without bumping the yasm version requirement yet.
-%define tzcnt rep bsf
+; convert FMA4 to FMA3 if possible
+%macro FMA4_INSTR 4
+    %macro %1 4-8 %1, %2, %3, %4
+        %if cpuflag(fma4)
+            v%5 %1, %2, %3, %4
+        %elifidn %1, %2
+            v%6 %1, %4, %3 ; %1 = %1 * %3 + %4
+        %elifidn %1, %3
+            v%7 %1, %2, %4 ; %1 = %2 * %1 + %4
+        %elifidn %1, %4
+            v%8 %1, %2, %3 ; %1 = %2 * %3 + %1
+        %else
+            %error fma3 emulation of ``%5 %1, %2, %3, %4'' is not supported
+        %endif
+    %endmacro
+%endmacro
+
+FMA4_INSTR fmaddpd, fmadd132pd, fmadd213pd, fmadd231pd
+FMA4_INSTR fmaddps, fmadd132ps, fmadd213ps, fmadd231ps
+FMA4_INSTR fmaddsd, fmadd132sd, fmadd213sd, fmadd231sd
+FMA4_INSTR fmaddss, fmadd132ss, fmadd213ss, fmadd231ss
+
+FMA4_INSTR fmaddsubpd, fmaddsub132pd, fmaddsub213pd, fmaddsub231pd
+FMA4_INSTR fmaddsubps, fmaddsub132ps, fmaddsub213ps, fmaddsub231ps
+FMA4_INSTR fmsubaddpd, fmsubadd132pd, fmsubadd213pd, fmsubadd231pd
+FMA4_INSTR fmsubaddps, fmsubadd132ps, fmsubadd213ps, fmsubadd231ps
+
+FMA4_INSTR fmsubpd, fmsub132pd, fmsub213pd, fmsub231pd
+FMA4_INSTR fmsubps, fmsub132ps, fmsub213ps, fmsub231ps
+FMA4_INSTR fmsubsd, fmsub132sd, fmsub213sd, fmsub231sd
+FMA4_INSTR fmsubss, fmsub132ss, fmsub213ss, fmsub231ss
+
+FMA4_INSTR fnmaddpd, fnmadd132pd, fnmadd213pd, fnmadd231pd
+FMA4_INSTR fnmaddps, fnmadd132ps, fnmadd213ps, fnmadd231ps
+FMA4_INSTR fnmaddsd, fnmadd132sd, fnmadd213sd, fnmadd231sd
+FMA4_INSTR fnmaddss, fnmadd132ss, fnmadd213ss, fnmadd231ss
+
+FMA4_INSTR fnmsubpd, fnmsub132pd, fnmsub213pd, fnmsub231pd
+FMA4_INSTR fnmsubps, fnmsub132ps, fnmsub213ps, fnmsub231ps
+FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd
+FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss

x264-snapshot-20120928-2245.tar.bz2/common/x86/x86util.asm -> x264-snapshot-20130224-2245.tar.bz2/common/x86/x86util.asm Changed

x264-snapshot-20120928-2245.tar.bz2/config.guess -> x264-snapshot-20130224-2245.tar.bz2/config.guess Changed

@@ -4,7 +4,7 @@
 #   2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
 #   2011, 2012 Free Software Foundation, Inc.
 
-timestamp='2012-02-10'
+timestamp='2012-09-25'
 
 # This file is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by
@@ -200,6 +200,10 @@
 	# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used.
 	echo "${machine}-${os}${release}"
 	exit ;;
+    *:Bitrig:*:*)
+	UNAME_MACHINE_ARCH=`arch | sed 's/Bitrig.//'`
+	echo ${UNAME_MACHINE_ARCH}-unknown-bitrig${UNAME_RELEASE}
+	exit ;;
     *:OpenBSD:*:*)
 	UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'`
 	echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE}
@@ -302,7 +306,7 @@
     arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
 	echo arm-acorn-riscix${UNAME_RELEASE}
 	exit ;;
-    arm:riscos:*:*|arm:RISCOS:*:*)
+    arm*:riscos:*:*|arm*:RISCOS:*:*)
 	echo arm-unknown-riscos
 	exit ;;
     SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*)
@@ -801,6 +805,9 @@
     i*:CYGWIN*:*)
 	echo ${UNAME_MACHINE}-pc-cygwin
 	exit ;;
+    *:MINGW64*:*)
+	echo ${UNAME_MACHINE}-pc-mingw64
+	exit ;;
     *:MINGW*:*)
 	echo ${UNAME_MACHINE}-pc-mingw32
 	exit ;;
@@ -1201,6 +1208,9 @@
     BePC:Haiku:*:*)	# Haiku running on Intel PC compatible.
 	echo i586-pc-haiku
 	exit ;;
+    x86_64:Haiku:*:*)
+	echo x86_64-unknown-haiku
+	exit ;;
     SX-4:SUPER-UX:*:*)
 	echo sx4-nec-superux${UNAME_RELEASE}
 	exit ;;
@@ -1256,7 +1266,7 @@
     NEO-?:NONSTOP_KERNEL:*:*)
 	echo neo-tandem-nsk${UNAME_RELEASE}
 	exit ;;
-    NSE-?:NONSTOP_KERNEL:*:*)
+    NSE-*:NONSTOP_KERNEL:*:*)
 	echo nse-tandem-nsk${UNAME_RELEASE}
 	exit ;;
     NSR-?:NONSTOP_KERNEL:*:*)
@@ -1330,9 +1340,6 @@
 	exit ;;
 esac
 
-#echo '(No uname command or uname output not recognized.)' 1>&2
-#echo "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" 1>&2
-
 eval $set_cc_for_build
 cat >$dummy.c <<EOF
 #ifdef _SEQUENT_

x264-snapshot-20120928-2245.tar.bz2/config.sub -> x264-snapshot-20130224-2245.tar.bz2/config.sub Changed

x264-snapshot-20120928-2245.tar.bz2/configure -> x264-snapshot-20130224-2245.tar.bz2/configure Changed

@@ -448,6 +448,7 @@
     fi
 fi
 
+libm=""
 case $host_os in
     beos*)
         SYS="BEOS"
@@ -456,37 +457,37 @@
     darwin*)
         SYS="MACOSX"
         CFLAGS="$CFLAGS -falign-loops=16"
-        LDFLAGS="$LDFLAGS -lm"
+        libm="-lm"
         if [ "$pic" = "no" ]; then
             cc_check "" -mdynamic-no-pic && CFLAGS="$CFLAGS -mdynamic-no-pic"
         fi
         ;;
     freebsd*)
         SYS="FREEBSD"
-        LDFLAGS="$LDFLAGS -lm"
+        libm="-lm"
         ;;
     kfreebsd*-gnu)
         SYS="FREEBSD"
         define HAVE_MALLOC_H
-        LDFLAGS="$LDFLAGS -lm"
+        libm="-lm"
         ;;
     netbsd*)
         SYS="NETBSD"
-        LDFLAGS="$LDFLAGS -lm"
+        libm="-lm"
         ;;
     openbsd*)
         SYS="OPENBSD"
-        LDFLAGS="$LDFLAGS -lm"
+        libm="-lm"
         ;;
     *linux*)
         SYS="LINUX"
         define HAVE_MALLOC_H
-        LDFLAGS="$LDFLAGS -lm"
+        libm="-lm"
         ;;
     gnu*)
         SYS="HURD"
         define HAVE_MALLOC_H
-        LDFLAGS="$LDFLAGS -lm"
+        libm="-lm"
         ;;
     cygwin*)
         EXE=".exe"
@@ -512,7 +513,7 @@
     sunos*|solaris*)
         SYS="SunOS"
         define HAVE_MALLOC_H
-        LDFLAGS="$LDFLAGS -lm"
+        libm="-lm"
         if cc_check "" /usr/lib/64/values-xpg6.o; then
             LDFLAGS="$LDFLAGS /usr/lib/64/values-xpg6.o"
         else
@@ -525,6 +526,9 @@
         ;;
 esac
 
+LDFLAGS="$LDFLAGS $libm"
+
+aligned_stack=1
 case $host_cpu in
     i*86)
         ARCH="X86"
@@ -545,6 +549,7 @@
                 # < 11 is completely incapable of keeping a mod16 stack
                 if cpp_check "" "" "__INTEL_COMPILER < 1100" ; then
                     define BROKEN_STACK_ALIGNMENT
+                    aligned_stack=0
                 # 11 <= x < 12 is capable of keeping a mod16 stack, but defaults to not doing so.
                 elif cpp_check "" "" "__INTEL_COMPILER < 1200" ; then
                     CFLAGS="$CFLAGS -falign-stack=assume-16-byte"
@@ -552,7 +557,7 @@
                 # >= 12 defaults to a mod16 stack
             fi
             # icl on windows has no mod16 stack support
-            [ $SYS = WINDOWS ] && define BROKEN_STACK_ALIGNMENT
+            [ $SYS = WINDOWS ] && define BROKEN_STACK_ALIGNMENT && aligned_stack=0
         fi
         if [ "$SYS" = MACOSX ]; then
             ASFLAGS="$ASFLAGS -f macho -DPREFIX"
@@ -645,6 +650,7 @@
         ARCH="$(echo $host_cpu | tr a-z A-Z)"
         ;;
 esac
+ASFLAGS="$ASFLAGS -DHAVE_ALIGNED_STACK=${aligned_stack}"
 
 if [ $SYS = WINDOWS ]; then
     if ! rc_check "0 RCDATA {0}" ; then
@@ -681,10 +687,10 @@
 fi
 
 if [ $asm = auto -a $ $ARCH = X86 -o $ARCH = X86_64 $ ] ; then
-    if ! as_check "vpperm xmm0, xmm0, xmm0, xmm0" ; then
+    if ! as_check "vpmovzxwd ymm0, xmm0" ; then
         VER=`($AS --version || echo no assembler) 2>/dev/null | head -n 1`
         echo "Found $VER"
-        echo "Minimum version is yasm-1.0.0"
+        echo "Minimum version is yasm-1.2.0"
         echo "If you really want to compile without asm, configure with --disable-asm."
         exit 1
     fi
@@ -832,7 +838,7 @@
         done
     fi
     LAVF_LIBS="-L. $LAVF_LIBS"
-    if cc_check libavformat/avformat.h "$LAVF_CFLAGS $LAVF_LIBS" "avformat_find_stream_info(0,0); avcodec_open2(0,0,0);" ; then
+    if cc_check libavformat/avformat.h "$LAVF_CFLAGS $LAVF_LIBS" "avformat_close_input(0);" ; then
         if [ "$swscale" = "yes" ]; then
             lavf="yes"
         else
@@ -901,7 +907,7 @@
 fi
 if [ "$gpac" = "yes" ] ; then
     define HAVE_GPAC
-    if cc_check gpac/isomedia.h "-Werror $GPAC_LIBS" "gf_malloc(1); gf_free(NULL);" ; then
+    if cc_check gpac/isomedia.h "-Werror $GPAC_LIBS" "void *p; p = gf_malloc(1); gf_free(p);" ; then
         define HAVE_GF_MALLOC
     fi
     LDFLAGSCLI="$GPAC_LIBS $LDFLAGSCLI"
@@ -1146,8 +1152,6 @@
 
 ${SRCPATH}/version.sh "${SRCPATH}" >> x264_config.h
 
-pclibs="-L$libdir -lx264 $libpthread"
-
 cat > x264.pc << EOF
 prefix=$prefix
 exec_prefix=$exec_prefix
@@ -1157,7 +1161,8 @@
 Name: x264
 Description: H.264 (MPEG4 AVC) encoder library
 Version: $(grep POINTVER < x264_config.h | sed -e 's/.* "//; s/".*//')
-Libs: $pclibs
+Libs: -L$libdir -lx264
+Libs.private: $libpthread $libm
 Cflags: -I$includedir
 EOF

x264-snapshot-20120928-2245.tar.bz2/encoder/analyse.c -> x264-snapshot-20130224-2245.tar.bz2/encoder/analyse.c Changed

x264-snapshot-20120928-2245.tar.bz2/encoder/analyse.h -> x264-snapshot-20130224-2245.tar.bz2/encoder/analyse.h Changed

x264-snapshot-20120928-2245.tar.bz2/encoder/cabac.c -> x264-snapshot-20130224-2245.tar.bz2/encoder/cabac.c Changed

x264-snapshot-20120928-2245.tar.bz2/encoder/cavlc.c -> x264-snapshot-20130224-2245.tar.bz2/encoder/cavlc.c Changed

x264-snapshot-20120928-2245.tar.bz2/encoder/encoder.c -> x264-snapshot-20130224-2245.tar.bz2/encoder/encoder.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * encoder.c: top-level encoder functions
  *****************************************************************************
- * Copyright (C) 2003-2012 x264 project
+ * Copyright (C) 2003-2013 x264 project
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
@@ -577,7 +577,7 @@
         h->param.rc.i_qp_constant = h->param.rc.f_rf_constant + QP_BD_OFFSET;
         h->param.rc.i_bitrate = 0;
     }
-    if( (h->param.rc.i_rc_method == X264_RC_CQP || h->param.rc.i_rc_method == X264_RC_CRF)
+    if( b_open && (h->param.rc.i_rc_method == X264_RC_CQP || h->param.rc.i_rc_method == X264_RC_CRF)
         && h->param.rc.i_qp_constant == 0 )
     {
         h->mb.b_lossless = 1;
@@ -2236,14 +2236,11 @@
     /* Slice header */
     x264_macroblock_thread_init( h );
 
-    /* If this isn't the first slice in the threadslice, set the slice QP
-     * equal to the last QP in the previous slice for more accurate
-     * CABAC initialization. */
-    if( h->sh.i_first_mb != h->i_threadslice_start * h->mb.i_mb_width )
-    {
-        h->sh.i_qp = h->mb.i_last_qp;
-        h->sh.i_qp_delta = h->sh.i_qp - h->pps->i_pic_init_qp;
-    }
+    /* Set the QP equal to the first QP in the slice for more accurate CABAC initialization. */
+    h->mb.i_mb_xy = h->sh.i_first_mb;
+    h->sh.i_qp = x264_ratecontrol_mb_qp( h );
+    h->sh.i_qp = SPEC_QP( h->sh.i_qp );
+    h->sh.i_qp_delta = h->sh.i_qp - h->pps->i_pic_init_qp;
 
     x264_slice_header_write( &h->out.bs, &h->sh, h->i_nal_ref_idc );
     if( h->param.b_cabac )

x264-snapshot-20120928-2245.tar.bz2/encoder/lookahead.c -> x264-snapshot-20130224-2245.tar.bz2/encoder/lookahead.c Changed

x264-snapshot-20120928-2245.tar.bz2/encoder/macroblock.c -> x264-snapshot-20130224-2245.tar.bz2/encoder/macroblock.c Changed

x264-snapshot-20120928-2245.tar.bz2/encoder/macroblock.h -> x264-snapshot-20130224-2245.tar.bz2/encoder/macroblock.h Changed

x264-snapshot-20120928-2245.tar.bz2/encoder/me.c -> x264-snapshot-20130224-2245.tar.bz2/encoder/me.c Changed

x264-snapshot-20120928-2245.tar.bz2/encoder/me.h -> x264-snapshot-20130224-2245.tar.bz2/encoder/me.h Changed

x264-snapshot-20120928-2245.tar.bz2/encoder/ratecontrol.c -> x264-snapshot-20130224-2245.tar.bz2/encoder/ratecontrol.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * ratecontrol.c: ratecontrol
  *****************************************************************************
- * Copyright (C) 2005-2012 x264 project
+ * Copyright (C) 2005-2013 x264 project
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Michael Niedermayer <michaelni@gmx.at>
@@ -1018,6 +1018,7 @@
 
         /* read stats */
         p = stats_in;
+        double total_qp_aq = 0;
         for( int i = 0; i < rc->num_entries; i++ )
         {
             ratecontrol_entry_t *rce;
@@ -1025,7 +1026,7 @@
             char pict_type;
             int e;
             char *next;
-            float qp;
+            float qp_rc, qp_aq;
             int ref;
 
             next= strchr(p, ';');
@@ -1041,8 +1042,8 @@
             rce = &rc->entry[frame_number];
             rce->direct_mode = 0;
 
-            e += sscanf( p, " in:%*d out:%*d type:%c dur:%"SCNd64" cpbdur:%"SCNd64" q:%f tex:%d mv:%d misc:%d imb:%d pmb:%d smb:%d d:%c",
-                   &pict_type, &rce->i_duration, &rce->i_cpb_duration, &qp, &rce->tex_bits,
+            e += sscanf( p, " in:%*d out:%*d type:%c dur:%"SCNd64" cpbdur:%"SCNd64" q:%f aq:%f tex:%d mv:%d misc:%d imb:%d pmb:%d smb:%d d:%c",
+                   &pict_type, &rce->i_duration, &rce->i_cpb_duration, &qp_rc, &qp_aq, &rce->tex_bits,
                    &rce->mv_bits, &rce->misc_bits, &rce->i_count, &rce->p_count,
                    &rce->s_count, &rce->direct_mode );
             rce->tex_bits  *= res_factor_bits;
@@ -1107,15 +1108,17 @@
                     break;
                 default:  e = -1; break;
             }
-            if( e < 12 )
+            if( e < 13 )
             {
 parse_error:
                 x264_log( h, X264_LOG_ERROR, "statistics are damaged at line %d, parser out=%d\n", i, e );
                 return -1;
             }
-            rce->qscale = qp2qscale( qp );
+            rce->qscale = qp2qscale( qp_rc );
+            total_qp_aq += qp_aq;
             p = next;
         }
+        h->pps->i_pic_init_qp = SPEC_QP( (int)(total_qp_aq / rc->num_entries + 0.5) );
 
         x264_free( stats_buf );
 
@@ -1801,10 +1804,11 @@
                           dir_avg>0 ? 's' : dir_avg<0 ? 't' : '-' )
                         : '-';
         if( fprintf( rc->p_stat_file_out,
-                 "in:%d out:%d type:%c dur:%"PRId64" cpbdur:%"PRId64" q:%.2f tex:%d mv:%d misc:%d imb:%d pmb:%d smb:%d d:%c ref:",
+                 "in:%d out:%d type:%c dur:%"PRId64" cpbdur:%"PRId64" q:%.2f aq:%.2f tex:%d mv:%d misc:%d imb:%d pmb:%d smb:%d d:%c ref:",
                  h->fenc->i_frame, h->i_frame,
                  c_type, h->fenc->i_duration,
-                 h->fenc->i_cpb_duration, rc->qpa_rc,
+                 h->fenc->i_cpb_duration,
+                 rc->qpa_rc, h->fdec->f_qp_avg_aq,
                  h->stat.frame.i_tex_bits,
                  h->stat.frame.i_mv_bits,
                  h->stat.frame.i_misc_bits,

x264-snapshot-20120928-2245.tar.bz2/encoder/ratecontrol.h -> x264-snapshot-20130224-2245.tar.bz2/encoder/ratecontrol.h Changed

x264-snapshot-20120928-2245.tar.bz2/encoder/rdo.c -> x264-snapshot-20130224-2245.tar.bz2/encoder/rdo.c Changed

x264-snapshot-20120928-2245.tar.bz2/encoder/set.c -> x264-snapshot-20130224-2245.tar.bz2/encoder/set.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * set: header writing
  *****************************************************************************
- * Copyright (C) 2003-2012 x264 project
+ * Copyright (C) 2003-2013 x264 project
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
@@ -575,7 +575,7 @@
 
     memcpy( payload, uuid, 16 );
     sprintf( payload+16, "x264 - core %d%s - H.264/MPEG-4 AVC codec - "
-             "Copy%s 2003-2012 - http://www.videolan.org/x264.html - options: %s",
+             "Copy%s 2003-2013 - http://www.videolan.org/x264.html - options: %s",
              X264_BUILD, X264_VERSION, HAVE_GPL?"left":"right", opts );
     length = strlen(payload)+1;
 
@@ -727,23 +727,23 @@
 
 const x264_level_t x264_levels[] =
 {
-    { 10,    1485,    99,   152064,     64,    175,  64, 64,  0, 2, 0, 0, 1 },
-    {  9,    1485,    99,   152064,    128,    350,  64, 64,  0, 2, 0, 0, 1 }, /* "1b" */
-    { 11,    3000,   396,   345600,    192,    500, 128, 64,  0, 2, 0, 0, 1 },
-    { 12,    6000,   396,   912384,    384,   1000, 128, 64,  0, 2, 0, 0, 1 },
-    { 13,   11880,   396,   912384,    768,   2000, 128, 64,  0, 2, 0, 0, 1 },
-    { 20,   11880,   396,   912384,   2000,   2000, 128, 64,  0, 2, 0, 0, 1 },
-    { 21,   19800,   792,  1824768,   4000,   4000, 256, 64,  0, 2, 0, 0, 0 },
-    { 22,   20250,  1620,  3110400,   4000,   4000, 256, 64,  0, 2, 0, 0, 0 },
-    { 30,   40500,  1620,  3110400,  10000,  10000, 256, 32, 22, 2, 0, 1, 0 },
-    { 31,  108000,  3600,  6912000,  14000,  14000, 512, 16, 60, 4, 1, 1, 0 },
-    { 32,  216000,  5120,  7864320,  20000,  20000, 512, 16, 60, 4, 1, 1, 0 },
-    { 40,  245760,  8192, 12582912,  20000,  25000, 512, 16, 60, 4, 1, 1, 0 },
-    { 41,  245760,  8192, 12582912,  50000,  62500, 512, 16, 24, 2, 1, 1, 0 },
-    { 42,  522240,  8704, 13369344,  50000,  62500, 512, 16, 24, 2, 1, 1, 1 },
-    { 50,  589824, 22080, 42393600, 135000, 135000, 512, 16, 24, 2, 1, 1, 1 },
-    { 51,  983040, 36864, 70778880, 240000, 240000, 512, 16, 24, 2, 1, 1, 1 },
-    { 52, 2073600, 36864, 70778880, 240000, 240000, 512, 16, 24, 2, 1, 1, 1 },
+    { 10,    1485,    99,    396,     64,    175,  64, 64,  0, 2, 0, 0, 1 },
+    {  9,    1485,    99,    396,    128,    350,  64, 64,  0, 2, 0, 0, 1 }, /* "1b" */
+    { 11,    3000,   396,    900,    192,    500, 128, 64,  0, 2, 0, 0, 1 },
+    { 12,    6000,   396,   2376,    384,   1000, 128, 64,  0, 2, 0, 0, 1 },
+    { 13,   11880,   396,   2376,    768,   2000, 128, 64,  0, 2, 0, 0, 1 },
+    { 20,   11880,   396,   2376,   2000,   2000, 128, 64,  0, 2, 0, 0, 1 },
+    { 21,   19800,   792,   4752,   4000,   4000, 256, 64,  0, 2, 0, 0, 0 },
+    { 22,   20250,  1620,   8100,   4000,   4000, 256, 64,  0, 2, 0, 0, 0 },
+    { 30,   40500,  1620,   8100,  10000,  10000, 256, 32, 22, 2, 0, 1, 0 },
+    { 31,  108000,  3600,  18000,  14000,  14000, 512, 16, 60, 4, 1, 1, 0 },
+    { 32,  216000,  5120,  20480,  20000,  20000, 512, 16, 60, 4, 1, 1, 0 },
+    { 40,  245760,  8192,  32768,  20000,  25000, 512, 16, 60, 4, 1, 1, 0 },
+    { 41,  245760,  8192,  32768,  50000,  62500, 512, 16, 24, 2, 1, 1, 0 },
+    { 42,  522240,  8704,  34816,  50000,  62500, 512, 16, 24, 2, 1, 1, 1 },
+    { 50,  589824, 22080, 110400, 135000, 135000, 512, 16, 24, 2, 1, 1, 1 },
+    { 51,  983040, 36864, 184320, 240000, 240000, 512, 16, 24, 2, 1, 1, 1 },
+    { 52, 2073600, 36864, 184320, 240000, 240000, 512, 16, 24, 2, 1, 1, 1 },
     { 0 }
 };
 
@@ -758,7 +758,7 @@
 {
     int ret = 0;
     int mbs = h->sps->i_mb_width * h->sps->i_mb_height;
-    int dpb = mbs * 384 * h->sps->vui.i_max_dec_frame_buffering;
+    int dpb = mbs * h->sps->vui.i_max_dec_frame_buffering;
     int cbp_factor = h->sps->i_profile_idc>=PROFILE_HIGH422 ? 16 :
                      h->sps->i_profile_idc==PROFILE_HIGH10 ? 12 :
                      h->sps->i_profile_idc==PROFILE_HIGH ? 5 : 4;
@@ -773,8 +773,8 @@
         ERROR( "frame MB size (%dx%d) > level limit (%d)\n",
                h->sps->i_mb_width, h->sps->i_mb_height, l->frame_size );
     if( dpb > l->dpb )
-        ERROR( "DPB size (%d frames, %d bytes) > level limit (%d frames, %d bytes)\n",
-                h->sps->vui.i_max_dec_frame_buffering, dpb, (int)(l->dpb / (384*mbs)), l->dpb );
+        ERROR( "DPB size (%d frames, %d mbs) > level limit (%d frames, %d mbs)\n",
+                h->sps->vui.i_max_dec_frame_buffering, dpb, l->dpb / mbs, l->dpb );
 
 #define CHECK( name, limit, val ) \
     if( (val) > (limit) ) \

x264-snapshot-20120928-2245.tar.bz2/encoder/set.h -> x264-snapshot-20130224-2245.tar.bz2/encoder/set.h Changed

x264-snapshot-20120928-2245.tar.bz2/encoder/slicetype.c -> x264-snapshot-20130224-2245.tar.bz2/encoder/slicetype.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * slicetype.c: lookahead analysis
  *****************************************************************************
- * Copyright (C) 2005-2012 x264 project
+ * Copyright (C) 2005-2013 x264 project
  *
  * Authors: Jason Garrett-Glaser <darkshikari@gmail.com>
  *          Loren Merritt <lorenm@u.washington.edu>
@@ -1124,6 +1124,7 @@
 
     if( !h->param.rc.i_lookahead )
     {
+        x264_slicetype_frame_cost( h, a, frames, 0, last_nonb, last_nonb, 0 );
         x264_macroblock_tree_propagate( h, frames, average_duration, 0, last_nonb, last_nonb, 1 );
         XCHG( uint16_t*, frames[last_nonb]->i_propagate_cost, frames[0]->i_propagate_cost );
     }
@@ -1650,7 +1651,10 @@
             if( warn && h->param.b_open_gop )
                 warn &= frm->i_type != X264_TYPE_I;
             if( warn )
+            {
                 x264_log( h, X264_LOG_WARNING, "specified frame type (%d) at %d is not compatible with keyframe interval\n", frm->i_type, frm->i_frame );
+                frm->i_type = h->param.b_open_gop && h->lookahead->i_last_keyframe >= 0 ? X264_TYPE_I : X264_TYPE_IDR;
+            }
         }
         if( frm->i_type == X264_TYPE_I && frm->i_frame - h->lookahead->i_last_keyframe >= h->param.i_keyint_min )
         {

x264-snapshot-20120928-2245.tar.bz2/filters/filters.c -> x264-snapshot-20130224-2245.tar.bz2/filters/filters.c Changed

x264-snapshot-20120928-2245.tar.bz2/filters/filters.h -> x264-snapshot-20130224-2245.tar.bz2/filters/filters.h Changed

x264-snapshot-20120928-2245.tar.bz2/filters/video/cache.c -> x264-snapshot-20130224-2245.tar.bz2/filters/video/cache.c Changed

x264-snapshot-20120928-2245.tar.bz2/filters/video/crop.c -> x264-snapshot-20130224-2245.tar.bz2/filters/video/crop.c Changed

x264-snapshot-20120928-2245.tar.bz2/filters/video/depth.c -> x264-snapshot-20130224-2245.tar.bz2/filters/video/depth.c Changed

x264-snapshot-20120928-2245.tar.bz2/filters/video/fix_vfr_pts.c -> x264-snapshot-20130224-2245.tar.bz2/filters/video/fix_vfr_pts.c Changed

x264-snapshot-20120928-2245.tar.bz2/filters/video/internal.c -> x264-snapshot-20130224-2245.tar.bz2/filters/video/internal.c Changed

x264-snapshot-20120928-2245.tar.bz2/filters/video/internal.h -> x264-snapshot-20130224-2245.tar.bz2/filters/video/internal.h Changed

x264-snapshot-20120928-2245.tar.bz2/filters/video/resize.c -> x264-snapshot-20130224-2245.tar.bz2/filters/video/resize.c Changed

x264-snapshot-20120928-2245.tar.bz2/filters/video/select_every.c -> x264-snapshot-20130224-2245.tar.bz2/filters/video/select_every.c Changed

x264-snapshot-20120928-2245.tar.bz2/filters/video/source.c -> x264-snapshot-20130224-2245.tar.bz2/filters/video/source.c Changed

x264-snapshot-20120928-2245.tar.bz2/filters/video/video.c -> x264-snapshot-20130224-2245.tar.bz2/filters/video/video.c Changed

x264-snapshot-20120928-2245.tar.bz2/filters/video/video.h -> x264-snapshot-20130224-2245.tar.bz2/filters/video/video.h Changed

x264-snapshot-20120928-2245.tar.bz2/input/avs.c -> x264-snapshot-20130224-2245.tar.bz2/input/avs.c Changed

x264-snapshot-20120928-2245.tar.bz2/input/ffms.c -> x264-snapshot-20130224-2245.tar.bz2/input/ffms.c Changed

x264-snapshot-20120928-2245.tar.bz2/input/input.c -> x264-snapshot-20130224-2245.tar.bz2/input/input.c Changed

x264-snapshot-20120928-2245.tar.bz2/input/input.h -> x264-snapshot-20130224-2245.tar.bz2/input/input.h Changed

x264-snapshot-20120928-2245.tar.bz2/input/lavf.c -> x264-snapshot-20130224-2245.tar.bz2/input/lavf.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * lavf.c: libavformat input
  *****************************************************************************
- * Copyright (C) 2009-2012 x264 project
+ * Copyright (C) 2009-2013 x264 project
  *
  * Authors: Mike Gurlitz <mike.gurlitz@gmail.com>
  *          Steven Walters <kemuri9@gmail.com>
@@ -28,12 +28,14 @@
 #define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "lavf", __VA_ARGS__ )
 #undef DECLARE_ALIGNED
 #include <libavformat/avformat.h>
+#include <libavutil/mem.h>
 #include <libavutil/pixdesc.h>
 #include <libavutil/dict.h>
 
 typedef struct
 {
     AVFormatContext *lavf;
+    AVFrame *frame;
     int stream_id;
     int next_frame;
     int vfr_input;
@@ -80,8 +82,8 @@
 
     AVCodecContext *c = h->lavf->streams[h->stream_id]->codec;
     AVPacket *pkt = p_pic->opaque;
-    AVFrame frame;
-    avcodec_get_frame_defaults( &frame );
+
+    avcodec_get_frame_defaults( h->frame );
 
     while( i_frame >= h->next_frame )
     {
@@ -97,7 +99,7 @@
                     pkt->size = 0;
 
                 c->reordered_opaque = pkt->pts;
-                if( avcodec_decode_video2( c, &frame, &finished, pkt ) < 0 )
+                if( avcodec_decode_video2( c, h->frame, &finished, pkt ) < 0 )
                     x264_cli_log( "lavf", X264_LOG_WARNING, "video decoding failed on frame %d\n", h->next_frame );
             }
             /* if the packet successfully decoded but the data from it is not desired, free it */
@@ -111,8 +113,8 @@
         h->next_frame++;
     }
 
-    memcpy( p_pic->img.stride, frame.linesize, sizeof(p_pic->img.stride) );
-    memcpy( p_pic->img.plane, frame.data, sizeof(p_pic->img.plane) );
+    memcpy( p_pic->img.stride, h->frame->linesize, sizeof(p_pic->img.stride) );
+    memcpy( p_pic->img.plane, h->frame->data, sizeof(p_pic->img.plane) );
     int is_fullrange   = 0;
     p_pic->img.width   = c->width;
     p_pic->img.height  = c->height;
@@ -121,15 +123,15 @@
     if( info )
     {
         info->fullrange  = is_fullrange;
-        info->interlaced = frame.interlaced_frame;
-        info->tff        = frame.top_field_first;
+        info->interlaced = h->frame->interlaced_frame;
+        info->tff        = h->frame->top_field_first;
     }
 
     if( h->vfr_input )
     {
         p_pic->pts = p_pic->duration = 0;
-        if( c->has_b_frames && frame.reordered_opaque != AV_NOPTS_VALUE )
-            p_pic->pts = frame.reordered_opaque;
+        if( c->has_b_frames && h->frame->reordered_opaque != AV_NOPTS_VALUE )
+            p_pic->pts = h->frame->reordered_opaque;
         else if( pkt->dts != AV_NOPTS_VALUE )
             p_pic->pts = pkt->dts; // for AVI files
         else if( info )
@@ -151,6 +153,10 @@
     if( !strcmp( psz_filename, "-" ) )
         psz_filename = "pipe:";
 
+    h->frame = avcodec_alloc_frame();
+    if( !h->frame )
+        return -1;
+
     /* if resolution was passed in, place it and colorspace into options. this allows raw video support */
     AVDictionary *options = NULL;
     if( opt->resolution )
@@ -245,7 +251,12 @@
 {
     lavf_hnd_t *h = handle;
     avcodec_close( h->lavf->streams[h->stream_id]->codec );
-    av_close_input_file( h->lavf );
+    avformat_close_input( &h->lavf );
+#if LIBAVCODEC_VERSION_INT >= AV_VERSION_INT(54, 28, 0)
+    avcodec_free_frame( &h->frame );
+#else
+    av_freep( &h->frame );
+#endif
     free( h );
     return 0;
 }

x264-snapshot-20120928-2245.tar.bz2/input/raw.c -> x264-snapshot-20130224-2245.tar.bz2/input/raw.c Changed

x264-snapshot-20120928-2245.tar.bz2/input/thread.c -> x264-snapshot-20130224-2245.tar.bz2/input/thread.c Changed

x264-snapshot-20120928-2245.tar.bz2/input/timecode.c -> x264-snapshot-20130224-2245.tar.bz2/input/timecode.c Changed

x264-snapshot-20120928-2245.tar.bz2/input/y4m.c -> x264-snapshot-20130224-2245.tar.bz2/input/y4m.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * y4m.c: y4m input
  *****************************************************************************
- * Copyright (C) 2003-2012 x264 project
+ * Copyright (C) 2003-2013 x264 project
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
@@ -35,6 +35,7 @@
     int frame_header_len;
     uint64_t frame_size;
     uint64_t plane_size[3];
+    int bit_depth;
 } y4m_hnd_t;
 
 #define Y4M_MAGIC "YUV4MPEG2"
@@ -42,15 +43,23 @@
 #define Y4M_FRAME_MAGIC "FRAME"
 #define MAX_FRAME_HEADER 80
 
-static int csp_string_to_int( char *csp_name )
+static int parse_csp_and_depth( char *csp_name, int *bit_depth )
 {
-    int csp = X264_CSP_MAX;
+    int csp    = X264_CSP_MAX;
+    *bit_depth = 8;
+
+    /* Set colorspace from known variants */
     if( !strncmp( "420", csp_name, 3 ) )
         csp = X264_CSP_I420;
     else if( !strncmp( "422", csp_name, 3 ) )
         csp = X264_CSP_I422;
     else if( !strncmp( "444", csp_name, 3 ) && strncmp( "444alpha", csp_name, 8 ) ) // only accept alphaless 4:4:4
         csp = X264_CSP_I444;
+
+    /* Set high bit depth from known extensions */
+    if( !strncmp( "p", csp_name + 3, 1 ) )
+        *bit_depth = strtol( csp_name + 4, NULL, 10 );
+
     return csp;
 }
 
@@ -63,6 +72,7 @@
     char *tokend, *header_end;
     int colorspace = X264_CSP_NONE;
     int alt_colorspace = X264_CSP_NONE;
+    int alt_bit_depth  = 8;
     if( !h )
         return -1;
 
@@ -112,7 +122,7 @@
                 tokstart=tokend;
                 break;
             case 'C': /* Color space */
-                colorspace = csp_string_to_int( tokstart );
+                colorspace = parse_csp_and_depth( tokstart, &h->bit_depth );
                 tokstart = strchr( tokstart, 0x20 );
                 break;
             case 'I': /* Interlace type */
@@ -159,7 +169,7 @@
                 {
                     /* Older nonstandard pixel format representation */
                     tokstart += 6;
-                    alt_colorspace = csp_string_to_int( tokstart );
+                    alt_colorspace = parse_csp_and_depth( tokstart, &alt_bit_depth );
                 }
                 tokstart = strchr( tokstart, 0x20 );
                 break;
@@ -167,22 +177,37 @@
     }
 
     if( colorspace == X264_CSP_NONE )
-        colorspace = alt_colorspace;
+    {
+        colorspace   = alt_colorspace;
+        h->bit_depth = alt_bit_depth;
+    }
 
-    // default to 4:2:0 if nothing is specified
+    // default to 8bit 4:2:0 if nothing is specified
     if( colorspace == X264_CSP_NONE )
-        colorspace = X264_CSP_I420;
+    {
+        colorspace    = X264_CSP_I420;
+        h->bit_depth  = 8;
+    }
 
     FAIL_IF_ERROR( colorspace <= X264_CSP_NONE || colorspace >= X264_CSP_MAX, "colorspace unhandled\n" )
+    FAIL_IF_ERROR( h->bit_depth < 8 || h->bit_depth > 16, "unsupported bit depth `%d'\n", h->bit_depth );
 
     info->thread_safe = 1;
     info->num_frames  = 0;
     info->csp         = colorspace;
     h->frame_size     = h->frame_header_len;
-    for( i = 0; i < x264_cli_csps[info->csp].planes; i++ )
+
+    if( h->bit_depth > 8 )
+        info->csp |= X264_CSP_HIGH_DEPTH;
+
+    const x264_cli_csp_t *csp = x264_cli_get_csp( info->csp );
+
+    for( i = 0; i < csp->planes; i++ )
     {
         h->plane_size[i] = x264_cli_pic_plane_size( info->csp, info->width, info->height, i );
         h->frame_size += h->plane_size[i];
+        /* x264_cli_pic_plane_size returns the size in bytes, we need the value in pixels from here on */
+        h->plane_size[i] /= x264_cli_csp_depth_factor( info->csp );
     }
 
     /* Most common case: frame_header = "FRAME" */
@@ -202,6 +227,7 @@
 static int read_frame_internal( cli_pic_t *pic, y4m_hnd_t *h )
 {
     size_t slen = strlen( Y4M_FRAME_MAGIC );
+    int pixel_depth = x264_cli_csp_depth_factor( pic->img.csp );
     int i = 0;
     char header[16];
 
@@ -222,7 +248,19 @@
 
     int error = 0;
     for( i = 0; i < pic->img.planes && !error; i++ )
-        error |= fread( pic->img.plane[i], h->plane_size[i], 1, h->fh ) <= 0;
+    {
+        error |= fread( pic->img.plane[i], pixel_depth, h->plane_size[i], h->fh ) != h->plane_size[i];
+        if( h->bit_depth & 7 )
+        {
+            /* upconvert non 16bit high depth planes to 16bit using the same
+             * algorithm as used in the depth filter. */
+            uint16_t *plane = (uint16_t*)pic->img.plane[i];
+            uint64_t pixel_count = h->plane_size[i];
+            int lshift = 16 - h->bit_depth;
+            for( uint64_t j = 0; j < pixel_count; j++ )
+                plane[j] = plane[j] << lshift;
+        }
+    }
     return error;
 }

x264-snapshot-20120928-2245.tar.bz2/output/flv.c -> x264-snapshot-20130224-2245.tar.bz2/output/flv.c Changed

x264-snapshot-20120928-2245.tar.bz2/output/flv_bytestream.c -> x264-snapshot-20130224-2245.tar.bz2/output/flv_bytestream.c Changed

x264-snapshot-20120928-2245.tar.bz2/output/flv_bytestream.h -> x264-snapshot-20130224-2245.tar.bz2/output/flv_bytestream.h Changed

x264-snapshot-20120928-2245.tar.bz2/output/matroska.c -> x264-snapshot-20130224-2245.tar.bz2/output/matroska.c Changed

x264-snapshot-20120928-2245.tar.bz2/output/matroska_ebml.c -> x264-snapshot-20130224-2245.tar.bz2/output/matroska_ebml.c Changed

x264-snapshot-20120928-2245.tar.bz2/output/matroska_ebml.h -> x264-snapshot-20130224-2245.tar.bz2/output/matroska_ebml.h Changed

x264-snapshot-20120928-2245.tar.bz2/output/mp4.c -> x264-snapshot-20130224-2245.tar.bz2/output/mp4.c Changed

x264-snapshot-20120928-2245.tar.bz2/output/output.h -> x264-snapshot-20130224-2245.tar.bz2/output/output.h Changed

x264-snapshot-20120928-2245.tar.bz2/output/raw.c -> x264-snapshot-20130224-2245.tar.bz2/output/raw.c Changed

x264-snapshot-20120928-2245.tar.bz2/tools/checkasm-a.asm -> x264-snapshot-20130224-2245.tar.bz2/tools/checkasm-a.asm Changed

x264-snapshot-20120928-2245.tar.bz2/tools/checkasm.c -> x264-snapshot-20130224-2245.tar.bz2/tools/checkasm.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * checkasm.c: assembly check tool
  *****************************************************************************
- * Copyright (C) 2003-2012 x264 project
+ * Copyright (C) 2003-2013 x264 project
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Laurent Aimar <fenrir@via.ecp.fr>
@@ -164,6 +164,7 @@
             if( k < j )
                 continue;
             printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
+                    b->cpu&X264_CPU_AVX2 && b->cpu&X264_CPU_FMA3 ? "avx2_fma3" :
                     b->cpu&X264_CPU_AVX2 ? "avx2" :
                     b->cpu&X264_CPU_FMA3 ? "fma3" :
                     b->cpu&X264_CPU_FMA4 ? "fma4" :
@@ -2444,11 +2445,6 @@
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA4, "FMA4" );
         cpu1 &= ~X264_CPU_FMA4;
     }
-    if( x264_cpu_detect() & X264_CPU_FMA3 )
-    {
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" );
-        cpu1 &= ~X264_CPU_FMA3;
-    }
     if( x264_cpu_detect() & X264_CPU_BMI1 )
     {
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" );
@@ -2466,6 +2462,11 @@
     }
     if( x264_cpu_detect() & X264_CPU_AVX2 )
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" );
+    if( x264_cpu_detect() & X264_CPU_FMA3 )
+    {
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" );
+        cpu1 &= ~X264_CPU_FMA3;
+    }
 #elif ARCH_PPC
     if( x264_cpu_detect() & X264_CPU_ALTIVEC )
     {

x264-snapshot-20120928-2245.tar.bz2/x264.c -> x264-snapshot-20130224-2245.tar.bz2/x264.c Changed

x264-snapshot-20120928-2245.tar.bz2/x264.h -> x264-snapshot-20130224-2245.tar.bz2/x264.h Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * x264.h: x264 public header
  *****************************************************************************
- * Copyright (C) 2003-2012 x264 project
+ * Copyright (C) 2003-2013 x264 project
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
@@ -41,7 +41,7 @@
 
 #include "x264_config.h"
 
-#define X264_BUILD 128
+#define X264_BUILD 129
 
 /* Application developers planning to link against a shared library version of
  * libx264 from a Microsoft Visual Studio or similar development environment
@@ -523,7 +523,7 @@
     int level_idc;
     int mbps;        /* max macroblock processing rate (macroblocks/sec) */
     int frame_size;  /* max frame size (macroblocks) */
-    int dpb;         /* max decoded picture buffer (bytes) */
+    int dpb;         /* max decoded picture buffer (mbs) */
     int bitrate;     /* max bitrate (kbit/sec) */
     int cpb;         /* max vbv buffer (kbit) */
     int mv_range;    /* max vertical mv component range (pixels) */

x264-snapshot-20120928-2245.tar.bz2/x264cli.h -> x264-snapshot-20130224-2245.tar.bz2/x264cli.h Changed

x264-snapshot-20120928-2245.tar.bz2/x264dll.c -> x264-snapshot-20130224-2245.tar.bz2/x264dll.c Changed

x264-snapshot-20120928-2245.tar.bz2/x264res.rc -> x264-snapshot-20130224-2245.tar.bz2/x264res.rc Changed

Changes of Revision 2