Packman Build Service PMBS

libx264.changes Changed

​x
 
@@ -1,4 +1,9 @@
 -------------------------------------------------------------------
+Sun Mar  1 09:33:42 UTC 2015 - i@margueirte.su
+
+- update version 20141218
+
+-------------------------------------------------------------------
 Wed Nov  5 12:33:30 UTC 2014 - i@margueirte.su
 
 - update version 20141104
​

libx264.spec Changed

 
@@ -1,7 +1,7 @@
 #
 # spec file for package libx264
 #
-# Copyright (c) 2014 SUSE LINUX Products GmbH, Nuernberg, Germany.
+# Copyright (c) 2015 SUSE LINUX GmbH, Nuernberg, Germany.
 #
 # All modifications and additions to the file contributed by third parties
 # remain the property of their copyright owners, unless otherwise agreed
@@ -17,7 +17,7 @@
 
 
 %define soname  142
-%define svn     20141104
+%define svn     20141218
 Name:           libx264
 Version:        0.%{soname}svn%{svn}
 Release:        0
​

x264-use-shared-library.patch Changed

 
@@ -1,17 +1,17 @@
-Index: x264-snapshot-20130723-2245/Makefile
+Index: x264-snapshot-20141218-2245/Makefile
 ===================================================================
---- x264-snapshot-20130723-2245.orig/Makefile
-+++ x264-snapshot-20130723-2245/Makefile
-@@ -171,6 +171,7 @@ $(LIBX264): $(GENERATED) .depend $(OBJS)
+--- x264-snapshot-20141218-2245.orig/Makefile
++++ x264-snapshot-20141218-2245/Makefile
+@@ -176,6 +176,7 @@ $(LIBX264): $(GENERATED) .depend $(OBJS)
  
  $(SONAME): $(GENERATED) .depend $(OBJS) $(OBJASM) $(OBJSO)
    $(LD)$@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS)
 +  ln -s $(SONAME) libx264.so
  
  ifneq ($(EXE),)
- .PHONY: x264 checkasm
-@@ -178,8 +179,8 @@ x264: x264$(EXE)
- checkasm: checkasm$(EXE)
+ .PHONY: x264 checkasm example
+@@ -184,8 +185,8 @@ checkasm: checkasm$(EXE)
+ example: example$(EXE)
  endif
  
 -x264$(EXE): $(GENERATED) .depend $(OBJCLI) $(CLI_LIBX264)
​

x264-snapshot-20141104-2245.tar.bz2/Makefile -> x264-snapshot-20141218-2245.tar.bz2/Makefile Changed

@@ -36,6 +36,8 @@
 
 OBJCHK = tools/checkasm.o
 
+OBJEXAMPLE = example.o
+
 CONFIG := $(shell cat config.h)
 
 # GPL-only files
@@ -176,9 +178,10 @@
 	$(LD)$@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS)
 
 ifneq ($(EXE),)
-.PHONY: x264 checkasm
+.PHONY: x264 checkasm example
 x264: x264$(EXE)
 checkasm: checkasm$(EXE)
+example: example$(EXE)
 endif
 
 x264$(EXE): $(GENERATED) .depend $(OBJCLI) $(CLI_LIBX264)
@@ -187,7 +190,10 @@
 checkasm$(EXE): $(GENERATED) .depend $(OBJCHK) $(LIBX264)
 	$(LD)$@ $(OBJCHK) $(LIBX264) $(LDFLAGS)
 
-$(OBJS) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK): .depend
+example$(EXE): $(GENERATED) .depend $(OBJEXAMPLE) $(LIBX264)
+	$(LD)$@ $(OBJEXAMPLE) $(LIBX264) $(LDFLAGS)
+
+$(OBJS) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK) $(OBJEXAMPLE): .depend
 
 %.o: %.asm common/x86/x86inc.asm common/x86/x86util.asm
 	$(AS) $(ASFLAGS) -o $@ $<
@@ -254,6 +260,7 @@
 clean:
 	rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(SONAME) *.a *.lib *.exp *.pdb x264 x264.exe .depend TAGS
 	rm -f checkasm checkasm.exe $(OBJCHK) $(GENERATED) x264_lookahead.clbin
+	rm -f example example.exe $(OBJEXAMPLE)
 	rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock *.pgd *.pgc
 
 distclean: clean

 
@@ -36,6 +36,8 @@
 
 OBJCHK = tools/checkasm.o
 
+OBJEXAMPLE = example.o
+
 CONFIG := $(shell cat config.h)
 
 # GPL-only files
@@ -176,9 +178,10 @@
    $(LD)$@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS)
 
 ifneq ($(EXE),)
-.PHONY: x264 checkasm
+.PHONY: x264 checkasm example
 x264: x264$(EXE)
 checkasm: checkasm$(EXE)
+example: example$(EXE)
 endif
 
 x264$(EXE): $(GENERATED) .depend $(OBJCLI) $(CLI_LIBX264)
@@ -187,7 +190,10 @@
 checkasm$(EXE): $(GENERATED) .depend $(OBJCHK) $(LIBX264)
    $(LD)$@ $(OBJCHK) $(LIBX264) $(LDFLAGS)
 
-$(OBJS) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK): .depend
+example$(EXE): $(GENERATED) .depend $(OBJEXAMPLE) $(LIBX264)
+   $(LD)$@ $(OBJEXAMPLE) $(LIBX264) $(LDFLAGS)
+
+$(OBJS) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK) $(OBJEXAMPLE): .depend
 
 %.o: %.asm common/x86/x86inc.asm common/x86/x86util.asm
    $(AS) $(ASFLAGS) -o $@ $<
@@ -254,6 +260,7 @@
 clean:
    rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(SONAME) *.a *.lib *.exp *.pdb x264 x264.exe .depend TAGS
    rm -f checkasm checkasm.exe $(OBJCHK) $(GENERATED) x264_lookahead.clbin
+   rm -f example example.exe $(OBJEXAMPLE)
    rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock *.pgd *.pgc
 
 distclean: clean
​

x264-snapshot-20141104-2245.tar.bz2/common/common.c -> x264-snapshot-20141218-2245.tar.bz2/common/common.c Changed

@@ -517,7 +517,7 @@
 static int parse_enum( const char *arg, const char * const *names, int *dst )
 {
     for( int i = 0; names[i]; i++ )
-        if( !strcmp( arg, names[i] ) )
+        if( !strcasecmp( arg, names[i] ) )
         {
             *dst = i;
             return 0;
@@ -540,12 +540,12 @@
 static int x264_atobool( const char *str, int *b_error )
 {
     if( !strcmp(str, "1") ||
-        !strcmp(str, "true") ||
-        !strcmp(str, "yes") )
+        !strcasecmp(str, "true") ||
+        !strcasecmp(str, "yes") )
         return 1;
     if( !strcmp(str, "0") ||
-        !strcmp(str, "false") ||
-        !strcmp(str, "no") )
+        !strcasecmp(str, "false") ||
+        !strcasecmp(str, "no") )
         return 0;
     *b_error = 1;
     return 0;
@@ -614,7 +614,7 @@
     OPT("asm")
     {
         p->cpu = isdigit(value[0]) ? atoi(value) :
-                 !strcmp(value, "auto") || atobool(value) ? x264_cpu_detect() : 0;
+                 !strcasecmp(value, "auto") || atobool(value) ? x264_cpu_detect() : 0;
         if( b_error )
         {
             char *buf = strdup(value);
@@ -635,14 +635,14 @@
     }
     OPT("threads")
     {
-        if( !strcmp(value, "auto") )
+        if( !strcasecmp(value, "auto") )
             p->i_threads = X264_THREADS_AUTO;
         else
             p->i_threads = atoi(value);
     }
     OPT("lookahead-threads")
     {
-        if( !strcmp(value, "auto") )
+        if( !strcasecmp(value, "auto") )
             p->i_lookahead_threads = X264_THREADS_AUTO;
         else
             p->i_lookahead_threads = atoi(value);
@@ -651,7 +651,7 @@
         p->b_sliced_threads = atobool(value);
     OPT("sync-lookahead")
     {
-        if( !strcmp(value, "auto") )
+        if( !strcasecmp(value, "auto") )
             p->i_sync_lookahead = X264_SYNC_LOOKAHEAD_AUTO;
         else
             p->i_sync_lookahead = atoi(value);

 
@@ -517,7 +517,7 @@
 static int parse_enum( const char *arg, const char * const *names, int *dst )
 {
     for( int i = 0; names[i]; i++ )
-        if( !strcmp( arg, names[i] ) )
+        if( !strcasecmp( arg, names[i] ) )
         {
             *dst = i;
             return 0;
@@ -540,12 +540,12 @@
 static int x264_atobool( const char *str, int *b_error )
 {
     if( !strcmp(str, "1") ||
-        !strcmp(str, "true") ||
-        !strcmp(str, "yes") )
+        !strcasecmp(str, "true") ||
+        !strcasecmp(str, "yes") )
         return 1;
     if( !strcmp(str, "0") ||
-        !strcmp(str, "false") ||
-        !strcmp(str, "no") )
+        !strcasecmp(str, "false") ||
+        !strcasecmp(str, "no") )
         return 0;
     *b_error = 1;
     return 0;
@@ -614,7 +614,7 @@
     OPT("asm")
     {
         p->cpu = isdigit(value[0]) ? atoi(value) :
-                 !strcmp(value, "auto") || atobool(value) ? x264_cpu_detect() : 0;
+                 !strcasecmp(value, "auto") || atobool(value) ? x264_cpu_detect() : 0;
         if( b_error )
         {
             char *buf = strdup(value);
@@ -635,14 +635,14 @@
     }
     OPT("threads")
     {
-        if( !strcmp(value, "auto") )
+        if( !strcasecmp(value, "auto") )
             p->i_threads = X264_THREADS_AUTO;
         else
             p->i_threads = atoi(value);
     }
     OPT("lookahead-threads")
     {
-        if( !strcmp(value, "auto") )
+        if( !strcasecmp(value, "auto") )
             p->i_lookahead_threads = X264_THREADS_AUTO;
         else
             p->i_lookahead_threads = atoi(value);
@@ -651,7 +651,7 @@
         p->b_sliced_threads = atobool(value);
     OPT("sync-lookahead")
     {
-        if( !strcmp(value, "auto") )
+        if( !strcasecmp(value, "auto") )
             p->i_sync_lookahead = X264_SYNC_LOOKAHEAD_AUTO;
         else
             p->i_sync_lookahead = atoi(value);
​

x264-snapshot-20141104-2245.tar.bz2/common/dct.c -> x264-snapshot-20141218-2245.tar.bz2/common/dct.c Changed

 
@@ -611,7 +611,6 @@
     {
         dctf->sub4x4_dct    = x264_sub4x4_dct_mmx;
         dctf->add4x4_idct   = x264_add4x4_idct_mmx;
-        dctf->dct4x4dc      = x264_dct4x4dc_mmx;
         dctf->idct4x4dc     = x264_idct4x4dc_mmx;
         dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmx2;
 
@@ -630,6 +629,7 @@
 
     if( cpu&X264_CPU_MMX2 )
     {
+        dctf->dct4x4dc         = x264_dct4x4dc_mmx2;
         dctf->add8x8_idct_dc   = x264_add8x8_idct_dc_mmx2;
         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx2;
     }
​

x264-snapshot-20141104-2245.tar.bz2/common/osdep.h -> x264-snapshot-20141218-2245.tar.bz2/common/osdep.h Changed

 
@@ -142,8 +142,6 @@
 #define ALIGNED_ARRAY_N ALIGNED_ARRAY_16
 #endif
 
-#define UNINIT(x) x=x
-
 #if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
 #define UNUSED __attribute__((unused))
 #define ALWAYS_INLINE __attribute__((always_inline)) inline
​

x264-snapshot-20141104-2245.tar.bz2/common/pixel.c -> x264-snapshot-20141218-2245.tar.bz2/common/pixel.c Changed

 
@@ -1040,6 +1040,7 @@
         INIT2_NAME( sad_aligned, sad, _avx2 );
         INIT2( sad_x3, _avx2 );
         INIT2( sad_x4, _avx2 );
+        pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx2;
         pixf->vsad = x264_pixel_vsad_avx2;
         pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx2;
         pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_avx2;
​

x264-snapshot-20141104-2245.tar.bz2/common/quant.c -> x264-snapshot-20141218-2245.tar.bz2/common/quant.c Changed

 
@@ -558,8 +558,6 @@
     if( cpu&X264_CPU_MMX )
     {
 #if ARCH_X86
-        pf->quant_4x4 = x264_quant_4x4_mmx;
-        pf->quant_8x8 = x264_quant_8x8_mmx;
         pf->dequant_4x4 = x264_dequant_4x4_mmx;
         pf->dequant_4x4_dc = x264_dequant_4x4dc_mmx2;
         pf->dequant_8x8 = x264_dequant_8x8_mmx;
@@ -576,6 +574,8 @@
     {
         pf->quant_2x2_dc = x264_quant_2x2_dc_mmx2;
 #if ARCH_X86
+        pf->quant_4x4 = x264_quant_4x4_mmx2;
+        pf->quant_8x8 = x264_quant_8x8_mmx2;
         pf->quant_4x4_dc = x264_quant_4x4_dc_mmx2;
         pf->decimate_score15 = x264_decimate_score15_mmx2;
         pf->decimate_score16 = x264_decimate_score16_mmx2;
​

x264-snapshot-20141104-2245.tar.bz2/common/x86/dct-a.asm -> x264-snapshot-20141218-2245.tar.bz2/common/x86/dct-a.asm Changed

 
@@ -143,7 +143,7 @@
 DCT4x4_DC
 %else
 
-INIT_MMX mmx
+INIT_MMX mmx2
 cglobal dct4x4dc, 1,1
     movq   m3, [r0+24]
     movq   m2, [r0+16]
​

x264-snapshot-20141104-2245.tar.bz2/common/x86/dct.h -> x264-snapshot-20141218-2245.tar.bz2/common/x86/dct.h Changed

 
@@ -70,7 +70,7 @@
 void x264_add16x16_idct_dc_avx  ( pixel   *p_dst, dctcoef dct    [16] );
 void x264_add16x16_idct_dc_avx2 ( uint8_t *p_dst, int16_t dct    [16] );
 
-void x264_dct4x4dc_mmx       ( int16_t d[16] );
+void x264_dct4x4dc_mmx2      ( int16_t d[16] );
 void x264_dct4x4dc_sse2      ( int32_t d[16] );
 void x264_dct4x4dc_avx       ( int32_t d[16] );
 void x264_idct4x4dc_mmx      ( int16_t d[16] );
​

x264-snapshot-20141104-2245.tar.bz2/common/x86/pixel-a.asm -> x264-snapshot-20141218-2245.tar.bz2/common/x86/pixel-a.asm Changed

@@ -727,15 +727,11 @@
 %endmacro
 
 %macro VAR_END 2
-%if HIGH_BIT_DEPTH
-%if mmsize == 8 && %1*%2 == 256
+%if HIGH_BIT_DEPTH && mmsize == 8 && %1*%2 == 256
     HADDUW  m5, m2
 %else
     HADDW   m5, m2
 %endif
-%else ; !HIGH_BIT_DEPTH
-    HADDW   m5, m2
-%endif ; HIGH_BIT_DEPTH
     HADDD   m6, m1
 %if ARCH_X86_64
     punpckldq m5, m6
@@ -772,20 +768,17 @@
     mova      m4, [r0+%1+mmsize]
 %else ; !HIGH_BIT_DEPTH
     mova      m0, [r0]
-    punpckhbw m1, m0, m7
     mova      m3, [r0+%1]
-    mova      m4, m3
+    punpckhbw m1, m0, m7
     punpcklbw m0, m7
+    punpckhbw m4, m3, m7
+    punpcklbw m3, m7
 %endif ; HIGH_BIT_DEPTH
 %ifidn %1, r1
     lea       r0, [r0+%1*2]
 %else
     add       r0, r1
 %endif
-%if HIGH_BIT_DEPTH == 0
-    punpcklbw m3, m7
-    punpckhbw m4, m7
-%endif ; !HIGH_BIT_DEPTH
     VAR_CORE
     dec r2d
     jg .loop
@@ -900,17 +893,26 @@
 VAR
 INIT_XMM xop
 VAR
+%endif ; !HIGH_BIT_DEPTH
 
 INIT_YMM avx2
 cglobal pixel_var_16x16, 2,4,7
+    FIX_STRIDES r1
     VAR_START 0
     mov      r2d, 4
     lea       r3, [r1*3]
 .loop:
+%if HIGH_BIT_DEPTH
+    mova      m0, [r0]
+    mova      m3, [r0+r1]
+    mova      m1, [r0+r1*2]
+    mova      m4, [r0+r3]
+%else
     pmovzxbw  m0, [r0]
     pmovzxbw  m3, [r0+r1]
     pmovzxbw  m1, [r0+r1*2]
     pmovzxbw  m4, [r0+r3]
+%endif
     lea       r0, [r0+r1*4]
     VAR_CORE
     dec r2d
@@ -929,7 +931,6 @@
     movd   edx, xm6
 %endif
     RET
-%endif ; !HIGH_BIT_DEPTH
 
 %macro VAR2_END 3
     HADDW   %2, xm1
@@ -1600,7 +1601,7 @@
 %macro SATDS_SSE2 0
 %define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
 
-%if vertical==0 || HIGH_BIT_DEPTH
+%if cpuflag(ssse3) && (vertical==0 || HIGH_BIT_DEPTH)
 cglobal pixel_satd_4x4, 4, 6, 6
     SATD_START_MMX
     mova m4, [hmul_4p]

 
@@ -727,15 +727,11 @@
 %endmacro
 
 %macro VAR_END 2
-%if HIGH_BIT_DEPTH
-%if mmsize == 8 && %1*%2 == 256
+%if HIGH_BIT_DEPTH && mmsize == 8 && %1*%2 == 256
     HADDUW  m5, m2
 %else
     HADDW   m5, m2
 %endif
-%else ; !HIGH_BIT_DEPTH
-    HADDW   m5, m2
-%endif ; HIGH_BIT_DEPTH
     HADDD   m6, m1
 %if ARCH_X86_64
     punpckldq m5, m6
@@ -772,20 +768,17 @@
     mova      m4, [r0+%1+mmsize]
 %else ; !HIGH_BIT_DEPTH
     mova      m0, [r0]
-    punpckhbw m1, m0, m7
     mova      m3, [r0+%1]
-    mova      m4, m3
+    punpckhbw m1, m0, m7
     punpcklbw m0, m7
+    punpckhbw m4, m3, m7
+    punpcklbw m3, m7
 %endif ; HIGH_BIT_DEPTH
 %ifidn %1, r1
     lea       r0, [r0+%1*2]
 %else
     add       r0, r1
 %endif
-%if HIGH_BIT_DEPTH == 0
-    punpcklbw m3, m7
-    punpckhbw m4, m7
-%endif ; !HIGH_BIT_DEPTH
     VAR_CORE
     dec r2d
     jg .loop
@@ -900,17 +893,26 @@
 VAR
 INIT_XMM xop
 VAR
+%endif ; !HIGH_BIT_DEPTH
 
 INIT_YMM avx2
 cglobal pixel_var_16x16, 2,4,7
+    FIX_STRIDES r1
     VAR_START 0
     mov      r2d, 4
     lea       r3, [r1*3]
 .loop:
+%if HIGH_BIT_DEPTH
+    mova      m0, [r0]
+    mova      m3, [r0+r1]
+    mova      m1, [r0+r1*2]
+    mova      m4, [r0+r3]
+%else
     pmovzxbw  m0, [r0]
     pmovzxbw  m3, [r0+r1]
     pmovzxbw  m1, [r0+r1*2]
     pmovzxbw  m4, [r0+r3]
+%endif
     lea       r0, [r0+r1*4]
     VAR_CORE
     dec r2d
@@ -929,7 +931,6 @@
     movd   edx, xm6
 %endif
     RET
-%endif ; !HIGH_BIT_DEPTH
 
 %macro VAR2_END 3
     HADDW   %2, xm1
@@ -1600,7 +1601,7 @@
 %macro SATDS_SSE2 0
 %define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
 
-%if vertical==0 || HIGH_BIT_DEPTH
+%if cpuflag(ssse3) && (vertical==0 || HIGH_BIT_DEPTH)
 cglobal pixel_satd_4x4, 4, 6, 6
     SATD_START_MMX
     mova m4, [hmul_4p]
​

x264-snapshot-20141104-2245.tar.bz2/common/x86/predict-c.c -> x264-snapshot-20141218-2245.tar.bz2/common/x86/predict-c.c Changed

 
@@ -65,12 +65,17 @@
     H += i * ( src[j+i - FDEC_STRIDE ]  - src[j-i - FDEC_STRIDE ] );\
     V += i * ( src[(j+i)*FDEC_STRIDE -1] - src[(j-i)*FDEC_STRIDE -1] );
 
+#if HAVE_X86_INLINE_ASM
+#if HIGH_BIT_DEPTH
 ALIGNED_16( static const int16_t pw_12345678[8] ) = {1,2,3,4,5,6,7,8};
 ALIGNED_16( static const int16_t pw_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
 ALIGNED_16( static const int16_t pw_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4};
+#else // !HIGH_BIT_DEPTH
 ALIGNED_8( static const int8_t pb_12345678[8] ) = {1,2,3,4,5,6,7,8};
 ALIGNED_8( static const int8_t pb_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
 ALIGNED_8( static const int8_t pb_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4};
+#endif // HIGH_BIT_DEPTH
+#endif // HAVE_X86_INLINE_ASM
 
 #define PREDICT_16x16_P_CORE\
     int H = 0;\
​

x264-snapshot-20141104-2245.tar.bz2/common/x86/quant-a.asm -> x264-snapshot-20141218-2245.tar.bz2/common/x86/quant-a.asm Changed

 
@@ -453,7 +453,7 @@
 QUANT_DC quant_2x2_dc, 1
 %if ARCH_X86_64 == 0 ; not needed because sse2 is faster
 QUANT_DC quant_4x4_dc, 4
-INIT_MMX mmx
+INIT_MMX mmx2
 QUANT_AC quant_4x4, 4
 QUANT_AC quant_8x8, 16
 %endif
​

x264-snapshot-20141104-2245.tar.bz2/common/x86/quant.h -> x264-snapshot-20141218-2245.tar.bz2/common/x86/quant.h Changed

 
@@ -30,8 +30,8 @@
 
 int x264_quant_2x2_dc_mmx2( dctcoef dct[4], int mf, int bias );
 int x264_quant_4x4_dc_mmx2( dctcoef dct[16], int mf, int bias );
-int x264_quant_4x4_mmx( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
-int x264_quant_8x8_mmx( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
+int x264_quant_4x4_mmx2( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
+int x264_quant_8x8_mmx2( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
 int x264_quant_2x2_dc_sse2( dctcoef dct[16], int mf, int bias );
 int x264_quant_4x4_dc_sse2( dctcoef dct[16], int mf, int bias );
 int x264_quant_4x4_sse2( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
​

x264-snapshot-20141104-2245.tar.bz2/common/x86/x86inc.asm -> x264-snapshot-20141218-2245.tar.bz2/common/x86/x86inc.asm Changed

@@ -1044,15 +1044,16 @@
 %endmacro
 
 ;%1 == instruction
-;%2 == 1 if float, 0 if int
-;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
-;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
-;%5+: operands
-%macro RUN_AVX_INSTR 5-8+
-    %ifnum sizeof%6
+;%2 == minimal instruction set
+;%3 == 1 if float, 0 if int
+;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
+;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
+;%6+: operands
+%macro RUN_AVX_INSTR 6-9+
+    %ifnum sizeof%7
+        %assign __sizeofreg sizeof%7
+    %elifnum sizeof%6
         %assign __sizeofreg sizeof%6
-    %elifnum sizeof%5
-        %assign __sizeofreg sizeof%5
     %else
         %assign __sizeofreg mmsize
     %endif
@@ -1061,325 +1062,333 @@
         %xdefine __instr v%1
     %else
         %xdefine __instr %1
-        %if %0 >= 7+%3
+        %if %0 >= 8+%4
             %assign __emulate_avx 1
         %endif
     %endif
+    %ifnidn %2, fnord
+        %ifdef cpuname
+            %if notcpuflag(%2)
+                %error use of ``%1'' %2 instruction in cpuname function: current_function
+            %endif
+        %endif
+    %endif
 
     %if __emulate_avx
-        %xdefine __src1 %6
-        %xdefine __src2 %7
-        %ifnidn %5, %6
-            %if %0 >= 8
-                CHECK_AVX_INSTR_EMU {%1 %5, %6, %7, %8}, %5, %7, %8
+        %xdefine __src1 %7
+        %xdefine __src2 %8
+        %ifnidn %6, %7
+            %if %0 >= 9
+                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, %8, %9
             %else
-                CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7
+                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, %8
             %endif
-            %if %4 && %3 == 0
-                %ifnid %7
+            %if %5 && %4 == 0
+                %ifnid %8
                     ; 3-operand AVX instructions with a memory arg can only have it in src2,
                     ; whereas SSE emulation prefers to have it in src1 (i.e. the mov).
                     ; So, if the instruction is commutative with a memory arg, swap them.
-                    %xdefine __src1 %7
-                    %xdefine __src2 %6
+                    %xdefine __src1 %8
+                    %xdefine __src2 %7
                 %endif
             %endif
             %if __sizeofreg == 8
-                MOVQ %5, __src1
-            %elif %2
-                MOVAPS %5, __src1
+                MOVQ %6, __src1
+            %elif %3
+                MOVAPS %6, __src1
             %else
-                MOVDQA %5, __src1
+                MOVDQA %6, __src1
             %endif
         %endif
-        %if %0 >= 8
-            %1 %5, __src2, %8
+        %if %0 >= 9
+            %1 %6, __src2, %9
         %else
-            %1 %5, __src2
+            %1 %6, __src2
         %endif
-    %elif %0 >= 8
-        __instr %5, %6, %7, %8
+    %elif %0 >= 9
+        __instr %6, %7, %8, %9
+    %elif %0 == 8
+        __instr %6, %7, %8
     %elif %0 == 7
-        __instr %5, %6, %7
-    %elif %0 == 6
-        __instr %5, %6
+        __instr %6, %7
     %else
-        __instr %5
+        __instr %6
     %endif
 %endmacro
 
 ;%1 == instruction
-;%2 == 1 if float, 0 if int
-;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
-;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
-%macro AVX_INSTR 1-4 0, 1, 0
-    %macro %1 1-9 fnord, fnord, fnord, fnord, %1, %2, %3, %4
+;%2 == minimal instruction set
+;%3 == 1 if float, 0 if int
+;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
+;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
+%macro AVX_INSTR 1-5 fnord, 0, 1, 0
+    %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5
         %ifidn %2, fnord
-            RUN_AVX_INSTR %6, %7, %8, %9, %1
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1
         %elifidn %3, fnord
-            RUN_AVX_INSTR %6, %7, %8, %9, %1, %2
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2
         %elifidn %4, fnord
-            RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3
         %elifidn %5, fnord
-            RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4
         %else
-            RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4, %5
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5
         %endif
     %endmacro
 %endmacro
 
 ; Instructions with both VEX and non-VEX encodings
 ; Non-destructive instructions are written without parameters
-AVX_INSTR addpd, 1, 0, 1
-AVX_INSTR addps, 1, 0, 1
-AVX_INSTR addsd, 1, 0, 1
-AVX_INSTR addss, 1, 0, 1
-AVX_INSTR addsubpd, 1, 0, 0
-AVX_INSTR addsubps, 1, 0, 0
-AVX_INSTR aesdec, 0, 0, 0
-AVX_INSTR aesdeclast, 0, 0, 0
-AVX_INSTR aesenc, 0, 0, 0
-AVX_INSTR aesenclast, 0, 0, 0
+AVX_INSTR addpd, sse2, 1, 0, 1
+AVX_INSTR addps, sse, 1, 0, 1
+AVX_INSTR addsd, sse2, 1, 0, 1
+AVX_INSTR addss, sse, 1, 0, 1
+AVX_INSTR addsubpd, sse3, 1, 0, 0
+AVX_INSTR addsubps, sse3, 1, 0, 0
+AVX_INSTR aesdec, fnord, 0, 0, 0
+AVX_INSTR aesdeclast, fnord, 0, 0, 0
+AVX_INSTR aesenc, fnord, 0, 0, 0
+AVX_INSTR aesenclast, fnord, 0, 0, 0
 AVX_INSTR aesimc
 AVX_INSTR aeskeygenassist
-AVX_INSTR andnpd, 1, 0, 0
-AVX_INSTR andnps, 1, 0, 0
-AVX_INSTR andpd, 1, 0, 1
-AVX_INSTR andps, 1, 0, 1
-AVX_INSTR blendpd, 1, 0, 0
-AVX_INSTR blendps, 1, 0, 0
-AVX_INSTR blendvpd, 1, 0, 0
-AVX_INSTR blendvps, 1, 0, 0
-AVX_INSTR cmppd, 1, 1, 0
-AVX_INSTR cmpps, 1, 1, 0
-AVX_INSTR cmpsd, 1, 1, 0
-AVX_INSTR cmpss, 1, 1, 0
-AVX_INSTR comisd
-AVX_INSTR comiss
-AVX_INSTR cvtdq2pd
-AVX_INSTR cvtdq2ps
-AVX_INSTR cvtpd2dq
-AVX_INSTR cvtpd2ps
-AVX_INSTR cvtps2dq
-AVX_INSTR cvtps2pd
-AVX_INSTR cvtsd2si
-AVX_INSTR cvtsd2ss
-AVX_INSTR cvtsi2sd
-AVX_INSTR cvtsi2ss
-AVX_INSTR cvtss2sd
-AVX_INSTR cvtss2si
-AVX_INSTR cvttpd2dq
-AVX_INSTR cvttps2dq
-AVX_INSTR cvttsd2si
-AVX_INSTR cvttss2si
-AVX_INSTR divpd, 1, 0, 0
-AVX_INSTR divps, 1, 0, 0
-AVX_INSTR divsd, 1, 0, 0
-AVX_INSTR divss, 1, 0, 0
-AVX_INSTR dppd, 1, 1, 0
-AVX_INSTR dpps, 1, 1, 0
-AVX_INSTR extractps
-AVX_INSTR haddpd, 1, 0, 0
-AVX_INSTR haddps, 1, 0, 0
-AVX_INSTR hsubpd, 1, 0, 0
-AVX_INSTR hsubps, 1, 0, 0
-AVX_INSTR insertps, 1, 1, 0
-AVX_INSTR lddqu
-AVX_INSTR ldmxcsr
-AVX_INSTR maskmovdqu
-AVX_INSTR maxpd, 1, 0, 1
-AVX_INSTR maxps, 1, 0, 1
-AVX_INSTR maxsd, 1, 0, 1
-AVX_INSTR maxss, 1, 0, 1
-AVX_INSTR minpd, 1, 0, 1
-AVX_INSTR minps, 1, 0, 1
-AVX_INSTR minsd, 1, 0, 1
-AVX_INSTR minss, 1, 0, 1
-AVX_INSTR movapd
-AVX_INSTR movaps
+AVX_INSTR andnpd, sse2, 1, 0, 0
+AVX_INSTR andnps, sse, 1, 0, 0
+AVX_INSTR andpd, sse2, 1, 0, 1
+AVX_INSTR andps, sse, 1, 0, 1
+AVX_INSTR blendpd, sse4, 1, 0, 0
+AVX_INSTR blendps, sse4, 1, 0, 0
+AVX_INSTR blendvpd, sse4, 1, 0, 0
+AVX_INSTR blendvps, sse4, 1, 0, 0
+AVX_INSTR cmppd, sse2, 1, 1, 0
+AVX_INSTR cmpps, sse, 1, 1, 0
+AVX_INSTR cmpsd, sse2, 1, 1, 0
+AVX_INSTR cmpss, sse, 1, 1, 0
+AVX_INSTR comisd, sse2
+AVX_INSTR comiss, sse
+AVX_INSTR cvtdq2pd, sse2
+AVX_INSTR cvtdq2ps, sse2
+AVX_INSTR cvtpd2dq, sse2
+AVX_INSTR cvtpd2ps, sse2
+AVX_INSTR cvtps2dq, sse2
+AVX_INSTR cvtps2pd, sse2
+AVX_INSTR cvtsd2si, sse2
+AVX_INSTR cvtsd2ss, sse2
+AVX_INSTR cvtsi2sd, sse2
+AVX_INSTR cvtsi2ss, sse
+AVX_INSTR cvtss2sd, sse2
+AVX_INSTR cvtss2si, sse
+AVX_INSTR cvttpd2dq, sse2
+AVX_INSTR cvttps2dq, sse2
+AVX_INSTR cvttsd2si, sse2
+AVX_INSTR cvttss2si, sse
+AVX_INSTR divpd, sse2, 1, 0, 0
+AVX_INSTR divps, sse, 1, 0, 0
+AVX_INSTR divsd, sse2, 1, 0, 0
+AVX_INSTR divss, sse, 1, 0, 0
+AVX_INSTR dppd, sse4, 1, 1, 0
+AVX_INSTR dpps, sse4, 1, 1, 0
+AVX_INSTR extractps, sse4
+AVX_INSTR haddpd, sse3, 1, 0, 0
+AVX_INSTR haddps, sse3, 1, 0, 0
+AVX_INSTR hsubpd, sse3, 1, 0, 0
+AVX_INSTR hsubps, sse3, 1, 0, 0
+AVX_INSTR insertps, sse4, 1, 1, 0
+AVX_INSTR lddqu, sse3
+AVX_INSTR ldmxcsr, sse
+AVX_INSTR maskmovdqu, sse2
+AVX_INSTR maxpd, sse2, 1, 0, 1
+AVX_INSTR maxps, sse, 1, 0, 1
+AVX_INSTR maxsd, sse2, 1, 0, 1
+AVX_INSTR maxss, sse, 1, 0, 1
+AVX_INSTR minpd, sse2, 1, 0, 1
+AVX_INSTR minps, sse, 1, 0, 1
+AVX_INSTR minsd, sse2, 1, 0, 1
+AVX_INSTR minss, sse, 1, 0, 1
+AVX_INSTR movapd, sse2
+AVX_INSTR movaps, sse
 AVX_INSTR movd
-AVX_INSTR movddup
-AVX_INSTR movdqa
-AVX_INSTR movdqu
-AVX_INSTR movhlps, 1, 0, 0
-AVX_INSTR movhpd, 1, 0, 0
-AVX_INSTR movhps, 1, 0, 0
-AVX_INSTR movlhps, 1, 0, 0
-AVX_INSTR movlpd, 1, 0, 0
-AVX_INSTR movlps, 1, 0, 0
-AVX_INSTR movmskpd
-AVX_INSTR movmskps
-AVX_INSTR movntdq
-AVX_INSTR movntdqa
-AVX_INSTR movntpd
-AVX_INSTR movntps
+AVX_INSTR movddup, sse3
+AVX_INSTR movdqa, sse2
+AVX_INSTR movdqu, sse2
+AVX_INSTR movhlps, sse, 1, 0, 0
+AVX_INSTR movhpd, sse2, 1, 0, 0
+AVX_INSTR movhps, sse, 1, 0, 0
+AVX_INSTR movlhps, sse, 1, 0, 0
+AVX_INSTR movlpd, sse2, 1, 0, 0
+AVX_INSTR movlps, sse, 1, 0, 0
+AVX_INSTR movmskpd, sse2
+AVX_INSTR movmskps, sse
+AVX_INSTR movntdq, sse2
+AVX_INSTR movntdqa, sse4
+AVX_INSTR movntpd, sse2
+AVX_INSTR movntps, sse
 AVX_INSTR movq
-AVX_INSTR movsd, 1, 0, 0
-AVX_INSTR movshdup
-AVX_INSTR movsldup
-AVX_INSTR movss, 1, 0, 0
-AVX_INSTR movupd
-AVX_INSTR movups
-AVX_INSTR mpsadbw, 0, 1, 0
-AVX_INSTR mulpd, 1, 0, 1
-AVX_INSTR mulps, 1, 0, 1
-AVX_INSTR mulsd, 1, 0, 1
-AVX_INSTR mulss, 1, 0, 1
-AVX_INSTR orpd, 1, 0, 1
-AVX_INSTR orps, 1, 0, 1
-AVX_INSTR pabsb
-AVX_INSTR pabsd
-AVX_INSTR pabsw
-AVX_INSTR packsswb, 0, 0, 0
-AVX_INSTR packssdw, 0, 0, 0
-AVX_INSTR packuswb, 0, 0, 0
-AVX_INSTR packusdw, 0, 0, 0
-AVX_INSTR paddb, 0, 0, 1
-AVX_INSTR paddw, 0, 0, 1
-AVX_INSTR paddd, 0, 0, 1
-AVX_INSTR paddq, 0, 0, 1
-AVX_INSTR paddsb, 0, 0, 1
-AVX_INSTR paddsw, 0, 0, 1
-AVX_INSTR paddusb, 0, 0, 1
-AVX_INSTR paddusw, 0, 0, 1
-AVX_INSTR palignr, 0, 1, 0
-AVX_INSTR pand, 0, 0, 1
-AVX_INSTR pandn, 0, 0, 0
-AVX_INSTR pavgb, 0, 0, 1
-AVX_INSTR pavgw, 0, 0, 1
-AVX_INSTR pblendvb, 0, 0, 0
-AVX_INSTR pblendw, 0, 1, 0
-AVX_INSTR pclmulqdq, 0, 1, 0
-AVX_INSTR pcmpestri
-AVX_INSTR pcmpestrm
-AVX_INSTR pcmpistri
-AVX_INSTR pcmpistrm
-AVX_INSTR pcmpeqb, 0, 0, 1
-AVX_INSTR pcmpeqw, 0, 0, 1
-AVX_INSTR pcmpeqd, 0, 0, 1
-AVX_INSTR pcmpeqq, 0, 0, 1
-AVX_INSTR pcmpgtb, 0, 0, 0
-AVX_INSTR pcmpgtw, 0, 0, 0
-AVX_INSTR pcmpgtd, 0, 0, 0
-AVX_INSTR pcmpgtq, 0, 0, 0
-AVX_INSTR pextrb
-AVX_INSTR pextrd
-AVX_INSTR pextrq
-AVX_INSTR pextrw
-AVX_INSTR phaddw, 0, 0, 0
-AVX_INSTR phaddd, 0, 0, 0
-AVX_INSTR phaddsw, 0, 0, 0
-AVX_INSTR phminposuw
-AVX_INSTR phsubw, 0, 0, 0
-AVX_INSTR phsubd, 0, 0, 0
-AVX_INSTR phsubsw, 0, 0, 0
-AVX_INSTR pinsrb, 0, 1, 0
-AVX_INSTR pinsrd, 0, 1, 0
-AVX_INSTR pinsrq, 0, 1, 0
-AVX_INSTR pinsrw, 0, 1, 0
-AVX_INSTR pmaddwd, 0, 0, 1
-AVX_INSTR pmaddubsw, 0, 0, 0
-AVX_INSTR pmaxsb, 0, 0, 1
-AVX_INSTR pmaxsw, 0, 0, 1
-AVX_INSTR pmaxsd, 0, 0, 1
-AVX_INSTR pmaxub, 0, 0, 1
-AVX_INSTR pmaxuw, 0, 0, 1
-AVX_INSTR pmaxud, 0, 0, 1
-AVX_INSTR pminsb, 0, 0, 1
-AVX_INSTR pminsw, 0, 0, 1
-AVX_INSTR pminsd, 0, 0, 1
-AVX_INSTR pminub, 0, 0, 1
-AVX_INSTR pminuw, 0, 0, 1
-AVX_INSTR pminud, 0, 0, 1
-AVX_INSTR pmovmskb
-AVX_INSTR pmovsxbw
-AVX_INSTR pmovsxbd
-AVX_INSTR pmovsxbq
-AVX_INSTR pmovsxwd
-AVX_INSTR pmovsxwq
-AVX_INSTR pmovsxdq
-AVX_INSTR pmovzxbw
-AVX_INSTR pmovzxbd
-AVX_INSTR pmovzxbq
-AVX_INSTR pmovzxwd
-AVX_INSTR pmovzxwq
-AVX_INSTR pmovzxdq
-AVX_INSTR pmuldq, 0, 0, 1
-AVX_INSTR pmulhrsw, 0, 0, 1
-AVX_INSTR pmulhuw, 0, 0, 1
-AVX_INSTR pmulhw, 0, 0, 1
-AVX_INSTR pmullw, 0, 0, 1
-AVX_INSTR pmulld, 0, 0, 1
-AVX_INSTR pmuludq, 0, 0, 1
-AVX_INSTR por, 0, 0, 1
-AVX_INSTR psadbw, 0, 0, 1
-AVX_INSTR pshufb, 0, 0, 0
-AVX_INSTR pshufd
-AVX_INSTR pshufhw
-AVX_INSTR pshuflw
-AVX_INSTR psignb, 0, 0, 0
-AVX_INSTR psignw, 0, 0, 0
-AVX_INSTR psignd, 0, 0, 0
-AVX_INSTR psllw, 0, 0, 0
-AVX_INSTR pslld, 0, 0, 0
-AVX_INSTR psllq, 0, 0, 0
-AVX_INSTR pslldq, 0, 0, 0
-AVX_INSTR psraw, 0, 0, 0
-AVX_INSTR psrad, 0, 0, 0
-AVX_INSTR psrlw, 0, 0, 0
-AVX_INSTR psrld, 0, 0, 0
-AVX_INSTR psrlq, 0, 0, 0
-AVX_INSTR psrldq, 0, 0, 0
-AVX_INSTR psubb, 0, 0, 0
-AVX_INSTR psubw, 0, 0, 0
-AVX_INSTR psubd, 0, 0, 0
-AVX_INSTR psubq, 0, 0, 0
-AVX_INSTR psubsb, 0, 0, 0
-AVX_INSTR psubsw, 0, 0, 0
-AVX_INSTR psubusb, 0, 0, 0
-AVX_INSTR psubusw, 0, 0, 0
-AVX_INSTR ptest
-AVX_INSTR punpckhbw, 0, 0, 0
-AVX_INSTR punpckhwd, 0, 0, 0
-AVX_INSTR punpckhdq, 0, 0, 0
-AVX_INSTR punpckhqdq, 0, 0, 0
-AVX_INSTR punpcklbw, 0, 0, 0
-AVX_INSTR punpcklwd, 0, 0, 0
-AVX_INSTR punpckldq, 0, 0, 0
-AVX_INSTR punpcklqdq, 0, 0, 0
-AVX_INSTR pxor, 0, 0, 1
-AVX_INSTR rcpps, 1, 0, 0
-AVX_INSTR rcpss, 1, 0, 0
-AVX_INSTR roundpd
-AVX_INSTR roundps
-AVX_INSTR roundsd
-AVX_INSTR roundss
-AVX_INSTR rsqrtps, 1, 0, 0
-AVX_INSTR rsqrtss, 1, 0, 0
-AVX_INSTR shufpd, 1, 1, 0
-AVX_INSTR shufps, 1, 1, 0
-AVX_INSTR sqrtpd, 1, 0, 0
-AVX_INSTR sqrtps, 1, 0, 0
-AVX_INSTR sqrtsd, 1, 0, 0
-AVX_INSTR sqrtss, 1, 0, 0
-AVX_INSTR stmxcsr
-AVX_INSTR subpd, 1, 0, 0
-AVX_INSTR subps, 1, 0, 0
-AVX_INSTR subsd, 1, 0, 0
-AVX_INSTR subss, 1, 0, 0
-AVX_INSTR ucomisd
-AVX_INSTR ucomiss
-AVX_INSTR unpckhpd, 1, 0, 0
-AVX_INSTR unpckhps, 1, 0, 0
-AVX_INSTR unpcklpd, 1, 0, 0
-AVX_INSTR unpcklps, 1, 0, 0
-AVX_INSTR xorpd, 1, 0, 1
-AVX_INSTR xorps, 1, 0, 1
+AVX_INSTR movsd, sse2, 1, 0, 0
+AVX_INSTR movshdup, sse3
+AVX_INSTR movsldup, sse3
+AVX_INSTR movss, sse, 1, 0, 0
+AVX_INSTR movupd, sse2
+AVX_INSTR movups, sse
+AVX_INSTR mpsadbw, sse4
+AVX_INSTR mulpd, sse2, 1, 0, 1
+AVX_INSTR mulps, sse, 1, 0, 1
+AVX_INSTR mulsd, sse2, 1, 0, 1
+AVX_INSTR mulss, sse, 1, 0, 1
+AVX_INSTR orpd, sse2, 1, 0, 1
+AVX_INSTR orps, sse, 1, 0, 1
+AVX_INSTR pabsb, ssse3
+AVX_INSTR pabsd, ssse3
+AVX_INSTR pabsw, ssse3
+AVX_INSTR packsswb, mmx, 0, 0, 0
+AVX_INSTR packssdw, mmx, 0, 0, 0
+AVX_INSTR packuswb, mmx, 0, 0, 0
+AVX_INSTR packusdw, sse4, 0, 0, 0
+AVX_INSTR paddb, mmx, 0, 0, 1
+AVX_INSTR paddw, mmx, 0, 0, 1
+AVX_INSTR paddd, mmx, 0, 0, 1
+AVX_INSTR paddq, sse2, 0, 0, 1
+AVX_INSTR paddsb, mmx, 0, 0, 1
+AVX_INSTR paddsw, mmx, 0, 0, 1
+AVX_INSTR paddusb, mmx, 0, 0, 1
+AVX_INSTR paddusw, mmx, 0, 0, 1
+AVX_INSTR palignr, ssse3
+AVX_INSTR pand, mmx, 0, 0, 1
+AVX_INSTR pandn, mmx, 0, 0, 0
+AVX_INSTR pavgb, mmx2, 0, 0, 1
+AVX_INSTR pavgw, mmx2, 0, 0, 1
+AVX_INSTR pblendvb, sse4, 0, 0, 0
+AVX_INSTR pblendw, sse4
+AVX_INSTR pclmulqdq
+AVX_INSTR pcmpestri, sse42
+AVX_INSTR pcmpestrm, sse42
+AVX_INSTR pcmpistri, sse42
+AVX_INSTR pcmpistrm, sse42
+AVX_INSTR pcmpeqb, mmx, 0, 0, 1
+AVX_INSTR pcmpeqw, mmx, 0, 0, 1
+AVX_INSTR pcmpeqd, mmx, 0, 0, 1
+AVX_INSTR pcmpeqq, sse4, 0, 0, 1
+AVX_INSTR pcmpgtb, mmx, 0, 0, 0
+AVX_INSTR pcmpgtw, mmx, 0, 0, 0
+AVX_INSTR pcmpgtd, mmx, 0, 0, 0
+AVX_INSTR pcmpgtq, sse42, 0, 0, 0
+AVX_INSTR pextrb, sse4
+AVX_INSTR pextrd, sse4
+AVX_INSTR pextrq, sse4
+AVX_INSTR pextrw, mmx2
+AVX_INSTR phaddw, ssse3, 0, 0, 0
+AVX_INSTR phaddd, ssse3, 0, 0, 0
+AVX_INSTR phaddsw, ssse3, 0, 0, 0
+AVX_INSTR phminposuw, sse4
+AVX_INSTR phsubw, ssse3, 0, 0, 0
+AVX_INSTR phsubd, ssse3, 0, 0, 0
+AVX_INSTR phsubsw, ssse3, 0, 0, 0
+AVX_INSTR pinsrb, sse4
+AVX_INSTR pinsrd, sse4
+AVX_INSTR pinsrq, sse4
+AVX_INSTR pinsrw, mmx2
+AVX_INSTR pmaddwd, mmx, 0, 0, 1
+AVX_INSTR pmaddubsw, ssse3, 0, 0, 0
+AVX_INSTR pmaxsb, sse4, 0, 0, 1
+AVX_INSTR pmaxsw, mmx2, 0, 0, 1
+AVX_INSTR pmaxsd, sse4, 0, 0, 1
+AVX_INSTR pmaxub, mmx2, 0, 0, 1
+AVX_INSTR pmaxuw, sse4, 0, 0, 1
+AVX_INSTR pmaxud, sse4, 0, 0, 1
+AVX_INSTR pminsb, sse4, 0, 0, 1
+AVX_INSTR pminsw, mmx2, 0, 0, 1
+AVX_INSTR pminsd, sse4, 0, 0, 1
+AVX_INSTR pminub, mmx2, 0, 0, 1
+AVX_INSTR pminuw, sse4, 0, 0, 1
+AVX_INSTR pminud, sse4, 0, 0, 1
+AVX_INSTR pmovmskb, mmx2
+AVX_INSTR pmovsxbw, sse4
+AVX_INSTR pmovsxbd, sse4
+AVX_INSTR pmovsxbq, sse4
+AVX_INSTR pmovsxwd, sse4
+AVX_INSTR pmovsxwq, sse4
+AVX_INSTR pmovsxdq, sse4
+AVX_INSTR pmovzxbw, sse4
+AVX_INSTR pmovzxbd, sse4
+AVX_INSTR pmovzxbq, sse4
+AVX_INSTR pmovzxwd, sse4
+AVX_INSTR pmovzxwq, sse4
+AVX_INSTR pmovzxdq, sse4
+AVX_INSTR pmuldq, sse4, 0, 0, 1
+AVX_INSTR pmulhrsw, ssse3, 0, 0, 1
+AVX_INSTR pmulhuw, mmx2, 0, 0, 1
+AVX_INSTR pmulhw, mmx, 0, 0, 1
+AVX_INSTR pmullw, mmx, 0, 0, 1
+AVX_INSTR pmulld, sse4, 0, 0, 1
+AVX_INSTR pmuludq, sse2, 0, 0, 1
+AVX_INSTR por, mmx, 0, 0, 1
+AVX_INSTR psadbw, mmx2, 0, 0, 1
+AVX_INSTR pshufb, ssse3, 0, 0, 0
+AVX_INSTR pshufd, sse2
+AVX_INSTR pshufhw, sse2
+AVX_INSTR pshuflw, sse2
+AVX_INSTR psignb, ssse3, 0, 0, 0
+AVX_INSTR psignw, ssse3, 0, 0, 0
+AVX_INSTR psignd, ssse3, 0, 0, 0
+AVX_INSTR psllw, mmx, 0, 0, 0
+AVX_INSTR pslld, mmx, 0, 0, 0
+AVX_INSTR psllq, mmx, 0, 0, 0
+AVX_INSTR pslldq, sse2, 0, 0, 0
+AVX_INSTR psraw, mmx, 0, 0, 0
+AVX_INSTR psrad, mmx, 0, 0, 0
+AVX_INSTR psrlw, mmx, 0, 0, 0
+AVX_INSTR psrld, mmx, 0, 0, 0
+AVX_INSTR psrlq, mmx, 0, 0, 0
+AVX_INSTR psrldq, sse2, 0, 0, 0
+AVX_INSTR psubb, mmx, 0, 0, 0
+AVX_INSTR psubw, mmx, 0, 0, 0
+AVX_INSTR psubd, mmx, 0, 0, 0
+AVX_INSTR psubq, sse2, 0, 0, 0
+AVX_INSTR psubsb, mmx, 0, 0, 0
+AVX_INSTR psubsw, mmx, 0, 0, 0
+AVX_INSTR psubusb, mmx, 0, 0, 0
+AVX_INSTR psubusw, mmx, 0, 0, 0
+AVX_INSTR ptest, sse4
+AVX_INSTR punpckhbw, mmx, 0, 0, 0
+AVX_INSTR punpckhwd, mmx, 0, 0, 0
+AVX_INSTR punpckhdq, mmx, 0, 0, 0
+AVX_INSTR punpckhqdq, sse2, 0, 0, 0
+AVX_INSTR punpcklbw, mmx, 0, 0, 0
+AVX_INSTR punpcklwd, mmx, 0, 0, 0
+AVX_INSTR punpckldq, mmx, 0, 0, 0
+AVX_INSTR punpcklqdq, sse2, 0, 0, 0
+AVX_INSTR pxor, mmx, 0, 0, 1
+AVX_INSTR rcpps, sse, 1, 0, 0
+AVX_INSTR rcpss, sse, 1, 0, 0
+AVX_INSTR roundpd, sse4
+AVX_INSTR roundps, sse4
+AVX_INSTR roundsd, sse4
+AVX_INSTR roundss, sse4
+AVX_INSTR rsqrtps, sse, 1, 0, 0
+AVX_INSTR rsqrtss, sse, 1, 0, 0
+AVX_INSTR shufpd, sse2, 1, 1, 0
+AVX_INSTR shufps, sse, 1, 1, 0
+AVX_INSTR sqrtpd, sse2, 1, 0, 0
+AVX_INSTR sqrtps, sse, 1, 0, 0
+AVX_INSTR sqrtsd, sse2, 1, 0, 0
+AVX_INSTR sqrtss, sse, 1, 0, 0
+AVX_INSTR stmxcsr, sse
+AVX_INSTR subpd, sse2, 1, 0, 0
+AVX_INSTR subps, sse, 1, 0, 0
+AVX_INSTR subsd, sse2, 1, 0, 0
+AVX_INSTR subss, sse, 1, 0, 0
+AVX_INSTR ucomisd, sse2
+AVX_INSTR ucomiss, sse
+AVX_INSTR unpckhpd, sse2, 1, 0, 0
+AVX_INSTR unpckhps, sse, 1, 0, 0
+AVX_INSTR unpcklpd, sse2, 1, 0, 0
+AVX_INSTR unpcklps, sse, 1, 0, 0
+AVX_INSTR xorpd, sse2, 1, 0, 1
+AVX_INSTR xorps, sse, 1, 0, 1
 
 ; 3DNow instructions, for sharing code between AVX, SSE and 3DN
-AVX_INSTR pfadd, 1, 0, 1
-AVX_INSTR pfsub, 1, 0, 0
-AVX_INSTR pfmul, 1, 0, 1
+AVX_INSTR pfadd, 3dnow, 1, 0, 1
+AVX_INSTR pfsub, 3dnow, 1, 0, 0
+AVX_INSTR pfmul, 3dnow, 1, 0, 1
 
 ; base-4 constants for shuffles
 %assign i 0

 
@@ -1044,15 +1044,16 @@
 %endmacro
 
 ;%1 == instruction
-;%2 == 1 if float, 0 if int
-;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
-;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
-;%5+: operands
-%macro RUN_AVX_INSTR 5-8+
-    %ifnum sizeof%6
+;%2 == minimal instruction set
+;%3 == 1 if float, 0 if int
+;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
+;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
+;%6+: operands
+%macro RUN_AVX_INSTR 6-9+
+    %ifnum sizeof%7
+        %assign __sizeofreg sizeof%7
+    %elifnum sizeof%6
         %assign __sizeofreg sizeof%6
-    %elifnum sizeof%5
-        %assign __sizeofreg sizeof%5
     %else
         %assign __sizeofreg mmsize
     %endif
@@ -1061,325 +1062,333 @@
         %xdefine __instr v%1
     %else
         %xdefine __instr %1
-        %if %0 >= 7+%3
+        %if %0 >= 8+%4
             %assign __emulate_avx 1
         %endif
     %endif
+    %ifnidn %2, fnord
+        %ifdef cpuname
+            %if notcpuflag(%2)
+                %error use of ``%1'' %2 instruction in cpuname function: current_function
+            %endif
+        %endif
+    %endif
 
     %if __emulate_avx
-        %xdefine __src1 %6
-        %xdefine __src2 %7
-        %ifnidn %5, %6
-            %if %0 >= 8
-                CHECK_AVX_INSTR_EMU {%1 %5, %6, %7, %8}, %5, %7, %8
+        %xdefine __src1 %7
+        %xdefine __src2 %8
+        %ifnidn %6, %7
+            %if %0 >= 9
+                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, %8, %9
             %else
-                CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7
+                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, %8
             %endif
-            %if %4 && %3 == 0
-                %ifnid %7
+            %if %5 && %4 == 0
+                %ifnid %8
                     ; 3-operand AVX instructions with a memory arg can only have it in src2,
                     ; whereas SSE emulation prefers to have it in src1 (i.e. the mov).
                     ; So, if the instruction is commutative with a memory arg, swap them.
-                    %xdefine __src1 %7
-                    %xdefine __src2 %6
+                    %xdefine __src1 %8
+                    %xdefine __src2 %7
                 %endif
             %endif
             %if __sizeofreg == 8
-                MOVQ %5, __src1
-            %elif %2
-                MOVAPS %5, __src1
+                MOVQ %6, __src1
+            %elif %3
+                MOVAPS %6, __src1
             %else
-                MOVDQA %5, __src1
+                MOVDQA %6, __src1
             %endif
         %endif
-        %if %0 >= 8
-            %1 %5, __src2, %8
+        %if %0 >= 9
+            %1 %6, __src2, %9
         %else
-            %1 %5, __src2
+            %1 %6, __src2
         %endif
-    %elif %0 >= 8
-        __instr %5, %6, %7, %8
+    %elif %0 >= 9
+        __instr %6, %7, %8, %9
+    %elif %0 == 8
+        __instr %6, %7, %8
     %elif %0 == 7
-        __instr %5, %6, %7
-    %elif %0 == 6
-        __instr %5, %6
+        __instr %6, %7
     %else
-        __instr %5
+        __instr %6
     %endif
 %endmacro
 
 ;%1 == instruction
-;%2 == 1 if float, 0 if int
-;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
-;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
-%macro AVX_INSTR 1-4 0, 1, 0
-    %macro %1 1-9 fnord, fnord, fnord, fnord, %1, %2, %3, %4
+;%2 == minimal instruction set
+;%3 == 1 if float, 0 if int
+;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
+;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
+%macro AVX_INSTR 1-5 fnord, 0, 1, 0
+    %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5
         %ifidn %2, fnord
-            RUN_AVX_INSTR %6, %7, %8, %9, %1
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1
         %elifidn %3, fnord
-            RUN_AVX_INSTR %6, %7, %8, %9, %1, %2
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2
         %elifidn %4, fnord
-            RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3
         %elifidn %5, fnord
-            RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4
         %else
-            RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4, %5
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5
         %endif
     %endmacro
 %endmacro
 
 ; Instructions with both VEX and non-VEX encodings
 ; Non-destructive instructions are written without parameters
-AVX_INSTR addpd, 1, 0, 1
-AVX_INSTR addps, 1, 0, 1
-AVX_INSTR addsd, 1, 0, 1
-AVX_INSTR addss, 1, 0, 1
-AVX_INSTR addsubpd, 1, 0, 0
-AVX_INSTR addsubps, 1, 0, 0
-AVX_INSTR aesdec, 0, 0, 0
-AVX_INSTR aesdeclast, 0, 0, 0
-AVX_INSTR aesenc, 0, 0, 0
-AVX_INSTR aesenclast, 0, 0, 0
+AVX_INSTR addpd, sse2, 1, 0, 1
+AVX_INSTR addps, sse, 1, 0, 1
+AVX_INSTR addsd, sse2, 1, 0, 1
+AVX_INSTR addss, sse, 1, 0, 1
+AVX_INSTR addsubpd, sse3, 1, 0, 0
+AVX_INSTR addsubps, sse3, 1, 0, 0
+AVX_INSTR aesdec, fnord, 0, 0, 0
+AVX_INSTR aesdeclast, fnord, 0, 0, 0
+AVX_INSTR aesenc, fnord, 0, 0, 0
+AVX_INSTR aesenclast, fnord, 0, 0, 0
 AVX_INSTR aesimc
 AVX_INSTR aeskeygenassist
-AVX_INSTR andnpd, 1, 0, 0
-AVX_INSTR andnps, 1, 0, 0
-AVX_INSTR andpd, 1, 0, 1
-AVX_INSTR andps, 1, 0, 1
-AVX_INSTR blendpd, 1, 0, 0
-AVX_INSTR blendps, 1, 0, 0
-AVX_INSTR blendvpd, 1, 0, 0
-AVX_INSTR blendvps, 1, 0, 0
-AVX_INSTR cmppd, 1, 1, 0
-AVX_INSTR cmpps, 1, 1, 0
-AVX_INSTR cmpsd, 1, 1, 0
-AVX_INSTR cmpss, 1, 1, 0
-AVX_INSTR comisd
-AVX_INSTR comiss
-AVX_INSTR cvtdq2pd
-AVX_INSTR cvtdq2ps
-AVX_INSTR cvtpd2dq
-AVX_INSTR cvtpd2ps
-AVX_INSTR cvtps2dq
-AVX_INSTR cvtps2pd
-AVX_INSTR cvtsd2si
-AVX_INSTR cvtsd2ss
-AVX_INSTR cvtsi2sd
-AVX_INSTR cvtsi2ss
-AVX_INSTR cvtss2sd
-AVX_INSTR cvtss2si
-AVX_INSTR cvttpd2dq
-AVX_INSTR cvttps2dq
-AVX_INSTR cvttsd2si
-AVX_INSTR cvttss2si
-AVX_INSTR divpd, 1, 0, 0
-AVX_INSTR divps, 1, 0, 0
-AVX_INSTR divsd, 1, 0, 0
-AVX_INSTR divss, 1, 0, 0
-AVX_INSTR dppd, 1, 1, 0
-AVX_INSTR dpps, 1, 1, 0
-AVX_INSTR extractps
-AVX_INSTR haddpd, 1, 0, 0
-AVX_INSTR haddps, 1, 0, 0
-AVX_INSTR hsubpd, 1, 0, 0
-AVX_INSTR hsubps, 1, 0, 0
-AVX_INSTR insertps, 1, 1, 0
-AVX_INSTR lddqu
-AVX_INSTR ldmxcsr
-AVX_INSTR maskmovdqu
-AVX_INSTR maxpd, 1, 0, 1
-AVX_INSTR maxps, 1, 0, 1
-AVX_INSTR maxsd, 1, 0, 1
-AVX_INSTR maxss, 1, 0, 1
-AVX_INSTR minpd, 1, 0, 1
-AVX_INSTR minps, 1, 0, 1
-AVX_INSTR minsd, 1, 0, 1
-AVX_INSTR minss, 1, 0, 1
-AVX_INSTR movapd
-AVX_INSTR movaps
+AVX_INSTR andnpd, sse2, 1, 0, 0
+AVX_INSTR andnps, sse, 1, 0, 0
+AVX_INSTR andpd, sse2, 1, 0, 1
+AVX_INSTR andps, sse, 1, 0, 1
+AVX_INSTR blendpd, sse4, 1, 0, 0
+AVX_INSTR blendps, sse4, 1, 0, 0
+AVX_INSTR blendvpd, sse4, 1, 0, 0
+AVX_INSTR blendvps, sse4, 1, 0, 0
+AVX_INSTR cmppd, sse2, 1, 1, 0
+AVX_INSTR cmpps, sse, 1, 1, 0
+AVX_INSTR cmpsd, sse2, 1, 1, 0
+AVX_INSTR cmpss, sse, 1, 1, 0
+AVX_INSTR comisd, sse2
+AVX_INSTR comiss, sse
+AVX_INSTR cvtdq2pd, sse2
+AVX_INSTR cvtdq2ps, sse2
+AVX_INSTR cvtpd2dq, sse2
+AVX_INSTR cvtpd2ps, sse2
+AVX_INSTR cvtps2dq, sse2
+AVX_INSTR cvtps2pd, sse2
+AVX_INSTR cvtsd2si, sse2
+AVX_INSTR cvtsd2ss, sse2
+AVX_INSTR cvtsi2sd, sse2
+AVX_INSTR cvtsi2ss, sse
+AVX_INSTR cvtss2sd, sse2
+AVX_INSTR cvtss2si, sse
+AVX_INSTR cvttpd2dq, sse2
+AVX_INSTR cvttps2dq, sse2
+AVX_INSTR cvttsd2si, sse2
+AVX_INSTR cvttss2si, sse
+AVX_INSTR divpd, sse2, 1, 0, 0
+AVX_INSTR divps, sse, 1, 0, 0
+AVX_INSTR divsd, sse2, 1, 0, 0
+AVX_INSTR divss, sse, 1, 0, 0
+AVX_INSTR dppd, sse4, 1, 1, 0
+AVX_INSTR dpps, sse4, 1, 1, 0
+AVX_INSTR extractps, sse4
+AVX_INSTR haddpd, sse3, 1, 0, 0
+AVX_INSTR haddps, sse3, 1, 0, 0
+AVX_INSTR hsubpd, sse3, 1, 0, 0
+AVX_INSTR hsubps, sse3, 1, 0, 0
+AVX_INSTR insertps, sse4, 1, 1, 0
+AVX_INSTR lddqu, sse3
+AVX_INSTR ldmxcsr, sse
+AVX_INSTR maskmovdqu, sse2
+AVX_INSTR maxpd, sse2, 1, 0, 1
+AVX_INSTR maxps, sse, 1, 0, 1
+AVX_INSTR maxsd, sse2, 1, 0, 1
+AVX_INSTR maxss, sse, 1, 0, 1
+AVX_INSTR minpd, sse2, 1, 0, 1
+AVX_INSTR minps, sse, 1, 0, 1
+AVX_INSTR minsd, sse2, 1, 0, 1
+AVX_INSTR minss, sse, 1, 0, 1
+AVX_INSTR movapd, sse2
+AVX_INSTR movaps, sse
 AVX_INSTR movd
-AVX_INSTR movddup
-AVX_INSTR movdqa
-AVX_INSTR movdqu
-AVX_INSTR movhlps, 1, 0, 0
-AVX_INSTR movhpd, 1, 0, 0
-AVX_INSTR movhps, 1, 0, 0
-AVX_INSTR movlhps, 1, 0, 0
-AVX_INSTR movlpd, 1, 0, 0
-AVX_INSTR movlps, 1, 0, 0
-AVX_INSTR movmskpd
-AVX_INSTR movmskps
-AVX_INSTR movntdq
-AVX_INSTR movntdqa
-AVX_INSTR movntpd
-AVX_INSTR movntps
+AVX_INSTR movddup, sse3
+AVX_INSTR movdqa, sse2
+AVX_INSTR movdqu, sse2
+AVX_INSTR movhlps, sse, 1, 0, 0
+AVX_INSTR movhpd, sse2, 1, 0, 0
+AVX_INSTR movhps, sse, 1, 0, 0
+AVX_INSTR movlhps, sse, 1, 0, 0
+AVX_INSTR movlpd, sse2, 1, 0, 0
+AVX_INSTR movlps, sse, 1, 0, 0
+AVX_INSTR movmskpd, sse2
+AVX_INSTR movmskps, sse
+AVX_INSTR movntdq, sse2
+AVX_INSTR movntdqa, sse4
+AVX_INSTR movntpd, sse2
+AVX_INSTR movntps, sse
 AVX_INSTR movq
-AVX_INSTR movsd, 1, 0, 0
-AVX_INSTR movshdup
-AVX_INSTR movsldup
-AVX_INSTR movss, 1, 0, 0
-AVX_INSTR movupd
-AVX_INSTR movups
-AVX_INSTR mpsadbw, 0, 1, 0
-AVX_INSTR mulpd, 1, 0, 1
-AVX_INSTR mulps, 1, 0, 1
-AVX_INSTR mulsd, 1, 0, 1
-AVX_INSTR mulss, 1, 0, 1
-AVX_INSTR orpd, 1, 0, 1
-AVX_INSTR orps, 1, 0, 1
-AVX_INSTR pabsb
-AVX_INSTR pabsd
-AVX_INSTR pabsw
-AVX_INSTR packsswb, 0, 0, 0
-AVX_INSTR packssdw, 0, 0, 0
-AVX_INSTR packuswb, 0, 0, 0
-AVX_INSTR packusdw, 0, 0, 0
-AVX_INSTR paddb, 0, 0, 1
-AVX_INSTR paddw, 0, 0, 1
-AVX_INSTR paddd, 0, 0, 1
-AVX_INSTR paddq, 0, 0, 1
-AVX_INSTR paddsb, 0, 0, 1
-AVX_INSTR paddsw, 0, 0, 1
-AVX_INSTR paddusb, 0, 0, 1
-AVX_INSTR paddusw, 0, 0, 1
-AVX_INSTR palignr, 0, 1, 0
-AVX_INSTR pand, 0, 0, 1
-AVX_INSTR pandn, 0, 0, 0
-AVX_INSTR pavgb, 0, 0, 1
-AVX_INSTR pavgw, 0, 0, 1
-AVX_INSTR pblendvb, 0, 0, 0
-AVX_INSTR pblendw, 0, 1, 0
-AVX_INSTR pclmulqdq, 0, 1, 0
-AVX_INSTR pcmpestri
-AVX_INSTR pcmpestrm
-AVX_INSTR pcmpistri
-AVX_INSTR pcmpistrm
-AVX_INSTR pcmpeqb, 0, 0, 1
-AVX_INSTR pcmpeqw, 0, 0, 1
-AVX_INSTR pcmpeqd, 0, 0, 1
-AVX_INSTR pcmpeqq, 0, 0, 1
-AVX_INSTR pcmpgtb, 0, 0, 0
-AVX_INSTR pcmpgtw, 0, 0, 0
-AVX_INSTR pcmpgtd, 0, 0, 0
-AVX_INSTR pcmpgtq, 0, 0, 0
-AVX_INSTR pextrb
-AVX_INSTR pextrd
-AVX_INSTR pextrq
-AVX_INSTR pextrw
-AVX_INSTR phaddw, 0, 0, 0
-AVX_INSTR phaddd, 0, 0, 0
-AVX_INSTR phaddsw, 0, 0, 0
-AVX_INSTR phminposuw
-AVX_INSTR phsubw, 0, 0, 0
-AVX_INSTR phsubd, 0, 0, 0
-AVX_INSTR phsubsw, 0, 0, 0
-AVX_INSTR pinsrb, 0, 1, 0
-AVX_INSTR pinsrd, 0, 1, 0
-AVX_INSTR pinsrq, 0, 1, 0
-AVX_INSTR pinsrw, 0, 1, 0
-AVX_INSTR pmaddwd, 0, 0, 1
-AVX_INSTR pmaddubsw, 0, 0, 0
-AVX_INSTR pmaxsb, 0, 0, 1
-AVX_INSTR pmaxsw, 0, 0, 1
-AVX_INSTR pmaxsd, 0, 0, 1
-AVX_INSTR pmaxub, 0, 0, 1
-AVX_INSTR pmaxuw, 0, 0, 1
-AVX_INSTR pmaxud, 0, 0, 1
-AVX_INSTR pminsb, 0, 0, 1
-AVX_INSTR pminsw, 0, 0, 1
-AVX_INSTR pminsd, 0, 0, 1
-AVX_INSTR pminub, 0, 0, 1
-AVX_INSTR pminuw, 0, 0, 1
-AVX_INSTR pminud, 0, 0, 1
-AVX_INSTR pmovmskb
-AVX_INSTR pmovsxbw
-AVX_INSTR pmovsxbd
-AVX_INSTR pmovsxbq
-AVX_INSTR pmovsxwd
-AVX_INSTR pmovsxwq
-AVX_INSTR pmovsxdq
-AVX_INSTR pmovzxbw
-AVX_INSTR pmovzxbd
-AVX_INSTR pmovzxbq
-AVX_INSTR pmovzxwd
-AVX_INSTR pmovzxwq
-AVX_INSTR pmovzxdq
-AVX_INSTR pmuldq, 0, 0, 1
-AVX_INSTR pmulhrsw, 0, 0, 1
-AVX_INSTR pmulhuw, 0, 0, 1
-AVX_INSTR pmulhw, 0, 0, 1
-AVX_INSTR pmullw, 0, 0, 1
-AVX_INSTR pmulld, 0, 0, 1
-AVX_INSTR pmuludq, 0, 0, 1
-AVX_INSTR por, 0, 0, 1
-AVX_INSTR psadbw, 0, 0, 1
-AVX_INSTR pshufb, 0, 0, 0
-AVX_INSTR pshufd
-AVX_INSTR pshufhw
-AVX_INSTR pshuflw
-AVX_INSTR psignb, 0, 0, 0
-AVX_INSTR psignw, 0, 0, 0
-AVX_INSTR psignd, 0, 0, 0
-AVX_INSTR psllw, 0, 0, 0
-AVX_INSTR pslld, 0, 0, 0
-AVX_INSTR psllq, 0, 0, 0
-AVX_INSTR pslldq, 0, 0, 0
-AVX_INSTR psraw, 0, 0, 0
-AVX_INSTR psrad, 0, 0, 0
-AVX_INSTR psrlw, 0, 0, 0
-AVX_INSTR psrld, 0, 0, 0
-AVX_INSTR psrlq, 0, 0, 0
-AVX_INSTR psrldq, 0, 0, 0
-AVX_INSTR psubb, 0, 0, 0
-AVX_INSTR psubw, 0, 0, 0
-AVX_INSTR psubd, 0, 0, 0
-AVX_INSTR psubq, 0, 0, 0
-AVX_INSTR psubsb, 0, 0, 0
-AVX_INSTR psubsw, 0, 0, 0
-AVX_INSTR psubusb, 0, 0, 0
-AVX_INSTR psubusw, 0, 0, 0
-AVX_INSTR ptest
-AVX_INSTR punpckhbw, 0, 0, 0
-AVX_INSTR punpckhwd, 0, 0, 0
-AVX_INSTR punpckhdq, 0, 0, 0
-AVX_INSTR punpckhqdq, 0, 0, 0
-AVX_INSTR punpcklbw, 0, 0, 0
-AVX_INSTR punpcklwd, 0, 0, 0
-AVX_INSTR punpckldq, 0, 0, 0
-AVX_INSTR punpcklqdq, 0, 0, 0
-AVX_INSTR pxor, 0, 0, 1
-AVX_INSTR rcpps, 1, 0, 0
-AVX_INSTR rcpss, 1, 0, 0
-AVX_INSTR roundpd
-AVX_INSTR roundps
-AVX_INSTR roundsd
-AVX_INSTR roundss
-AVX_INSTR rsqrtps, 1, 0, 0
-AVX_INSTR rsqrtss, 1, 0, 0
-AVX_INSTR shufpd, 1, 1, 0
-AVX_INSTR shufps, 1, 1, 0
-AVX_INSTR sqrtpd, 1, 0, 0
-AVX_INSTR sqrtps, 1, 0, 0
-AVX_INSTR sqrtsd, 1, 0, 0
-AVX_INSTR sqrtss, 1, 0, 0
-AVX_INSTR stmxcsr
-AVX_INSTR subpd, 1, 0, 0
-AVX_INSTR subps, 1, 0, 0
-AVX_INSTR subsd, 1, 0, 0
-AVX_INSTR subss, 1, 0, 0
-AVX_INSTR ucomisd
-AVX_INSTR ucomiss
-AVX_INSTR unpckhpd, 1, 0, 0
-AVX_INSTR unpckhps, 1, 0, 0
-AVX_INSTR unpcklpd, 1, 0, 0
-AVX_INSTR unpcklps, 1, 0, 0
-AVX_INSTR xorpd, 1, 0, 1
-AVX_INSTR xorps, 1, 0, 1
+AVX_INSTR movsd, sse2, 1, 0, 0
+AVX_INSTR movshdup, sse3
+AVX_INSTR movsldup, sse3
+AVX_INSTR movss, sse, 1, 0, 0
+AVX_INSTR movupd, sse2
+AVX_INSTR movups, sse
+AVX_INSTR mpsadbw, sse4
+AVX_INSTR mulpd, sse2, 1, 0, 1
+AVX_INSTR mulps, sse, 1, 0, 1
+AVX_INSTR mulsd, sse2, 1, 0, 1
+AVX_INSTR mulss, sse, 1, 0, 1
+AVX_INSTR orpd, sse2, 1, 0, 1
+AVX_INSTR orps, sse, 1, 0, 1
+AVX_INSTR pabsb, ssse3
+AVX_INSTR pabsd, ssse3
+AVX_INSTR pabsw, ssse3
+AVX_INSTR packsswb, mmx, 0, 0, 0
+AVX_INSTR packssdw, mmx, 0, 0, 0
+AVX_INSTR packuswb, mmx, 0, 0, 0
+AVX_INSTR packusdw, sse4, 0, 0, 0
+AVX_INSTR paddb, mmx, 0, 0, 1
+AVX_INSTR paddw, mmx, 0, 0, 1
+AVX_INSTR paddd, mmx, 0, 0, 1
+AVX_INSTR paddq, sse2, 0, 0, 1
+AVX_INSTR paddsb, mmx, 0, 0, 1
+AVX_INSTR paddsw, mmx, 0, 0, 1
+AVX_INSTR paddusb, mmx, 0, 0, 1
+AVX_INSTR paddusw, mmx, 0, 0, 1
+AVX_INSTR palignr, ssse3
+AVX_INSTR pand, mmx, 0, 0, 1
+AVX_INSTR pandn, mmx, 0, 0, 0
+AVX_INSTR pavgb, mmx2, 0, 0, 1
+AVX_INSTR pavgw, mmx2, 0, 0, 1
+AVX_INSTR pblendvb, sse4, 0, 0, 0
+AVX_INSTR pblendw, sse4
+AVX_INSTR pclmulqdq
+AVX_INSTR pcmpestri, sse42
+AVX_INSTR pcmpestrm, sse42
+AVX_INSTR pcmpistri, sse42
+AVX_INSTR pcmpistrm, sse42
+AVX_INSTR pcmpeqb, mmx, 0, 0, 1
+AVX_INSTR pcmpeqw, mmx, 0, 0, 1
+AVX_INSTR pcmpeqd, mmx, 0, 0, 1
+AVX_INSTR pcmpeqq, sse4, 0, 0, 1
+AVX_INSTR pcmpgtb, mmx, 0, 0, 0
+AVX_INSTR pcmpgtw, mmx, 0, 0, 0
+AVX_INSTR pcmpgtd, mmx, 0, 0, 0
+AVX_INSTR pcmpgtq, sse42, 0, 0, 0
+AVX_INSTR pextrb, sse4
+AVX_INSTR pextrd, sse4
+AVX_INSTR pextrq, sse4
+AVX_INSTR pextrw, mmx2
+AVX_INSTR phaddw, ssse3, 0, 0, 0
+AVX_INSTR phaddd, ssse3, 0, 0, 0
+AVX_INSTR phaddsw, ssse3, 0, 0, 0
+AVX_INSTR phminposuw, sse4
+AVX_INSTR phsubw, ssse3, 0, 0, 0
+AVX_INSTR phsubd, ssse3, 0, 0, 0
+AVX_INSTR phsubsw, ssse3, 0, 0, 0
+AVX_INSTR pinsrb, sse4
+AVX_INSTR pinsrd, sse4
+AVX_INSTR pinsrq, sse4
+AVX_INSTR pinsrw, mmx2
+AVX_INSTR pmaddwd, mmx, 0, 0, 1
+AVX_INSTR pmaddubsw, ssse3, 0, 0, 0
+AVX_INSTR pmaxsb, sse4, 0, 0, 1
+AVX_INSTR pmaxsw, mmx2, 0, 0, 1
+AVX_INSTR pmaxsd, sse4, 0, 0, 1
+AVX_INSTR pmaxub, mmx2, 0, 0, 1
+AVX_INSTR pmaxuw, sse4, 0, 0, 1
+AVX_INSTR pmaxud, sse4, 0, 0, 1
+AVX_INSTR pminsb, sse4, 0, 0, 1
+AVX_INSTR pminsw, mmx2, 0, 0, 1
+AVX_INSTR pminsd, sse4, 0, 0, 1
+AVX_INSTR pminub, mmx2, 0, 0, 1
+AVX_INSTR pminuw, sse4, 0, 0, 1
+AVX_INSTR pminud, sse4, 0, 0, 1
+AVX_INSTR pmovmskb, mmx2
+AVX_INSTR pmovsxbw, sse4
+AVX_INSTR pmovsxbd, sse4
+AVX_INSTR pmovsxbq, sse4
+AVX_INSTR pmovsxwd, sse4
+AVX_INSTR pmovsxwq, sse4
+AVX_INSTR pmovsxdq, sse4
+AVX_INSTR pmovzxbw, sse4
+AVX_INSTR pmovzxbd, sse4
+AVX_INSTR pmovzxbq, sse4
+AVX_INSTR pmovzxwd, sse4
+AVX_INSTR pmovzxwq, sse4
+AVX_INSTR pmovzxdq, sse4
+AVX_INSTR pmuldq, sse4, 0, 0, 1
+AVX_INSTR pmulhrsw, ssse3, 0, 0, 1
+AVX_INSTR pmulhuw, mmx2, 0, 0, 1
+AVX_INSTR pmulhw, mmx, 0, 0, 1
+AVX_INSTR pmullw, mmx, 0, 0, 1
+AVX_INSTR pmulld, sse4, 0, 0, 1
+AVX_INSTR pmuludq, sse2, 0, 0, 1
+AVX_INSTR por, mmx, 0, 0, 1
+AVX_INSTR psadbw, mmx2, 0, 0, 1
+AVX_INSTR pshufb, ssse3, 0, 0, 0
+AVX_INSTR pshufd, sse2
+AVX_INSTR pshufhw, sse2
+AVX_INSTR pshuflw, sse2
+AVX_INSTR psignb, ssse3, 0, 0, 0
+AVX_INSTR psignw, ssse3, 0, 0, 0
+AVX_INSTR psignd, ssse3, 0, 0, 0
+AVX_INSTR psllw, mmx, 0, 0, 0
+AVX_INSTR pslld, mmx, 0, 0, 0
+AVX_INSTR psllq, mmx, 0, 0, 0
+AVX_INSTR pslldq, sse2, 0, 0, 0
+AVX_INSTR psraw, mmx, 0, 0, 0
+AVX_INSTR psrad, mmx, 0, 0, 0
+AVX_INSTR psrlw, mmx, 0, 0, 0
+AVX_INSTR psrld, mmx, 0, 0, 0
+AVX_INSTR psrlq, mmx, 0, 0, 0
+AVX_INSTR psrldq, sse2, 0, 0, 0
+AVX_INSTR psubb, mmx, 0, 0, 0
+AVX_INSTR psubw, mmx, 0, 0, 0
+AVX_INSTR psubd, mmx, 0, 0, 0
+AVX_INSTR psubq, sse2, 0, 0, 0
+AVX_INSTR psubsb, mmx, 0, 0, 0
+AVX_INSTR psubsw, mmx, 0, 0, 0
+AVX_INSTR psubusb, mmx, 0, 0, 0
+AVX_INSTR psubusw, mmx, 0, 0, 0
+AVX_INSTR ptest, sse4
+AVX_INSTR punpckhbw, mmx, 0, 0, 0
+AVX_INSTR punpckhwd, mmx, 0, 0, 0
+AVX_INSTR punpckhdq, mmx, 0, 0, 0
+AVX_INSTR punpckhqdq, sse2, 0, 0, 0
+AVX_INSTR punpcklbw, mmx, 0, 0, 0
+AVX_INSTR punpcklwd, mmx, 0, 0, 0
+AVX_INSTR punpckldq, mmx, 0, 0, 0
+AVX_INSTR punpcklqdq, sse2, 0, 0, 0
+AVX_INSTR pxor, mmx, 0, 0, 1
+AVX_INSTR rcpps, sse, 1, 0, 0
+AVX_INSTR rcpss, sse, 1, 0, 0
+AVX_INSTR roundpd, sse4
+AVX_INSTR roundps, sse4
+AVX_INSTR roundsd, sse4
+AVX_INSTR roundss, sse4
+AVX_INSTR rsqrtps, sse, 1, 0, 0
+AVX_INSTR rsqrtss, sse, 1, 0, 0
+AVX_INSTR shufpd, sse2, 1, 1, 0
+AVX_INSTR shufps, sse, 1, 1, 0
+AVX_INSTR sqrtpd, sse2, 1, 0, 0
+AVX_INSTR sqrtps, sse, 1, 0, 0
+AVX_INSTR sqrtsd, sse2, 1, 0, 0
+AVX_INSTR sqrtss, sse, 1, 0, 0
+AVX_INSTR stmxcsr, sse
+AVX_INSTR subpd, sse2, 1, 0, 0
+AVX_INSTR subps, sse, 1, 0, 0
+AVX_INSTR subsd, sse2, 1, 0, 0
+AVX_INSTR subss, sse, 1, 0, 0
+AVX_INSTR ucomisd, sse2
+AVX_INSTR ucomiss, sse
+AVX_INSTR unpckhpd, sse2, 1, 0, 0
+AVX_INSTR unpckhps, sse, 1, 0, 0
+AVX_INSTR unpcklpd, sse2, 1, 0, 0
+AVX_INSTR unpcklps, sse, 1, 0, 0
+AVX_INSTR xorpd, sse2, 1, 0, 1
+AVX_INSTR xorps, sse, 1, 0, 1
 
 ; 3DNow instructions, for sharing code between AVX, SSE and 3DN
-AVX_INSTR pfadd, 1, 0, 1
-AVX_INSTR pfsub, 1, 0, 0
-AVX_INSTR pfmul, 1, 0, 1
+AVX_INSTR pfadd, 3dnow, 1, 0, 1
+AVX_INSTR pfsub, 3dnow, 1, 0, 0
+AVX_INSTR pfmul, 3dnow, 1, 0, 1
 
 ; base-4 constants for shuffles
 %assign i 0
​

x264-snapshot-20141104-2245.tar.bz2/encoder/cavlc.c -> x264-snapshot-20141218-2245.tar.bz2/encoder/cavlc.c Changed

 
@@ -289,6 +289,7 @@
                 x264_cavlc_block_residual( h, DCT_LUMA_4x4, i4+i8*4+p*16, h->dct.luma4x4[i4+i8*4+p*16] );
 }
 
+#if RDO_SKIP_BS
 static ALWAYS_INLINE void x264_cavlc_partition_luma_residual( x264_t *h, int i8, int p )
 {
     if( h->mb.b_transform_8x8 && h->mb.cache.non_zero_count[x264_scan8[i8*4]] )
@@ -299,6 +300,7 @@
         for( int i4 = 0; i4 < 4; i4++ )
             x264_cavlc_block_residual( h, DCT_LUMA_4x4, i4+i8*4+p*16, h->dct.luma4x4[i4+i8*4+p*16] );
 }
+#endif
 
 static void x264_cavlc_mb_header_i( x264_t *h, int i_mb_type, int i_mb_i_offset, int chroma )
 {
​

x264-snapshot-20141104-2245.tar.bz2/encoder/ratecontrol.c -> x264-snapshot-20141218-2245.tar.bz2/encoder/ratecontrol.c Changed

@@ -2191,6 +2191,8 @@
 
     if( rcc->b_vbv && rcc->last_satd > 0 )
     {
+        double fenc_cpb_duration = (double)h->fenc->i_cpb_duration *
+                                   h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
         /* Lookahead VBV: raise the quantizer as necessary such that no frames in
          * the lookahead overflow and such that the buffer is in a reasonable state
          * by the end of the lookahead. */
@@ -2206,6 +2208,7 @@
                 double buffer_fill_cur = rcc->buffer_fill - cur_bits;
                 double target_fill;
                 double total_duration = 0;
+                double last_duration = fenc_cpb_duration;
                 frame_q[0] = h->sh.i_type == SLICE_TYPE_I ? q * h->param.rc.f_ip_factor : q;
                 frame_q[1] = frame_q[0] * h->param.rc.f_pb_factor;
                 frame_q[2] = frame_q[0] / h->param.rc.f_ip_factor;
@@ -2213,8 +2216,8 @@
                 /* Loop over the planned future frames. */
                 for( int j = 0; buffer_fill_cur >= 0 && buffer_fill_cur <= rcc->buffer_size; j++ )
                 {
-                    total_duration += h->fenc->f_planned_cpb_duration[j];
-                    buffer_fill_cur += rcc->vbv_max_rate * h->fenc->f_planned_cpb_duration[j];
+                    total_duration += last_duration;
+                    buffer_fill_cur += rcc->vbv_max_rate * last_duration;
                     int i_type = h->fenc->i_planned_type[j];
                     int i_satd = h->fenc->i_planned_satd[j];
                     if( i_type == X264_TYPE_AUTO )
@@ -2222,6 +2225,7 @@
                     i_type = IS_X264_TYPE_I( i_type ) ? SLICE_TYPE_I : IS_X264_TYPE_B( i_type ) ? SLICE_TYPE_B : SLICE_TYPE_P;
                     cur_bits = predict_size( &rcc->pred[i_type], frame_q[i_type], i_satd );
                     buffer_fill_cur -= cur_bits;
+                    last_duration = h->fenc->f_planned_cpb_duration[j];
                 }
                 /* Try to get to get the buffer at least 50% filled, but don't set an impossible goal. */
                 target_fill = X264_MIN( rcc->buffer_fill + total_duration * rcc->vbv_max_rate * 0.5, rcc->buffer_size * 0.5 );
@@ -2255,45 +2259,44 @@
             /* Now a hard threshold to make sure the frame fits in VBV.
              * This one is mostly for I-frames. */
             double bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
-            double qf = 1.0;
             /* For small VBVs, allow the frame to use up the entire VBV. */
             double max_fill_factor = h->param.rc.i_vbv_buffer_size >= 5*h->param.rc.i_vbv_max_bitrate / rcc->fps ? 2 : 1;
             /* For single-frame VBVs, request that the frame use up the entire VBV. */
             double min_fill_factor = rcc->single_frame_vbv ? 1 : 2;
 
             if( bits > rcc->buffer_fill/max_fill_factor )
-                qf = x264_clip3f( rcc->buffer_fill/(max_fill_factor*bits), 0.2, 1.0 );
-            q /= qf;
-            bits *= qf;
+            {
+                double qf = x264_clip3f( rcc->buffer_fill/(max_fill_factor*bits), 0.2, 1.0 );
+                q /= qf;
+                bits *= qf;
+            }
             if( bits < rcc->buffer_rate/min_fill_factor )
-                q *= bits*min_fill_factor/rcc->buffer_rate;
+            {
+                double qf = x264_clip3f( bits*min_fill_factor/rcc->buffer_rate, 0.001, 1.0 );
+                q *= qf;
+            }
             q = X264_MAX( q0, q );
         }
 
-        /* Apply MinCR restrictions */
-        double bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
-        if( bits > rcc->frame_size_maximum )
-            q *= bits / rcc->frame_size_maximum;
-        bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
-
         /* Check B-frame complexity, and use up any bits that would
          * overflow before the next P-frame. */
         if( h->sh.i_type == SLICE_TYPE_P && !rcc->single_frame_vbv )
         {
             int nb = rcc->bframes;
+            double bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
             double pbbits = bits;
             double bbits = predict_size( rcc->pred_b_from_p, q * h->param.rc.f_pb_factor, rcc->last_satd );
             double space;
             double bframe_cpb_duration = 0;
             double minigop_cpb_duration;
             for( int i = 0; i < nb; i++ )
-                bframe_cpb_duration += h->fenc->f_planned_cpb_duration[1+i];
+                bframe_cpb_duration += h->fenc->f_planned_cpb_duration[i];
 
             if( bbits * nb > bframe_cpb_duration * rcc->vbv_max_rate )
                 nb = 0;
             pbbits += nb * bbits;
 
-            minigop_cpb_duration = bframe_cpb_duration + h->fenc->f_planned_cpb_duration[0];
+            minigop_cpb_duration = bframe_cpb_duration + fenc_cpb_duration;
             space = rcc->buffer_fill + minigop_cpb_duration*rcc->vbv_max_rate - rcc->buffer_size;
             if( pbbits < space )
             {
@@ -2302,6 +2305,12 @@
             q = X264_MAX( q0/2, q );
         }
 
+        /* Apply MinCR and buffer fill restrictions */
+        double bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
+        double frame_size_maximum = X264_MIN( rcc->frame_size_maximum, X264_MAX( rcc->buffer_fill, 0.001 ) );
+        if( bits > frame_size_maximum )
+            q *= bits / frame_size_maximum;
+
         if( !rcc->b_vbv_min_rate )
             q = X264_MAX( q0, q );
     }
@@ -2326,7 +2335,7 @@
 {
     float q;
     x264_ratecontrol_t *rcc = h->rc;
-    ratecontrol_entry_t UNINIT(rce);
+    ratecontrol_entry_t rce = {0};
     int pict_type = h->sh.i_type;
     int64_t total_bits = 8*(h->stat.i_frame_size[SLICE_TYPE_I]
                           + h->stat.i_frame_size[SLICE_TYPE_P]

 
@@ -2191,6 +2191,8 @@
 
     if( rcc->b_vbv && rcc->last_satd > 0 )
     {
+        double fenc_cpb_duration = (double)h->fenc->i_cpb_duration *
+                                   h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
         /* Lookahead VBV: raise the quantizer as necessary such that no frames in
          * the lookahead overflow and such that the buffer is in a reasonable state
          * by the end of the lookahead. */
@@ -2206,6 +2208,7 @@
                 double buffer_fill_cur = rcc->buffer_fill - cur_bits;
                 double target_fill;
                 double total_duration = 0;
+                double last_duration = fenc_cpb_duration;
                 frame_q[0] = h->sh.i_type == SLICE_TYPE_I ? q * h->param.rc.f_ip_factor : q;
                 frame_q[1] = frame_q[0] * h->param.rc.f_pb_factor;
                 frame_q[2] = frame_q[0] / h->param.rc.f_ip_factor;
@@ -2213,8 +2216,8 @@
                 /* Loop over the planned future frames. */
                 for( int j = 0; buffer_fill_cur >= 0 && buffer_fill_cur <= rcc->buffer_size; j++ )
                 {
-                    total_duration += h->fenc->f_planned_cpb_duration[j];
-                    buffer_fill_cur += rcc->vbv_max_rate * h->fenc->f_planned_cpb_duration[j];
+                    total_duration += last_duration;
+                    buffer_fill_cur += rcc->vbv_max_rate * last_duration;
                     int i_type = h->fenc->i_planned_type[j];
                     int i_satd = h->fenc->i_planned_satd[j];
                     if( i_type == X264_TYPE_AUTO )
@@ -2222,6 +2225,7 @@
                     i_type = IS_X264_TYPE_I( i_type ) ? SLICE_TYPE_I : IS_X264_TYPE_B( i_type ) ? SLICE_TYPE_B : SLICE_TYPE_P;
                     cur_bits = predict_size( &rcc->pred[i_type], frame_q[i_type], i_satd );
                     buffer_fill_cur -= cur_bits;
+                    last_duration = h->fenc->f_planned_cpb_duration[j];
                 }
                 /* Try to get to get the buffer at least 50% filled, but don't set an impossible goal. */
                 target_fill = X264_MIN( rcc->buffer_fill + total_duration * rcc->vbv_max_rate * 0.5, rcc->buffer_size * 0.5 );
@@ -2255,45 +2259,44 @@
             /* Now a hard threshold to make sure the frame fits in VBV.
              * This one is mostly for I-frames. */
             double bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
-            double qf = 1.0;
             /* For small VBVs, allow the frame to use up the entire VBV. */
             double max_fill_factor = h->param.rc.i_vbv_buffer_size >= 5*h->param.rc.i_vbv_max_bitrate / rcc->fps ? 2 : 1;
             /* For single-frame VBVs, request that the frame use up the entire VBV. */
             double min_fill_factor = rcc->single_frame_vbv ? 1 : 2;
 
             if( bits > rcc->buffer_fill/max_fill_factor )
-                qf = x264_clip3f( rcc->buffer_fill/(max_fill_factor*bits), 0.2, 1.0 );
-            q /= qf;
-            bits *= qf;
+            {
+                double qf = x264_clip3f( rcc->buffer_fill/(max_fill_factor*bits), 0.2, 1.0 );
+                q /= qf;
+                bits *= qf;
+            }
             if( bits < rcc->buffer_rate/min_fill_factor )
-                q *= bits*min_fill_factor/rcc->buffer_rate;
+            {
+                double qf = x264_clip3f( bits*min_fill_factor/rcc->buffer_rate, 0.001, 1.0 );
+                q *= qf;
+            }
             q = X264_MAX( q0, q );
         }
 
-        /* Apply MinCR restrictions */
-        double bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
-        if( bits > rcc->frame_size_maximum )
-            q *= bits / rcc->frame_size_maximum;
-        bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
-
         /* Check B-frame complexity, and use up any bits that would
          * overflow before the next P-frame. */
         if( h->sh.i_type == SLICE_TYPE_P && !rcc->single_frame_vbv )
         {
             int nb = rcc->bframes;
+            double bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
             double pbbits = bits;
             double bbits = predict_size( rcc->pred_b_from_p, q * h->param.rc.f_pb_factor, rcc->last_satd );
             double space;
             double bframe_cpb_duration = 0;
             double minigop_cpb_duration;
             for( int i = 0; i < nb; i++ )
-                bframe_cpb_duration += h->fenc->f_planned_cpb_duration[1+i];
+                bframe_cpb_duration += h->fenc->f_planned_cpb_duration[i];
 
             if( bbits * nb > bframe_cpb_duration * rcc->vbv_max_rate )
                 nb = 0;
             pbbits += nb * bbits;
 
-            minigop_cpb_duration = bframe_cpb_duration + h->fenc->f_planned_cpb_duration[0];
+            minigop_cpb_duration = bframe_cpb_duration + fenc_cpb_duration;
             space = rcc->buffer_fill + minigop_cpb_duration*rcc->vbv_max_rate - rcc->buffer_size;
             if( pbbits < space )
             {
@@ -2302,6 +2305,12 @@
             q = X264_MAX( q0/2, q );
         }
 
+        /* Apply MinCR and buffer fill restrictions */
+        double bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
+        double frame_size_maximum = X264_MIN( rcc->frame_size_maximum, X264_MAX( rcc->buffer_fill, 0.001 ) );
+        if( bits > frame_size_maximum )
+            q *= bits / frame_size_maximum;
+
         if( !rcc->b_vbv_min_rate )
             q = X264_MAX( q0, q );
     }
@@ -2326,7 +2335,7 @@
 {
     float q;
     x264_ratecontrol_t *rcc = h->rc;
-    ratecontrol_entry_t UNINIT(rce);
+    ratecontrol_entry_t rce = {0};
     int pict_type = h->sh.i_type;
     int64_t total_bits = 8*(h->stat.i_frame_size[SLICE_TYPE_I]
                           + h->stat.i_frame_size[SLICE_TYPE_P]
​

x264-snapshot-20141104-2245.tar.bz2/encoder/rdo.c -> x264-snapshot-20141218-2245.tar.bz2/encoder/rdo.c Changed

 
@@ -186,7 +186,7 @@
     h->mb.b_transform_8x8 = b_transform_bak;
     h->mb.i_type = type_bak;
 
-    return i_ssd + i_bits;
+    return X264_MIN( i_ssd + i_bits, COST_MAX );
 }
 
 /* partition RD functions use 8 bits more precision to avoid large rounding errors at low QPs */
​

x264-snapshot-20141104-2245.tar.bz2/encoder/set.c -> x264-snapshot-20141218-2245.tar.bz2/encoder/set.c Changed

@@ -543,7 +543,8 @@
 void x264_sei_recovery_point_write( x264_t *h, bs_t *s, int recovery_frame_cnt )
 {
     bs_t q;
-    uint8_t tmp_buf[100];
+    ALIGNED_4( uint8_t tmp_buf[100] );
+    M32( tmp_buf ) = 0; // shut up gcc
     bs_init( &q, tmp_buf, 100 );
 
     bs_realign( &q );
@@ -595,7 +596,8 @@
 {
     x264_sps_t *sps = h->sps;
     bs_t q;
-    uint8_t tmp_buf[100];
+    ALIGNED_4( uint8_t tmp_buf[100] );
+    M32( tmp_buf ) = 0; // shut up gcc
     bs_init( &q, tmp_buf, 100 );
 
     bs_realign( &q );
@@ -617,7 +619,8 @@
 {
     x264_sps_t *sps = h->sps;
     bs_t q;
-    uint8_t tmp_buf[100];
+    ALIGNED_4( uint8_t tmp_buf[100] );
+    M32( tmp_buf ) = 0; // shut up gcc
     bs_init( &q, tmp_buf, 100 );
 
     bs_realign( &q );
@@ -648,7 +651,8 @@
 {
     int quincunx_sampling_flag = h->param.i_frame_packing == 0;
     bs_t q;
-    uint8_t tmp_buf[100];
+    ALIGNED_4( uint8_t tmp_buf[100] );
+    M32( tmp_buf ) = 0; // shut up gcc
     bs_init( &q, tmp_buf, 100 );
 
     bs_realign( &q );
@@ -701,7 +705,8 @@
 {
     x264_slice_header_t *sh = &h->sh_backup;
     bs_t q;
-    uint8_t tmp_buf[100];
+    ALIGNED_4( uint8_t tmp_buf[100] );
+    M32( tmp_buf ) = 0; // shut up gcc
     bs_init( &q, tmp_buf, 100 );
 
     bs_realign( &q );

 
@@ -543,7 +543,8 @@
 void x264_sei_recovery_point_write( x264_t *h, bs_t *s, int recovery_frame_cnt )
 {
     bs_t q;
-    uint8_t tmp_buf[100];
+    ALIGNED_4( uint8_t tmp_buf[100] );
+    M32( tmp_buf ) = 0; // shut up gcc
     bs_init( &q, tmp_buf, 100 );
 
     bs_realign( &q );
@@ -595,7 +596,8 @@
 {
     x264_sps_t *sps = h->sps;
     bs_t q;
-    uint8_t tmp_buf[100];
+    ALIGNED_4( uint8_t tmp_buf[100] );
+    M32( tmp_buf ) = 0; // shut up gcc
     bs_init( &q, tmp_buf, 100 );
 
     bs_realign( &q );
@@ -617,7 +619,8 @@
 {
     x264_sps_t *sps = h->sps;
     bs_t q;
-    uint8_t tmp_buf[100];
+    ALIGNED_4( uint8_t tmp_buf[100] );
+    M32( tmp_buf ) = 0; // shut up gcc
     bs_init( &q, tmp_buf, 100 );
 
     bs_realign( &q );
@@ -648,7 +651,8 @@
 {
     int quincunx_sampling_flag = h->param.i_frame_packing == 0;
     bs_t q;
-    uint8_t tmp_buf[100];
+    ALIGNED_4( uint8_t tmp_buf[100] );
+    M32( tmp_buf ) = 0; // shut up gcc
     bs_init( &q, tmp_buf, 100 );
 
     bs_realign( &q );
@@ -701,7 +705,8 @@
 {
     x264_slice_header_t *sh = &h->sh_backup;
     bs_t q;
-    uint8_t tmp_buf[100];
+    ALIGNED_4( uint8_t tmp_buf[100] );
+    M32( tmp_buf ) = 0; // shut up gcc
     bs_init( &q, tmp_buf, 100 );
 
     bs_realign( &q );
​

x264-snapshot-20141104-2245.tar.bz2/encoder/slicetype.c -> x264-snapshot-20141218-2245.tar.bz2/encoder/slicetype.c Changed

 
@@ -1853,14 +1853,11 @@
         if( i )
         {
             x264_calculate_durations( h, h->lookahead->next.list[i], h->lookahead->next.list[i-1], &h->i_cpb_delay, &h->i_coded_fields );
-            h->lookahead->next.list[0]->f_planned_cpb_duration[i-1] = (double)h->lookahead->next.list[i-1]->i_cpb_duration *
+            h->lookahead->next.list[0]->f_planned_cpb_duration[i-1] = (double)h->lookahead->next.list[i]->i_cpb_duration *
                                                                       h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
         }
         else
             x264_calculate_durations( h, h->lookahead->next.list[i], NULL, &h->i_cpb_delay, &h->i_coded_fields );
-
-        h->lookahead->next.list[0]->f_planned_cpb_duration[i] = (double)h->lookahead->next.list[i]->i_cpb_duration *
-                                                                h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
     }
 }
 
​

x264-snapshot-20141218-2245.tar.bz2/example.c Added

@@ -0,0 +1,155 @@
+/*****************************************************************************
+ * example.c: libx264 API usage example
+ *****************************************************************************
+ * Copyright (C) 2014 x264 project
+ *
+ * Authors: Anton Mitrofanov <BugMaster@narod.ru>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifdef _WIN32
+/* The following two defines must be located before the inclusion of any system header files. */
+#define WINVER       0x0500
+#define _WIN32_WINNT 0x0500
+#include <windows.h>
+#include <io.h>       /* _setmode() */
+#include <fcntl.h>    /* _O_BINARY */
+#endif
+
+#include <stdint.h>
+#include <stdio.h>
+#include <signal.h>
+#include <x264.h>
+
+/* Ctrl-C handler */
+static volatile int b_ctrl_c = 0;
+static void sigint_handler( int a )
+{
+    b_ctrl_c = 1;
+}
+
+#define FAIL_IF_ERROR( cond, ... )\
+do\
+{\
+    if( cond )\
+    {\
+        fprintf( stderr, __VA_ARGS__ );\
+        goto fail;\
+    }\
+} while( 0 )
+
+int main( int argc, char **argv )
+{
+    int width, height;
+    x264_param_t param;
+    x264_picture_t pic;
+    x264_picture_t pic_out;
+    x264_t *h;
+    int i_frame = 0;
+    int i_frame_size;
+    x264_nal_t *nal;
+    int i_nal;
+
+#ifdef _WIN32
+    _setmode( _fileno( stdin ),  _O_BINARY );
+    _setmode( _fileno( stdout ), _O_BINARY );
+    _setmode( _fileno( stderr ), _O_BINARY );
+#endif
+
+    /* Control-C handler */
+    signal( SIGINT, sigint_handler );
+
+    FAIL_IF_ERROR( !(argc > 1), "Example usage: example 352x288 <input.yuv >output.h264\n" );
+    FAIL_IF_ERROR( 2 != sscanf( argv[1], "%dx%d", &width, &height ), "resolution not specified or incorrect\n" );
+
+    /* Get default params for preset/tuning */
+    if( x264_param_default_preset( &param, "medium", NULL ) < 0 )
+        goto fail;
+
+    /* Configure non-default params */
+    param.i_csp = X264_CSP_I420;
+    param.i_width  = width;
+    param.i_height = height;
+    param.b_vfr_input = 0;
+    param.b_repeat_headers = 1;
+    param.b_annexb = 1;
+
+    /* Apply profile restrictions. */
+    if( x264_param_apply_profile( &param, "high" ) < 0 )
+        goto fail;
+
+    if( x264_picture_alloc( &pic, param.i_csp, param.i_width, param.i_height ) < 0 )
+        goto fail;
+#undef fail
+#define fail fail2
+
+    h = x264_encoder_open( &param );
+    if( !h )
+        goto fail;
+#undef fail
+#define fail fail3
+
+    /* Encode frames */
+    for( ; !b_ctrl_c; i_frame++ )
+    {
+        /* Read input frame */
+        int plane_size = width * height;
+        if( fread( pic.img.plane[0], 1, plane_size, stdin ) != plane_size )
+            break;
+        plane_size = ((width + 1) >> 1) * ((height + 1) >> 1);
+        if( fread( pic.img.plane[1], 1, plane_size, stdin ) != plane_size )
+            break;
+        if( fread( pic.img.plane[2], 1, plane_size, stdin ) != plane_size )
+            break;
+
+        pic.i_pts = i_frame;
+        i_frame_size = x264_encoder_encode( h, &nal, &i_nal, &pic, &pic_out );
+        if( i_frame_size < 0 )
+            goto fail;
+        else if( i_frame_size )
+        {
+            if( !fwrite( nal->p_payload, i_frame_size, 1, stdout ) )
+                goto fail;
+        }
+    }
+    /* Flush delayed frames */
+    while( !b_ctrl_c && x264_encoder_delayed_frames( h ) )
+    {
+        i_frame_size = x264_encoder_encode( h, &nal, &i_nal, NULL, &pic_out );
+        if( i_frame_size < 0 )
+            goto fail;
+        else if( i_frame_size )
+        {
+            if( !fwrite( nal->p_payload, i_frame_size, 1, stdout ) )
+                goto fail;
+        }
+    }
+
+    x264_encoder_close( h );
+    x264_picture_clean( &pic );
+    return 0;
+
+#undef fail
+fail3:
+    x264_encoder_close( h );
+fail2:
+    x264_picture_clean( &pic );
+fail:
+    return -1;
+}

 
@@ -0,0 +1,155 @@
+/*****************************************************************************
+ * example.c: libx264 API usage example
+ *****************************************************************************
+ * Copyright (C) 2014 x264 project
+ *
+ * Authors: Anton Mitrofanov <BugMaster@narod.ru>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifdef _WIN32
+/* The following two defines must be located before the inclusion of any system header files. */
+#define WINVER       0x0500
+#define _WIN32_WINNT 0x0500
+#include <windows.h>
+#include <io.h>       /* _setmode() */
+#include <fcntl.h>    /* _O_BINARY */
+#endif
+
+#include <stdint.h>
+#include <stdio.h>
+#include <signal.h>
+#include <x264.h>
+
+/* Ctrl-C handler */
+static volatile int b_ctrl_c = 0;
+static void sigint_handler( int a )
+{
+    b_ctrl_c = 1;
+}
+
+#define FAIL_IF_ERROR( cond, ... )\
+do\
+{\
+    if( cond )\
+    {\
+        fprintf( stderr, __VA_ARGS__ );\
+        goto fail;\
+    }\
+} while( 0 )
+
+int main( int argc, char **argv )
+{
+    int width, height;
+    x264_param_t param;
+    x264_picture_t pic;
+    x264_picture_t pic_out;
+    x264_t *h;
+    int i_frame = 0;
+    int i_frame_size;
+    x264_nal_t *nal;
+    int i_nal;
+
+#ifdef _WIN32
+    _setmode( _fileno( stdin ),  _O_BINARY );
+    _setmode( _fileno( stdout ), _O_BINARY );
+    _setmode( _fileno( stderr ), _O_BINARY );
+#endif
+
+    /* Control-C handler */
+    signal( SIGINT, sigint_handler );
+
+    FAIL_IF_ERROR( !(argc > 1), "Example usage: example 352x288 <input.yuv >output.h264\n" );
+    FAIL_IF_ERROR( 2 != sscanf( argv[1], "%dx%d", &width, &height ), "resolution not specified or incorrect\n" );
+
+    /* Get default params for preset/tuning */
+    if( x264_param_default_preset( &param, "medium", NULL ) < 0 )
+        goto fail;
+
+    /* Configure non-default params */
+    param.i_csp = X264_CSP_I420;
+    param.i_width  = width;
+    param.i_height = height;
+    param.b_vfr_input = 0;
+    param.b_repeat_headers = 1;
+    param.b_annexb = 1;
+
+    /* Apply profile restrictions. */
+    if( x264_param_apply_profile( &param, "high" ) < 0 )
+        goto fail;
+
+    if( x264_picture_alloc( &pic, param.i_csp, param.i_width, param.i_height ) < 0 )
+        goto fail;
+#undef fail
+#define fail fail2
+
+    h = x264_encoder_open( &param );
+    if( !h )
+        goto fail;
+#undef fail
+#define fail fail3
+
+    /* Encode frames */
+    for( ; !b_ctrl_c; i_frame++ )
+    {
+        /* Read input frame */
+        int plane_size = width * height;
+        if( fread( pic.img.plane[0], 1, plane_size, stdin ) != plane_size )
+            break;
+        plane_size = ((width + 1) >> 1) * ((height + 1) >> 1);
+        if( fread( pic.img.plane[1], 1, plane_size, stdin ) != plane_size )
+            break;
+        if( fread( pic.img.plane[2], 1, plane_size, stdin ) != plane_size )
+            break;
+
+        pic.i_pts = i_frame;
+        i_frame_size = x264_encoder_encode( h, &nal, &i_nal, &pic, &pic_out );
+        if( i_frame_size < 0 )
+            goto fail;
+        else if( i_frame_size )
+        {
+            if( !fwrite( nal->p_payload, i_frame_size, 1, stdout ) )
+                goto fail;
+        }
+    }
+    /* Flush delayed frames */
+    while( !b_ctrl_c && x264_encoder_delayed_frames( h ) )
+    {
+        i_frame_size = x264_encoder_encode( h, &nal, &i_nal, NULL, &pic_out );
+        if( i_frame_size < 0 )
+            goto fail;
+        else if( i_frame_size )
+        {
+            if( !fwrite( nal->p_payload, i_frame_size, 1, stdout ) )
+                goto fail;
+        }
+    }
+
+    x264_encoder_close( h );
+    x264_picture_clean( &pic );
+    return 0;
+
+#undef fail
+fail3:
+    x264_encoder_close( h );
+fail2:
+    x264_picture_clean( &pic );
+fail:
+    return -1;
+}
​

x264-snapshot-20141104-2245.tar.bz2/tools/checkasm.c -> x264-snapshot-20141218-2245.tar.bz2/tools/checkasm.c Changed

 
@@ -90,7 +90,9 @@
 {
     uint32_t a = 0;
 #if HAVE_X86_INLINE_ASM
-    asm volatile( "rdtsc" : "=a"(a) :: "edx", "memory" );
+    asm volatile( "lfence \n"
+                  "rdtsc  \n"
+                  : "=a"(a) :: "edx", "memory" );
 #elif ARCH_PPC
     asm volatile( "mftb %0" : "=r"(a) :: "memory" );
 #elif ARCH_ARM     // ARMv7 only
​

Changes of Revision 9