Packman Build Service PMBS

libx264.changes Changed

libx264.spec Changed

@@ -1,6 +1,7 @@
-# vim: set ts=4 sw=4 et:
-# Copyright (c) 2012 Pascal Bleser <pascal.bleser@opensuse.org>
-# COpyright (c) 2013 Marguerite Su <marguerite@opensuse.org>
+#
+# spec file for package libx264
+#
+# Copyright (c) 2014 SUSE LINUX Products GmbH, Nuernberg, Germany.
 #
 # All modifications and additions to the file contributed by third parties
 # remain the property of their copyright owners, unless otherwise agreed
@@ -11,19 +12,21 @@
 # license that conforms to the Open Source Definition (Version 1.9)
 # published by the Open Source Initiative.
 
-# Please submit bugfixes or comments via http://bugs.links2linux.org/
+# Please submit bugfixes or comments via http://bugs.opensuse.org/
+#
+
 
-Name:           libx264
 %define soname  142
-%define svn     20140321
+%define svn     20141104
+Name:           libx264
 Version:        0.%{soname}svn%{svn}
-Release:        1
-License:        GPL-2.0+
+Release:        0
 Summary:        A free h264/avc encoder - encoder binary
-Url:            http://developers.videolan.org/x264.html
+License:        GPL-2.0+
 Group:          Productivity/Multimedia/Video/Editors and Convertors
-Source:        ftp://ftp.videolan.org/pub/videolan/x264/snapshots/x264-snapshot-%{svn}-2245.tar.bz2
-Patch:         x264-use-shared-library.patch
+Url:            http://developers.videolan.org/x264.html
+Source:         ftp://ftp.videolan.org/pub/videolan/x264/snapshots/x264-snapshot-%{svn}-2245.tar.bz2
+Patch0:         x264-use-shared-library.patch
 BuildRequires:  nasm
 BuildRequires:  pkg-config
 BuildRequires:  yasm >= 1.2.0
@@ -92,7 +95,7 @@
 
 %prep
 %setup -q -n x264-snapshot-%{svn}-2245
-%patch -p1
+%patch0 -p1
 FAKE_BUILDDATE=$(LC_ALL=C date -u -r %{_sourcedir}/%{name}.changes '+%%b %%e %%Y')
 sed -i "s/__DATE__/\"$FAKE_BUILDDATE\"/" x264.c
 
@@ -108,7 +111,7 @@
 make %{?_smp_mflags}
 
 %install
-%makeinstall
+make DESTDIR=%{buildroot} install %{?_smp_mflags}
 
 rm -f %{buildroot}%{_libdir}/%{name}.so
 rm -f %{buildroot}%{_libdir}/%{name}.a
@@ -119,6 +122,7 @@
 echo "%{name}-%{soname}" > %{_sourcedir}/baselibs.conf
 
 %post -n %{name}-%{soname} -p /sbin/ldconfig
+
 %postun -n %{name}-%{soname} -p /sbin/ldconfig
 
 %files %{soname}

x264-snapshot-20140321-2245.tar.bz2/common/sparc Deleted

x264-snapshot-20140321-2245.tar.bz2/common/sparc/pixel.asm Deleted

@@ -1,1089 +0,0 @@
-/*****************************************************************************
- * pixel.asm: sparc pixel metrics
- *****************************************************************************
- * Copyright (C) 2005-2014 x264 project
- *
- * Authors: Phil Jensen <philj@csufresno.edu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-! VIS optimized SAD for UltraSPARC
-
-.text
-.global x264_pixel_sad_8x8_vis
-x264_pixel_sad_8x8_vis:
-	save %sp, -120, %sp
-
-	fzero %f12
-
-	alignaddr %i0, %g0, %l0
-	ldd [%l0], %f0
-	ldd [%l0+8], %f2
-	faligndata %f0, %f2, %f4
-
-	alignaddr %i2, %g0, %l2
-	ldd [%l2], %f6
-	ldd [%l2+8], %f8
-	faligndata %f6, %f8, %f10
-
-	add %i0, %i1, %i0
-	add %i2, %i3, %i2
-	pdist %f4, %f10, %f12
-
-	alignaddr %i0, %g0, %l0
-	ldd [%l0], %f0
-	ldd [%l0+8], %f2
-	faligndata %f0, %f2, %f4
-
-	alignaddr %i2, %g0, %l2
-	ldd [%l2], %f6
-	ldd [%l2+8], %f8
-	faligndata %f6, %f8, %f10
-
-	add %i0, %i1, %i0
-	add %i2, %i3, %i2
-	pdist %f4, %f10, %f12
-
-	alignaddr %i0, %g0, %l0
-	ldd [%l0], %f0
-	ldd [%l0+8], %f2
-	faligndata %f0, %f2, %f4
-
-	alignaddr %i2, %g0, %l2
-	ldd [%l2], %f6
-	ldd [%l2+8], %f8
-	faligndata %f6, %f8, %f10
-
-	add %i0, %i1, %i0
-	add %i2, %i3, %i2
-	pdist %f4, %f10, %f12
-
-	alignaddr %i0, %g0, %l0
-	ldd [%l0], %f0
-	ldd [%l0+8], %f2
-	faligndata %f0, %f2, %f4
-
-	alignaddr %i2, %g0, %l2
-	ldd [%l2], %f6
-	ldd [%l2+8], %f8
-	faligndata %f6, %f8, %f10
-
-	add %i0, %i1, %i0
-	add %i2, %i3, %i2
-	pdist %f4, %f10, %f12
-
-	alignaddr %i0, %g0, %l0
-	ldd [%l0], %f0
-	ldd [%l0+8], %f2
-	faligndata %f0, %f2, %f4
-
-	alignaddr %i2, %g0, %l2
-	ldd [%l2], %f6
-	ldd [%l2+8], %f8
-	faligndata %f6, %f8, %f10
-
-	add %i0, %i1, %i0
-	add %i2, %i3, %i2
-	pdist %f4, %f10, %f12
-
-	alignaddr %i0, %g0, %l0
-	ldd [%l0], %f0
-	ldd [%l0+8], %f2
-	faligndata %f0, %f2, %f4
-
-	alignaddr %i2, %g0, %l2
-	ldd [%l2], %f6
-	ldd [%l2+8], %f8
-	faligndata %f6, %f8, %f10
-
-	add %i0, %i1, %i0
-	add %i2, %i3, %i2
-	pdist %f4, %f10, %f12
-
-	alignaddr %i0, %g0, %l0
-	ldd [%l0], %f0
-	ldd [%l0+8], %f2
-	faligndata %f0, %f2, %f4
-
-	alignaddr %i2, %g0, %l2
-	ldd [%l2], %f6
-	ldd [%l2+8], %f8
-	faligndata %f6, %f8, %f10
-
-	add %i0, %i1, %i0
-	add %i2, %i3, %i2
-	pdist %f4, %f10, %f12
-
-	alignaddr %i0, %g0, %l0
-	ldd [%l0], %f0
-	ldd [%l0+8], %f2
-	faligndata %f0, %f2, %f4
-
-	alignaddr %i2, %g0, %l2
-	ldd [%l2], %f6
-	ldd [%l2+8], %f8
-	faligndata %f6, %f8, %f10
-
-	add %i0, %i1, %i0
-	add %i2, %i3, %i2
-	pdist %f4, %f10, %f12
-
-	std %f12, [%fp-24]
-	ld [%fp-20], %i0
-
-	ret
-	restore
-
-.global x264_pixel_sad_8x16_vis
-x264_pixel_sad_8x16_vis:
-	save %sp, -120, %sp
-
-	fzero %f12
-
-	alignaddr %i0, %g0, %l0
-	ldd [%l0], %f0
-	ldd [%l0+8], %f2
-	faligndata %f0, %f2, %f4
-
-	alignaddr %i2, %g0, %l2
-	ldd [%l2], %f6
-	ldd [%l2+8], %f8
-	faligndata %f6, %f8, %f10
-
-	add %i0, %i1, %i0
-	add %i2, %i3, %i2
-	pdist %f4, %f10, %f12
-
-	alignaddr %i0, %g0, %l0
-	ldd [%l0], %f0
-	ldd [%l0+8], %f2
-	faligndata %f0, %f2, %f4
-
-	alignaddr %i2, %g0, %l2
-	ldd [%l2], %f6
-	ldd [%l2+8], %f8
-	faligndata %f6, %f8, %f10
-
-	add %i0, %i1, %i0
-	add %i2, %i3, %i2
-	pdist %f4, %f10, %f12
-
-	alignaddr %i0, %g0, %l0
-	ldd [%l0], %f0
-	ldd [%l0+8], %f2
-	faligndata %f0, %f2, %f4
-
-	alignaddr %i2, %g0, %l2
-	ldd [%l2], %f6
-	ldd [%l2+8], %f8
-	faligndata %f6, %f8, %f10
-
-	add %i0, %i1, %i0
-	add %i2, %i3, %i2
-	pdist %f4, %f10, %f12

x264-snapshot-20140321-2245.tar.bz2/common/sparc/pixel.h Deleted

@@ -1,34 +0,0 @@
-/*****************************************************************************
- * pixel.h: sparc pixel metrics
- *****************************************************************************
- * Copyright (C) 2005-2014 x264 project
- *
- * Authors: Phil Jensen <philj@csufresno.edu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
-
-#ifndef X264_SPARC_PIXEL_H
-#define X264_SPARC_PIXEL_H
-
-int x264_pixel_sad_8x8_vis  ( uint8_t *, intptr_t, uint8_t *, intptr_t );
-int x264_pixel_sad_8x16_vis ( uint8_t *, intptr_t, uint8_t *, intptr_t );
-int x264_pixel_sad_16x8_vis ( uint8_t *, intptr_t, uint8_t *, intptr_t );
-int x264_pixel_sad_16x16_vis( uint8_t *, intptr_t, uint8_t *, intptr_t );
-
-#endif

x264-snapshot-20140321-2245.tar.bz2/tools/cltostr.pl Deleted

@@ -1,65 +0,0 @@
-# Perl script used for compiling OpenCL src into x264 binary
-#
-# Copyright (C) 2013-2014 x264 project
-# Authors: Steve Borho <sborho@multicorewareinc.com>
-
-use Digest::MD5 qw(md5_hex);
-
-# xxd takes a VAR, which will be the variable name
-# and BYTES, a string of bytes to beencoded.
-sub xxd
-{
-  my %args = @_;
-  my $var = $args{VAR};
-  my $bytes = $args{BYTES};
-  my @hexbytes;
-  my @bytes = split //, $$bytes;
-  foreach $b (@bytes)
-  {
-    push @hexbytes, sprintf("0x%02X", ord($b));
-  }
-
-  # Format 'em nice and pretty-like.
-  print 'static const char ' . $var . '[] = {' . "\n";
-  my $count = 0;
-  foreach my $h (@hexbytes)
-  {
-    print "$h, ";
-    $count++;
-    if ($count == 16)
-    {
-      print "\n";
-      $count = 0;
-    }
-  }
-  print "\n0x00 };\n\n";
-
-  return;
-}
-
-if (@ARGV < 1)
-{
-  printf "%s: VARNAME ", $0 . "\n";
-  exit(-1);
-}
-
-
-my @lines;
-while(<STDIN>)
-{
-  s/^\s+//;                # trim leading whitespace
-  if (/^\/\//)
-  {
-    next;   # skip the line if it starts with '//'
-  }
-  push @lines, $_;
-}
-
-my $lines = join '', @lines;
-xxd(VAR => @ARGV[0], BYTES => \$lines);
-
-my $hash = md5_hex($lines);
-@hash = ( $hash =~ m/../g );
-
-
-xxd(VAR => @ARGV[0] . "_hash", BYTES => \$hash);

x264-snapshot-20140321-2245.tar.bz2/.gitignore -> x264-snapshot-20141104-2245.tar.bz2/.gitignore Changed

x264-snapshot-20140321-2245.tar.bz2/AUTHORS -> x264-snapshot-20141104-2245.tar.bz2/AUTHORS Changed

@@ -33,6 +33,14 @@
 D: BeOS and MacOS X ports.
 S: France
 
+N: Fiona Glaser
+E: fiona AT x264 DOT com
+D: Maintainer
+D: All areas of encoder analysis and algorithms
+D: Motion estimation, rate control, macroblock & frame decisions, RDO, etc
+D: x86 asm
+S: USA
+
 N: Gabriel Bouvigne
 E: bouvigne AT mp3-tech DOT org
 D: 2pass VBV
@@ -47,31 +55,25 @@
 D: 4:2:2 chroma subsampling, x86 asm, Windows improvements, bugfixes
 S: Sweden
 
-N: Jason Garrett-Glaser
-E: darkshikari AT gmail DOT com
-D: x86 asm, 1pass VBV, adaptive quantization, inline asm
-D: various speed optimizations, bugfixes
-S: USA
-
 N: Laurent Aimar
-E: fenrir AT via.ecp DOT fr
+E: fenrir AT videolan DOT org
 C: fenrir
 D: Intial import, former maintainer
 D: x86 asm (mmx/mmx2)
 S: France
 
 N: Loren Merritt
-E: lorenm AT u.washington DOT edu
+E: pengvado AT akuvian DOT org
 C: pengvado
-D: maintainer
+D: Maintainer
 D: All areas of encoder analysis and algorithms
-D: Motion estimation, rate control, macroblock & frame decisions, RDO, etc.
+D: Motion estimation, rate control, macroblock & frame decisions, RDO, etc
 D: Multithreading
-D: x86 and x86_64 asm (mmx/mmx2/sse2)
+D: x86 asm
 S: USA
 
 N: Mans Rullgard
-E: mru AT inprovide DOT com
+E: mru AT mansr DOT com
 C: mru
 D: Rate control
 S: Southampton, UK
@@ -91,10 +93,6 @@
 D: gcc asm to nasm conversion
 S: China
 
-N: Phil Jensen
-E: philj AT csufresno DOT edu
-D: SPARC asm
-
 N: Radek Czyz
 E: radoslaw AT syskin DOT cjb DOT net
 D: Cached motion compensation

x264-snapshot-20140321-2245.tar.bz2/Makefile -> x264-snapshot-20141104-2245.tar.bz2/Makefile Changed

@@ -88,17 +88,14 @@
 ifeq ($(ARCH),X86)
 ARCH_X86 = yes
 ASMSRC   = $(X86SRC) common/x86/pixel-32.asm
-ASFLAGS += -DARCH_X86_64=0
 endif
 
 ifeq ($(ARCH),X86_64)
 ARCH_X86 = yes
 ASMSRC   = $(X86SRC:-32.asm=-64.asm) common/x86/trellis-64.asm
-ASFLAGS += -DARCH_X86_64=1
 endif
 
 ifdef ARCH_X86
-ASFLAGS += -I$(SRCPATH)/common/x86/
 SRCS   += common/x86/mc-c.c common/x86/predict-c.c
 OBJASM  = $(ASMSRC:%.asm=%.o)
 $(OBJASM): common/x86/x86inc.asm common/x86/x86util.asm
@@ -126,11 +123,18 @@
 endif
 endif
 
-# VIS optims
-ifeq ($(ARCH),UltraSPARC)
-ifeq ($(findstring HIGH_BIT_DEPTH, $(CONFIG)),)
-ASMSRC += common/sparc/pixel.asm
-OBJASM  = $(ASMSRC:%.asm=%.o)
+# AArch64 NEON optims
+ifeq ($(ARCH),AARCH64)
+ifneq ($(AS),)
+ASMSRC += common/aarch64/dct-a.S     \
+          common/aarch64/deblock-a.S \
+          common/aarch64/mc-a.S      \
+          common/aarch64/pixel-a.S   \
+          common/aarch64/predict-a.S \
+          common/aarch64/quant-a.S
+SRCS   += common/aarch64/mc-c.c      \
+          common/aarch64/predict-c.c
+OBJASM  = $(ASMSRC:%.S=%.o)
 endif
 endif
 
@@ -148,7 +152,7 @@
 
 ifeq ($(HAVE_OPENCL),yes)
 common/oclobj.h: common/opencl/x264-cl.h $(wildcard $(SRCPATH)/common/opencl/*.cl)
-	cat $^ | perl $(SRCPATH)/tools/cltostr.pl x264_opencl_source > $@
+	cat $^ | $(SRCPATH)/tools/cltostr.sh $@
 GENERATED += common/oclobj.h
 SRCS += common/opencl.c encoder/slicetype-cl.c
 endif
@@ -157,7 +161,7 @@
 OBJCLI += $(SRCCLI:%.c=%.o)
 OBJSO  += $(SRCSO:%.c=%.o)
 
-.PHONY: all default fprofiled clean distclean install uninstall lib-static lib-shared cli install-lib-dev install-lib-static install-lib-shared install-cli
+.PHONY: all default fprofiled clean distclean install install-* uninstall cli lib-* etags
 
 cli: x264$(EXE)
 lib-static: $(LIBX264)
@@ -185,7 +189,7 @@
 
 $(OBJS) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK): .depend
 
-%.o: %.asm
+%.o: %.asm common/x86/x86inc.asm common/x86/x86util.asm
 	$(AS) $(ASFLAGS) -o $@ $<
 	-@ $(if $(STRIP), $(STRIP) -x $@) # delete local/anonymous symbols, so they don't show up in oprofile
 
@@ -201,7 +205,12 @@
 
 .depend: config.mak
 	@rm -f .depend
+	@echo 'dependency file generation...'
+ifeq ($(COMPILER),CL)
+	@$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS) $(SRCCLI) $(SRCSO)), $(SRCPATH)/tools/msvsdepend.sh "$(CC)" "$(CFLAGS)" "$(SRC)" "$(SRC:$(SRCPATH)/%.c=%.o)" 1>> .depend;)
+else
 	@$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS) $(SRCCLI) $(SRCSO)), $(CC) $(CFLAGS) $(SRC) $(DEPMT) $(SRC:$(SRCPATH)/%.c=%.o) $(DEPMM) 1>> .depend;)
+endif
 
 config.mak:
 	./configure
@@ -232,15 +241,20 @@
 	$(MAKE) clean
 	$(MAKE) x264$(EXE) CFLAGS="$(CFLAGS) $(PROF_GEN_CC)" LDFLAGS="$(LDFLAGS) $(PROF_GEN_LD)"
 	$(foreach V, $(VIDS), $(foreach I, 0 1 2 3 4 5 6 7, ./x264$(EXE) $(OPT$I) --threads 1 $(V) -o $(DEVNULL) ;))
+ifeq ($(COMPILER),CL)
+# Because Visual Studio timestamps the object files within the PGD, it fails to build if they change - only the executable should be deleted
+	rm -f x264$(EXE)
+else
 	rm -f $(SRC2:%.c=%.o)
+endif
 	$(MAKE) CFLAGS="$(CFLAGS) $(PROF_USE_CC)" LDFLAGS="$(LDFLAGS) $(PROF_USE_LD)"
-	rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock
+	rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock *.pgd *.pgc
 endif
 
 clean:
 	rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(SONAME) *.a *.lib *.exp *.pdb x264 x264.exe .depend TAGS
 	rm -f checkasm checkasm.exe $(OBJCHK) $(GENERATED) x264_lookahead.clbin
-	rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock
+	rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock *.pgd *.pgc
 
 distclean: clean
 	rm -f config.mak x264_config.h config.h config.log x264.pc x264.def

x264-snapshot-20141104-2245.tar.bz2/common/aarch64 Added

x264-snapshot-20141104-2245.tar.bz2/common/aarch64/asm.S Added

@@ -0,0 +1,221 @@
+/*****************************************************************************
+ * asm.S: AArch64 utility macros
+ *****************************************************************************
+ * Copyright (C) 2008-2014 x264 project
+ *
+ * Authors: Mans Rullgard <mans@mansr.com>
+ *          David Conrad <lessen42@gmail.com>
+ *          Janne Grunau <janne-x264@jannau.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "config.h"
+
+#ifdef PREFIX
+#   define EXTERN_ASM _
+#else
+#   define EXTERN_ASM
+#endif
+
+#ifdef __ELF__
+#   define ELF
+#else
+#   define ELF  #
+#endif
+
+#ifdef __MACH__
+#   define MACH
+#else
+#   define MACH #
+#endif
+
+#if HAVE_AS_FUNC
+#   define FUNC
+#else
+#   define FUNC #
+#endif
+
+.macro  function name, export=0, align=2
+    .macro endfunc
+ELF     .size   \name, . - \name
+FUNC    .endfunc
+        .purgem endfunc
+    .endm
+        .text
+        .align          \align
+    .if \export
+        .global EXTERN_ASM\name
+ELF     .type   EXTERN_ASM\name, %function
+FUNC    .func   EXTERN_ASM\name
+EXTERN_ASM\name:
+    .else
+ELF     .type   \name, %function
+FUNC    .func   \name
+\name:
+    .endif
+.endm
+
+.macro  const   name, align=2
+    .macro endconst
+ELF     .size   \name, . - \name
+        .purgem endconst
+    .endm
+ELF     .section        .rodata
+MACH    .const_data
+        .align          \align
+\name:
+.endm
+
+.macro  movrel rd, val
+#if defined(PIC) && defined(__APPLE__)
+        adrp            \rd, \val@PAGE
+        add             \rd, \rd, \val@PAGEOFF
+#elif defined(PIC)
+        adrp            \rd, \val
+        add             \rd, \rd, :lo12:\val
+#else
+        ldr             \rd, =\val
+#endif
+.endm
+
+#define GLUE(a, b) a ## b
+#define JOIN(a, b) GLUE(a, b)
+#define X(s) JOIN(EXTERN_ASM, s)
+
+#define FDEC_STRIDE 32
+#define FENC_STRIDE 16
+
+
+.macro SUMSUB_AB   sum, sub, a, b
+    add        \sum,  \a,  \b
+    sub        \sub,  \a,  \b
+.endm
+
+.macro unzip t1, t2, s1, s2
+    uzp1        \t1,  \s1,  \s2
+    uzp2        \t2,  \s1,  \s2
+.endm
+
+.macro transpose t1, t2, s1, s2
+    trn1        \t1,  \s1,  \s2
+    trn2        \t2,  \s1,  \s2
+.endm
+
+.macro transpose4x4.h v0, v1, v2, v3, t0, t1, t2, t3
+    transpose   \t0\().2s,  \t2\().2s,  \v0\().2s,  \v2\().2s
+    transpose   \t1\().2s,  \t3\().2s,  \v1\().2s,  \v3\().2s
+    transpose   \v0\().4h,  \v1\().4h,  \t0\().4h,  \t1\().4h
+    transpose   \v2\().4h,  \v3\().4h,  \t2\().4h,  \t3\().4h
+.endm
+
+.macro transpose4x8.h v0, v1, v2, v3, t0, t1, t2, t3
+    transpose   \t0\().4s,  \t2\().4s,  \v0\().4s,  \v2\().4s
+    transpose   \t1\().4s,  \t3\().4s,  \v1\().4s,  \v3\().4s
+    transpose   \v0\().8h,  \v1\().8h,  \t0\().8h,  \t1\().8h
+    transpose   \v2\().8h,  \v3\().8h,  \t2\().8h,  \t3\().8h
+.endm
+
+
+.macro transpose8x8.h r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
+    trn1        \r8\().8H,  \r0\().8H,  \r1\().8H
+    trn2        \r9\().8H,  \r0\().8H,  \r1\().8H
+    trn1        \r1\().8H,  \r2\().8H,  \r3\().8H
+    trn2        \r3\().8H,  \r2\().8H,  \r3\().8H
+    trn1        \r0\().8H,  \r4\().8H,  \r5\().8H
+    trn2        \r5\().8H,  \r4\().8H,  \r5\().8H
+    trn1        \r2\().8H,  \r6\().8H,  \r7\().8H
+    trn2        \r7\().8H,  \r6\().8H,  \r7\().8H
+
+    trn1        \r4\().4S,  \r0\().4S,  \r2\().4S
+    trn2        \r2\().4S,  \r0\().4S,  \r2\().4S
+    trn1        \r6\().4S,  \r5\().4S,  \r7\().4S
+    trn2        \r7\().4S,  \r5\().4S,  \r7\().4S
+    trn1        \r5\().4S,  \r9\().4S,  \r3\().4S
+    trn2        \r9\().4S,  \r9\().4S,  \r3\().4S
+    trn1        \r3\().4S,  \r8\().4S,  \r1\().4S
+    trn2        \r8\().4S,  \r8\().4S,  \r1\().4S
+
+    trn1        \r0\().2D,  \r3\().2D,  \r4\().2D
+    trn2        \r4\().2D,  \r3\().2D,  \r4\().2D
+
+    trn1        \r1\().2D,  \r5\().2D,  \r6\().2D
+    trn2        \r5\().2D,  \r5\().2D,  \r6\().2D
+
+    trn2        \r6\().2D,  \r8\().2D,  \r2\().2D
+    trn1        \r2\().2D,  \r8\().2D,  \r2\().2D
+
+    trn1        \r3\().2D,  \r9\().2D,  \r7\().2D
+    trn2        \r7\().2D,  \r9\().2D,  \r7\().2D
+.endm
+
+.macro  transpose_8x16.b r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
+    trn1        \t0\().16b, \r0\().16b, \r1\().16b
+    trn2        \t1\().16b, \r0\().16b, \r1\().16b
+    trn1        \r1\().16b, \r2\().16b, \r3\().16b
+    trn2        \r3\().16b, \r2\().16b, \r3\().16b
+    trn1        \r0\().16b, \r4\().16b, \r5\().16b
+    trn2        \r5\().16b, \r4\().16b, \r5\().16b
+    trn1        \r2\().16b, \r6\().16b, \r7\().16b
+    trn2        \r7\().16b, \r6\().16b, \r7\().16b
+
+    trn1        \r4\().8h,  \r0\().8h,  \r2\().8h
+    trn2        \r2\().8h,  \r0\().8h,  \r2\().8h
+    trn1        \r6\().8h,  \r5\().8h,  \r7\().8h
+    trn2        \r7\().8h,  \r5\().8h,  \r7\().8h
+    trn1        \r5\().8h,  \t1\().8h,  \r3\().8h
+    trn2        \t1\().8h,  \t1\().8h,  \r3\().8h
+    trn1        \r3\().8h,  \t0\().8h,  \r1\().8h
+    trn2        \t0\().8h,  \t0\().8h,  \r1\().8h
+
+    trn1        \r0\().4s,  \r3\().4s,  \r4\().4s
+    trn2        \r4\().4s,  \r3\().4s,  \r4\().4s
+
+    trn1        \r1\().4s,  \r5\().4s,  \r6\().4s
+    trn2        \r5\().4s,  \r5\().4s,  \r6\().4s
+
+    trn2        \r6\().4s,  \t0\().4s,  \r2\().4s
+    trn1        \r2\().4s,  \t0\().4s,  \r2\().4s
+
+    trn1        \r3\().4s,  \t1\().4s,  \r7\().4s
+    trn2        \r7\().4s,  \t1\().4s,  \r7\().4s
+.endm
+
+.macro  transpose_4x16.b r0, r1, r2, r3, t4, t5, t6, t7

x264-snapshot-20141104-2245.tar.bz2/common/aarch64/dct-a.S Added

@@ -0,0 +1,666 @@
+/****************************************************************************
+ * dct-a.S: AArch6464 transform and zigzag
+ *****************************************************************************
+ * Copyright (C) 2009-2014 x264 project
+ *
+ * Authors: David Conrad <lessen42@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+const scan4x4_frame, align=4
+.byte    0,1,   8,9,   2,3,   4,5
+.byte   10,11, 16,17, 24,25, 18,19
+.byte   12,13,  6,7,  14,15, 20,21
+.byte   26,27, 28,29, 22,23, 30,31
+endconst
+
+// sum = a + (b>>shift)   sub = (a>>shift) - b
+.macro SUMSUB_SHR shift sum sub a b t0 t1
+    sshr        \t0,  \b, #\shift
+    sshr        \t1,  \a, #\shift
+    add         \sum, \a, \t0
+    sub         \sub, \t1, \b
+.endm
+
+// sum = (a>>shift) + b   sub = a - (b>>shift)
+.macro SUMSUB_SHR2 shift sum sub a b t0 t1
+    sshr        \t0,  \a, #\shift
+    sshr        \t1,  \b, #\shift
+    add         \sum, \t0, \b
+    sub         \sub, \a, \t1
+.endm
+
+// a += 1.5*ma   b -= 1.5*mb
+.macro SUMSUB_15 a b ma mb t0 t1
+    sshr        \t0, \ma, #1
+    sshr        \t1, \mb, #1
+    add         \t0, \t0, \ma
+    add         \t1, \t1, \mb
+    add         \a,  \a,  \t0
+    sub         \b,  \b,  \t1
+.endm
+
+
+function x264_dct4x4dc_neon, export=1
+    ld1        {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
+    movi        v31.4h, #1
+    SUMSUB_AB   v4.4h,  v5.4h,  v0.4h,  v1.4h
+    SUMSUB_AB   v6.4h,  v7.4h,  v2.4h,  v3.4h
+    SUMSUB_AB   v0.4h,  v2.4h,  v4.4h,  v6.4h
+    SUMSUB_AB   v3.4h,  v1.4h,  v5.4h,  v7.4h
+    transpose   v4.4h,  v6.4h,  v0.4h,  v2.4h
+    transpose   v5.4h,  v7.4h,  v1.4h,  v3.4h
+    SUMSUB_AB   v0.4h,  v2.4h,  v4.4h,  v6.4h
+    SUMSUB_AB   v1.4h,  v3.4h,  v5.4h,  v7.4h
+    transpose   v4.2s,  v5.2s,  v0.2s,  v1.2s
+    transpose   v6.2s,  v7.2s,  v2.2s,  v3.2s
+    add         v16.4h, v4.4h,  v31.4h
+    add         v17.4h, v6.4h,  v31.4h
+    srhadd      v0.4h,  v4.4h,  v5.4h
+    shsub       v1.4h,  v16.4h, v5.4h
+    shsub       v2.4h,  v17.4h, v7.4h
+    srhadd      v3.4h,  v6.4h,  v7.4h
+    st1        {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
+    ret
+endfunc
+
+function x264_idct4x4dc_neon, export=1
+    ld1        {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
+    SUMSUB_AB   v4.4h,  v5.4h,  v0.4h,  v1.4h
+    SUMSUB_AB   v6.4h,  v7.4h,  v2.4h,  v3.4h
+    SUMSUB_AB   v0.4h,  v2.4h,  v4.4h,  v6.4h
+    SUMSUB_AB   v3.4h,  v1.4h,  v5.4h,  v7.4h
+    transpose   v4.4h,  v6.4h,  v0.4h,  v2.4h
+    transpose   v5.4h,  v7.4h,  v1.4h,  v3.4h
+    SUMSUB_AB   v0.4h,  v2.4h,  v4.4h,  v6.4h
+    SUMSUB_AB   v1.4h,  v3.4h,  v5.4h,  v7.4h
+    transpose   v4.2s,  v5.2s,  v0.2s,  v1.2s
+    transpose   v6.2s,  v7.2s,  v2.2s,  v3.2s
+    SUMSUB_AB   v0.4h,  v1.4h,  v4.4h,  v5.4h
+    SUMSUB_AB   v3.4h,  v2.4h,  v6.4h,  v7.4h
+    st1        {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
+    ret
+endfunc
+
+.macro DCT_1D v0 v1 v2 v3 v4 v5 v6 v7
+    SUMSUB_AB   \v1, \v6, \v5, \v6
+    SUMSUB_AB   \v3, \v7, \v4, \v7
+    add         \v0, \v3, \v1
+    add         \v4, \v7, \v7
+    add         \v5, \v6, \v6
+    sub         \v2, \v3, \v1
+    add         \v1, \v4, \v6
+    sub         \v3, \v7, \v5
+.endm
+
+function x264_sub4x4_dct_neon, export=1
+    mov         x3, #FENC_STRIDE
+    mov         x4, #FDEC_STRIDE
+    ld1        {v0.s}[0], [x1], x3
+    ld1        {v1.s}[0], [x2], x4
+    ld1        {v2.s}[0], [x1], x3
+    usubl       v16.8h, v0.8b,  v1.8b
+    ld1        {v3.s}[0], [x2], x4
+    ld1        {v4.s}[0], [x1], x3
+    usubl       v17.8h, v2.8b,  v3.8b
+    ld1        {v5.s}[0], [x2], x4
+    ld1        {v6.s}[0], [x1], x3
+    usubl       v18.8h, v4.8b,  v5.8b
+    ld1        {v7.s}[0], [x2], x4
+    usubl       v19.8h, v6.8b,  v7.8b
+
+    DCT_1D      v0.4h, v1.4h, v2.4h, v3.4h, v16.4h, v17.4h, v18.4h, v19.4h
+    transpose4x4.h v0, v1, v2, v3, v4, v5, v6, v7
+    DCT_1D      v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h
+    st1        {v4.4h,v5.4h,v6.4h,v7.4h}, [x0]
+    ret
+endfunc
+
+function x264_sub8x4_dct_neon
+    ld1        {v0.8b}, [x1], x3
+    ld1        {v1.8b}, [x2], x4
+    usubl       v16.8h, v0.8b,  v1.8b
+    ld1        {v2.8b}, [x1], x3
+    ld1        {v3.8b}, [x2], x4
+    usubl       v17.8h, v2.8b,  v3.8b
+    ld1        {v4.8b}, [x1], x3
+    ld1        {v5.8b}, [x2], x4
+    usubl       v18.8h, v4.8b,  v5.8b
+    ld1        {v6.8b}, [x1], x3
+    ld1        {v7.8b}, [x2], x4
+    usubl       v19.8h, v6.8b,  v7.8b
+
+    DCT_1D      v0.8h, v1.8h, v2.8h, v3.8h, v16.8h, v17.8h, v18.8h, v19.8h
+    transpose4x8.h  v0, v1, v2, v3, v4, v5, v6, v7
+
+    SUMSUB_AB   v16.8h, v19.8h, v0.8h,  v3.8h
+    SUMSUB_AB   v17.8h, v18.8h, v1.8h,  v2.8h
+    add         v22.8h, v19.8h, v19.8h
+    add         v21.8h, v18.8h, v18.8h
+    add         v0.8h,  v16.8h, v17.8h
+    sub         v1.8h,  v16.8h, v17.8h
+
+    add         v2.8h,  v22.8h, v18.8h
+    sub         v3.8h,  v19.8h, v21.8h
+
+    zip1        v4.2d,  v0.2d,  v2.2d
+    zip2        v6.2d,  v0.2d,  v2.2d
+    zip1        v5.2d,  v1.2d,  v3.2d
+    zip2        v7.2d,  v1.2d,  v3.2d
+
+    st1        {v4.8h}, [x0], #16
+    st1        {v5.8h}, [x0], #16
+    st1        {v6.8h}, [x0], #16
+    st1        {v7.8h}, [x0], #16
+    ret
+endfunc
+
+function x264_sub8x8_dct_neon, export=1
+    mov         x5,  x30
+    mov         x3, #FENC_STRIDE
+    mov         x4, #FDEC_STRIDE
+    bl          x264_sub8x4_dct_neon
+    mov         x30, x5
+    b           x264_sub8x4_dct_neon
+endfunc
+
+function x264_sub16x16_dct_neon, export=1
+    mov         x5,  x30
+    mov         x3, #FENC_STRIDE
+    mov         x4, #FDEC_STRIDE
+    bl          x264_sub8x4_dct_neon
+    bl          x264_sub8x4_dct_neon
+    sub         x1, x1, #8*FENC_STRIDE-8
+    sub         x2, x2, #8*FDEC_STRIDE-8
+    bl          x264_sub8x4_dct_neon
+    bl          x264_sub8x4_dct_neon
+    sub         x1, x1, #8
+    sub         x2, x2, #8
+    bl          x264_sub8x4_dct_neon
+    bl          x264_sub8x4_dct_neon
+    sub         x1, x1, #8*FENC_STRIDE-8

x264-snapshot-20141104-2245.tar.bz2/common/aarch64/dct.h Added

@@ -0,0 +1,52 @@
+/*****************************************************************************
+ * dct.h: AArch64 transform and zigzag
+ *****************************************************************************
+ * Copyright (C) 2009-2014 x264 project
+ *
+ * Authors: David Conrad <lessen42@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_AARCH64_DCT_H
+#define X264_AARCH64_DCT_H
+
+void x264_dct4x4dc_neon( int16_t d[16] );
+void x264_idct4x4dc_neon( int16_t d[16] );
+
+void x264_sub4x4_dct_neon( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_neon( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct_neon( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
+
+void x264_add4x4_idct_neon( uint8_t *p_dst, int16_t dct[16] );
+void x264_add8x8_idct_neon( uint8_t *p_dst, int16_t dct[4][16] );
+void x264_add16x16_idct_neon( uint8_t *p_dst, int16_t dct[16][16] );
+
+void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[4] );
+void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[16] );
+void x264_sub8x8_dct_dc_neon( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 );
+
+void x264_sub8x8_dct8_neon( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct8_neon( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
+
+void x264_add8x8_idct8_neon( uint8_t *p_dst, int16_t dct[64] );
+void x264_add16x16_idct8_neon( uint8_t *p_dst, int16_t dct[4][64] );
+
+void x264_zigzag_scan_4x4_frame_neon( int16_t level[16], int16_t dct[16] );
+
+#endif

x264-snapshot-20141104-2245.tar.bz2/common/aarch64/deblock-a.S Added

@@ -0,0 +1,392 @@
+/*****************************************************************************
+ * deblock.S: aarch64 deblocking
+ *****************************************************************************
+ * Copyright (C) 2009-2014 x264 project
+ *
+ * Authors: Mans Rullgard <mans@mansr.com>
+ *          Janne Grunau <janne-x264@jannau.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.macro h264_loop_filter_start
+    cmp             w2,  #0
+    ldr             w6,  [x4]
+    ccmp            w3,  #0, #0, ne
+    mov             v24.s[0], w6
+    and             w6,  w6,  w6,  lsl #16
+    b.eq            1f
+    ands            w6,  w6,  w6,  lsl #8
+    b.ge            2f
+1:
+    ret
+2:
+.endm
+
+.macro h264_loop_filter_luma
+    dup             v22.16b, w2                     // alpha
+    uxtl            v24.8h,  v24.8b
+    uabd            v21.16b, v16.16b, v0.16b        // abs(p0 - q0)
+    uxtl            v24.4s,  v24.4h
+    uabd            v28.16b, v18.16b, v16.16b       // abs(p1 - p0)
+    sli             v24.8h,  v24.8h,  #8
+    uabd            v30.16b, v2.16b,  v0.16b        // abs(q1 - q0)
+    sli             v24.4s,  v24.4s,  #16
+    cmhi            v21.16b, v22.16b, v21.16b       // < alpha
+    dup             v22.16b, w3                     // beta
+    cmlt            v23.16b, v24.16b, #0
+    cmhi            v28.16b, v22.16b, v28.16b       // < beta
+    cmhi            v30.16b, v22.16b, v30.16b       // < beta
+    bic             v21.16b, v21.16b, v23.16b
+    uabd            v17.16b, v20.16b, v16.16b       // abs(p2 - p0)
+    and             v21.16b, v21.16b, v28.16b
+    uabd            v19.16b,  v4.16b,  v0.16b       // abs(q2 - q0)
+    cmhi            v17.16b, v22.16b, v17.16b       // < beta
+    and             v21.16b, v21.16b, v30.16b
+    cmhi            v19.16b, v22.16b, v19.16b       // < beta
+    and             v17.16b, v17.16b, v21.16b
+    and             v19.16b, v19.16b, v21.16b
+    and             v24.16b, v24.16b, v21.16b
+    urhadd          v28.16b, v16.16b,  v0.16b
+    sub             v21.16b, v24.16b, v17.16b
+    uqadd           v23.16b, v18.16b, v24.16b
+    uhadd           v20.16b, v20.16b, v28.16b
+    sub             v21.16b, v21.16b, v19.16b
+    uhadd           v28.16b,  v4.16b, v28.16b
+    umin            v23.16b, v23.16b, v20.16b
+    uqsub           v22.16b, v18.16b, v24.16b
+    uqadd           v4.16b,   v2.16b, v24.16b
+    umax            v23.16b, v23.16b, v22.16b
+    uqsub           v22.16b,  v2.16b, v24.16b
+    umin            v28.16b,  v4.16b, v28.16b
+    uxtl            v4.8h,    v0.8b
+    umax            v28.16b, v28.16b, v22.16b
+    uxtl2           v20.8h,   v0.16b
+    usubw           v4.8h,    v4.8h,  v16.8b
+    usubw2          v20.8h,  v20.8h,  v16.16b
+    shl             v4.8h,    v4.8h,  #2
+    shl             v20.8h,  v20.8h,  #2
+    uaddw           v4.8h,    v4.8h,  v18.8b
+    uaddw2          v20.8h,  v20.8h,  v18.16b
+    usubw           v4.8h,    v4.8h,   v2.8b
+    usubw2          v20.8h,  v20.8h,   v2.16b
+    rshrn           v4.8b,    v4.8h,  #3
+    rshrn2          v4.16b,  v20.8h,  #3
+    bsl             v17.16b, v23.16b, v18.16b
+    bsl             v19.16b, v28.16b,  v2.16b
+    neg             v23.16b, v21.16b
+    uxtl            v28.8h,  v16.8b
+    smin            v4.16b,   v4.16b, v21.16b
+    uxtl2           v21.8h,  v16.16b
+    smax            v4.16b,   v4.16b, v23.16b
+    uxtl            v22.8h,   v0.8b
+    uxtl2           v24.8h,   v0.16b
+    saddw           v28.8h,  v28.8h,  v4.8b
+    saddw2          v21.8h,  v21.8h,  v4.16b
+    ssubw           v22.8h,  v22.8h,  v4.8b
+    ssubw2          v24.8h,  v24.8h,  v4.16b
+    sqxtun          v16.8b,  v28.8h
+    sqxtun2         v16.16b, v21.8h
+    sqxtun          v0.8b,   v22.8h
+    sqxtun2         v0.16b,  v24.8h
+.endm
+
+function x264_deblock_v_luma_neon, export=1
+    h264_loop_filter_start
+
+    ld1             {v0.16b},  [x0], x1
+    ld1             {v2.16b},  [x0], x1
+    ld1             {v4.16b},  [x0], x1
+    sub             x0,  x0,  x1, lsl #2
+    sub             x0,  x0,  x1, lsl #1
+    ld1             {v20.16b},  [x0], x1
+    ld1             {v18.16b},  [x0], x1
+    ld1             {v16.16b},  [x0], x1
+
+    h264_loop_filter_luma
+
+    sub             x0,  x0,  x1, lsl #1
+    st1             {v17.16b}, [x0], x1
+    st1             {v16.16b}, [x0], x1
+    st1             {v0.16b},  [x0], x1
+    st1             {v19.16b}, [x0]
+
+    ret
+endfunc
+
+function x264_deblock_h_luma_neon, export=1
+    h264_loop_filter_start
+
+    sub             x0,  x0,  #4
+    ld1             {v6.8b},  [x0], x1
+    ld1             {v20.8b}, [x0], x1
+    ld1             {v18.8b}, [x0], x1
+    ld1             {v16.8b}, [x0], x1
+    ld1             {v0.8b},  [x0], x1
+    ld1             {v2.8b},  [x0], x1
+    ld1             {v4.8b},  [x0], x1
+    ld1             {v26.8b}, [x0], x1
+    ld1             {v6.d}[1],  [x0], x1
+    ld1             {v20.d}[1], [x0], x1
+    ld1             {v18.d}[1], [x0], x1
+    ld1             {v16.d}[1], [x0], x1
+    ld1             {v0.d}[1],  [x0], x1
+    ld1             {v2.d}[1],  [x0], x1
+    ld1             {v4.d}[1],  [x0], x1
+    ld1             {v26.d}[1], [x0], x1
+
+    transpose_8x16.b v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
+
+    h264_loop_filter_luma
+
+    transpose_4x16.b v17, v16, v0, v19, v21, v23, v25, v27
+
+    sub             x0,  x0,  x1, lsl #4
+    add             x0,  x0,  #2
+    st1             {v17.s}[0],  [x0], x1
+    st1             {v16.s}[0], [x0], x1
+    st1             {v0.s}[0],  [x0], x1
+    st1             {v19.s}[0], [x0], x1
+    st1             {v17.s}[1],  [x0], x1
+    st1             {v16.s}[1], [x0], x1
+    st1             {v0.s}[1],  [x0], x1
+    st1             {v19.s}[1], [x0], x1
+    st1             {v17.s}[2],  [x0], x1
+    st1             {v16.s}[2], [x0], x1
+    st1             {v0.s}[2],  [x0], x1
+    st1             {v19.s}[2], [x0], x1
+    st1             {v17.s}[3],  [x0], x1
+    st1             {v16.s}[3], [x0], x1
+    st1             {v0.s}[3],  [x0], x1
+    st1             {v19.s}[3], [x0], x1
+
+    ret
+endfunc
+
+.macro h264_loop_filter_chroma
+    dup             v22.16b, w2              // alpha
+    uxtl            v24.8h,  v24.8b
+    uabd            v26.16b, v16.16b, v0.16b   // abs(p0 - q0)
+    uxtl            v4.8h,   v0.8b
+    uxtl2           v5.8h,   v0.16b
+    uabd            v28.16b, v18.16b, v16.16b  // abs(p1 - p0)
+    usubw           v4.8h,   v4.8h,   v16.8b
+    usubw2          v5.8h,   v5.8h,   v16.16b
+    sli             v24.8h,  v24.8h,  #8
+    shl             v4.8h,   v4.8h,   #2
+    shl             v5.8h,   v5.8h,   #2
+    uabd            v30.16b, v2.16b,  v0.16b   // abs(q1 - q0)
+    uxtl            v24.4s,  v24.4h
+    uaddw           v4.8h,   v4.8h,   v18.8b
+    uaddw2          v5.8h,   v5.8h,   v18.16b
+    cmhi            v26.16b, v22.16b, v26.16b  // < alpha

x264-snapshot-20141104-2245.tar.bz2/common/aarch64/mc-a.S Added

@@ -0,0 +1,1365 @@
+/*****************************************************************************
+ * mc.S: aarch64 motion compensation
+ *****************************************************************************
+ * Copyright (C) 2009-2014 x264 project
+ *
+ * Authors: David Conrad <lessen42@gmail.com>
+ *          Janne Grunau <janne-x264@jannau.net>
+ *          Mans Rullgard <mans@mansr.com>
+ *          Stefan Groenroos <stefan.gronroos@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+// note: prefetch stuff assumes 64-byte cacheline
+
+// void prefetch_ref( uint8_t *pix, intptr_t stride, int parity )
+function x264_prefetch_ref_aarch64, export=1
+    cmp         w2,  #1
+    csel        x2,  xzr, x1, eq
+    add         x0,  x0,  #64
+    add         x0,  x0,  x2,  lsl #3
+
+    lsl         x2,  x1,  #1
+    add         x3,  x1,  x1,  lsl #1
+    add         x4,  x0,  x1,  lsl #2
+
+    prfm        pldl1strm, [x0]
+    prfm        pldl1strm, [x0,  x1]
+    prfm        pldl1strm, [x0,  x2]
+    prfm        pldl1strm, [x0,  x3]
+    prfm        pldl1strm, [x4]
+    prfm        pldl1strm, [x4,  x1]
+    prfm        pldl1strm, [x4,  x2]
+    prfm        pldl1strm, [x4,  x3]
+    ret
+endfunc
+
+// void prefetch_fenc( uint8_t *pix_y,  intptr_t stride_y,
+//                     uint8_t *pix_uv, intptr_t stride_uv, int mb_x )
+.macro x264_prefetch_fenc sub
+function x264_prefetch_fenc_\sub\()_aarch64, export=1
+    and         w6,  w5,  #3
+    and         w7,  w5,  #3
+    mul         x6,  x6,  x1
+    mul         x7,  x7,  x3
+    add         x0,  x0,  #64
+    add         x2,  x2,  #64
+
+    add         x0,  x0,  x6,  lsl #2
+    add         x6,  x0,  x1,  lsl #1
+    prfm        pldl1strm, [x0]
+    prfm        pldl1strm, [x0,  x1]
+    prfm        pldl1strm, [x6]
+    prfm        pldl1strm, [x6, x1]
+
+    add         x2,  x2,  x7,  lsl #1
+    prfm        pldl1strm, [x2]
+    prfm        pldl1strm, [x2,  x3]
+.ifc \sub, 422
+    add         x7,  x2,  x3,  lsl #1
+    prfm        pldl1strm, [x7]
+    prfm        pldl1strm, [x7,  x3]
+.endif
+    ret
+endfunc
+.endm
+
+x264_prefetch_fenc 420
+x264_prefetch_fenc 422
+
+// void pixel_avg( uint8_t *dst,  intptr_t dst_stride,
+//                 uint8_t *src1, intptr_t src1_stride,
+//                 uint8_t *src2, intptr_t src2_stride, int weight );
+.macro AVGH w h
+function x264_pixel_avg_\w\()x\h\()_neon, export=1
+    mov         w10, #64
+    cmp         w6,  #32
+    mov         w9, #\h
+    b.eq        pixel_avg_w\w\()_neon
+    subs        w7,  w10,  w6
+    b.lt        pixel_avg_weight_w\w\()_add_sub_neon     // weight > 64
+    cmp         w6,  #0
+    b.ge        pixel_avg_weight_w\w\()_add_add_neon
+    b           pixel_avg_weight_w\w\()_sub_add_neon     // weight < 0
+endfunc
+.endm
+
+AVGH  4, 2
+AVGH  4, 4
+AVGH  4, 8
+AVGH  4, 16
+AVGH  8, 4
+AVGH  8, 8
+AVGH  8, 16
+AVGH 16, 8
+AVGH 16, 16
+
+// 0 < weight < 64
+.macro load_weights_add_add
+    mov         w6,  w6
+.endm
+.macro weight_add_add dst, s1, s2, h=
+.ifc \h, 2
+    umull2      \dst, \s1, v30.16b
+    umlal2      \dst, \s2, v31.16b
+.else
+    umull       \dst, \s1, v30.8b
+    umlal       \dst, \s2, v31.8b
+.endif
+.endm
+
+// weight > 64
+.macro load_weights_add_sub
+    neg         w7,  w7
+.endm
+.macro weight_add_sub dst, s1, s2, h=
+.ifc \h, 2
+    umull2      \dst, \s1, v30.16b
+    umlsl2      \dst, \s2, v31.16b
+.else
+    umull       \dst, \s1, v30.8b
+    umlsl       \dst, \s2, v31.8b
+.endif
+.endm
+
+// weight < 0
+.macro load_weights_sub_add
+    neg         w6,  w6
+.endm
+.macro weight_sub_add dst, s1, s2, h=
+.ifc \h, 2
+    umull2      \dst, \s2, v31.16b
+    umlsl2      \dst, \s1, v30.16b
+.else
+    umull       \dst, \s2, v31.8b
+    umlsl       \dst, \s1, v30.8b
+.endif
+.endm
+
+.macro AVG_WEIGHT ext
+function pixel_avg_weight_w4_\ext\()_neon
+    load_weights_\ext
+    dup         v30.8b, w6
+    dup         v31.8b, w7
+1:  // height loop
+    subs        w9,  w9,  #2
+    ld1        {v0.s}[0], [x2], x3
+    ld1        {v1.s}[0], [x4], x5
+    weight_\ext v4.8h,  v0.8b,  v1.8b
+    ld1        {v2.s}[0], [x2], x3
+    ld1        {v3.s}[0], [x4], x5
+    sqrshrun    v0.8b,  v4.8h,  #6
+    weight_\ext v5.8h,  v2.8b,  v3.8b
+    st1        {v0.s}[0], [x0], x1
+    sqrshrun    v1.8b,  v5.8h,  #6
+    st1        {v1.s}[0], [x0], x1
+    b.gt        1b
+    ret
+endfunc
+
+function pixel_avg_weight_w8_\ext\()_neon
+    load_weights_\ext
+    dup         v30.8b, w6
+    dup         v31.8b, w7
+1:  // height loop
+    subs        w9,  w9,  #4
+    ld1        {v0.8b}, [x2], x3
+    ld1        {v1.8b}, [x4], x5
+    weight_\ext v16.8h, v0.8b,  v1.8b
+    ld1        {v2.8b}, [x2], x3
+    ld1        {v3.8b}, [x4], x5
+    weight_\ext v17.8h, v2.8b,  v3.8b
+    ld1        {v4.8b}, [x2], x3
+    ld1        {v5.8b}, [x4], x5
+    weight_\ext v18.8h, v4.8b,  v5.8b
+    ld1        {v6.8b}, [x2], x3
+    ld1        {v7.8b}, [x4], x5
+    weight_\ext v19.8h, v6.8b,  v7.8b
+    sqrshrun    v0.8b,  v16.8h, #6
+    sqrshrun    v1.8b,  v17.8h, #6
+    sqrshrun    v2.8b,  v18.8h, #6
+    sqrshrun    v3.8b,  v19.8h, #6

x264-snapshot-20141104-2245.tar.bz2/common/aarch64/mc-c.c Added

@@ -0,0 +1,249 @@
+/*****************************************************************************
+ * mc-c.c: aarch64 motion compensation
+ *****************************************************************************
+ * Copyright (C) 2009-2014 x264 project
+ *
+ * Authors: David Conrad <lessen42@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "mc.h"
+
+void x264_prefetch_ref_aarch64( uint8_t *, intptr_t, int );
+void x264_prefetch_fenc_420_aarch64( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_prefetch_fenc_422_aarch64( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+
+void *x264_memcpy_aligned_neon( void *dst, const void *src, size_t n );
+void x264_memzero_aligned_neon( void *dst, size_t n );
+
+void x264_pixel_avg_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_16x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_8x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_8x8_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_8x4_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_4x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_4x8_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_4x4_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_4x2_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+
+void x264_pixel_avg2_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
+void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
+void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
+void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
+
+void x264_plane_copy_deinterleave_neon(  pixel *dstu, intptr_t i_dstu,
+                                         pixel *dstv, intptr_t i_dstv,
+                                         pixel *src,  intptr_t i_src, int w, int h );
+void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta,
+                                            pixel *dstb, intptr_t i_dstb,
+                                            pixel *dstc, intptr_t i_dstc,
+                                            pixel *src,  intptr_t i_src, int pw, int w, int h );
+void x264_plane_copy_interleave_neon( pixel *dst,  intptr_t i_dst,
+                                      pixel *srcu, intptr_t i_srcu,
+                                      pixel *srcv, intptr_t i_srcv, int w, int h );
+
+void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
+void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
+void x264_load_deinterleave_chroma_fenc_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
+
+#define MC_WEIGHT(func)\
+void x264_mc_weight_w20##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
+void x264_mc_weight_w16##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
+void x264_mc_weight_w8##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
+void x264_mc_weight_w4##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
+\
+static void (* x264_mc##func##_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int ) =\
+{\
+    x264_mc_weight_w4##func##_neon,\
+    x264_mc_weight_w4##func##_neon,\
+    x264_mc_weight_w8##func##_neon,\
+    x264_mc_weight_w16##func##_neon,\
+    x264_mc_weight_w16##func##_neon,\
+    x264_mc_weight_w20##func##_neon,\
+};
+
+MC_WEIGHT()
+MC_WEIGHT(_nodenom)
+MC_WEIGHT(_offsetadd)
+MC_WEIGHT(_offsetsub)
+
+void x264_mc_copy_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_mc_copy_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_mc_copy_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+
+void x264_mc_chroma_neon( uint8_t *, uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int );
+void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int );
+
+#if !HIGH_BIT_DEPTH
+static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
+{
+    if( w->i_scale == 1<<w->i_denom )
+    {
+        if( w->i_offset < 0 )
+        {
+            w->weightfn = x264_mc_offsetsub_wtab_neon;
+            w->cachea[0] = -w->i_offset;
+        }
+        else
+        {
+            w->weightfn = x264_mc_offsetadd_wtab_neon;
+            w->cachea[0] = w->i_offset;
+        }
+    }
+    else if( !w->i_denom )
+        w->weightfn = x264_mc_nodenom_wtab_neon;
+    else
+        w->weightfn = x264_mc_wtab_neon;
+}
+
+static void (* const x264_pixel_avg_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ) =
+{
+    NULL,
+    x264_pixel_avg2_w4_neon,
+    x264_pixel_avg2_w8_neon,
+    x264_pixel_avg2_w16_neon,   // no slower than w12, so no point in a separate function
+    x264_pixel_avg2_w16_neon,
+    x264_pixel_avg2_w20_neon,
+};
+
+static void (* const x264_mc_copy_wtab_neon[5])( uint8_t *, intptr_t, uint8_t *, intptr_t, int ) =
+{
+    NULL,
+    x264_mc_copy_w4_neon,
+    x264_mc_copy_w8_neon,
+    NULL,
+    x264_mc_copy_w16_neon,
+};
+
+static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
+static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
+
+static void mc_luma_neon( uint8_t *dst,    intptr_t i_dst_stride,
+                          uint8_t *src[4], intptr_t i_src_stride,
+                          int mvx, int mvy,
+                          int i_width, int i_height, const x264_weight_t *weight )
+{
+    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
+    intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
+    uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset;
+    if ( (mvy&3) == 3 )             // explict if() to force conditional add
+        src1 += i_src_stride;
+
+    if( qpel_idx & 5 ) /* qpel interpolation needed */
+    {
+        uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
+        x264_pixel_avg_wtab_neon[i_width>>2](
+                dst, i_dst_stride, src1, i_src_stride,
+                src2, i_height );
+        if( weight->weightfn )
+            weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );
+    }
+    else if( weight->weightfn )
+        weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );
+    else
+        x264_mc_copy_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, i_height );
+}
+
+static uint8_t *get_ref_neon( uint8_t *dst,   intptr_t *i_dst_stride,
+                              uint8_t *src[4], intptr_t i_src_stride,
+                              int mvx, int mvy,
+                              int i_width, int i_height, const x264_weight_t *weight )
+{
+    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
+    intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
+    uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset;
+    if ( (mvy&3) == 3 )             // explict if() to force conditional add
+        src1 += i_src_stride;
+
+    if( qpel_idx & 5 ) /* qpel interpolation needed */
+    {
+        uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
+        x264_pixel_avg_wtab_neon[i_width>>2](
+                dst, *i_dst_stride, src1, i_src_stride,
+                src2, i_height );
+        if( weight->weightfn )
+            weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );
+        return dst;
+    }
+    else if( weight->weightfn )
+    {
+        weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );
+        return dst;
+    }
+    else
+    {
+        *i_dst_stride = i_src_stride;
+        return src1;
+    }
+}
+
+void x264_hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
+                            uint8_t *src, intptr_t stride, int width,
+                            int height, int16_t *buf );

x264-snapshot-20141104-2245.tar.bz2/common/aarch64/mc.h Added

@@ -0,0 +1,29 @@
+/*****************************************************************************
+ * mc.h: aarch64 motion compensation
+ *****************************************************************************
+ * Copyright (C) 2014 x264 project
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_AARCH64_MC_H
+#define X264_AARCH64_MC_H
+
+void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf );
+
+#endif

x264-snapshot-20141104-2245.tar.bz2/common/aarch64/pixel-a.S Added

@@ -0,0 +1,1153 @@
+/*****************************************************************************
+ * pixel.S: aarch64 pixel metrics
+ *****************************************************************************
+ * Copyright (C) 2009-2014 x264 project
+ *
+ * Authors: David Conrad <lessen42@gmail.com>
+ *          Janne Grunau <janne-x264@jannau.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+const mask
+.rept 16
+.byte 0xff
+.endr
+.rept 16
+.byte 0x00
+.endr
+endconst
+
+const mask_ac_4_8
+.short 0, -1, -1, -1,  0, -1, -1, -1
+.short 0, -1, -1, -1, -1, -1, -1, -1
+endconst
+
+.macro SAD_START_4
+    ld1        {v1.s}[0], [x2], x3
+    ld1        {v0.s}[0], [x0], x1
+    ld1        {v1.s}[1], [x2], x3
+    ld1        {v0.s}[1], [x0], x1
+    uabdl       v16.8h,  v0.8b,  v1.8b
+.endm
+
+.macro SAD_4
+    ld1        {v1.s}[0], [x2], x3
+    ld1        {v0.s}[0], [x0], x1
+    ld1        {v1.s}[1], [x2], x3
+    ld1        {v0.s}[1], [x0], x1
+    uabal       v16.8h,  v0.8b,  v1.8b
+.endm
+
+.macro SAD_START_8
+    ld1         {v1.8b}, [x2], x3
+    ld1         {v0.8b}, [x0], x1
+    ld1         {v3.8b}, [x2], x3
+    ld1         {v2.8b}, [x0], x1
+    uabdl       v16.8h,  v0.8b,  v1.8b
+    uabdl       v17.8h,  v2.8b,  v3.8b
+.endm
+
+.macro SAD_8
+    ld1         {v1.8b}, [x2], x3
+    ld1         {v0.8b}, [x0], x1
+    ld1         {v3.8b}, [x2], x3
+    ld1         {v2.8b}, [x0], x1
+    uabal       v16.8h,  v0.8b,  v1.8b
+    uabal       v17.8h,  v2.8b,  v3.8b
+.endm
+
+.macro SAD_START_16
+    ld1         {v1.16b}, [x2], x3
+    ld1         {v0.16b}, [x0], x1
+    ld1         {v3.16b}, [x2], x3
+    ld1         {v2.16b}, [x0], x1
+    uabdl       v16.8h,  v0.8b,  v1.8b
+    uabdl2      v17.8h,  v0.16b, v1.16b
+    uabal       v16.8h,  v2.8b,  v3.8b
+    uabal2      v17.8h,  v2.16b, v3.16b
+.endm
+
+.macro SAD_16
+    ld1         {v1.16b}, [x2], x3
+    ld1         {v0.16b}, [x0], x1
+    ld1         {v3.16b}, [x2], x3
+    ld1         {v2.16b}, [x0], x1
+    uabal       v16.8h,  v0.8b,  v1.8b
+    uabal2      v17.8h,  v0.16b, v1.16b
+    uabal       v16.8h,  v2.8b,  v3.8b
+    uabal2      v17.8h,  v2.16b, v3.16b
+.endm
+
+.macro SAD_FUNC w, h, name
+function x264_pixel_sad\name\()_\w\()x\h\()_neon, export=1
+    SAD_START_\w
+
+.rept \h / 2 - 1
+    SAD_\w
+.endr
+.if \w > 4
+    add         v16.8h,  v16.8h,  v17.8h
+.endif
+    uaddlv      s0,  v16.8h
+    fmov        w0,  s0
+    ret
+endfunc
+.endm
+
+SAD_FUNC  4,  4
+SAD_FUNC  4,  8
+SAD_FUNC  8,  4
+SAD_FUNC  8,  8
+SAD_FUNC  8,  16
+SAD_FUNC  16, 8
+SAD_FUNC  16, 16
+
+.macro SAD_X_4 x, first=uabal
+    ld1        {v0.s}[0], [x0], x7
+    ld1        {v1.s}[0], [x1], x5
+    ld1        {v0.s}[1], [x0], x7
+    ld1        {v1.s}[1], [x1], x5
+    \first      v16.8h,  v1.8b,  v0.8b
+    ld1        {v2.s}[0], [x2], x5
+    ld1        {v2.s}[1], [x2], x5
+    \first      v17.8h,  v2.8b,  v0.8b
+    ld1        {v3.s}[0], [x3], x5
+    ld1        {v3.s}[1], [x3], x5
+    \first      v18.8h,  v3.8b,  v0.8b
+.if \x == 4
+    ld1        {v4.s}[0], [x4], x5
+    ld1        {v4.s}[1], [x4], x5
+    \first      v19.8h,  v4.8b,  v0.8b
+.endif
+.endm
+
+.macro SAD_X_8 x, first=uabal
+    ld1        {v0.8b}, [x0], x7
+    ld1        {v1.8b}, [x1], x5
+    \first      v16.8h,  v1.8b,  v0.8b
+    ld1        {v2.8b}, [x2], x5
+    ld1        {v5.8b}, [x0], x7
+    \first      v17.8h,  v2.8b,  v0.8b
+    ld1        {v3.8b}, [x3], x5
+    ld1        {v1.8b}, [x1], x5
+   \first       v18.8h,  v3.8b,  v0.8b
+    uabal       v16.8h,  v1.8b,  v5.8b
+    ld1        {v2.8b}, [x2], x5
+    ld1        {v3.8b}, [x3], x5
+    uabal       v17.8h,  v2.8b,  v5.8b
+    uabal       v18.8h,  v3.8b,  v5.8b
+.if \x == 4
+    ld1        {v4.8b}, [x4], x5
+    \first      v19.8h,  v4.8b,  v0.8b
+    ld1        {v4.8b}, [x4], x5
+    uabal       v19.8h,  v4.8b,  v5.8b
+.endif
+.endm
+
+.macro SAD_X_16 x, first=uabal
+    ld1        {v0.16b}, [x0], x7
+    ld1        {v1.16b}, [x1], x5
+    \first      v16.8h,  v1.8b,  v0.8b
+    \first\()2  v20.8h,  v1.16b, v0.16b
+    ld1        {v2.16b}, [x2], x5
+    ld1        {v5.16b}, [x0], x7
+    \first      v17.8h,  v2.8b,  v0.8b
+    \first\()2  v21.8h,  v2.16b, v0.16b
+    ld1        {v3.16b}, [x3], x5
+    ld1        {v1.16b}, [x1], x5
+    \first      v18.8h,  v3.8b,  v0.8b
+    \first\()2  v22.8h,  v3.16b, v0.16b
+    uabal       v16.8h,  v1.8b,  v5.8b
+    uabal2      v20.8h,  v1.16b, v5.16b
+    ld1        {v2.16b}, [x2], x5
+    ld1        {v3.16b}, [x3], x5
+    uabal       v17.8h,  v2.8b,  v5.8b
+    uabal2      v21.8h,  v2.16b, v5.16b
+    uabal       v18.8h,  v3.8b,  v5.8b
+    uabal2      v22.8h,  v3.16b, v5.16b
+.if \x == 4
+    ld1        {v4.16b}, [x4], x5
+    \first      v19.8h,  v4.8b,  v0.8b
+    \first\()2  v23.8h,  v4.16b, v0.16b
+    ld1        {v4.16b}, [x4], x5
+    uabal       v19.8h,  v4.8b,  v5.8b
+    uabal2      v23.8h,  v4.16b, v5.16b
+.endif
+.endm
+
+.macro SAD_X_FUNC x, w, h
+function x264_pixel_sad_x\x\()_\w\()x\h\()_neon, export=1
+.if \x == 3
+    mov         x6,  x5

x264-snapshot-20141104-2245.tar.bz2/common/aarch64/pixel.h Added

@@ -0,0 +1,69 @@
+/*****************************************************************************
+ * pixel.h: aarch64 pixel metrics
+ *****************************************************************************
+ * Copyright (C) 2009-2014 x264 project
+ *
+ * Authors: David Conrad <lessen42@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_AARCH64_PIXEL_H
+#define X264_AARCH64_PIXEL_H
+
+#define DECL_PIXELS( ret, name, suffix, args ) \
+    ret x264_pixel_##name##_16x16_##suffix args;\
+    ret x264_pixel_##name##_16x8_##suffix args;\
+    ret x264_pixel_##name##_8x16_##suffix args;\
+    ret x264_pixel_##name##_8x8_##suffix args;\
+    ret x264_pixel_##name##_8x4_##suffix args;\
+    ret x264_pixel_##name##_4x8_##suffix args;\
+    ret x264_pixel_##name##_4x4_##suffix args;\
+
+#define DECL_X1( name, suffix ) \
+    DECL_PIXELS( int, name, suffix, ( uint8_t *, intptr_t, uint8_t *, intptr_t ) )
+
+#define DECL_X4( name, suffix ) \
+    DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )\
+    DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )
+
+DECL_X1( sad, neon )
+DECL_X4( sad, neon )
+DECL_X1( satd, neon )
+DECL_X1( ssd, neon )
+
+int x264_pixel_sa8d_8x8_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t );
+int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
+
+uint64_t x264_pixel_var_8x8_neon  ( uint8_t *, intptr_t );
+uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t );
+uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t );
+int x264_pixel_var2_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
+int x264_pixel_var2_8x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
+
+uint64_t x264_pixel_hadamard_ac_8x8_neon  ( uint8_t *, intptr_t );
+uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t );
+uint64_t x264_pixel_hadamard_ac_16x8_neon ( uint8_t *, intptr_t );
+uint64_t x264_pixel_hadamard_ac_16x16_neon( uint8_t *, intptr_t );
+
+void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, intptr_t,
+                                      const uint8_t *, intptr_t,
+                                      int sums[2][4] );
+float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width );
+
+#endif

x264-snapshot-20141104-2245.tar.bz2/common/aarch64/predict-a.S Added

@@ -0,0 +1,661 @@
+/*****************************************************************************
+ * predict.S: aarch64 intra prediction
+ *****************************************************************************
+ * Copyright (C) 2009-2014 x264 project
+ *
+ * Authors: David Conrad <lessen42@gmail.com>
+ *          Mans Rullgard <mans@mansr.com>
+ *          Janne Grunau <janne-x264@jannau.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+const p8weight, align=4
+    .short      1, 2, 3, 4, 1, 2, 3, 4
+endconst
+const p16weight, align=4
+    .short      1, 2, 3, 4, 5, 6, 7, 8
+endconst
+
+.macro ldcol.8  vd,  xn,  xm,  n=8,  hi=0
+.if \n == 8 || \hi == 0
+    ld1        {\vd\().b}[0], [\xn], \xm
+    ld1        {\vd\().b}[1], [\xn], \xm
+    ld1        {\vd\().b}[2], [\xn], \xm
+    ld1        {\vd\().b}[3], [\xn], \xm
+.endif
+.if \n == 8 || \hi == 1
+    ld1        {\vd\().b}[4], [\xn], \xm
+    ld1        {\vd\().b}[5], [\xn], \xm
+    ld1        {\vd\().b}[6], [\xn], \xm
+    ld1        {\vd\().b}[7], [\xn], \xm
+.endif
+.endm
+
+.macro ldcol.16  vd,  xn,  xm
+    ldcol.8     \vd, \xn, \xm
+    ld1        {\vd\().b}[ 8], [\xn], \xm
+    ld1        {\vd\().b}[ 9], [\xn], \xm
+    ld1        {\vd\().b}[10], [\xn], \xm
+    ld1        {\vd\().b}[11], [\xn], \xm
+    ld1        {\vd\().b}[12], [\xn], \xm
+    ld1        {\vd\().b}[13], [\xn], \xm
+    ld1        {\vd\().b}[14], [\xn], \xm
+    ld1        {\vd\().b}[15], [\xn], \xm
+.endm
+
+
+function x264_predict_4x4_h_aarch64, export=1
+    ldrb    w1, [x0, #0*FDEC_STRIDE-1]
+    ldrb    w2, [x0, #1*FDEC_STRIDE-1]
+    ldrb    w3, [x0, #2*FDEC_STRIDE-1]
+    ldrb    w4, [x0, #3*FDEC_STRIDE-1]
+    add     w1, w1, w1, lsl #8
+    add     w2, w2, w2, lsl #8
+    add     w3, w3, w3, lsl #8
+    add     w4, w4, w4, lsl #8
+    add     w1, w1, w1, lsl #16
+    str     w1, [x0, #0*FDEC_STRIDE]
+    add     w2, w2, w2, lsl #16
+    str     w2, [x0, #1*FDEC_STRIDE]
+    add     w3, w3, w3, lsl #16
+    str     w3, [x0, #2*FDEC_STRIDE]
+    add     w4, w4, w4, lsl #16
+    str     w4, [x0, #3*FDEC_STRIDE]
+    ret
+endfunc
+
+function x264_predict_4x4_v_aarch64, export=1
+    ldr     w1,  [x0, #0 - 1 * FDEC_STRIDE]
+    str     w1,  [x0, #0 + 0 * FDEC_STRIDE]
+    str     w1,  [x0, #0 + 1 * FDEC_STRIDE]
+    str     w1,  [x0, #0 + 2 * FDEC_STRIDE]
+    str     w1,  [x0, #0 + 3 * FDEC_STRIDE]
+    ret
+endfunc
+
+function x264_predict_4x4_dc_neon, export=1
+    sub         x1,  x0,  #FDEC_STRIDE
+    sub         x2,  x0,  #1
+    mov         x7,  #FDEC_STRIDE
+    ld1        {v0.8b}, [x1]
+    ld1r       {v1.8b}, [x2], x7
+    ld1r       {v2.8b}, [x2], x7
+    ld1r       {v3.8b}, [x2], x7
+    ld1r       {v4.8b}, [x2], x7
+    uaddlp      v0.4h,  v0.8b
+    uaddl       v1.8h,  v1.8b,  v2.8b
+    uaddl       v2.8h,  v3.8b,  v4.8b
+    addp        v0.4h,  v0.4h,  v0.4h
+    add         v1.4h,  v1.4h,  v2.4h
+    dup         v0.4h,  v0.h[0]
+    add         v0.4h,  v0.4h,  v1.4h
+    rshrn       v0.8b,  v0.8h,  #3
+    str         s0,  [x0], #FDEC_STRIDE
+    str         s0,  [x0], #FDEC_STRIDE
+    str         s0,  [x0], #FDEC_STRIDE
+    str         s0,  [x0]
+    ret
+endfunc
+
+function x264_predict_4x4_dc_top_neon, export=1
+    sub         x1,  x0,  #FDEC_STRIDE
+    mov         x7,  #FDEC_STRIDE
+    ld1        {v0.8b}, [x1]
+    uaddlp      v0.4h,  v0.8b
+    addp        v0.4h,  v0.4h,  v0.4h
+    dup         v0.4h,  v0.h[0]
+    rshrn       v0.8b,  v0.8h,  #2
+    str         s0,  [x0], #FDEC_STRIDE
+    str         s0,  [x0], #FDEC_STRIDE
+    str         s0,  [x0], #FDEC_STRIDE
+    str         s0,  [x0]
+    ret
+endfunc
+
+function x264_predict_4x4_ddr_neon, export=1
+    sub         x1,  x0,  #FDEC_STRIDE+1
+    mov         x7,  #FDEC_STRIDE
+    ld1        {v0.8b}, [x1], x7            // # -FDEC_STRIDE-1
+    ld1r       {v1.8b}, [x1], x7            // #0*FDEC_STRIDE-1
+    ld1r       {v2.8b}, [x1], x7            // #1*FDEC_STRIDE-1
+    ext         v0.8b,  v1.8b,  v0.8b,  #7
+    ld1r       {v3.8b}, [x1], x7            // #2*FDEC_STRIDE-1
+    ext         v0.8b,  v2.8b,  v0.8b,  #7  // a
+    ld1r       {v4.8b}, [x1], x7            // #3*FDEC_STRIDE-1
+    ext         v1.8b,  v3.8b,  v0.8b,  #7  // b
+    ext         v2.8b,  v4.8b,  v1.8b,  #7  // c
+    uaddl       v0.8h,  v0.8b,  v1.8b
+    uaddl       v1.8h,  v1.8b,  v2.8b
+    add         v0.8h,  v0.8h,  v1.8h
+    rshrn       v0.8b,  v0.8h,  #2
+
+    ext         v3.8b,  v0.8b, v0.8b,  #3
+    ext         v2.8b,  v0.8b, v0.8b,  #2
+    ext         v1.8b,  v0.8b, v0.8b,  #1
+
+    str         s3,  [x0], #FDEC_STRIDE
+    str         s2,  [x0], #FDEC_STRIDE
+    str         s1,  [x0], #FDEC_STRIDE
+    str         s0,  [x0]
+    ret
+endfunc
+
+function x264_predict_4x4_ddl_neon, export=1
+    sub         x0,  x0,  #FDEC_STRIDE
+    mov         x7,  #FDEC_STRIDE
+    ld1        {v0.8b}, [x0],  x7
+    dup         v3.8b,  v0.b[7]
+    ext         v1.8b,  v0.8b,  v0.8b,  #1
+    ext         v2.8b,  v0.8b,  v3.8b,  #2
+    uhadd       v0.8b,  v0.8b,  v2.8b
+    urhadd      v0.8b,  v0.8b,  v1.8b
+    str         s0,  [x0], #FDEC_STRIDE
+    ext         v1.8b,  v0.8b,  v0.8b,  #1
+    ext         v2.8b,  v0.8b,  v0.8b,  #2
+    str         s1,  [x0], #FDEC_STRIDE
+    ext         v3.8b,  v0.8b,  v0.8b,  #3
+    str         s2,  [x0], #FDEC_STRIDE
+    str         s3,  [x0]
+    ret
+endfunc
+
+function x264_predict_8x8_dc_neon, export=1
+    mov         x7,  #FDEC_STRIDE
+    ld1        {v0.16b}, [x1], #16
+    ld1        {v1.8b},  [x1]
+    ext         v0.16b, v0.16b, v0.16b, #7
+    uaddlv      h1,  v1.8b
+    uaddlv      h0,  v0.8b
+    add         v0.8h,  v0.8h,  v1.8h
+    dup         v0.8h,  v0.h[0]
+    rshrn       v0.8b,  v0.8h,  #4
+.rept 8
+    st1        {v0.8b}, [x0], x7
+.endr
+    ret
+endfunc
+
+function x264_predict_8x8_h_neon, export=1
+    mov         x7,  #FDEC_STRIDE
+    ld1        {v16.16b}, [x1]
+    dup         v0.8b, v16.b[14]

x264-snapshot-20141104-2245.tar.bz2/common/aarch64/predict-c.c Added

@@ -0,0 +1,114 @@
+/*****************************************************************************
+ * predict.c: aarch64 intra prediction
+ *****************************************************************************
+ * Copyright (C) 2009-2014 x264 project
+ *
+ * Authors: David Conrad <lessen42@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "predict.h"
+#include "pixel.h"
+
+void x264_predict_4x4_dc_top_neon( uint8_t *src );
+void x264_predict_4x4_ddr_neon( uint8_t *src );
+void x264_predict_4x4_ddl_neon( uint8_t *src );
+
+void x264_predict_8x8c_dc_top_neon( uint8_t *src );
+void x264_predict_8x8c_dc_left_neon( uint8_t *src );
+void x264_predict_8x8c_p_neon( uint8_t *src );
+
+void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] );
+
+void x264_predict_16x16_dc_top_neon( uint8_t *src );
+void x264_predict_16x16_dc_left_neon( uint8_t *src );
+void x264_predict_16x16_p_neon( uint8_t *src );
+
+void x264_predict_4x4_init_aarch64( int cpu, x264_predict_t pf[12] )
+{
+#if !HIGH_BIT_DEPTH
+    if (cpu&X264_CPU_ARMV8)
+    {
+        pf[I_PRED_4x4_H]   = x264_predict_4x4_h_aarch64;
+        pf[I_PRED_4x4_V]   = x264_predict_4x4_v_aarch64;
+    }
+
+    if (cpu&X264_CPU_NEON)
+    {
+        pf[I_PRED_4x4_DC]     = x264_predict_4x4_dc_neon;
+        pf[I_PRED_4x4_DC_TOP] = x264_predict_4x4_dc_top_neon;
+        pf[I_PRED_4x4_DDL]    = x264_predict_4x4_ddl_neon;
+        pf[I_PRED_4x4_DDR]    = x264_predict_4x4_ddr_neon;
+    }
+#endif // !HIGH_BIT_DEPTH
+}
+
+void x264_predict_8x8c_init_aarch64( int cpu, x264_predict_t pf[7] )
+{
+    if (!(cpu&X264_CPU_NEON))
+        return;
+
+#if !HIGH_BIT_DEPTH
+    pf[I_PRED_CHROMA_DC]      = x264_predict_8x8c_dc_neon;
+    pf[I_PRED_CHROMA_DC_TOP]  = x264_predict_8x8c_dc_top_neon;
+    pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left_neon;
+    pf[I_PRED_CHROMA_H]       = x264_predict_8x8c_h_neon;
+    pf[I_PRED_CHROMA_V]       = x264_predict_8x8c_v_neon;
+    pf[I_PRED_CHROMA_P]       = x264_predict_8x8c_p_neon;
+#endif // !HIGH_BIT_DEPTH
+}
+
+void x264_predict_8x8_init_aarch64( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
+{
+    if (!(cpu&X264_CPU_NEON))
+        return;
+
+#if !HIGH_BIT_DEPTH
+    pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_neon;
+    pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_neon;
+    pf[I_PRED_8x8_VL]  = x264_predict_8x8_vl_neon;
+    pf[I_PRED_8x8_VR]  = x264_predict_8x8_vr_neon;
+    pf[I_PRED_8x8_DC]  = x264_predict_8x8_dc_neon;
+    pf[I_PRED_8x8_H]   = x264_predict_8x8_h_neon;
+    pf[I_PRED_8x8_HD]  = x264_predict_8x8_hd_neon;
+    pf[I_PRED_8x8_HU]  = x264_predict_8x8_hu_neon;
+    pf[I_PRED_8x8_V]   = x264_predict_8x8_v_neon;
+#endif // !HIGH_BIT_DEPTH
+}
+
+void x264_predict_16x16_init_aarch64( int cpu, x264_predict_t pf[7] )
+{
+    if (!(cpu&X264_CPU_NEON))
+        return;
+
+#if !HIGH_BIT_DEPTH
+    pf[I_PRED_16x16_DC ]    = x264_predict_16x16_dc_neon;
+    pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_neon;
+    pf[I_PRED_16x16_DC_LEFT]= x264_predict_16x16_dc_left_neon;
+    pf[I_PRED_16x16_H ]     = x264_predict_16x16_h_neon;
+    pf[I_PRED_16x16_V ]     = x264_predict_16x16_v_neon;
+    pf[I_PRED_16x16_P ]     = x264_predict_16x16_p_neon;
+#endif // !HIGH_BIT_DEPTH
+}

x264-snapshot-20141104-2245.tar.bz2/common/aarch64/predict.h Added

@@ -0,0 +1,52 @@
+/*****************************************************************************
+ * predict.h: aarch64 intra prediction
+ *****************************************************************************
+ * Copyright (C) 2009-2014 x264 project
+ *
+ * Authors: David Conrad <lessen42@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_AARCH64_PREDICT_H
+#define X264_AARCH64_PREDICT_H
+
+void x264_predict_4x4_h_aarch64( uint8_t *src );
+void x264_predict_4x4_v_aarch64( uint8_t *src );
+
+// for the merged 4x4 intra sad/satd which expects unified suffix
+#define x264_predict_4x4_h_neon x264_predict_4x4_h_aarch64
+#define x264_predict_4x4_v_neon x264_predict_4x4_v_aarch64
+
+void x264_predict_4x4_dc_neon( uint8_t *src );
+void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8c_dc_neon( uint8_t *src );
+void x264_predict_8x8c_h_neon( uint8_t *src );
+void x264_predict_8x8c_v_neon( uint8_t *src );
+void x264_predict_16x16_v_neon( uint8_t *src );
+void x264_predict_16x16_h_neon( uint8_t *src );
+void x264_predict_16x16_dc_neon( uint8_t *src );
+
+void x264_predict_4x4_init_aarch64( int cpu, x264_predict_t pf[12] );
+void x264_predict_8x8_init_aarch64( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
+void x264_predict_8x8c_init_aarch64( int cpu, x264_predict_t pf[7] );
+void x264_predict_16x16_init_aarch64( int cpu, x264_predict_t pf[7] );
+
+#endif /* X264_AARCH64_PREDICT_H */

x264-snapshot-20141104-2245.tar.bz2/common/aarch64/quant-a.S Added

@@ -0,0 +1,386 @@
+/****************************************************************************
+ * quant.S: arm quantization and level-run
+ *****************************************************************************
+ * Copyright (C) 2009-2014 x264 project
+ *
+ * Authors: David Conrad <lessen42@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.macro QUANT_TWO bias0 bias1 mf0_1 mf2_3 mask
+    add         v18.8h, v18.8h, \bias0
+    add         v19.8h, v19.8h, \bias1
+    umull       v20.4s, v18.4h, \mf0_1\().4h
+    umull2      v21.4s, v18.8h, \mf0_1\().8h
+    umull       v22.4s, v19.4h, \mf2_3\().4h
+    umull2      v23.4s, v19.8h, \mf2_3\().8h
+    sshr        v16.8h, v16.8h, #15
+    sshr        v17.8h, v17.8h, #15
+    shrn        v18.4h, v20.4s, #16
+    shrn2       v18.8h, v21.4s, #16
+    shrn        v19.4h, v22.4s, #16
+    shrn2       v19.8h, v23.4s, #16
+    eor         v18.16b, v18.16b, v16.16b
+    eor         v19.16b, v19.16b, v17.16b
+    sub         v18.8h, v18.8h, v16.8h
+    sub         v19.8h, v19.8h, v17.8h
+    orr         \mask,  v18.16b, v19.16b
+    st1        {v18.8h,v19.8h}, [x0], #32
+.endm
+
+.macro QUANT_END d
+    fmov        x2,  \d
+    mov         w0,  #0
+    tst         x2,  x2
+    cinc        w0,  w0,  ne
+    ret
+.endm
+
+// quant_2x2_dc( int16_t dct[4], int mf, int bias )
+function x264_quant_2x2_dc_neon, export=1
+    ld1        {v0.4h}, [x0]
+    dup         v2.4h,  w2
+    dup         v1.4h,  w1
+    abs         v3.4h,  v0.4h
+    add         v3.4h,  v3.4h,  v2.4h
+    umull       v3.4s,  v3.4h,  v1.4h
+    sshr        v0.4h,  v0.4h,  #15
+    shrn        v3.4h,  v3.4s,  #16
+    eor         v3.8b,  v3.8b,  v0.8b
+    sub         v3.4h,  v3.4h,  v0.4h
+    st1        {v3.4h}, [x0]
+    QUANT_END   d3
+endfunc
+
+// quant_4x4_dc( int16_t dct[16], int mf, int bias )
+function x264_quant_4x4_dc_neon, export=1
+    ld1        {v16.8h,v17.8h}, [x0]
+    abs         v18.8h,  v16.8h
+    abs         v19.8h,  v17.8h
+    dup         v0.8h,  w2
+    dup         v2.8h,  w1
+    QUANT_TWO   v0.8h,  v0.8h,  v2,  v2,  v0.16b
+    uqxtn       v0.8b,  v0.8h
+    QUANT_END   d0
+endfunc
+
+// quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
+function x264_quant_4x4_neon, export=1
+    ld1        {v16.8h,v17.8h}, [x0]
+    abs         v18.8h,  v16.8h
+    abs         v19.8h,  v17.8h
+    ld1        {v0.8h,v1.8h}, [x2]
+    ld1        {v2.8h,v3.8h}, [x1]
+    QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v0.16b
+    uqxtn       v0.8b,  v0.8h
+    QUANT_END   d0
+endfunc
+
+// quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] )
+function x264_quant_4x4x4_neon, export=1
+    ld1        {v16.8h,v17.8h}, [x0]
+    abs         v18.8h, v16.8h
+    abs         v19.8h, v17.8h
+    ld1        {v0.8h,v1.8h}, [x2]
+    ld1        {v2.8h,v3.8h}, [x1]
+    QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v4.16b
+    ld1        {v16.8h,v17.8h}, [x0]
+    abs         v18.8h, v16.8h
+    abs         v19.8h, v17.8h
+    QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v5.16b
+    ld1        {v16.8h,v17.8h}, [x0]
+    abs         v18.8h, v16.8h
+    abs         v19.8h, v17.8h
+    QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v6.16b
+    ld1        {v16.8h,v17.8h}, [x0]
+    abs         v18.8h, v16.8h
+    abs         v19.8h, v17.8h
+    QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v7.16b
+    uqxtn       v4.8b,  v4.8h
+    uqxtn       v7.8b,  v7.8h
+    uqxtn       v6.8b,  v6.8h
+    uqxtn       v5.8b,  v5.8h
+    fmov        x7,  d7
+    fmov        x6,  d6
+    fmov        x5,  d5
+    fmov        x4,  d4
+    mov         w0,  #0
+    tst         x7,  x7
+    cinc        w0,  w0,  ne
+    lsl         w0,  w0,  #1
+    tst         x6,  x6
+    cinc        w0,  w0,  ne
+    lsl         w0,  w0,  #1
+    tst         x5,  x5
+    cinc        w0,  w0,  ne
+    lsl         w0,  w0,  #1
+    tst         x4,  x4
+    cinc        w0,  w0,  ne
+    ret
+endfunc
+
+// quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
+function x264_quant_8x8_neon, export=1
+    ld1        {v16.8h,v17.8h}, [x0]
+    abs         v18.8h, v16.8h
+    abs         v19.8h, v17.8h
+    ld1        {v0.8h,v1.8h}, [x2], #32
+    ld1        {v2.8h,v3.8h}, [x1], #32
+    QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v4.16b
+.rept 3
+    ld1        {v16.8h,v17.8h}, [x0]
+    abs         v18.8h, v16.8h
+    abs         v19.8h, v17.8h
+    ld1        {v0.8h,v1.8h}, [x2], #32
+    ld1        {v2.8h,v3.8h}, [x1], #32
+    QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v5.16b
+    orr         v4.16b, v4.16b, v5.16b
+.endr
+    uqxtn       v0.8b,  v4.8h
+    QUANT_END   d0
+endfunc
+
+.macro DEQUANT_START mf_size offset dc=no
+    mov         w3,  #0x2b
+    mul         w3,  w3,  w2
+    lsr         w3,  w3,  #8            // i_qbits = i_qp / 6
+    add         w5,  w3,  w3,  lsl #1
+    sub         w2,  w2,  w5,  lsl #1   // i_mf = i_qp % 6
+    lsl         w2,  w2,  #\mf_size
+.ifc \dc,no
+    add         x1,  x1,  w2, sxtw      // dequant_mf[i_mf]
+.else
+    ldr         x1, [x1,  w2, sxtw]     // dequant_mf[i_mf][0][0]
+.endif
+    subs        w3,  w3,  #\offset      // 6 for 8x8
+.endm
+
+// dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp )
+.macro DEQUANT size bits
+function x264_dequant_\size\()_neon, export=1
+    DEQUANT_START \bits+2, \bits
+.ifc \size, 8x8
+    mov         w2,  #4
+.endif
+    b.lt        dequant_\size\()_rshift
+
+    dup         v31.8h, w3
+dequant_\size\()_lshift_loop:
+.ifc \size, 8x8
+    subs        w2,  w2,  #1
+.endif
+    ld1        {v16.4s}, [x1], #16
+    ld1        {v17.4s}, [x1], #16
+    sqxtn       v2.4h,  v16.4s
+    ld1        {v18.4s}, [x1], #16
+    sqxtn2      v2.8h,  v17.4s
+    ld1        {v19.4s}, [x1], #16
+    sqxtn       v3.4h,  v18.4s
+    ld1        {v0.8h,v1.8h}, [x0]
+    sqxtn2      v3.8h,  v19.4s
+    mul         v0.8h,  v0.8h,  v2.8h

x264-snapshot-20141104-2245.tar.bz2/common/aarch64/quant.h Added

@@ -0,0 +1,47 @@
+/*****************************************************************************
+ * quant.h: arm quantization and level-run
+ *****************************************************************************
+ * Copyright (C) 2005-2014 x264 project
+ *
+ * Authors: David Conrad <lessen42@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_AARCH64_QUANT_H
+#define X264_AARCH64_QUANT_H
+
+int x264_quant_2x2_dc_aarch64( int16_t dct[4], int mf, int bias );
+
+int x264_quant_2x2_dc_neon( int16_t dct[4], int mf, int bias );
+int x264_quant_4x4_dc_neon( int16_t dct[16], int mf, int bias );
+int x264_quant_4x4_neon( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
+int x264_quant_4x4x4_neon( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] );
+int x264_quant_8x8_neon( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] );
+
+void x264_dequant_4x4_dc_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp );
+
+int x264_coeff_last4_aarch64( int16_t * );
+int x264_coeff_last8_aarch64( int16_t * );
+int x264_coeff_last15_neon( int16_t * );
+int x264_coeff_last16_neon( int16_t * );
+int x264_coeff_last64_neon( int16_t * );
+
+#endif

x264-snapshot-20140321-2245.tar.bz2/common/arm/asm.S -> x264-snapshot-20141104-2245.tar.bz2/common/arm/asm.S Changed

@@ -28,6 +28,16 @@
 
 .syntax unified
 
+#if   HAVE_NEON
+        .arch           armv7-a
+#elif HAVE_ARMV6T2
+        .arch           armv6t2
+#elif HAVE_ARMV6
+        .arch           armv6
+#endif
+
+.fpu neon
+
 #ifdef PREFIX
 #   define EXTERN_ASM _
 #else
@@ -40,32 +50,49 @@
 #   define ELF @
 #endif
 
-        .macro require8, val=1
+#if HAVE_AS_FUNC
+#   define FUNC
+#else
+#   define FUNC @
+#endif
+
+.macro require8, val=1
 ELF     .eabi_attribute 24, \val
-        .endm
+.endm
 
-        .macro preserve8, val=1
+.macro preserve8, val=1
 ELF     .eabi_attribute 25, \val
-        .endm
+.endm
 
-        .macro function name
-        .global EXTERN_ASM\name
+.macro function name, export=1
+    .macro endfunc
+ELF     .size   \name, . - \name
+FUNC    .endfunc
+        .purgem endfunc
+    .endm
         .align  2
+.if \export == 1
+        .global EXTERN_ASM\name
+ELF     .hidden EXTERN_ASM\name
+ELF     .type   EXTERN_ASM\name, %function
+FUNC    .func   EXTERN_ASM\name
 EXTERN_ASM\name:
+.else
 ELF     .hidden \name
 ELF     .type   \name, %function
-        .func   \name
+FUNC    .func   \name
 \name:
-        .endm
+.endif
+.endm
 
-        .macro movrel rd, val
+.macro movrel rd, val
 #if HAVE_ARMV6T2 && !defined(PIC)
         movw            \rd, #:lower16:\val
         movt            \rd, #:upper16:\val
 #else
         ldr             \rd, =\val
 #endif
-        .endm
+.endm
 
 .macro movconst rd, val
 #if HAVE_ARMV6T2
@@ -78,6 +105,10 @@
 #endif
 .endm
 
+#define GLUE(a, b) a ## b
+#define JOIN(a, b) GLUE(a, b)
+#define X(s) JOIN(EXTERN_ASM, s)
+
 #define FENC_STRIDE 16
 #define FDEC_STRIDE 32

x264-snapshot-20140321-2245.tar.bz2/common/arm/cpu-a.S -> x264-snapshot-20141104-2245.tar.bz2/common/arm/cpu-a.S Changed

@@ -25,7 +25,6 @@
 
 #include "asm.S"
 
-.fpu neon
 .align 2
 
 // done in gas because .fpu neon overrides the refusal to assemble
@@ -33,12 +32,12 @@
 function x264_cpu_neon_test
     vadd.i16    q0, q0, q0
     bx          lr
-.endfunc
+endfunc
 
 // return: 0 on success
 //         1 if counters were already enabled
 //         9 if lo-res counters were already enabled
-function x264_cpu_enable_armv7_counter
+function x264_cpu_enable_armv7_counter, export=0
     mrc         p15, 0, r2, c9, c12, 0      // read PMNC
     ands        r0, r2, #1
     andne       r0, r2, #9
@@ -49,14 +48,14 @@
     mov         r2, #1 << 31                // enable cycle counter
     mcr         p15, 0, r2, c9, c12, 1      // write CNTENS
     bx          lr
-.endfunc
+endfunc
 
-function x264_cpu_disable_armv7_counter
+function x264_cpu_disable_armv7_counter, export=0
     mrc         p15, 0, r0, c9, c12, 0      // read PMNC
     bic         r0, r0, #1                  // disable counters
     mcr         p15, 0, r0, c9, c12, 0      // write PMNC
     bx          lr
-.endfunc
+endfunc
 
 
 .macro READ_TIME r
@@ -106,4 +105,4 @@
     cmp         r0, #10
     movgt       r0, #0
     pop         {r4-r6,pc}
-.endfunc
+endfunc

x264-snapshot-20140321-2245.tar.bz2/common/arm/dct-a.S -> x264-snapshot-20141104-2245.tar.bz2/common/arm/dct-a.S Changed

@@ -25,8 +25,6 @@
 
 #include "asm.S"
 
-.fpu neon
-
 .section .rodata
 .align 4
 
@@ -82,7 +80,7 @@
     vrhadd.s16      d3,  d6,  d7
     vst1.64         {d0-d3}, [r0,:128]
     bx              lr
-.endfunc
+endfunc
 
 function x264_idct4x4dc_neon
     vld1.64         {d0-d3}, [r0,:128]
@@ -94,7 +92,7 @@
     HADAMARD        2, sumsub, d3, d2, d6, d7
     vst1.64         {d0-d3}, [r0,:128]
     bx              lr
-.endfunc
+endfunc
 
 
 .macro DCT_1D d0 d1 d2 d3  d4 d5 d6 d7
@@ -129,9 +127,9 @@
     DCT_1D          d4, d5, d6, d7, d0, d1, d2, d3
     vst1.64         {d4-d7}, [r0,:128]
     bx              lr
-.endfunc
+endfunc
 
-function x264_sub8x4_dct_neon
+function x264_sub8x4_dct_neon, export=0
     vld1.64         {d0}, [r1,:64], r3
     vld1.64         {d1}, [r2,:64], ip
     vsubl.u8        q8,  d0,  d1
@@ -165,7 +163,7 @@
     vst1.64         {d4-d5}, [r0,:128]!
     vst1.64         {d6-d7}, [r0,:128]!
     bx              lr
-.endfunc
+endfunc
 
 function x264_sub8x8_dct_neon
     push            {lr}
@@ -174,7 +172,7 @@
     bl              x264_sub8x4_dct_neon
     pop             {lr}
     b               x264_sub8x4_dct_neon
-.endfunc
+endfunc
 
 function x264_sub16x16_dct_neon
     push            {lr}
@@ -195,7 +193,7 @@
     bl              x264_sub8x4_dct_neon
     pop             {lr}
     b               x264_sub8x4_dct_neon
-.endfunc
+endfunc
 
 
 .macro DCT8_1D type
@@ -279,22 +277,22 @@
     vst1.64         {d24-d27}, [r0,:128]!
     vst1.64         {d28-d31}, [r0,:128]!
     bx              lr
-.endfunc
+endfunc
 
 function x264_sub16x16_dct8_neon
     push            {lr}
-    bl              x264_sub8x8_dct8_neon
+    bl              X(x264_sub8x8_dct8_neon)
     sub             r1,  r1,  #FENC_STRIDE*8 - 8
     sub             r2,  r2,  #FDEC_STRIDE*8 - 8
-    bl              x264_sub8x8_dct8_neon
+    bl              X(x264_sub8x8_dct8_neon)
     sub             r1,  r1,  #8
     sub             r2,  r2,  #8
-    bl              x264_sub8x8_dct8_neon
+    bl              X(x264_sub8x8_dct8_neon)
     pop             {lr}
     sub             r1,  r1,  #FENC_STRIDE*8 - 8
     sub             r2,  r2,  #FDEC_STRIDE*8 - 8
-    b               x264_sub8x8_dct8_neon
-.endfunc
+    b               X(x264_sub8x8_dct8_neon)
+endfunc
 
 
 // First part of IDCT (minus final SUMSUB_BA)
@@ -336,9 +334,9 @@
     vst1.32         {d2[1]}, [r0,:32], r2
     vst1.32         {d2[0]}, [r0,:32], r2
     bx              lr
-.endfunc
+endfunc
 
-function x264_add8x4_idct_neon
+function x264_add8x4_idct_neon, export=0
     vld1.64         {d0-d3}, [r1,:128]!
     IDCT_1D         d16, d18, d20, d22, d0, d1, d2, d3
     vld1.64         {d4-d7}, [r1,:128]!
@@ -376,7 +374,7 @@
     vst1.32         {d2}, [r0,:64], r2
     vst1.32         {d3}, [r0,:64], r2
     bx              lr
-.endfunc
+endfunc
 
 function x264_add8x8_idct_neon
     mov             r2, #FDEC_STRIDE
@@ -384,7 +382,7 @@
     bl              x264_add8x4_idct_neon
     mov             lr, ip
     b               x264_add8x4_idct_neon
-.endfunc
+endfunc
 
 function x264_add16x16_idct_neon
     mov             r2, #FDEC_STRIDE
@@ -401,7 +399,7 @@
     bl              x264_add8x4_idct_neon
     mov             lr, ip
     b               x264_add8x4_idct_neon
-.endfunc
+endfunc
 
 
 .macro IDCT8_1D type
@@ -498,19 +496,19 @@
     vst1.64         {d6}, [r0,:64], r2
     vst1.64         {d7}, [r0,:64], r2
     bx              lr
-.endfunc
+endfunc
 
 function x264_add16x16_idct8_neon
     mov             ip,  lr
-    bl              x264_add8x8_idct8_neon
+    bl              X(x264_add8x8_idct8_neon)
     sub             r0,  r0,  #8*FDEC_STRIDE-8
-    bl              x264_add8x8_idct8_neon
+    bl              X(x264_add8x8_idct8_neon)
     sub             r0,  r0,  #8
-    bl              x264_add8x8_idct8_neon
+    bl              X(x264_add8x8_idct8_neon)
     sub             r0,  r0,  #8*FDEC_STRIDE-8
     mov             lr,  ip
-    b               x264_add8x8_idct8_neon
-.endfunc
+    b               X(x264_add8x8_idct8_neon)
+endfunc
 
 
 function x264_add8x8_idct_dc_neon
@@ -562,7 +560,7 @@
     vst1.64         {d6}, [r0,:64], r2
     vst1.64         {d7}, [r0,:64], r2
     bx              lr
-.endfunc
+endfunc
 
 .macro ADD16x4_IDCT_DC dc
     vld1.64         {d16-d17}, [r0,:128], r3
@@ -610,7 +608,7 @@
     ADD16x4_IDCT_DC d2
     ADD16x4_IDCT_DC d3
     bx              lr
-.endfunc
+endfunc
 
 function x264_sub8x8_dct_dc_neon
     mov             r3,  #FENC_STRIDE
@@ -658,7 +656,7 @@
     vpadd.s16       d0,  d0,  d1
     vst1.64         {d0}, [r0,:64]
     bx              lr
-.endfunc
+endfunc
 
 
 function x264_zigzag_scan_4x4_frame_neon
@@ -671,4 +669,4 @@
     vtbl.8      d7, {d2-d3}, d19
     vst1.64     {d4-d7},   [r0,:128]
     bx          lr
-.endfunc
+endfunc

x264-snapshot-20140321-2245.tar.bz2/common/arm/deblock-a.S -> x264-snapshot-20141104-2245.tar.bz2/common/arm/deblock-a.S Changed

@@ -25,8 +25,6 @@
 
 #include "asm.S"
 
-.fpu neon
-
 .macro h264_loop_filter_start
     ldr             ip,  [sp]
     ldr             ip,  [ip]
@@ -142,7 +140,7 @@
 
     align_pop_regs
     bx              lr
-.endfunc
+endfunc
 
 function x264_deblock_h_luma_neon
     h264_loop_filter_start
@@ -194,7 +192,7 @@
 
     align_pop_regs
     bx              lr
-.endfunc
+endfunc
 
 .macro h264_loop_filter_chroma
     vdup.8          q11, r2         // alpha
@@ -255,7 +253,7 @@
     vst2.8          {d0, d1},  [r0,:128], r1
 
     bx              lr
-.endfunc
+endfunc
 
 function x264_deblock_h_chroma_neon
     h264_loop_filter_start
@@ -303,4 +301,110 @@
     vst1.8          {d3},  [r0], r1
 
     bx              lr
-.endfunc
+endfunc
+
+function x264_deblock_strength_neon
+    ldr             ip,  [sp]
+    vmov.i8         q8,  #0
+    lsl             ip,  ip,  #8
+    add             r3,  r3,  #32
+    sub             ip,  ip,  #(1<<8)-3
+    vmov.i8         q9,  #0
+    vdup.16         q10, ip
+    ldr             ip,  [sp, #4]
+
+lists:
+    @ load bytes ref
+    vld1.8          {d31}, [r1]!
+    add             r2,  r2,  #16
+    vld1.8          {q1},  [r1]!
+    vmov.i8         q0,  #0
+    vld1.8          {q2},  [r1]!
+    vext.8          q3,  q0,  q1,  #15
+    vext.8          q0,  q0,  q2,  #15
+    vuzp.32         q1,  q2
+    vuzp.32         q3,  q0
+    vext.8          q1,  q15, q2,  #12
+
+    veor            q0,  q0,  q2
+    veor            q1,  q1,  q2
+    vorr            q8,  q8,  q0
+    vorr            q9,  q9,  q1
+
+    vld1.16         {q11}, [r2,:128]!   @ mv + 0x10
+    vld1.16         {q3},  [r2,:128]!   @ mv + 0x20
+    vld1.16         {q12}, [r2,:128]!   @ mv + 0x30
+    vld1.16         {q2},  [r2,:128]!   @ mv + 0x40
+    vld1.16         {q13}, [r2,:128]!   @ mv + 0x50
+    vext.8          q3,  q3,  q12, #12
+    vext.8          q2,  q2,  q13, #12
+    vabd.s16        q0,  q12, q3
+    vld1.16         {q3},  [r2,:128]!   @ mv + 0x60
+    vabd.s16        q1,  q13, q2
+    vld1.16         {q14}, [r2,:128]!   @ mv + 0x70
+    vqmovn.u16      d0,  q0
+    vld1.16         {q2},  [r2,:128]!   @ mv + 0x80
+    vld1.16         {q15}, [r2,:128]!   @ mv + 0x90
+    vqmovn.u16      d1,  q1
+    vext.8          q3,  q3,  q14, #12
+    vext.8          q2,  q2,  q15, #12
+    vabd.s16        q3,  q14, q3
+    vabd.s16        q2,  q15, q2
+    vqmovn.u16      d2,  q3
+    vqmovn.u16      d3,  q2
+
+    vqsub.u8        q0,  q0,  q10
+    vqsub.u8        q1,  q1,  q10
+    vqmovn.u16      d0,  q0
+    vqmovn.u16      d1,  q1
+
+    vabd.s16        q1,  q12, q13
+    vorr            q8,  q8,  q0
+
+    vabd.s16        q0,  q11, q12
+    vabd.s16        q2,  q13, q14
+    vabd.s16        q3,  q14, q15
+    vqmovn.u16      d0,  q0
+    vqmovn.u16      d1,  q1
+    vqmovn.u16      d2,  q2
+    vqmovn.u16      d3,  q3
+
+    vqsub.u8        q0,  q0,  q10
+    vqsub.u8        q1,  q1,  q10
+    vqmovn.u16      d0,  q0
+    vqmovn.u16      d1,  q1
+    subs            ip,  ip,  #1
+    vorr            q9,  q9,  q0
+    beq             lists
+
+    mov             ip,  #-32
+    @ load bytes nnz
+    vld1.8          {d31}, [r0]!
+    vld1.8          {q1},  [r0]!
+    vmov.i8         q0,  #0
+    vld1.8          {q2},  [r0]
+    vext.8          q3,  q0,  q1,  #15
+    vext.8          q0,  q0,  q2,  #15
+    vuzp.32         q1,  q2
+    vuzp.32         q3,  q0
+    vext.8          q1,  q15, q2,  #12
+
+    vorr            q0,  q0,  q2
+    vorr            q1,  q1,  q2
+    vmov.u8         q10, #1
+    vmin.u8         q0,  q0,  q10
+    vmin.u8         q1,  q1,  q10
+    vmin.u8         q8,  q8,  q10       @ mv ? 1 : 0
+    vmin.u8         q9,  q9,  q10
+    vadd.u8         q0,  q0,  q0        @ nnz ? 2 : 0
+    vadd.u8         q1,  q1,  q1
+    vmax.u8         q8,  q8,  q0
+    vmax.u8         q9,  q9,  q1
+    vzip.16         d16, d17
+    vst1.8          {q9}, [r3,:128], ip @ bs[1]
+    vtrn.8          d16, d17
+    vtrn.32         d16, d17
+
+    vst1.8          {q8}, [r3,:128]     @ bs[0]
+    bx              lr
+endfunc

x264-snapshot-20140321-2245.tar.bz2/common/arm/mc-a.S -> x264-snapshot-20141104-2245.tar.bz2/common/arm/mc-a.S Changed

@@ -27,7 +27,6 @@
 
 #include "asm.S"
 
-.fpu neon
 .text
 
 // note: prefetch stuff assumes 64-byte cacheline, true for the Cortex-A8
@@ -50,7 +49,7 @@
     pld         [r3, r1, lsl #1]
     pld         [r3, r2]
     bx          lr
-.endfunc
+endfunc
 
 // void prefetch_fenc( uint8_t *pix_y,  intptr_t stride_y,
 //                     uint8_t *pix_uv, intptr_t stride_uv, int mb_x )
@@ -76,7 +75,7 @@
     pld         [ip]
     pld         [ip, r3]
     pop         {pc}
-.endfunc
+endfunc
 
 
 // void *x264_memcpy_aligned( void *dst, const void *src, size_t n )
@@ -85,10 +84,10 @@
     movrel      ip,  memcpy_table
     and         r3,  r3,  #0xc
     ldr         pc,  [ip, r3]
-.endfunc
+endfunc
 
 .macro MEMCPY_ALIGNED srcalign dstalign
-function memcpy_aligned_\dstalign\()_\srcalign\()_neon
+function memcpy_aligned_\dstalign\()_\srcalign\()_neon, export=0
     mov         r3, r0
 .if \srcalign == 8 && \dstalign == 8
     sub         r2, #16
@@ -127,7 +126,7 @@
     vst1.64     {d0}, [r3,:64]!
 .endif
     bx          lr
-.endfunc
+endfunc
 .endm
 
 MEMCPY_ALIGNED 16, 16
@@ -156,7 +155,7 @@
 .endr
     bgt         memzero_loop
     bx          lr
-.endfunc
+endfunc
 
 
 // void pixel_avg( uint8_t *dst,  intptr_t dst_stride,
@@ -175,12 +174,13 @@
     cmp         ip,  #0
     bge         x264_pixel_avg_weight_w\w\()_add_add_neon
     b           x264_pixel_avg_weight_w\w\()_sub_add_neon     // weight < 0
-.endfunc
+endfunc
 .endm
 
 AVGH  4, 2
 AVGH  4, 4
 AVGH  4, 8
+AVGH  4, 16
 AVGH  8, 4
 AVGH  8, 8
 AVGH  8, 16
@@ -238,7 +238,7 @@
 .endm
 
 .macro AVG_WEIGHT ext
-function x264_pixel_avg_weight_w4_\ext\()_neon
+function x264_pixel_avg_weight_w4_\ext\()_neon, export=0
     load_weights_\ext
 1:  // height loop
     subs            lr,  lr,  #2
@@ -252,9 +252,9 @@
     vst1.32         {d1[0]}, [r0,:32], r1
     bgt             1b
     pop             {r4-r6,pc}
-.endfunc
+endfunc
 
-function x264_pixel_avg_weight_w8_\ext\()_neon
+function x264_pixel_avg_weight_w8_\ext\()_neon, export=0
     load_weights_\ext
 1:  // height loop
     subs            lr,  lr,  #4
@@ -276,9 +276,9 @@
     vst1.64         {d3}, [r0,:64], r1
     bgt             1b
     pop             {r4-r6,pc}
-.endfunc
+endfunc
 
-function x264_pixel_avg_weight_w16_\ext\()_neon
+function x264_pixel_avg_weight_w16_\ext\()_neon, export=0
     load_weights_\ext
 1:  // height loop
     subs            lr,  lr,  #2
@@ -296,14 +296,14 @@
     vst1.64         {d2-d3}, [r0,:128], r1
     bgt             1b
     pop             {r4-r6,pc}
-.endfunc
+endfunc
 .endm
 
 AVG_WEIGHT add_add
 AVG_WEIGHT add_sub
 AVG_WEIGHT sub_add
 
-function x264_pixel_avg_w4_neon
+function x264_pixel_avg_w4_neon, export=0
     subs        lr,  lr,  #2
     vld1.32     {d0[]}, [r2], r3
     vld1.32     {d2[]}, [r4], r5
@@ -315,9 +315,9 @@
     vst1.32     {d1[0]}, [r0,:32], r1
     bgt         x264_pixel_avg_w4_neon
     pop         {r4-r6,pc}
-.endfunc
+endfunc
 
-function x264_pixel_avg_w8_neon
+function x264_pixel_avg_w8_neon, export=0
     subs        lr,  lr,  #4
     vld1.64     {d0}, [r2], r3
     vld1.64     {d2}, [r4], r5
@@ -337,9 +337,9 @@
     vst1.64     {d3}, [r0,:64], r1
     bgt         x264_pixel_avg_w8_neon
     pop         {r4-r6,pc}
-.endfunc
+endfunc
 
-function x264_pixel_avg_w16_neon
+function x264_pixel_avg_w16_neon, export=0
     subs        lr,  lr,  #4
     vld1.64     {d0-d1}, [r2], r3
     vld1.64     {d2-d3}, [r4], r5
@@ -359,7 +359,7 @@
     vst1.64     {d6-d7}, [r0,:128], r1
     bgt         x264_pixel_avg_w16_neon
     pop         {r4-r6,pc}
-.endfunc
+endfunc
 
 
 function x264_pixel_avg2_w4_neon
@@ -378,7 +378,7 @@
     vst1.32     {d1[0]}, [r0,:32], r1
     bgt         avg2_w4_loop
     pop         {pc}
-.endfunc
+endfunc
 
 function x264_pixel_avg2_w8_neon
     ldr         ip,  [sp, #4]
@@ -396,7 +396,7 @@
     vst1.64     {d1}, [r0,:64], r1
     bgt         avg2_w8_loop
     pop         {pc}
-.endfunc
+endfunc
 
 function x264_pixel_avg2_w16_neon
     ldr         ip,  [sp, #4]
@@ -414,7 +414,7 @@
     vst1.64     {d4-d5}, [r0,:128], r1
     bgt         avg2_w16_loop
     pop         {pc}
-.endfunc
+endfunc
 
 function x264_pixel_avg2_w20_neon
     ldr         ip,  [sp, #4]
@@ -437,7 +437,7 @@
     vst1.32     {d6[0]},  [r0,:32], r1
     bgt         avg2_w20_loop
     pop         {pc}
-.endfunc
+endfunc
 
 
 .macro weight_prologue type
@@ -448,7 +448,7 @@
     ldr         lr,  [r4, #32]      // denom
 .endif
     ldrd        r4,  r5,  [r4, #32+4]    // scale, offset
-    vdup.16     q0,  r4
+    vdup.8      d0,  r4
     vdup.16     q1,  r5
 .ifc \type, full
     rsb         lr,  lr,  #0

x264-snapshot-20140321-2245.tar.bz2/common/arm/mc-c.c -> x264-snapshot-20141104-2245.tar.bz2/common/arm/mc-c.c Changed

@@ -37,6 +37,7 @@
 void x264_pixel_avg_8x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 void x264_pixel_avg_8x8_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 void x264_pixel_avg_8x4_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_4x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 void x264_pixel_avg_4x8_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 void x264_pixel_avg_4x4_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 void x264_pixel_avg_4x2_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
@@ -46,13 +47,28 @@
 void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
 void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
 
+void x264_plane_copy_deinterleave_neon(  pixel *dstu, intptr_t i_dstu,
+                                         pixel *dstv, intptr_t i_dstv,
+                                         pixel *src,  intptr_t i_src, int w, int h );
+void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta,
+                                            pixel *dstb, intptr_t i_dstb,
+                                            pixel *dstc, intptr_t i_dstc,
+                                            pixel *src,  intptr_t i_src, int pw, int w, int h );
+void x264_plane_copy_interleave_neon( pixel *dst,  intptr_t i_dst,
+                                      pixel *srcu, intptr_t i_srcu,
+                                      pixel *srcv, intptr_t i_srcv, int w, int h );
+
+void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
+void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
+void x264_load_deinterleave_chroma_fenc_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
+
 #define MC_WEIGHT(func)\
 void x264_mc_weight_w20##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
 void x264_mc_weight_w16##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
 void x264_mc_weight_w8##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
 void x264_mc_weight_w4##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
 \
-static void (* const x264_mc##func##_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int ) =\
+static weight_fn_t x264_mc##func##_wtab_neon[6] =\
 {\
     x264_mc_weight_w4##func##_neon,\
     x264_mc_weight_w4##func##_neon,\
@@ -72,7 +88,7 @@
 void x264_mc_copy_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 void x264_mc_copy_w16_aligned_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 
-void x264_mc_chroma_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int );
+void x264_mc_chroma_neon( uint8_t *, uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int );
 void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int );
 
 void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, intptr_t, int );
@@ -224,11 +240,20 @@
     pf->copy[PIXEL_8x8]   = x264_mc_copy_w8_neon;
     pf->copy[PIXEL_4x4]   = x264_mc_copy_w4_neon;
 
+    pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon;
+    pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
+    pf->plane_copy_interleave = x264_plane_copy_interleave_neon;
+
+    pf->store_interleave_chroma = x264_store_interleave_chroma_neon;
+    pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon;
+    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_neon;
+
     pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon;
     pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_neon;
     pf->avg[PIXEL_8x16]  = x264_pixel_avg_8x16_neon;
     pf->avg[PIXEL_8x8]   = x264_pixel_avg_8x8_neon;
     pf->avg[PIXEL_8x4]   = x264_pixel_avg_8x4_neon;
+    pf->avg[PIXEL_4x16]  = x264_pixel_avg_4x16_neon;
     pf->avg[PIXEL_4x8]   = x264_pixel_avg_4x8_neon;
     pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_neon;
     pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_neon;

x264-snapshot-20140321-2245.tar.bz2/common/arm/pixel-a.S -> x264-snapshot-20141104-2245.tar.bz2/common/arm/pixel-a.S Changed

@@ -25,14 +25,15 @@
 
 #include "asm.S"
 
-.fpu neon
 .section .rodata
 .align 4
 
-.rept 16 .byte 0xff
+.rept 16
+        .byte 0xff
 .endr
 mask_ff:
-.rept 16 .byte 0
+.rept 16
+        .byte 0
 .endr
 
 mask_ac4:
@@ -60,7 +61,7 @@
 .endr
     usada8      r0, r6, lr, ip
     pop         {r4-r6,pc}
-.endfunc
+endfunc
 .endm
 
 SAD4_ARMV6 4
@@ -137,7 +138,7 @@
     vpaddl.u16  d0,  d0
     vmov.u32    r0,  d0[0]
     bx          lr
-.endfunc
+endfunc
 .endm
 
 SAD_FUNC  4,  4
@@ -222,7 +223,7 @@
     vpaddl.u16  d0,  d0
     vmov.u32    r0,  d0[0]
     bx          lr
-.endfunc
+endfunc
 .endm
 
 SAD_FUNC_DUAL  8,  4
@@ -368,7 +369,7 @@
     vst1.32     {d0-d1}, [r7]
 .endif
     pop         {r6-r7,pc}
-.endfunc
+endfunc
 .endm
 
 SAD_X_FUNC  3, 4,  4
@@ -477,7 +478,7 @@
     vpadd.s32   d0, d0, d0
     vmov.32     r0, d0[0]
     bx          lr
-.endfunc
+endfunc
 .endm
 
 SSD_FUNC   4, 4
@@ -517,7 +518,7 @@
     vld1.64         {d26}, [r0,:64], r1
     VAR_SQR_SUM     q2,  q10,  q15, d26
     b               x264_var_end
-.endfunc
+endfunc
 
 function x264_pixel_var_8x16_neon
     vld1.64         {d16}, [r0,:64], r1
@@ -549,7 +550,7 @@
 2:
     VAR_SQR_SUM     q2,  q13,  q15, d22
     b               x264_var_end
-.endfunc
+endfunc
 
 function x264_pixel_var_16x16_neon
     vld1.64         {d16-d17}, [r0,:128], r1
@@ -573,9 +574,9 @@
     VAR_SQR_SUM     q1,  q12,  q14, d18
     VAR_SQR_SUM     q2,  q13,  q15, d19
     bgt             var16_loop
-.endfunc
+endfunc
 
-function x264_var_end
+function x264_var_end, export=0
     vpaddl.u16      q8,  q14
     vpaddl.u16      q9,  q15
     vadd.u32        q1,  q1,  q8
@@ -588,7 +589,7 @@
 
     vmov            r0,  r1,  d0
     bx              lr
-.endfunc
+endfunc
 
 .macro DIFF_SUM diff da db lastdiff
     vld1.64         {\da}, [r0,:64], r1
@@ -633,7 +634,7 @@
     mul             r0,  r0,  r0
     sub             r0,  r1,  r0,  lsr #6
     bx              lr
-.endfunc
+endfunc
 
 function x264_pixel_var2_8x16_neon
     vld1.64         {d16}, [r0,:64], r1
@@ -677,7 +678,7 @@
     mul             r0,  r0,  r0
     sub             r0,  r1,  r0,  lsr #7
     bx              lr
-.endfunc
+endfunc
 
 .macro LOAD_DIFF_8x4 q0 q1 q2 q3
     vld1.32     {d1}, [r2], r3
@@ -714,7 +715,7 @@
     HORIZ_ADD   d0,  d0,  d1
     vmov.32     r0,  d0[0]
     bx          lr
-.endfunc
+endfunc
 
 function x264_pixel_satd_4x8_neon
     vld1.32     {d1[]},  [r2], r3
@@ -741,7 +742,7 @@
     vsubl.u8    q3,  d6,  d7
     SUMSUB_AB   q10, q11, q2,  q3
     b           x264_satd_4x8_8x4_end_neon
-.endfunc
+endfunc
 
 function x264_pixel_satd_8x4_neon
     vld1.64     {d1}, [r2], r3
@@ -758,9 +759,9 @@
     vld1.64     {d6}, [r0,:64], r1
     vsubl.u8    q3,  d6,  d7
     SUMSUB_AB   q10, q11, q2,  q3
-.endfunc
+endfunc
 
-function x264_satd_4x8_8x4_end_neon
+function x264_satd_4x8_8x4_end_neon, export=0
     vadd.s16    q0,  q8,  q10
     vadd.s16    q1,  q9,  q11
     vsub.s16    q2,  q8,  q10
@@ -785,7 +786,7 @@
     HORIZ_ADD   d0,  d0,  d1
     vmov.32     r0,  d0[0]
     bx          lr
-.endfunc
+endfunc
 
 function x264_pixel_satd_8x8_neon
     mov         ip,  lr
@@ -799,7 +800,7 @@
     mov         lr,  ip
     vmov.32     r0,  d0[0]
     bx          lr
-.endfunc
+endfunc
 
 function x264_pixel_satd_8x16_neon
     vpush       {d8-d11}
@@ -821,9 +822,9 @@
     mov         lr,  ip
     vmov.32     r0,  d0[0]
     bx          lr
-.endfunc
+endfunc
 
-function x264_satd_8x8_neon
+function x264_satd_8x8_neon, export=0
     LOAD_DIFF_8x4 q8,  q9,  q10, q11
     vld1.64     {d7}, [r2], r3
     SUMSUB_AB   q0,  q1,  q8,  q9
@@ -841,10 +842,10 @@
     SUMSUB_AB   q9,  q11, q1,  q3
     vld1.64     {d0}, [r0,:64], r1
     vsubl.u8    q15, d0,  d1
-.endfunc
+endfunc
 
 // one vertical hadamard pass and two horizontal
-function x264_satd_8x4v_8x8h_neon
+function x264_satd_8x4v_8x8h_neon, export=0
     SUMSUB_ABCD q0, q1, q2, q3, q12, q13, q14, q15
     vtrn.16     q8,  q9
     SUMSUB_AB   q12, q14, q0,  q2
@@ -870,7 +871,7 @@
     vmax.s16    q14, q8,  q10
     vmax.s16    q15, q9,  q11
     bx          lr
-.endfunc
+endfunc

x264-snapshot-20140321-2245.tar.bz2/common/arm/predict-a.S -> x264-snapshot-20141104-2245.tar.bz2/common/arm/predict-a.S Changed

@@ -26,8 +26,6 @@
 
 #include "asm.S"
 
-.fpu neon
-
 .section .rodata
 .align 4
 
@@ -77,7 +75,16 @@
     add     ip, ip, ip, lsl #16
     str     ip, [r0, #3*FDEC_STRIDE]
     bx      lr
-.endfunc
+endfunc
+
+function x264_predict_4x4_v_armv6
+    ldr     r1,  [r0, #0 - 1 * FDEC_STRIDE]
+    str     r1,  [r0, #0 + 0 * FDEC_STRIDE]
+    str     r1,  [r0, #0 + 1 * FDEC_STRIDE]
+    str     r1,  [r0, #0 + 2 * FDEC_STRIDE]
+    str     r1,  [r0, #0 + 3 * FDEC_STRIDE]
+    bx      lr
+endfunc
 
 function x264_predict_4x4_dc_armv6
     mov     ip, #0
@@ -100,7 +107,7 @@
     str     r1, [r0, #2*FDEC_STRIDE]
     str     r1, [r0, #3*FDEC_STRIDE]
     bx      lr
-.endfunc
+endfunc
 
 function x264_predict_4x4_dc_top_neon
     mov         r12, #FDEC_STRIDE
@@ -115,7 +122,7 @@
     vst1.32     d1[0], [r0,:32], r12
     vst1.32     d1[0], [r0,:32], r12
     bx          lr
-.endfunc
+endfunc
 
 // return a1 = (a1+2*b1+c1+2)>>2  a2 = (a2+2*b2+c2+2)>>2
 .macro PRED4x4_LOWPASS a1 b1 c1 a2 b2 c2 pb_1
@@ -158,7 +165,7 @@
     add     r5, r5, r4, lsr #8
     str     r5, [r0, #3*FDEC_STRIDE]
     pop     {r4-r6,pc}
-.endfunc
+endfunc
 
 function x264_predict_4x4_ddl_neon
     sub         r0, #FDEC_STRIDE
@@ -177,7 +184,7 @@
     vst1.32     {d2[0]}, [r0,:32], ip
     vst1.32     {d3[0]}, [r0,:32], ip
     bx          lr
-.endfunc
+endfunc
 
 function x264_predict_8x8_dc_neon
     mov     ip, #0
@@ -201,7 +208,7 @@
     vst1.64 {d0}, [r0,:64], ip
 .endr
     pop    {r4-r5,pc}
-.endfunc
+endfunc
 
 function x264_predict_8x8_h_neon
     add         r1, r1, #7
@@ -224,7 +231,7 @@
     vst1.64     {d6}, [r0,:64], ip
     vst1.64     {d7}, [r0,:64], ip
     bx          lr
-.endfunc
+endfunc
 
 function x264_predict_8x8_v_neon
     add         r1, r1, #16
@@ -234,7 +241,7 @@
     vst1.8      {d0}, [r0,:64], r12
 .endr
     bx          lr
-.endfunc
+endfunc
 
 function x264_predict_8x8_ddl_neon
     add         r1, #16
@@ -262,7 +269,7 @@
     vst1.8      d2, [r0,:64], r12
     vst1.8      d1, [r0,:64], r12
     bx          lr
-.endfunc
+endfunc
 
 function x264_predict_8x8_ddr_neon
     vld1.8      {d0-d3}, [r1,:128]
@@ -292,7 +299,7 @@
     vst1.8      {d4}, [r0,:64], r12
     vst1.8      {d5}, [r0,:64], r12
     bx          lr
-.endfunc
+endfunc
 
 function x264_predict_8x8_vl_neon
     add         r1, #16
@@ -323,7 +330,7 @@
     vst1.8      {d3}, [r0,:64], r12
     vst1.8      {d2}, [r0,:64], r12
     bx          lr
-.endfunc
+endfunc
 
 function x264_predict_8x8_vr_neon
     add         r1, #8
@@ -355,7 +362,7 @@
     vst1.8      {d6}, [r0,:64], r12
     vst1.8      {d3}, [r0,:64], r12
     bx          lr
-.endfunc
+endfunc
 
 function x264_predict_8x8_hd_neon
     mov         r12, #FDEC_STRIDE
@@ -388,7 +395,7 @@
     vst1.8      {d16}, [r0,:64], r12
 
     bx          lr
-.endfunc
+endfunc
 
 function x264_predict_8x8_hu_neon
     mov         r12, #FDEC_STRIDE
@@ -421,7 +428,7 @@
     vst1.8      {d7}, [r0,:64], r12
     vst1.8      {d17}, [r0,:64]
     bx          lr
-.endfunc
+endfunc
 
 function x264_predict_8x8c_dc_top_neon
     sub         r2,  r0,  #FDEC_STRIDE
@@ -434,7 +441,7 @@
     vdup.8      d0,  d0[0]
     vtrn.32     d0,  d1
     b           pred8x8_dc_end
-.endfunc
+endfunc
 
 function x264_predict_8x8c_dc_left_neon
     mov         r1,  #FDEC_STRIDE
@@ -446,7 +453,7 @@
     vdup.8      d1,  d0[1]
     vdup.8      d0,  d0[0]
     b           pred8x8_dc_end
-.endfunc
+endfunc
 
 function x264_predict_8x8c_dc_neon
     sub         r2,  r0,  #FDEC_STRIDE
@@ -472,7 +479,7 @@
     vst1.8      {d1}, [r2,:64], r1
 .endr
     bx          lr
-.endfunc
+endfunc
 
 function x264_predict_8x8c_h_neon
     sub         r1, r0, #1
@@ -484,7 +491,7 @@
     vst1.64     {d2}, [r0,:64], ip
 .endr
     bx          lr
-.endfunc
+endfunc
 
 function x264_predict_8x8c_v_neon
     sub         r0, r0, #FDEC_STRIDE
@@ -494,7 +501,7 @@
     vst1.64     {d0}, [r0,:64], ip
 .endr
     bx          lr
-.endfunc
+endfunc
 
 function x264_predict_8x8c_p_neon
     sub         r3,  r0,  #FDEC_STRIDE
@@ -547,7 +554,7 @@
     subs        r3,  r3,  #1
     bne         1b
     bx          lr
-.endfunc
+endfunc
 
 
 function x264_predict_16x16_dc_top_neon
@@ -558,7 +565,7 @@
     vrshrn.u16  d0,  q0,  #4

x264-snapshot-20140321-2245.tar.bz2/common/arm/predict-c.c -> x264-snapshot-20141104-2245.tar.bz2/common/arm/predict-c.c Changed

@@ -27,36 +27,6 @@
 #include "predict.h"
 #include "pixel.h"
 
-void x264_predict_4x4_dc_armv6( uint8_t *src );
-void x264_predict_4x4_dc_top_neon( uint8_t *src );
-void x264_predict_4x4_h_armv6( uint8_t *src );
-void x264_predict_4x4_ddr_armv6( uint8_t *src );
-void x264_predict_4x4_ddl_neon( uint8_t *src );
-
-void x264_predict_8x8c_dc_neon( uint8_t *src );
-void x264_predict_8x8c_dc_top_neon( uint8_t *src );
-void x264_predict_8x8c_dc_left_neon( uint8_t *src );
-void x264_predict_8x8c_h_neon( uint8_t *src );
-void x264_predict_8x8c_v_neon( uint8_t *src );
-void x264_predict_8x8c_p_neon( uint8_t *src );
-
-void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] );
-
-void x264_predict_16x16_dc_neon( uint8_t *src );
-void x264_predict_16x16_dc_top_neon( uint8_t *src );
-void x264_predict_16x16_dc_left_neon( uint8_t *src );
-void x264_predict_16x16_h_neon( uint8_t *src );
-void x264_predict_16x16_v_neon( uint8_t *src );
-void x264_predict_16x16_p_neon( uint8_t *src );
-
 void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] )
 {
     if (!(cpu&X264_CPU_ARMV6))
@@ -64,6 +34,7 @@
 
 #if !HIGH_BIT_DEPTH
     pf[I_PRED_4x4_H]   = x264_predict_4x4_h_armv6;
+    pf[I_PRED_4x4_V]   = x264_predict_4x4_v_armv6;
     pf[I_PRED_4x4_DC]  = x264_predict_4x4_dc_armv6;
     pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_armv6;

x264-snapshot-20140321-2245.tar.bz2/common/arm/predict.h -> x264-snapshot-20141104-2245.tar.bz2/common/arm/predict.h Changed

@@ -26,15 +26,36 @@
 #ifndef X264_ARM_PREDICT_H
 #define X264_ARM_PREDICT_H
 
-void x264_predict_8x8_v_neon( pixel *src, pixel edge[36] );
-void x264_predict_8x8_h_neon( pixel *src, pixel edge[36] );
-void x264_predict_8x8_dc_neon( pixel *src, pixel edge[36] );
-void x264_predict_8x8c_dc_neon( pixel *src );
-void x264_predict_8x8c_h_neon( pixel *src );
-void x264_predict_8x8c_v_neon( pixel *src );
-void x264_predict_16x16_v_neon( pixel *src );
-void x264_predict_16x16_h_neon( pixel *src );
-void x264_predict_16x16_dc_neon( pixel *src );
+void x264_predict_4x4_dc_armv6( uint8_t *src );
+void x264_predict_4x4_dc_top_neon( uint8_t *src );
+void x264_predict_4x4_v_armv6( uint8_t *src );
+void x264_predict_4x4_h_armv6( uint8_t *src );
+void x264_predict_4x4_ddr_armv6( uint8_t *src );
+void x264_predict_4x4_ddl_neon( uint8_t *src );
+
+void x264_predict_8x8c_dc_neon( uint8_t *src );
+void x264_predict_8x8c_dc_top_neon( uint8_t *src );
+void x264_predict_8x8c_dc_left_neon( uint8_t *src );
+void x264_predict_8x8c_h_neon( uint8_t *src );
+void x264_predict_8x8c_v_neon( uint8_t *src );
+void x264_predict_8x8c_p_neon( uint8_t *src );
+
+void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] );
+
+void x264_predict_16x16_dc_neon( uint8_t *src );
+void x264_predict_16x16_dc_top_neon( uint8_t *src );
+void x264_predict_16x16_dc_left_neon( uint8_t *src );
+void x264_predict_16x16_h_neon( uint8_t *src );
+void x264_predict_16x16_v_neon( uint8_t *src );
+void x264_predict_16x16_p_neon( uint8_t *src );
 
 void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] );
 void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );

x264-snapshot-20140321-2245.tar.bz2/common/arm/quant-a.S -> x264-snapshot-20141104-2245.tar.bz2/common/arm/quant-a.S Changed

@@ -25,8 +25,6 @@
 
 #include "asm.S"
 
-.fpu neon
-
 .section .rodata
 .align 4
 pmovmskb_byte:
@@ -80,7 +78,7 @@
     vsub.s16    d3,  d3,  d0
     vst1.64     {d3}, [r0,:64]
     QUANT_END   d3
-.endfunc
+endfunc
 
 // quant_4x4_dc( int16_t dct[16], int mf, int bias )
 function x264_quant_4x4_dc_neon
@@ -92,7 +90,7 @@
     QUANT_TWO   q0,  q0,  d4,  d5,  d4,  d5,  q0
     vorr        d0,  d0,  d1
     QUANT_END   d0
-.endfunc
+endfunc
 
 // quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
 function x264_quant_4x4_neon
@@ -104,7 +102,7 @@
     QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7, q0
     vorr        d0,  d0,  d1
     QUANT_END   d0
-.endfunc
+endfunc
 
 // quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] )
 function x264_quant_4x4x4_neon
@@ -145,7 +143,7 @@
     orrne       r0,  #8
     vpop        {d8-d15}
     bx          lr
-.endfunc
+endfunc
 
 // quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
 function x264_quant_8x8_neon
@@ -165,7 +163,7 @@
 .endr
     vorr        d0,  d0,  d1
     QUANT_END   d0
-.endfunc
+endfunc
 
 .macro DEQUANT_START mf_size offset dc=no
     mov         r3,  #0x2b
@@ -257,7 +255,7 @@
     bgt         dequant_\size\()_rshift_loop
 .endif
     bx          lr
-.endfunc
+endfunc
 .endm
 
 DEQUANT 4x4, 4
@@ -307,7 +305,7 @@
     vmovn.s32   d3,  q13
     vst1.16     {d0-d3},   [r0,:128]
     bx          lr
-.endfunc
+endfunc
 
 
 // int coeff_last( int16_t *l )
@@ -319,7 +317,21 @@
     lsrs        r2,  r2,  #16
     addne       r0,  r0,  #1
     bx          lr
-.endfunc
+endfunc
+
+function x264_coeff_last8_arm
+    ldrd        r2,  r3,  [r0, #8]
+    orrs        ip,  r2,  r3
+    movne       r0,  #4
+    ldrdeq      r2,  r3,  [r0]
+    moveq       r0,  #0
+    tst         r3,  r3
+    addne       r0,  #2
+    movne       r2,  r3
+    lsrs        r2,  r2,  #16
+    addne       r0,  r0,  #1
+    bx          lr
+endfunc
 
 .macro COEFF_LAST_1x size
 function x264_coeff_last\size\()_neon
@@ -344,7 +356,7 @@
     subslt      r0,  r3,  r0,  lsr #2
     movlt       r0,  #0
     bx          lr
-.endfunc
+endfunc
 .endm
 
 COEFF_LAST_1x 15
@@ -393,4 +405,4 @@
     subslt      r0,  ip,  r0
     movlt       r0,  #0
     bx          lr
-.endfunc
+endfunc

x264-snapshot-20140321-2245.tar.bz2/common/arm/quant.h -> x264-snapshot-20141104-2245.tar.bz2/common/arm/quant.h Changed

x264-snapshot-20140321-2245.tar.bz2/common/bitstream.c -> x264-snapshot-20141104-2245.tar.bz2/common/bitstream.c Changed

x264-snapshot-20140321-2245.tar.bz2/common/bitstream.h -> x264-snapshot-20141104-2245.tar.bz2/common/bitstream.h Changed

x264-snapshot-20140321-2245.tar.bz2/common/cabac.c -> x264-snapshot-20141104-2245.tar.bz2/common/cabac.c Changed

x264-snapshot-20140321-2245.tar.bz2/common/common.h -> x264-snapshot-20141104-2245.tar.bz2/common/common.h Changed

x264-snapshot-20140321-2245.tar.bz2/common/cpu.c -> x264-snapshot-20141104-2245.tar.bz2/common/cpu.c Changed

@@ -5,7 +5,7 @@
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Laurent Aimar <fenrir@via.ecp.fr>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -89,6 +89,9 @@
     {"ARMv6",           X264_CPU_ARMV6},
     {"NEON",            X264_CPU_NEON},
     {"FastNeonMRC",     X264_CPU_FAST_NEON_MRC},
+#elif ARCH_AARCH64
+    {"ARMv8",           X264_CPU_ARMV8},
+    {"NEON",            X264_CPU_NEON},
 #endif
     {"", 0},
 };
@@ -338,6 +341,9 @@
 
 uint32_t x264_cpu_detect( void )
 {
+#ifdef __NO_FPRS__
+    return 0;
+#else
     static void (*oldsig)( int );
 
     oldsig = signal( SIGILL, sigill_handler );
@@ -357,6 +363,7 @@
     signal( SIGILL, oldsig );
 
     return X264_CPU_ALTIVEC;
+#endif
 }
 #endif
 
@@ -405,6 +412,13 @@
     return flags;
 }
 
+#elif ARCH_AARCH64
+
+uint32_t x264_cpu_detect( void )
+{
+    return X264_CPU_ARMV8 | X264_CPU_NEON;
+}
+
 #else
 
 uint32_t x264_cpu_detect( void )

x264-snapshot-20140321-2245.tar.bz2/common/dct.c -> x264-snapshot-20141104-2245.tar.bz2/common/dct.c Changed

x264-snapshot-20140321-2245.tar.bz2/common/deblock.c -> x264-snapshot-20141104-2245.tar.bz2/common/deblock.c Changed

@@ -5,7 +5,7 @@
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *          Henrik Gramner <henrik@gramner.com>
  *
  * This program is free software; you can redistribute it and/or modify
@@ -729,11 +729,14 @@
 void x264_deblock_h_luma_altivec( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
 #endif // ARCH_PPC
 
-#if HAVE_ARMV6
+#if HAVE_ARMV6 || ARCH_AARCH64
 void x264_deblock_v_luma_neon  ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
 void x264_deblock_h_luma_neon  ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
 void x264_deblock_v_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
 void x264_deblock_h_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
+                                 int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
+                                 int mvy_limit, int bframe );
 #endif
 
 void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
@@ -835,13 +838,14 @@
    }
 #endif // HAVE_ALTIVEC
 
-#if HAVE_ARMV6
+#if HAVE_ARMV6 || ARCH_AARCH64
    if( cpu&X264_CPU_NEON )
    {
         pf->deblock_luma[1] = x264_deblock_v_luma_neon;
         pf->deblock_luma[0] = x264_deblock_h_luma_neon;
         pf->deblock_chroma[1] = x264_deblock_v_chroma_neon;
         pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon;
+        pf->deblock_strength     = x264_deblock_strength_neon;
    }
 #endif
 #endif // !HIGH_BIT_DEPTH

x264-snapshot-20140321-2245.tar.bz2/common/frame.c -> x264-snapshot-20141104-2245.tar.bz2/common/frame.c Changed

x264-snapshot-20140321-2245.tar.bz2/common/frame.h -> x264-snapshot-20141104-2245.tar.bz2/common/frame.h Changed

x264-snapshot-20140321-2245.tar.bz2/common/macroblock.c -> x264-snapshot-20141104-2245.tar.bz2/common/macroblock.c Changed

x264-snapshot-20140321-2245.tar.bz2/common/macroblock.h -> x264-snapshot-20141104-2245.tar.bz2/common/macroblock.h Changed

x264-snapshot-20140321-2245.tar.bz2/common/mc.c -> x264-snapshot-20141104-2245.tar.bz2/common/mc.c Changed

x264-snapshot-20140321-2245.tar.bz2/common/mvpred.c -> x264-snapshot-20141104-2245.tar.bz2/common/mvpred.c Changed

x264-snapshot-20140321-2245.tar.bz2/common/opencl.c -> x264-snapshot-20141104-2245.tar.bz2/common/opencl.c Changed

x264-snapshot-20140321-2245.tar.bz2/common/osdep.h -> x264-snapshot-20141104-2245.tar.bz2/common/osdep.h Changed

x264-snapshot-20140321-2245.tar.bz2/common/pixel.c -> x264-snapshot-20141104-2245.tar.bz2/common/pixel.c Changed

@@ -5,7 +5,7 @@
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Laurent Aimar <fenrir@via.ecp.fr>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -38,8 +38,9 @@
 #   include "arm/pixel.h"
 #   include "arm/predict.h"
 #endif
-#if ARCH_UltraSPARC
-#   include "sparc/pixel.h"
+#if ARCH_AARCH64
+#   include "aarch64/pixel.h"
+#   include "aarch64/predict.h"
 #endif
 
 
@@ -200,7 +201,7 @@
 #define PIXEL_VAR2_C( name, w, h, shift ) \
 static int name( pixel *pix1, intptr_t i_stride1, pixel *pix2, intptr_t i_stride2, int *ssd ) \
 { \
-    uint32_t var = 0, sum = 0, sqr = 0; \
+    int var = 0, sum = 0, sqr = 0; \
     for( int y = 0; y < h; y++ ) \
     { \
         for( int x = 0; x < w; x++ ) \
@@ -212,8 +213,7 @@
         pix1 += i_stride1; \
         pix2 += i_stride2; \
     } \
-    sum = abs(sum); \
-    var = sqr - ((uint64_t)sum * sum >> shift); \
+    var = sqr - ((int64_t)sum * sum >> shift); \
     *ssd = sqr; \
     return var; \
 }
@@ -454,15 +454,6 @@
 SAD_X( 4x8 )
 SAD_X( 4x4 )
 
-#if !HIGH_BIT_DEPTH
-#if ARCH_UltraSPARC
-SAD_X( 16x16_vis )
-SAD_X( 16x8_vis )
-SAD_X( 8x16_vis )
-SAD_X( 8x8_vis )
-#endif
-#endif // !HIGH_BIT_DEPTH
-
 /****************************************************************************
  * pixel_satd_x4
  * no faster than single satd, but needed for satd to be a drop-in replacement for sad
@@ -509,7 +500,7 @@
 #endif
 
 #if !HIGH_BIT_DEPTH
-#if HAVE_ARMV6
+#if HAVE_ARMV6 || ARCH_AARCH64
 SATD_X_DECL7( _neon )
 #endif
 #endif // !HIGH_BIT_DEPTH
@@ -533,7 +524,7 @@
 INTRA_MBCMP_8x8( sad, _mmx2,  _c )
 INTRA_MBCMP_8x8(sa8d, _sse2,  _sse2 )
 #endif
-#if !HIGH_BIT_DEPTH && HAVE_ARMV6
+#if !HIGH_BIT_DEPTH && (HAVE_ARMV6 || ARCH_AARCH64)
 INTRA_MBCMP_8x8( sad, _neon, _neon )
 INTRA_MBCMP_8x8(sa8d, _neon, _neon )
 #endif
@@ -593,8 +584,18 @@
 #endif
 #endif
 #if !HIGH_BIT_DEPTH && HAVE_ARMV6
-INTRA_MBCMP( sad,  4x4,   v, h, dc,  , _neon, _c )
-INTRA_MBCMP(satd,  4x4,   v, h, dc,  , _neon, _c )
+INTRA_MBCMP( sad,  4x4,   v, h, dc,  , _neon, _armv6 )
+INTRA_MBCMP(satd,  4x4,   v, h, dc,  , _neon, _armv6 )
+INTRA_MBCMP( sad,  8x8,  dc, h,  v, c, _neon, _neon )
+INTRA_MBCMP(satd,  8x8,  dc, h,  v, c, _neon, _neon )
+INTRA_MBCMP( sad,  8x16, dc, h,  v, c, _neon, _c )
+INTRA_MBCMP(satd,  8x16, dc, h,  v, c, _neon, _c )
+INTRA_MBCMP( sad, 16x16,  v, h, dc,  , _neon, _neon )
+INTRA_MBCMP(satd, 16x16,  v, h, dc,  , _neon, _neon )
+#endif
+#if !HIGH_BIT_DEPTH && ARCH_AARCH64
+INTRA_MBCMP( sad,  4x4,   v, h, dc,  , _neon, _neon )
+INTRA_MBCMP(satd,  4x4,   v, h, dc,  , _neon, _neon )
 INTRA_MBCMP( sad,  8x8,  dc, h,  v, c, _neon, _neon )
 INTRA_MBCMP(satd,  8x8,  dc, h,  v, c, _neon, _neon )
 INTRA_MBCMP( sad,  8x16, dc, h,  v, c, _neon, _c )
@@ -1021,8 +1022,16 @@
     }
     if( cpu&X264_CPU_XOP )
     {
+        INIT5( sad_x3, _xop );
+        INIT5( sad_x4, _xop );
+        pixf->ssd_nv12_core    = x264_pixel_ssd_nv12_core_xop;
+        pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop;
+        pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_xop;
         pixf->vsad = x264_pixel_vsad_xop;
         pixf->asd8 = x264_pixel_asd8_xop;
+#if ARCH_X86_64
+        pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_xop;
+#endif
     }
     if( cpu&X264_CPU_AVX2 )
     {
@@ -1308,6 +1317,7 @@
         pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_xop;
         pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_xop;
         pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_xop;
+        pixf->ssd_nv12_core    = x264_pixel_ssd_nv12_core_xop;
         pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop;
         pixf->var[PIXEL_8x16]  = x264_pixel_var_8x16_xop;
         pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_xop;
@@ -1394,6 +1404,46 @@
         }
     }
 #endif
+
+#if ARCH_AARCH64
+    if( cpu&X264_CPU_NEON )
+    {
+        INIT7( sad, _neon );
+        // AArch64 has no distinct instructions for aligned load/store
+        INIT7_NAME( sad_aligned, sad, _neon );
+        INIT7( sad_x3, _neon );
+        INIT7( sad_x4, _neon );
+        INIT7( ssd, _neon );
+        INIT7( satd, _neon );
+        INIT7( satd_x3, _neon );
+        INIT7( satd_x4, _neon );
+        INIT4( hadamard_ac, _neon );
+
+        pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_neon;
+        pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon;
+
+        pixf->var[PIXEL_8x8]    = x264_pixel_var_8x8_neon;
+        pixf->var[PIXEL_8x16]   = x264_pixel_var_8x16_neon;
+        pixf->var[PIXEL_16x16]  = x264_pixel_var_16x16_neon;
+        pixf->var2[PIXEL_8x8]   = x264_pixel_var2_8x8_neon;
+        pixf->var2[PIXEL_8x16]  = x264_pixel_var2_8x16_neon;
+
+        pixf->intra_sad_x3_4x4    = x264_intra_sad_x3_4x4_neon;
+        pixf->intra_satd_x3_4x4   = x264_intra_satd_x3_4x4_neon;
+        pixf->intra_sad_x3_8x8    = x264_intra_sad_x3_8x8_neon;
+        pixf->intra_sa8d_x3_8x8   = x264_intra_sa8d_x3_8x8_neon;
+        pixf->intra_sad_x3_8x8c   = x264_intra_sad_x3_8x8c_neon;
+        pixf->intra_satd_x3_8x8c  = x264_intra_satd_x3_8x8c_neon;
+        pixf->intra_sad_x3_8x16c  = x264_intra_sad_x3_8x16c_neon;
+        pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_neon;
+        pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16_neon;
+        pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_neon;
+
+        pixf->ssim_4x4x2_core   = x264_pixel_ssim_4x4x2_core_neon;
+        pixf->ssim_end4         = x264_pixel_ssim_end4_neon;
+    }
+#endif // ARCH_AARCH64
+
 #endif // HIGH_BIT_DEPTH
 #if HAVE_ALTIVEC
     if( cpu&X264_CPU_ALTIVEC )
@@ -1401,13 +1451,6 @@
         x264_pixel_altivec_init( pixf );
     }
 #endif
-#if !HIGH_BIT_DEPTH
-#if ARCH_UltraSPARC
-    INIT4( sad, _vis );
-    INIT4( sad_x3, _vis );
-    INIT4( sad_x4, _vis );
-#endif
-#endif // !HIGH_BIT_DEPTH
 
     pixf->ads[PIXEL_8x16] =
     pixf->ads[PIXEL_8x4] =

x264-snapshot-20140321-2245.tar.bz2/common/pixel.h -> x264-snapshot-20141104-2245.tar.bz2/common/pixel.h Changed

x264-snapshot-20140321-2245.tar.bz2/common/predict.c -> x264-snapshot-20141104-2245.tar.bz2/common/predict.c Changed

@@ -5,7 +5,7 @@
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *          Henrik Gramner <henrik@gramner.com>
  *
  * This program is free software; you can redistribute it and/or modify
@@ -40,6 +40,9 @@
 #if ARCH_ARM
 #   include "arm/predict.h"
 #endif
+#if ARCH_AARCH64
+#   include "aarch64/predict.h"
+#endif
 
 /****************************************************************************
  * 16x16 prediction for intra luma block
@@ -899,6 +902,10 @@
 #if HAVE_ARMV6
     x264_predict_16x16_init_arm( cpu, pf );
 #endif
+
+#if ARCH_AARCH64
+    x264_predict_16x16_init_aarch64( cpu, pf );
+#endif
 }
 
 void x264_predict_8x8c_init( int cpu, x264_predict_t pf[7] )
@@ -923,6 +930,10 @@
 #if HAVE_ARMV6
     x264_predict_8x8c_init_arm( cpu, pf );
 #endif
+
+#if ARCH_AARCH64
+    x264_predict_8x8c_init_aarch64( cpu, pf );
+#endif
 }
 
 void x264_predict_8x16c_init( int cpu, x264_predict_t pf[7] )
@@ -963,6 +974,10 @@
 #if HAVE_ARMV6
     x264_predict_8x8_init_arm( cpu, pf, predict_filter );
 #endif
+
+#if ARCH_AARCH64
+    x264_predict_8x8_init_aarch64( cpu, pf, predict_filter );
+#endif
 }
 
 void x264_predict_4x4_init( int cpu, x264_predict_t pf[12] )
@@ -987,5 +1002,9 @@
 #if HAVE_ARMV6
     x264_predict_4x4_init_arm( cpu, pf );
 #endif
+
+#if ARCH_AARCH64
+    x264_predict_4x4_init_aarch64( cpu, pf );
+#endif
 }

x264-snapshot-20140321-2245.tar.bz2/common/quant.c -> x264-snapshot-20141104-2245.tar.bz2/common/quant.c Changed

@@ -4,7 +4,7 @@
  * Copyright (C) 2005-2014 x264 project
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *          Christian Heine <sennindemokrit@gmx.net>
  *          Henrik Gramner <henrik@gramner.com>
  *
@@ -37,6 +37,9 @@
 #if ARCH_ARM
 #   include "arm/quant.h"
 #endif
+#if ARCH_AARCH64
+#   include "aarch64/quant.h"
+#endif
 
 #define QUANT_ONE( coef, mf, f ) \
 { \
@@ -556,7 +559,6 @@
     {
 #if ARCH_X86
         pf->quant_4x4 = x264_quant_4x4_mmx;
-        pf->quant_4x4x4 = x264_quant_4x4x4_mmx;
         pf->quant_8x8 = x264_quant_8x8_mmx;
         pf->dequant_4x4 = x264_dequant_4x4_mmx;
         pf->dequant_4x4_dc = x264_dequant_4x4dc_mmx2;
@@ -725,8 +727,12 @@
 
 #if HAVE_ARMV6
     if( cpu&X264_CPU_ARMV6 )
+    {
         pf->coeff_last4 = x264_coeff_last4_arm;
-
+        pf->coeff_last8 = x264_coeff_last8_arm;
+    }
+#endif
+#if HAVE_ARMV6 || ARCH_AARCH64
     if( cpu&X264_CPU_NEON )
     {
         pf->quant_2x2_dc   = x264_quant_2x2_dc_neon;
@@ -742,6 +748,13 @@
         pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_neon;
     }
 #endif
+#if ARCH_AARCH64
+    if( cpu&X264_CPU_ARMV8 )
+    {
+        pf->coeff_last4 = x264_coeff_last4_aarch64;
+        pf->coeff_last8 = x264_coeff_last8_aarch64;
+    }
+#endif
 #endif // HIGH_BIT_DEPTH
     pf->coeff_last[DCT_LUMA_DC]     = pf->coeff_last[DCT_CHROMAU_DC]  = pf->coeff_last[DCT_CHROMAV_DC] =
     pf->coeff_last[DCT_CHROMAU_4x4] = pf->coeff_last[DCT_CHROMAV_4x4] = pf->coeff_last[DCT_LUMA_4x4];

x264-snapshot-20140321-2245.tar.bz2/common/quant.h -> x264-snapshot-20141104-2245.tar.bz2/common/quant.h Changed

x264-snapshot-20140321-2245.tar.bz2/common/rectangle.c -> x264-snapshot-20141104-2245.tar.bz2/common/rectangle.c Changed

x264-snapshot-20140321-2245.tar.bz2/common/rectangle.h -> x264-snapshot-20141104-2245.tar.bz2/common/rectangle.h Changed

x264-snapshot-20140321-2245.tar.bz2/common/vlc.c -> x264-snapshot-20141104-2245.tar.bz2/common/vlc.c Changed

x264-snapshot-20140321-2245.tar.bz2/common/x86/bitstream-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/bitstream-a.asm Changed

x264-snapshot-20140321-2245.tar.bz2/common/x86/cabac-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/cabac-a.asm Changed

x264-snapshot-20140321-2245.tar.bz2/common/x86/const-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/const-a.asm Changed

x264-snapshot-20140321-2245.tar.bz2/common/x86/cpu-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/cpu-a.asm Changed

x264-snapshot-20140321-2245.tar.bz2/common/x86/dct-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/dct-a.asm Changed

x264-snapshot-20140321-2245.tar.bz2/common/x86/dct.h -> x264-snapshot-20141104-2245.tar.bz2/common/x86/dct.h Changed

x264-snapshot-20140321-2245.tar.bz2/common/x86/deblock-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/deblock-a.asm Changed

x264-snapshot-20140321-2245.tar.bz2/common/x86/mc-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/mc-a.asm Changed

x264-snapshot-20140321-2245.tar.bz2/common/x86/mc-a2.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/mc-a2.asm Changed

x264-snapshot-20140321-2245.tar.bz2/common/x86/mc-c.c -> x264-snapshot-20141104-2245.tar.bz2/common/x86/mc-c.c Changed

x264-snapshot-20140321-2245.tar.bz2/common/x86/pixel-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/pixel-a.asm Changed

@@ -7,7 +7,7 @@
 ;*          Holger Lubitz <holger@lubitz.org>
 ;*          Laurent Aimar <fenrir@via.ecp.fr>
 ;*          Alex Izvorski <aizvorksi@gmail.com>
-;*          Jason Garrett-Glaser <darkshikari@gmail.com>
+;*          Fiona Glaser <fiona@x264.com>
 ;*          Oskar Arvidsson <oskar@irock.se>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
@@ -561,10 +561,15 @@
     pshufhw     m0, m0, q3120
     pshufhw     m1, m1, q3120
 %endif
+%if cpuflag(xop)
+    pmadcswd    m2, m0, m0, m2
+    pmadcswd    m3, m1, m1, m3
+%else
     pmaddwd     m0, m0
     pmaddwd     m1, m1
     paddd       m2, m0
     paddd       m3, m1
+%endif
     add         r6, 2*mmsize
     jl .loopx
 %if mmsize == 32 ; avx2 may overread by 32 bytes, that has to be handled
@@ -657,10 +662,15 @@
     por     m0, m1
     psrlw   m2, m0, 8
     pand    m0, m5
+%if cpuflag(xop)
+    pmadcswd m4, m2, m2, m4
+    pmadcswd m3, m0, m0, m3
+%else
     pmaddwd m2, m2
     pmaddwd m0, m0
-    paddd   m3, m0
     paddd   m4, m2
+    paddd   m3, m0
+%endif
     add     r6, mmsize
     jl .loopx
 %if mmsize == 32 ; avx2 may overread by 16 bytes, that has to be handled
@@ -695,6 +705,8 @@
 SSD_NV12
 INIT_XMM avx
 SSD_NV12
+INIT_XMM xop
+SSD_NV12
 INIT_YMM avx2
 SSD_NV12
 
@@ -4677,12 +4689,13 @@
 ;-----------------------------------------------------------------------------
 ; float pixel_ssim_end( int sum0[5][4], int sum1[5][4], int width )
 ;-----------------------------------------------------------------------------
-cglobal pixel_ssim_end4, 3,3,7
-    movdqa    m0, [r0+ 0]
-    movdqa    m1, [r0+16]
-    movdqa    m2, [r0+32]
-    movdqa    m3, [r0+48]
-    movdqa    m4, [r0+64]
+cglobal pixel_ssim_end4, 2,3
+    mov      r2d, r2m
+    mova      m0, [r0+ 0]
+    mova      m1, [r0+16]
+    mova      m2, [r0+32]
+    mova      m3, [r0+48]
+    mova      m4, [r0+64]
     paddd     m0, [r1+ 0]
     paddd     m1, [r1+16]
     paddd     m2, [r1+32]
@@ -4692,8 +4705,6 @@
     paddd     m1, m2
     paddd     m2, m3
     paddd     m3, m4
-    movdqa    m5, [ssim_c1]
-    movdqa    m6, [ssim_c2]
     TRANSPOSE4x4D  0, 1, 2, 3, 4
 
 ;   s1=m0, s2=m1, ss=m2, s12=m3
@@ -4702,20 +4713,21 @@
     cvtdq2ps  m1, m1
     cvtdq2ps  m2, m2
     cvtdq2ps  m3, m3
+    mulps     m4, m0, m1  ; s1*s2
+    mulps     m0, m0      ; s1*s1
+    mulps     m1, m1      ; s2*s2
     mulps     m2, [pf_64] ; ss*64
     mulps     m3, [pf_128] ; s12*128
-    movdqa    m4, m1
-    mulps     m4, m0      ; s1*s2
-    mulps     m1, m1      ; s2*s2
-    mulps     m0, m0      ; s1*s1
     addps     m4, m4      ; s1*s2*2
     addps     m0, m1      ; s1*s1 + s2*s2
     subps     m2, m0      ; vars
     subps     m3, m4      ; covar*2
-    addps     m4, m5      ; s1*s2*2 + ssim_c1
-    addps     m0, m5      ; s1*s1 + s2*s2 + ssim_c1
-    addps     m2, m6      ; vars + ssim_c2
-    addps     m3, m6      ; covar*2 + ssim_c2
+    movaps    m1, [ssim_c1]
+    addps     m4, m1      ; s1*s2*2 + ssim_c1
+    addps     m0, m1      ; s1*s1 + s2*s2 + ssim_c1
+    movaps    m1, [ssim_c2]
+    addps     m2, m1      ; vars + ssim_c2
+    addps     m3, m1      ; covar*2 + ssim_c2
 %else
     pmaddwd   m4, m1, m0  ; s1*s2
     pslld     m1, 16
@@ -4726,10 +4738,12 @@
     pslld     m2, 6
     psubd     m3, m4  ; covar*2
     psubd     m2, m0  ; vars
-    paddd     m0, m5
-    paddd     m4, m5
-    paddd     m3, m6
-    paddd     m2, m6
+    mova      m1, [ssim_c1]
+    paddd     m0, m1
+    paddd     m4, m1
+    mova      m1, [ssim_c2]
+    paddd     m3, m1
+    paddd     m2, m1
     cvtdq2ps  m0, m0  ; (float)(s1*s1 + s2*s2 + ssim_c1)
     cvtdq2ps  m4, m4  ; (float)(s1*s2*2 + ssim_c1)
     cvtdq2ps  m3, m3  ; (float)(covar*2 + ssim_c2)
@@ -4742,20 +4756,31 @@
     cmp       r2d, 4
     je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
     neg       r2
+
 %ifdef PIC
     lea       r3, [mask_ff + 16]
-    movdqu    m1, [r3 + r2*4]
+    %xdefine %%mask r3
 %else
-    movdqu    m1, [mask_ff + r2*4 + 16]
+    %xdefine %%mask mask_ff + 16
 %endif
-    pand      m4, m1
+%if cpuflag(avx)
+    andps     m4, [%%mask + r2*4]
+%else
+    movups    m0, [%%mask + r2*4]
+    andps     m4, m0
+%endif
+
 .skip:
     movhlps   m0, m4
     addps     m0, m4
+%if cpuflag(ssse3)
+    movshdup  m4, m0
+%else
     pshuflw   m4, m0, q0032
+%endif
     addss     m0, m4
 %if ARCH_X86_64 == 0
-    movd     r0m, m0
+    movss    r0m, m0
     fld     dword r0m
 %endif
     RET

x264-snapshot-20140321-2245.tar.bz2/common/x86/pixel.h -> x264-snapshot-20141104-2245.tar.bz2/common/x86/pixel.h Changed

@@ -5,7 +5,7 @@
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -56,6 +56,7 @@
 DECL_X4( sad, sse2 )
 DECL_X4( sad, sse3 )
 DECL_X4( sad, ssse3 )
+DECL_X4( sad, xop )
 DECL_X4( sad, avx )
 DECL_X4( sad, avx2 )
 DECL_X1( ssd, mmx )
@@ -153,6 +154,9 @@
 void x264_pixel_ssd_nv12_core_avx ( pixel *pixuv1, intptr_t stride1,
                                     pixel *pixuv2, intptr_t stride2, int width,
                                     int height, uint64_t *ssd_u, uint64_t *ssd_v );
+void x264_pixel_ssd_nv12_core_xop ( pixel *pixuv1, intptr_t stride1,
+                                    pixel *pixuv2, intptr_t stride2, int width,
+                                    int height, uint64_t *ssd_u, uint64_t *ssd_v );
 void x264_pixel_ssd_nv12_core_avx2( pixel *pixuv1, intptr_t stride1,
                                     pixel *pixuv2, intptr_t stride2, int width,
                                     int height, uint64_t *ssd_u, uint64_t *ssd_v );

x264-snapshot-20140321-2245.tar.bz2/common/x86/predict-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/predict-a.asm Changed

x264-snapshot-20140321-2245.tar.bz2/common/x86/predict-c.c -> x264-snapshot-20141104-2245.tar.bz2/common/x86/predict-c.c Changed

x264-snapshot-20140321-2245.tar.bz2/common/x86/quant-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/quant-a.asm Changed

@@ -4,7 +4,7 @@
 ;* Copyright (C) 2005-2014 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
-;*          Jason Garrett-Glaser <darkshikari@gmail.com>
+;*          Fiona Glaser <fiona@x264.com>
 ;*          Christian Heine <sennindemokrit@gmx.net>
 ;*          Oskar Arvidsson <oskar@irock.se>
 ;*          Henrik Gramner <henrik@gramner.com>
@@ -292,14 +292,11 @@
     QUANT_4x4  0, 6
     QUANT_4x4 64, 7
     packssdw  m6, m7
-    packssdw  m5, m6
-    packssdw  m5, m5  ; AA BB CC DD
-    packsswb  m5, m5  ; A B C D
+    packssdw  m5, m6  ; AAAA BBBB CCCC DDDD
     pxor      m4, m4
-    pcmpeqb   m5, m4
-    pmovmskb eax, m5
-    not      eax
-    and      eax, 0xf
+    pcmpeqd   m5, m4
+    movmskps eax, m5
+    xor      eax, 0xf
     RET
 %endmacro
 
@@ -444,16 +441,11 @@
     QUANT_4x4 64, 5
     QUANT_4x4 96, 6
     packssdw  m5, m6
-    packssdw  m4, m5
-%if mmsize == 16
-    packssdw  m4, m4  ; AA BB CC DD
-%endif
-    packsswb  m4, m4  ; A B C D
+    packssdw  m4, m5  ; AAAA BBBB CCCC DDDD
     pxor      m3, m3
-    pcmpeqb   m4, m3
-    pmovmskb eax, m4
-    not      eax
-    and      eax, 0xf
+    pcmpeqd   m4, m3
+    movmskps eax, m4
+    xor      eax, 0xf
     RET
 %endmacro
 
@@ -464,7 +456,6 @@
 INIT_MMX mmx
 QUANT_AC quant_4x4, 4
 QUANT_AC quant_8x8, 16
-QUANT_4x4x4
 %endif
 
 INIT_XMM sse2

x264-snapshot-20140321-2245.tar.bz2/common/x86/quant.h -> x264-snapshot-20141104-2245.tar.bz2/common/x86/quant.h Changed

x264-snapshot-20140321-2245.tar.bz2/common/x86/sad-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/sad-a.asm Changed

x264-snapshot-20140321-2245.tar.bz2/common/x86/sad16-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/sad16-a.asm Changed

x264-snapshot-20140321-2245.tar.bz2/common/x86/util.h -> x264-snapshot-20141104-2245.tar.bz2/common/x86/util.h Changed

x264-snapshot-20140321-2245.tar.bz2/common/x86/x86inc.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/x86inc.asm Changed

@@ -5,7 +5,7 @@
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Anton Mitrofanov <BugMaster@narod.ru>
-;*          Jason Garrett-Glaser <darkshikari@gmail.com>
+;*          Fiona Glaser <fiona@x264.com>
 ;*          Henrik Gramner <henrik@gramner.com>
 ;*
 ;* Permission to use, copy, modify, and/or distribute this software for any
@@ -90,9 +90,6 @@
     default rel
 %endif
 
-; Always use long nops (reduces 0x90 spam in disassembly on x86_32)
-CPU amdnop
-
 ; Macros to eliminate most code duplication between x86_32 and x86_64:
 ; Currently this works only for leaf functions which load all their arguments
 ; into registers at the start, and make no other use of the stack. Luckily that
@@ -756,19 +753,26 @@
 %define    cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
 %define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
 
-; Takes up to 2 cpuflags from the above list.
+; Takes an arbitrary number of cpuflags from the above list.
 ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
 ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
-%macro INIT_CPUFLAGS 0-2
-    CPU amdnop
+%macro INIT_CPUFLAGS 0-*
+    %xdefine SUFFIX
+    %undef cpuname
+    %assign cpuflags 0
+
     %if %0 >= 1
-        %xdefine cpuname %1
-        %assign cpuflags cpuflags_%1
-        %if %0 >= 2
-            %xdefine cpuname %1_%2
-            %assign cpuflags cpuflags | cpuflags_%2
-        %endif
+        %rep %0
+            %ifdef cpuname
+                %xdefine cpuname cpuname %+ _%1
+            %else
+                %xdefine cpuname %1
+            %endif
+            %assign cpuflags cpuflags | cpuflags_%1
+            %rotate 1
+        %endrep
         %xdefine SUFFIX _ %+ cpuname
+
         %if cpuflag(avx)
             %assign avx_enabled 1
         %endif
@@ -779,16 +783,15 @@
         %endif
         %if cpuflag(aligned)
             %define movu mova
-        %elifidn %1, sse3
+        %elif cpuflag(sse3) && notcpuflag(ssse3)
             %define movu lddqu
         %endif
-        %if ARCH_X86_64 == 0 && notcpuflag(sse2)
-            CPU basicnop
-        %endif
+    %endif
+
+    %if ARCH_X86_64 || cpuflag(sse2)
+        CPU amdnop
     %else
-        %xdefine SUFFIX
-        %undef cpuname
-        %undef cpuflags
+        CPU basicnop
     %endif
 %endmacro

x264-snapshot-20140321-2245.tar.bz2/common/x86/x86util.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/x86util.asm Changed

x264-snapshot-20140321-2245.tar.bz2/configure -> x264-snapshot-20141104-2245.tar.bz2/configure Changed

@@ -73,32 +73,36 @@
     echo "$1" >> config.log
 }
 
-intel_cflags() {
-    # Intel Compiler issues an incredibly large number of warnings on any warning level,
+cc_cflags() {
+    # several non gcc compilers issue an incredibly large number of warnings on any warning level,
     # suppress them by disabling all warnings rather than having to use #pragmas to disable most of them
     for arg in $*; do
         [ $arg = -ffast-math ] && arg=
         [[ "$arg" = -falign-loops* ]] && arg=
         [ "$arg" = -fno-tree-vectorize ] && arg=
         [ "$arg" = -Wshadow ] && arg=
+        [ "$arg" = -Wno-maybe-uninitialized ] && arg=
         [[ "$arg" = -mpreferred-stack-boundary* ]] && arg=
         [[ "$arg" = -l* ]] && arg=
         [[ "$arg" = -L* ]] && arg=
-        if [ $compiler = ICL ]; then
+        if [ $compiler_style = MS ]; then
             [ "$arg" = -Wall ] && arg=-W0
+            [ "$arg" = -Werror ] && arg="-W3 -WX"
             [ "$arg" = -g ] && arg=-Z7
             [ "$arg" = -fomit-frame-pointer ] && arg=
             [ "$arg" = -s ] && arg=
             [ "$arg" = -fPIC ] && arg=
         else
             [ "$arg" = -Wall ] && arg=-w0
+            [ "$arg" = -Werror ] && arg="-w3 -Werror"
         fi
+        [ $compiler = CL -a "$arg" = -O3 ] && arg=-O2
 
         [ -n "$arg" ] && echo -n "$arg "
     done
 }
 
-icl_ldflags() {
+cl_ldflags() {
     for arg in $*; do
         arg=${arg/LIBPATH/libpath}
         [ ${arg#-libpath:} == $arg -a ${arg#-l} != $arg ] && arg=${arg#-l}.lib
@@ -106,6 +110,11 @@
         [ $arg = -Wl,--large-address-aware ] && arg=-largeaddressaware
         [ $arg = -s ] && arg=
         [ "$arg" = -Wl,-Bsymbolic ] && arg=
+        [ "$arg" = -fno-tree-vectorize ] && arg=
+        [ "$arg" = -Werror ] && arg=
+        [ "$arg" = -Wshadow ] && arg=
+        [ "$arg" = -Wmaybe-uninitialized ] && arg=
+        [[ "$arg" = -Qdiag-error* ]] && arg=
 
         arg=${arg/pthreadGC/pthreadVC}
         [ "$arg" = avifil32.lib ] && arg=vfw32.lib
@@ -135,11 +144,11 @@
     fi
     rm -f conftest.c
     [ -n "$1" ] && echo "#include <$1>" > conftest.c
-    echo "int main () { $3 return 0; }" >> conftest.c
-    if [ $compiler = ICL ]; then
-        cc_cmd="$CC conftest.c $(intel_cflags $CFLAGS $2) -link $(icl_ldflags $2 $LDFLAGSCLI $LDFLAGS)"
+    echo "int main (void) { $3 return 0; }" >> conftest.c
+    if [ $compiler_style = MS ]; then
+        cc_cmd="$CC conftest.c $(cc_cflags $CFLAGS $CHECK_CFLAGS $2) -link $(cl_ldflags $2 $LDFLAGSCLI $LDFLAGS)"
     else
-        cc_cmd="$CC conftest.c $CFLAGS $2 $LDFLAGSCLI $LDFLAGS -o conftest"
+        cc_cmd="$CC conftest.c $CFLAGS $CHECK_CFLAGS $2 $LDFLAGSCLI $LDFLAGS -o conftest"
     fi
     if $cc_cmd >conftest.log 2>&1; then
         res=$?
@@ -165,8 +174,12 @@
     rm -f conftest.c
     [ -n "$1" ] && echo "#include <$1>" > conftest.c
     echo -e "#if !($3) \n#error $4 \n#endif " >> conftest.c
-
-    if $CC conftest.c $CFLAGS $2 -E -o conftest >conftest.log 2>&1; then
+    if [ $compiler_style = MS ]; then
+        cpp_cmd="$CC conftest.c $(cc_cflags $CFLAGS $2) -P"
+    else
+        cpp_cmd="$CC conftest.c $CFLAGS $2 -E -o conftest"
+    fi
+    if $cpp_cmd >conftest.log 2>&1; then
         res=$?
         log_ok
     else
@@ -185,8 +198,9 @@
 
 as_check() {
     log_check "whether $AS supports $1"
-    echo "$1" > conftest.asm
-    if $AS conftest.asm $ASFLAGS $2 -o conftest.o >conftest.log 2>&1; then
+    echo "$1" > conftest$AS_EXT
+    as_cmd="$AS conftest$AS_EXT $ASFLAGS $2 -o conftest.o"
+    if $as_cmd >conftest.log 2>&1; then
         res=$?
         log_ok
     else
@@ -194,12 +208,12 @@
         log_fail
         log_msg "Failed commandline was:"
         log_msg "--------------------------------------------------"
-        log_msg "$AS conftest.asm $ASFLAGS $2 -o conftest.o"
+        log_msg "$as_cmd"
         cat conftest.log >> config.log
         log_msg "--------------------------------------------------"
         log_msg "Failed program was:"
         log_msg "--------------------------------------------------"
-        cat conftest.asm >> config.log
+        cat conftest$AS_EXT >> config.log
         log_msg "--------------------------------------------------"
     fi
     return $res
@@ -208,10 +222,10 @@
 rc_check() {
     log_check "whether $RC works"
     echo "$1" > conftest.rc
-    if [ $compiler = ICL ]; then
-        rc_cmd="$RC $RCFLAGS -foconftest.o conftest.rc"
-    else
+    if [ $compiler = GNU ]; then
         rc_cmd="$RC $RCFLAGS -o conftest.o conftest.rc"
+    else
+        rc_cmd="$RC $RCFLAGS -foconftest.o conftest.rc"
     fi
     if $rc_cmd >conftest.log 2>&1; then
         res=$?
@@ -278,21 +292,26 @@
 bit_depth="8"
 chroma_format="all"
 compiler="GNU"
+compiler_style="GNU"
 opencl="yes"
 
 CFLAGS="$CFLAGS -Wall -I. -I\$(SRCPATH)"
 LDFLAGS="$LDFLAGS"
 LDFLAGSCLI="$LDFLAGSCLI"
-ASFLAGS="$ASFLAGS"
+ASFLAGS="$ASFLAGS -I. -I\$(SRCPATH)"
 RCFLAGS="$RCFLAGS"
+CHECK_CFLAGS=""
 HAVE_GETOPT_LONG=1
 cross_prefix=""
 
 EXE=""
+AS_EXT=".S"
+NL="
+"
 
 # list of all preprocessor HAVE values we can define
 CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F SWSCALE \
-             LAVF FFMS GPAC AVS GPL VECTOREXT INTERLACED CPU_COUNT OPENCL THP LSMASH"
+             LAVF FFMS GPAC AVS GPL VECTOREXT INTERLACED CPU_COUNT OPENCL THP LSMASH X86_INLINE_ASM AS_FUNC"
 
 # parse options
 
@@ -439,23 +458,44 @@
 host_vendor="${host%%-*}"
 host_os="${host#*-}"
 
-# test for use of Intel Compiler
+# test for use of compilers that require specific handling
+cc_base=`basename "$CC"`
+QPRE="-"
 if [[ $host_os = mingw* || $host_os = cygwin* ]]; then
-    if [[ `basename "$CC"` = icl* ]]; then
+    if [[ "$cc_base" = icl || "$cc_base" = icl[\ .]* ]]; then
         # Windows Intel Compiler creates dependency generation with absolute Windows paths, Cygwin's make does not support Windows paths.
         [[ $host_os = cygwin* ]] && die "Windows Intel Compiler support requires MSYS"
         compiler=ICL
+        compiler_style=MS
         CFLAGS="$CFLAGS -Qstd=c99 -nologo -Qms0 -DHAVE_STRING_H -I\$(SRCPATH)/extras"
         QPRE="-Q"
         `$CC 2>&1 | grep -q IA-32` && host_cpu=i486
         `$CC 2>&1 | grep -q "Intel(R) 64"` && host_cpu=x86_64
         cpp_check "" "" "_MSC_VER >= 1400" || die "Windows Intel Compiler support requires Visual Studio 2005 or newer"
+        if cc_check '' -Qdiag-error:10006,10157 ; then
+            CHECK_CFLAGS="$CHECK_CFLAGS -Qdiag-error:10006,10157"
+        fi
+    elif [[ "$cc_base" = cl || "$cc_base" = cl[\ .]* ]]; then
+        # Standard Microsoft Visual Studio
+        # Dependency creation includes absolute windows paths, Cygwin's make does not support Windows paths.
+        [[ $host_os = cygwin* ]] && die "Microsoft Visual Studio support requires MSYS"
+        compiler=CL
+        compiler_style=MS
+        CFLAGS="$CFLAGS -nologo -DHAVE_STRING_H -I\$(SRCPATH)/extras"
+        `$CC 2>&1 | grep -q 'for x86'` && host_cpu=i486
+        `$CC 2>&1 | grep -q 'for x64'` && host_cpu=x86_64
+        cpp_check '' '' '_MSC_VER > 1800 || (_MSC_VER == 1800 && _MSC_FULL_VER >= 180030324)' || die "Microsoft Visual Studio support requires Visual Studio 2013 Update 2 or newer"
     fi
 else
-    if [[ `basename "$CC"` = icc* ]]; then
+    if [[ "$cc_base" = icc || "$cc_base" = icc[\ .]* ]]; then
         AR="xiar"
         compiler=ICC
-        QPRE="-"
+    fi
+fi
+
+if [[ "$cc_base" = clang || "$cc_base" = clang[\ .]* ]]; then
+    if cc_check '' -Werror=unknown-warning-option ; then
+        CHECK_CFLAGS="$CHECK_CFLAGS -Werror=unknown-warning-option"

x264-snapshot-20140321-2245.tar.bz2/encoder/analyse.c -> x264-snapshot-20141104-2245.tar.bz2/encoder/analyse.c Changed

x264-snapshot-20140321-2245.tar.bz2/encoder/cabac.c -> x264-snapshot-20141104-2245.tar.bz2/encoder/cabac.c Changed

x264-snapshot-20140321-2245.tar.bz2/encoder/cavlc.c -> x264-snapshot-20141104-2245.tar.bz2/encoder/cavlc.c Changed

x264-snapshot-20140321-2245.tar.bz2/encoder/encoder.c -> x264-snapshot-20141104-2245.tar.bz2/encoder/encoder.c Changed

@@ -5,7 +5,7 @@
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -97,11 +97,14 @@
         int cw = h->param.i_width>>1;
         int ch = h->param.i_height>>CHROMA_V_SHIFT;
         pixel *planeu = x264_malloc( (cw*ch*2+32)*sizeof(pixel) );
-        pixel *planev = planeu + cw*ch + 16;
-        h->mc.plane_copy_deinterleave( planeu, cw, planev, cw, h->fdec->plane[1], h->fdec->i_stride[1], cw, ch );
-        fwrite( planeu, 1, cw*ch*sizeof(pixel), f );
-        fwrite( planev, 1, cw*ch*sizeof(pixel), f );
-        x264_free( planeu );
+        if( planeu )
+        {
+            pixel *planev = planeu + cw*ch + 16;
+            h->mc.plane_copy_deinterleave( planeu, cw, planev, cw, h->fdec->plane[1], h->fdec->i_stride[1], cw, ch );
+            fwrite( planeu, 1, cw*ch*sizeof(pixel), f );
+            fwrite( planev, 1, cw*ch*sizeof(pixel), f );
+            x264_free( planeu );
+        }
     }
     fclose( f );
 }
@@ -412,6 +415,12 @@
 
 static int x264_validate_parameters( x264_t *h, int b_open )
 {
+    if( !h->param.pf_log )
+    {
+        x264_log( NULL, X264_LOG_ERROR, "pf_log not set! did you forget to call x264_param_default?\n" );
+        return -1;
+    }
+
 #if HAVE_MMX
     if( b_open )
     {
@@ -818,6 +827,8 @@
         /* 8x8dct is not useful without RD in CAVLC lossless */
         if( !h->param.b_cabac && h->param.analyse.i_subpel_refine < 6 )
             h->param.analyse.b_transform_8x8 = 0;
+        h->param.analyse.inter &= ~X264_ANALYSE_I8x8;
+        h->param.analyse.intra &= ~X264_ANALYSE_I8x8;
     }
     if( h->param.rc.i_rc_method == X264_RC_CQP )
     {
@@ -1403,7 +1414,11 @@
     /* Init x264_t */
     h->i_frame = -1;
     h->i_frame_num = 0;
-    h->i_idr_pic_id = 0;
+
+    if( h->param.i_avcintra_class )
+        h->i_idr_pic_id = 5;
+    else
+        h->i_idr_pic_id = 0;
 
     if( (uint64_t)h->param.i_timebase_den * 2 > UINT32_MAX )
     {
@@ -2154,6 +2169,31 @@
             h->fref[1][h->i_ref[1]++] = h->frames.reference[i];
     }
 
+    if( h->sh.i_mmco_remove_from_end )
+    {
+        /* Order ref0 for MMCO remove */
+        do
+        {
+            b_ok = 1;
+            for( int i = 0; i < h->i_ref[0] - 1; i++ )
+            {
+                if( h->fref[0][i]->i_frame < h->fref[0][i+1]->i_frame )
+                {
+                    XCHG( x264_frame_t*, h->fref[0][i], h->fref[0][i+1] );
+                    b_ok = 0;
+                    break;
+                }
+            }
+        } while( !b_ok );
+
+        for( int i = h->i_ref[0]-1; i >= h->i_ref[0] - h->sh.i_mmco_remove_from_end; i-- )
+        {
+            int diff = h->i_frame_num - h->fref[0][i]->i_frame_num;
+            h->sh.mmco[h->sh.i_mmco_command_count].i_poc = h->fref[0][i]->i_poc;
+            h->sh.mmco[h->sh.i_mmco_command_count++].i_difference_of_pic_nums = diff;
+        }
+    }
+
     /* Order reference lists by distance from the current frame. */
     for( int list = 0; list < 2; list++ )
     {
@@ -2176,14 +2216,6 @@
         } while( !b_ok );
     }
 
-    if( h->sh.i_mmco_remove_from_end )
-        for( int i = h->i_ref[0]-1; i >= h->i_ref[0] - h->sh.i_mmco_remove_from_end; i-- )
-        {
-            int diff = h->i_frame_num - h->fref[0][i]->i_frame_num;
-            h->sh.mmco[h->sh.i_mmco_command_count].i_poc = h->fref[0][i]->i_poc;
-            h->sh.mmco[h->sh.i_mmco_command_count++].i_difference_of_pic_nums = diff;
-        }
-
     x264_reference_check_reorder( h );
 
     h->i_ref[1] = X264_MIN( h->i_ref[1], h->frames.i_max_ref1 );
@@ -2438,7 +2470,24 @@
         x264_slice_header_init( h, &h->sh, h->sps, h->pps, h->i_idr_pic_id, h->i_frame_num, i_global_qp );
 
         /* alternate id */
-        h->i_idr_pic_id ^= 1;
+        if( h->param.i_avcintra_class )
+        {
+            switch( h->i_idr_pic_id )
+            {
+                case 5:
+                    h->i_idr_pic_id = 3;
+                    break;
+                case 3:
+                    h->i_idr_pic_id = 4;
+                    break;
+                case 4:
+                default:
+                    h->i_idr_pic_id = 5;
+                    break;
+            }
+        }
+        else
+            h->i_idr_pic_id ^= 1;
     }
     else
     {
@@ -3539,15 +3588,15 @@
                 return -1;
             overhead += h->out.nal[h->out.i_nal-1].i_payload + SEI_OVERHEAD;
         }
+    }
 
-        if( h->param.i_frame_packing >= 0 )
-        {
-            x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
-            x264_sei_frame_packing_write( h, &h->out.bs );
-            if( x264_nal_end( h ) )
-                return -1;
-            overhead += h->out.nal[h->out.i_nal-1].i_payload + SEI_OVERHEAD;
-        }
+    if( h->param.i_frame_packing >= 0 && (h->fenc->b_keyframe || h->param.i_frame_packing == 5) )
+    {
+        x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
+        x264_sei_frame_packing_write( h, &h->out.bs );
+        if( x264_nal_end( h ) )
+            return -1;
+        overhead += h->out.nal[h->out.i_nal-1].i_payload + SEI_OVERHEAD;
     }
 
     /* generate sei pic timing */

x264-snapshot-20140321-2245.tar.bz2/encoder/macroblock.c -> x264-snapshot-20141104-2245.tar.bz2/encoder/macroblock.c Changed

x264-snapshot-20140321-2245.tar.bz2/encoder/me.c -> x264-snapshot-20141104-2245.tar.bz2/encoder/me.c Changed

x264-snapshot-20140321-2245.tar.bz2/encoder/ratecontrol.c -> x264-snapshot-20141104-2245.tar.bz2/encoder/ratecontrol.c Changed

@@ -6,7 +6,7 @@
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Michael Niedermayer <michaelni@gmx.at>
  *          Gabriel Bouvigne <gabriel.bouvigne@joost.com>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *          Måns Rullgård <mru@mru.ath.cx>
  *
  * This program is free software; you can redistribute it and/or modify
@@ -158,7 +158,7 @@
     double frame_size_maximum;  /* Maximum frame size due to MinCR */
     double frame_size_planned;
     double slice_size_planned;
-    predictor_t (*row_pred)[2];
+    predictor_t *row_pred;
     predictor_t row_preds[3][2];
     predictor_t *pred_b_from_p; /* predict B-frame size from P-frame satd */
     int bframes;                /* # consecutive B-frames before this P-frame */
@@ -1418,7 +1418,7 @@
         memset( h->fdec->i_row_bits, 0, h->mb.i_mb_height * sizeof(int) );
         memset( h->fdec->f_row_qp, 0, h->mb.i_mb_height * sizeof(float) );
         memset( h->fdec->f_row_qscale, 0, h->mb.i_mb_height * sizeof(float) );
-        rc->row_pred = &rc->row_preds[h->sh.i_type];
+        rc->row_pred = rc->row_preds[h->sh.i_type];
         rc->buffer_rate = h->fenc->i_cpb_duration * rc->vbv_max_rate * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
         update_vbv_plan( h, overhead );
 
@@ -1504,7 +1504,7 @@
     /* average between two predictors:
      * absolute SATD, and scaled bit cost of the colocated row in the previous frame */
     x264_ratecontrol_t *rc = h->rc;
-    float pred_s = predict_size( rc->row_pred[0], qscale, h->fdec->i_row_satd[y] );
+    float pred_s = predict_size( &rc->row_pred[0], qscale, h->fdec->i_row_satd[y] );
     if( h->sh.i_type == SLICE_TYPE_I || qscale >= h->fref[0][0]->f_row_qscale[y] )
     {
         if( h->sh.i_type == SLICE_TYPE_P
@@ -1522,7 +1522,7 @@
     /* Our QP is lower than the reference! */
     else
     {
-        float pred_intra = predict_size( rc->row_pred[1], qscale, h->fdec->i_row_satds[0][0][y] );
+        float pred_intra = predict_size( &rc->row_pred[1], qscale, h->fdec->i_row_satds[0][0][y] );
         /* Sum: better to overestimate than underestimate by using only one of the two predictors. */
         return pred_intra + pred_s;
     }
@@ -1570,9 +1570,9 @@
     h->fdec->f_row_qp[y] = rc->qpm;
     h->fdec->f_row_qscale[y] = qscale;
 
-    update_predictor( rc->row_pred[0], qscale, h->fdec->i_row_satd[y], h->fdec->i_row_bits[y] );
+    update_predictor( &rc->row_pred[0], qscale, h->fdec->i_row_satd[y], h->fdec->i_row_bits[y] );
     if( h->sh.i_type == SLICE_TYPE_P && rc->qpm < h->fref[0][0]->f_row_qp[y] )
-        update_predictor( rc->row_pred[1], qscale, h->fdec->i_row_satds[0][0][y], h->fdec->i_row_bits[y] );
+        update_predictor( &rc->row_pred[1], qscale, h->fdec->i_row_satds[0][0][y], h->fdec->i_row_bits[y] );
 
     /* update ratecontrol per-mbpair in MBAFF */
     if( SLICE_MBAFF && !(y&1) )
@@ -2612,7 +2612,7 @@
         x264_t *t = h->thread[i];
         if( t != h )
             memcpy( t->rc, rc, offsetof(x264_ratecontrol_t, row_pred) );
-        t->rc->row_pred = &t->rc->row_preds[h->sh.i_type];
+        t->rc->row_pred = t->rc->row_preds[h->sh.i_type];
         /* Calculate the planned slice size. */
         if( rc->b_vbv && rc->frame_size_planned )
         {

x264-snapshot-20140321-2245.tar.bz2/encoder/rdo.c -> x264-snapshot-20141104-2245.tar.bz2/encoder/rdo.c Changed

x264-snapshot-20140321-2245.tar.bz2/encoder/set.c -> x264-snapshot-20141104-2245.tar.bz2/encoder/set.c Changed

@@ -675,7 +675,9 @@
         bs_write( &q, 4, 0 );                     // frame1_grid_position_y
     }
     bs_write( &q, 8, 0 );                         // frame_packing_arrangement_reserved_byte
-    bs_write_ue( &q, 1 );                         // frame_packing_arrangement_repetition_period
+    // "frame_packing_arrangement_repetition_period equal to 1 specifies that the frame packing arrangement SEI message persists in output"
+    // for (i_frame_packing == 5) this will undermine current_frame_is_frame0_flag which must alternate every view sequence
+    bs_write_ue( &q, h->param.i_frame_packing != 5 ); // frame_packing_arrangement_repetition_period
     bs_write1( &q, 0 );                           // frame_packing_arrangement_extension_flag
 
     bs_align_10( &q );
@@ -740,11 +742,15 @@
     data[20] = 0x13;
     /* These bytes appear to be some sort of frame/seconds counter in certain applications,
      * but others jump around, so leave them as zero for now */
-    data[21] = data[22] = 0;
-
+    data[22] = data[23] = data[25] = data[26] = 0;
     data[28] = 0x14;
+    data[30] = data[31] = data[33] = data[34] = 0;
     data[36] = 0x60;
     data[41] = 0x22; /* Believed to be some sort of end of basic UMID identifier */
+    data[60] = 0x62;
+    data[62] = data[63] = data[65] = data[66] = 0;
+    data[68] = 0x63;
+    data[70] = data[71] = data[73] = data[74] = 0;
 
     x264_sei_write( &h->out.bs, data, len, SEI_USER_DATA_UNREGISTERED );

x264-snapshot-20140321-2245.tar.bz2/encoder/slicetype.c -> x264-snapshot-20141104-2245.tar.bz2/encoder/slicetype.c Changed

x264-snapshot-20140321-2245.tar.bz2/filters/filters.c -> x264-snapshot-20141104-2245.tar.bz2/filters/filters.c Changed

x264-snapshot-20140321-2245.tar.bz2/filters/video/select_every.c -> x264-snapshot-20141104-2245.tar.bz2/filters/video/select_every.c Changed

x264-snapshot-20140321-2245.tar.bz2/input/avs.c -> x264-snapshot-20141104-2245.tar.bz2/input/avs.c Changed

@@ -298,7 +298,10 @@
             opt->input_range = opt->output_range;
         }
         const char *arg_name[] = { NULL, "interlaced", "matrix" };
-        AVS_Value arg_arr[] = { res, avs_new_value_bool( info->interlaced ), avs_new_value_string( matrix ) };
+        AVS_Value arg_arr[3];
+        arg_arr[0] = res;
+        arg_arr[1] = avs_new_value_bool( info->interlaced );
+        arg_arr[2] = avs_new_value_string( matrix );
         AVS_Value res2 = h->func.avs_invoke( h->env, conv_func, avs_new_value_array( arg_arr, arg_count ), arg_name );
         FAIL_IF_ERROR( avs_is_error( res2 ), "couldn't convert input clip to %s\n", csp )
         res = update_clip( h, &vi, res2, res );
@@ -308,7 +311,9 @@
     {
         const char *levels = opt->output_range ? "TV->PC" : "PC->TV";
         x264_cli_log( "avs", X264_LOG_WARNING, "performing %s conversion\n", levels );
-        AVS_Value arg_arr[] = { res, avs_new_value_string( levels ) };
+        AVS_Value arg_arr[2];
+        arg_arr[0] = res;
+        arg_arr[1] = avs_new_value_string( levels );
         const char *arg_name[] = { NULL, "levels" };
         AVS_Value res2 = h->func.avs_invoke( h->env, "ColorYUV", avs_new_value_array( arg_arr, 2 ), arg_name );
         FAIL_IF_ERROR( avs_is_error( res2 ), "couldn't convert range: %s\n", avs_as_error( res2 ) )

x264-snapshot-20140321-2245.tar.bz2/input/ffms.c -> x264-snapshot-20141104-2245.tar.bz2/input/ffms.c Changed

x264-snapshot-20140321-2245.tar.bz2/input/lavf.c -> x264-snapshot-20141104-2245.tar.bz2/input/lavf.c Changed

@@ -42,12 +42,6 @@
     cli_pic_t *first_pic;
 } lavf_hnd_t;
 
-#define x264_free_packet( pkt )\
-{\
-    av_free_packet( pkt );\
-    av_init_packet( pkt );\
-}
-
 /* handle the deprecated jpeg pixel formats */
 static int handle_jpeg( int csp, int *fullrange )
 {
@@ -70,9 +64,7 @@
         {
             XCHG( cli_image_t, p_pic->img, h->first_pic->img );
             p_pic->pts = h->first_pic->pts;
-            XCHG( void*, p_pic->opaque, h->first_pic->opaque );
         }
-        lavf_input.release_frame( h->first_pic, NULL );
         lavf_input.picture_clean( h->first_pic );
         free( h->first_pic );
         h->first_pic = NULL;
@@ -81,9 +73,11 @@
     }
 
     AVCodecContext *c = h->lavf->streams[h->stream_id]->codec;
-    AVPacket *pkt = p_pic->opaque;
 
-    avcodec_get_frame_defaults( h->frame );
+    AVPacket pkt;
+    av_init_packet( &pkt );
+    pkt.data = NULL;
+    pkt.size = 0;
 
     while( i_frame >= h->next_frame )
     {
@@ -91,20 +85,23 @@
         int ret = 0;
         do
         {
-            ret = av_read_frame( h->lavf, pkt );
+            ret = av_read_frame( h->lavf, &pkt );
 
-            if( pkt->stream_index == h->stream_id )
+            if( ret < 0 )
             {
-                if( ret < 0 )
-                    pkt->size = 0;
+                av_init_packet( &pkt );
+                pkt.data = NULL;
+                pkt.size = 0;
+            }
 
-                c->reordered_opaque = pkt->pts;
-                if( avcodec_decode_video2( c, h->frame, &finished, pkt ) < 0 )
+            if( ret < 0 || pkt.stream_index == h->stream_id )
+            {
+                if( avcodec_decode_video2( c, h->frame, &finished, &pkt ) < 0 )
                     x264_cli_log( "lavf", X264_LOG_WARNING, "video decoding failed on frame %d\n", h->next_frame );
             }
-            /* if the packet successfully decoded but the data from it is not desired, free it */
-            else if( ret >= 0 )
-                x264_free_packet( pkt );
+
+            if( ret >= 0 )
+                av_free_packet( &pkt );
         } while( !finished && ret >= 0 );
 
         if( !finished )
@@ -130,10 +127,10 @@
     if( h->vfr_input )
     {
         p_pic->pts = p_pic->duration = 0;
-        if( c->has_b_frames && h->frame->reordered_opaque != AV_NOPTS_VALUE )
-            p_pic->pts = h->frame->reordered_opaque;
-        else if( pkt->dts != AV_NOPTS_VALUE )
-            p_pic->pts = pkt->dts; // for AVI files
+        if( h->frame->pkt_pts != AV_NOPTS_VALUE )
+            p_pic->pts = h->frame->pkt_pts;
+        else if( h->frame->pkt_dts != AV_NOPTS_VALUE )
+            p_pic->pts = h->frame->pkt_dts; // for AVI files
         else if( info )
         {
             h->vfr_input = info->vfr = 0;
@@ -153,7 +150,7 @@
     if( !strcmp( psz_filename, "-" ) )
         psz_filename = "pipe:";
 
-    h->frame = avcodec_alloc_frame();
+    h->frame = av_frame_alloc();
     if( !h->frame )
         return -1;
 
@@ -220,13 +217,10 @@
 
 static int picture_alloc( cli_pic_t *pic, int csp, int width, int height )
 {
-    if( x264_cli_pic_alloc( pic, csp, width, height ) )
+    if( x264_cli_pic_alloc( pic, X264_CSP_NONE, width, height ) )
         return -1;
+    pic->img.csp = csp;
     pic->img.planes = 4;
-    pic->opaque = malloc( sizeof(AVPacket) );
-    if( !pic->opaque )
-        return -1;
-    av_init_packet( pic->opaque );
     return 0;
 }
 
@@ -235,15 +229,8 @@
     return read_frame_internal( pic, handle, i_frame, NULL );
 }
 
-static int release_frame( cli_pic_t *pic, hnd_t handle )
-{
-    x264_free_packet( pic->opaque );
-    return 0;
-}
-
 static void picture_clean( cli_pic_t *pic )
 {
-    free( pic->opaque );
     memset( pic, 0, sizeof(cli_pic_t) );
 }
 
@@ -252,13 +239,9 @@
     lavf_hnd_t *h = handle;
     avcodec_close( h->lavf->streams[h->stream_id]->codec );
     avformat_close_input( &h->lavf );
-#if LIBAVCODEC_VERSION_INT >= AV_VERSION_INT(54, 28, 0)
-    avcodec_free_frame( &h->frame );
-#else
-    av_freep( &h->frame );
-#endif
+    av_frame_free( &h->frame );
     free( h );
     return 0;
 }
 
-const cli_input_t lavf_input = { open_file, picture_alloc, read_frame, release_frame, picture_clean, close_file };
+const cli_input_t lavf_input = { open_file, picture_alloc, read_frame, NULL, picture_clean, close_file };

x264-snapshot-20140321-2245.tar.bz2/input/thread.c -> x264-snapshot-20141104-2245.tar.bz2/input/thread.c Changed

x264-snapshot-20140321-2245.tar.bz2/output/mp4_lsmash.c -> x264-snapshot-20141104-2245.tar.bz2/output/mp4_lsmash.c Changed

@@ -79,6 +79,7 @@
     int i_dts_compress_multiplier;
     int b_use_recovery;
     int b_fragments;
+    lsmash_file_parameters_t file_param;
 } mp4_hnd_t;
 
 /*******************/
@@ -88,16 +89,10 @@
     mp4_hnd_t *p_mp4 = handle;
     if( !p_mp4 )
         return;
-    if( p_mp4->p_sei_buffer )
-    {
-        free( p_mp4->p_sei_buffer );
-        p_mp4->p_sei_buffer = NULL;
-    }
-    if( p_mp4->p_root )
-    {
-        lsmash_destroy_root( p_mp4->p_root );
-        p_mp4->p_root = NULL;
-    }
+    lsmash_cleanup_summary( (lsmash_summary_t *)p_mp4->summary );
+    lsmash_close_file( &p_mp4->file_param );
+    lsmash_destroy_root( p_mp4->p_root );
+    free( p_mp4->p_sei_buffer );
     free( p_mp4 );
 }
 
@@ -181,9 +176,13 @@
     p_mp4->b_fragments    = !b_regular;
     p_mp4->b_stdout       = !strcmp( psz_filename, "-" );
 
-    p_mp4->p_root = lsmash_open_movie( psz_filename, p_mp4->b_fragments ? LSMASH_FILE_MODE_WRITE_FRAGMENTED : LSMASH_FILE_MODE_WRITE );
+    p_mp4->p_root = lsmash_create_root();
     MP4_FAIL_IF_ERR_EX( !p_mp4->p_root, "failed to create root.\n" );
 
+    MP4_FAIL_IF_ERR_EX( lsmash_open_file( psz_filename, 0, &p_mp4->file_param ) < 0, "failed to open an output file.\n" );
+    if( p_mp4->b_fragments )
+        p_mp4->file_param.mode |= LSMASH_FILE_MODE_FRAGMENTED;
+
     p_mp4->summary = (lsmash_video_summary_t *)lsmash_create_summary( LSMASH_SUMMARY_TYPE_VIDEO );
     MP4_FAIL_IF_ERR_EX( !p_mp4->summary,
                         "failed to allocate memory for summary information of video.\n" );
@@ -219,12 +218,17 @@
             brands[brand_count++] = ISOM_BRAND_TYPE_ISO6;   /* cslg and visual random access grouping */
     }
 
+    /* Set file */
+    lsmash_file_parameters_t *file_param = &p_mp4->file_param;
+    file_param->major_brand   = brands[0];
+    file_param->brands        = brands;
+    file_param->brand_count   = brand_count;
+    file_param->minor_version = 0;
+    MP4_FAIL_IF_ERR( !lsmash_set_file( p_mp4->p_root, file_param ), "failed to add an output file into a ROOT.\n" );
+
     /* Set movie parameters. */
     lsmash_movie_parameters_t movie_param;
     lsmash_initialize_movie_parameters( &movie_param );
-    movie_param.major_brand = ISOM_BRAND_TYPE_MP42;
-    movie_param.brands = brands;
-    movie_param.number_of_brands = brand_count;
     MP4_FAIL_IF_ERR( lsmash_set_movie_parameters( p_mp4->p_root, &movie_param ),
                      "failed to set movie parameters.\n" );
     p_mp4->i_movie_timescale = lsmash_get_movie_timescale( p_mp4->p_root );

x264-snapshot-20140321-2245.tar.bz2/tools/checkasm.c -> x264-snapshot-20141104-2245.tar.bz2/tools/checkasm.c Changed

@@ -5,7 +5,7 @@
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Laurent Aimar <fenrir@via.ecp.fr>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -90,11 +90,11 @@
 {
     uint32_t a = 0;
 #if HAVE_X86_INLINE_ASM
-    asm volatile( "rdtsc" :"=a"(a) ::"edx" );
+    asm volatile( "rdtsc" : "=a"(a) :: "edx", "memory" );
 #elif ARCH_PPC
-    asm volatile( "mftb %0" : "=r" (a) );
+    asm volatile( "mftb %0" : "=r"(a) :: "memory" );
 #elif ARCH_ARM     // ARMv7 only
-    asm volatile( "mrc p15, 0, %0, c9, c13, 0" : "=r"(a) );
+    asm volatile( "mrc p15, 0, %0, c9, c13, 0" : "=r"(a) :: "memory" );
 #endif
     return a;
 }
@@ -184,6 +184,9 @@
 #elif ARCH_ARM
                     b->cpu&X264_CPU_NEON ? "neon" :
                     b->cpu&X264_CPU_ARMV6 ? "armv6" :
+#elif ARCH_AARCH64
+                    b->cpu&X264_CPU_NEON ? "neon" :
+                    b->cpu&X264_CPU_ARMV8 ? "armv8" :
 #endif
                     "c",
 #if HAVE_MMX
@@ -728,11 +731,14 @@
             fprintf( stderr, "ssim: %.7f != %.7f [FAILED]\n", res_c, res_a );
         }
         set_func_name( "ssim_core" );
-        call_c2( pixel_c.ssim_4x4x2_core,   pbuf1+2, (intptr_t)32, pbuf2+2, (intptr_t)32, sums );
-        call_a2( pixel_asm.ssim_4x4x2_core, pbuf1+2, (intptr_t)32, pbuf2+2, (intptr_t)32, sums );
+        call_c( pixel_c.ssim_4x4x2_core,   pbuf1+2, (intptr_t)32, pbuf2+2, (intptr_t)32, sums );
+        call_a( pixel_asm.ssim_4x4x2_core, pbuf1+2, (intptr_t)32, pbuf2+2, (intptr_t)32, sums );
         set_func_name( "ssim_end" );
         call_c2( pixel_c.ssim_end4,   sums, sums, 4 );
         call_a2( pixel_asm.ssim_end4, sums, sums, 4 );
+        /* check incorrect assumptions that 32-bit ints are zero-extended to 64-bit */
+        call_c1( pixel_c.ssim_end4,   sums, sums, 3 );
+        call_a1( pixel_asm.ssim_end4, sums, sums, 3 );
         report( "ssim :" );
     }
 
@@ -1097,6 +1103,7 @@
         TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, dct8[0], 8 );
         TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 4 );
         TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 );
+        TEST_ZIGZAG_SUB( sub_8x8, level1, level2, 64 );
         TEST_ZIGZAG_SUBAC( sub_4x4ac, level1, level2 );
         report( interlace ? "zigzag_field :" : "zigzag_frame :" );
     }
@@ -2624,8 +2631,9 @@
 {
     int ret = 0;
     int cpu0 = 0, cpu1 = 0;
+    uint32_t cpu_detect = x264_cpu_detect();
 #if HAVE_MMX
-    if( x264_cpu_detect() & X264_CPU_MMX2 )
+    if( cpu_detect & X264_CPU_MMX2 )
     {
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_MMX | X264_CPU_MMX2, "MMX" );
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "MMX Cache64" );
@@ -2634,7 +2642,7 @@
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_32, "MMX Cache32" );
         cpu1 &= ~X264_CPU_CACHELINE_32;
 #endif
-        if( x264_cpu_detect() & X264_CPU_LZCNT )
+        if( cpu_detect & X264_CPU_LZCNT )
         {
             ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX_LZCNT" );
             cpu1 &= ~X264_CPU_LZCNT;
@@ -2642,9 +2650,9 @@
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "MMX SlowCTZ" );
         cpu1 &= ~X264_CPU_SLOW_CTZ;
     }
-    if( x264_cpu_detect() & X264_CPU_SSE )
+    if( cpu_detect & X264_CPU_SSE )
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE, "SSE" );
-    if( x264_cpu_detect() & X264_CPU_SSE2 )
+    if( cpu_detect & X264_CPU_SSE2 )
     {
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2 | X264_CPU_SSE2_IS_SLOW, "SSE2Slow" );
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2_IS_FAST, "SSE2Fast" );
@@ -2655,17 +2663,17 @@
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" );
         cpu1 &= ~X264_CPU_SLOW_CTZ;
     }
-    if( x264_cpu_detect() & X264_CPU_LZCNT )
+    if( cpu_detect & X264_CPU_LZCNT )
     {
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE_LZCNT" );
         cpu1 &= ~X264_CPU_LZCNT;
     }
-    if( x264_cpu_detect() & X264_CPU_SSE3 )
+    if( cpu_detect & X264_CPU_SSE3 )
     {
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3 | X264_CPU_CACHELINE_64, "SSE3" );
         cpu1 &= ~X264_CPU_CACHELINE_64;
     }
-    if( x264_cpu_detect() & X264_CPU_SSSE3 )
+    if( cpu_detect & X264_CPU_SSSE3 )
     {
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" );
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
@@ -2679,54 +2687,59 @@
         cpu1 &= ~X264_CPU_CACHELINE_64;
         cpu1 &= ~X264_CPU_SLOW_ATOM;
     }
-    if( x264_cpu_detect() & X264_CPU_SSE4 )
+    if( cpu_detect & X264_CPU_SSE4 )
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" );
-    if( x264_cpu_detect() & X264_CPU_AVX )
+    if( cpu_detect & X264_CPU_AVX )
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX, "AVX" );
-    if( x264_cpu_detect() & X264_CPU_XOP )
+    if( cpu_detect & X264_CPU_XOP )
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_XOP, "XOP" );
-    if( x264_cpu_detect() & X264_CPU_FMA4 )
+    if( cpu_detect & X264_CPU_FMA4 )
     {
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA4, "FMA4" );
         cpu1 &= ~X264_CPU_FMA4;
     }
-    if( x264_cpu_detect() & X264_CPU_BMI1 )
+    if( cpu_detect & X264_CPU_BMI1 )
     {
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" );
         cpu1 &= ~X264_CPU_BMI1;
     }
-    if( x264_cpu_detect() & X264_CPU_AVX2 )
+    if( cpu_detect & X264_CPU_AVX2 )
     {
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" );
-        if( x264_cpu_detect() & X264_CPU_LZCNT )
+        if( cpu_detect & X264_CPU_LZCNT )
         {
             ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "AVX2_LZCNT" );
             cpu1 &= ~X264_CPU_LZCNT;
         }
     }
-    if( x264_cpu_detect() & X264_CPU_BMI2 )
+    if( cpu_detect & X264_CPU_BMI2 )
     {
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1|X264_CPU_BMI2, "BMI2" );
         cpu1 &= ~(X264_CPU_BMI1|X264_CPU_BMI2);
     }
-    if( x264_cpu_detect() & X264_CPU_FMA3 )
+    if( cpu_detect & X264_CPU_FMA3 )
     {
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" );
         cpu1 &= ~X264_CPU_FMA3;
     }
 #elif ARCH_PPC
-    if( x264_cpu_detect() & X264_CPU_ALTIVEC )
+    if( cpu_detect & X264_CPU_ALTIVEC )
     {
         fprintf( stderr, "x264: ALTIVEC against C\n" );
         ret = check_all_funcs( 0, X264_CPU_ALTIVEC );
     }
 #elif ARCH_ARM
-    if( x264_cpu_detect() & X264_CPU_ARMV6 )
+    if( cpu_detect & X264_CPU_ARMV6 )
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_ARMV6, "ARMv6" );
-    if( x264_cpu_detect() & X264_CPU_NEON )
+    if( cpu_detect & X264_CPU_NEON )
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_NEON, "NEON" );
-    if( x264_cpu_detect() & X264_CPU_FAST_NEON_MRC )
+    if( cpu_detect & X264_CPU_FAST_NEON_MRC )
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_FAST_NEON_MRC, "Fast NEON MRC" );
+#elif ARCH_AARCH64
+    if( cpu_detect & X264_CPU_ARMV8 )
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_ARMV8, "ARMv8" );
+    if( cpu_detect & X264_CPU_NEON )
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_NEON, "NEON" );
 #endif
     return ret;
 }

x264-snapshot-20141104-2245.tar.bz2/tools/cltostr.sh Added

x264-snapshot-20141104-2245.tar.bz2/tools/msvsdepend.sh Added

@@ -0,0 +1,21 @@
+#!/bin/sh
+# There's a lot of things going on here
+# expected arguments are $(CC) $(CFLAGS) $(SRC) $(OBJ)
+# 1) start the dependency line with the object argument
+# 2) need to add -Zs -showIncludes to the flags to have the compiler output list of include files without compilation
+# 3) look for notes in the output that start with "Note: including file:"
+# 4) retain only the filepath from the notes
+# 5) convert \ foldername separators to /
+# 6) escape spaces in the filepath
+# 7) remove system includes (hack: check for "/Program Files" string in filepath)
+# 8) sort and remove duplicate filepath entries
+# 9) convert newlines to spaces to collapse the dependencies into the one dependency line
+# 10) print a newline character, to properly separate dependency lines
+echo -n "$4: "
+$1 $2 $3 -Zs -showIncludes 2>&1 |
+    grep '^Note: including file:' |
+    sed 's/^Note: including file:[[:space:]]*$.*$$/\1/; s/\\/\//g; s/ /\\ /g' |
+    sed '/\/[Pp]rogram\\ [Ff]iles/d' |
+    sort | uniq |
+    tr -s '\n\r' ' '
+echo ''

x264-snapshot-20140321-2245.tar.bz2/x264.c -> x264-snapshot-20141104-2245.tar.bz2/x264.c Changed

x264-snapshot-20140321-2245.tar.bz2/x264.h -> x264-snapshot-20141104-2245.tar.bz2/x264.h Changed

@@ -5,7 +5,7 @@
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -28,8 +28,8 @@
 #ifndef X264_X264_H
 #define X264_X264_H
 
-#if !defined(_STDINT_H) && !defined(_STDINT_H_) && !defined(_STDINT_H_INCLUDED) &&\
-    !defined(_INTTYPES_H) && !defined(_INTTYPES_H_)
+#if !defined(_STDINT_H) && !defined(_STDINT_H_) && !defined(_STDINT_H_INCLUDED) && !defined(_STDINT) &&\
+    !defined(_INTTYPES_H) && !defined(_INTTYPES_H_) && !defined(_INTTYPES)
 # ifdef _MSC_VER
 #  pragma message("You must include stdint.h or inttypes.h before x264.h")
 # else
@@ -152,10 +152,11 @@
 /* PowerPC */
 #define X264_CPU_ALTIVEC         0x0000001
 
-/* ARM */
+/* ARM and AArch64 */
 #define X264_CPU_ARMV6           0x0000001
 #define X264_CPU_NEON            0x0000002  /* ARM NEON */
 #define X264_CPU_FAST_NEON_MRC   0x0000004  /* Transfer from NEON to ARM register is fast (Cortex-A9) */
+#define X264_CPU_ARMV8           0x0000008
 
 /* Analyse flags */
 #define X264_ANALYSE_I4x4       0x0001  /* Analyse i4x4 */

Changes of Revision 8