Projects
Essentials
libx264
Sign Up
Log In
Username
Password
We truncated the diff of some files because they were too big. If you want to see the full diff for every file,
click here
.
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
Expand all
Collapse all
Changes of Revision 8
View file
libx264.changes
Changed
@@ -1,4 +1,9 @@ ------------------------------------------------------------------- +Wed Nov 5 12:33:30 UTC 2014 - i@margueirte.su + +- update version 20141104 + +------------------------------------------------------------------- Sat Mar 22 17:10:14 UTC 2014 - i@margueirte.su - update version 20140321.
View file
libx264.spec
Changed
@@ -1,6 +1,7 @@ -# vim: set ts=4 sw=4 et: -# Copyright (c) 2012 Pascal Bleser <pascal.bleser@opensuse.org> -# COpyright (c) 2013 Marguerite Su <marguerite@opensuse.org> +# +# spec file for package libx264 +# +# Copyright (c) 2014 SUSE LINUX Products GmbH, Nuernberg, Germany. # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed @@ -11,19 +12,21 @@ # license that conforms to the Open Source Definition (Version 1.9) # published by the Open Source Initiative. -# Please submit bugfixes or comments via http://bugs.links2linux.org/ +# Please submit bugfixes or comments via http://bugs.opensuse.org/ +# + -Name: libx264 %define soname 142 -%define svn 20140321 +%define svn 20141104 +Name: libx264 Version: 0.%{soname}svn%{svn} -Release: 1 -License: GPL-2.0+ +Release: 0 Summary: A free h264/avc encoder - encoder binary -Url: http://developers.videolan.org/x264.html +License: GPL-2.0+ Group: Productivity/Multimedia/Video/Editors and Convertors -Source: ftp://ftp.videolan.org/pub/videolan/x264/snapshots/x264-snapshot-%{svn}-2245.tar.bz2 -Patch: x264-use-shared-library.patch +Url: http://developers.videolan.org/x264.html +Source: ftp://ftp.videolan.org/pub/videolan/x264/snapshots/x264-snapshot-%{svn}-2245.tar.bz2 +Patch0: x264-use-shared-library.patch BuildRequires: nasm BuildRequires: pkg-config BuildRequires: yasm >= 1.2.0 @@ -92,7 +95,7 @@ %prep %setup -q -n x264-snapshot-%{svn}-2245 -%patch -p1 +%patch0 -p1 FAKE_BUILDDATE=$(LC_ALL=C date -u -r %{_sourcedir}/%{name}.changes '+%%b %%e %%Y') sed -i "s/__DATE__/\"$FAKE_BUILDDATE\"/" x264.c @@ -108,7 +111,7 @@ make %{?_smp_mflags} %install -%makeinstall +make DESTDIR=%{buildroot} install %{?_smp_mflags} rm -f %{buildroot}%{_libdir}/%{name}.so rm -f %{buildroot}%{_libdir}/%{name}.a @@ -119,6 +122,7 @@ echo "%{name}-%{soname}" > %{_sourcedir}/baselibs.conf %post -n %{name}-%{soname} -p /sbin/ldconfig + %postun -n %{name}-%{soname} -p /sbin/ldconfig %files %{soname}
View file
x264-snapshot-20140321-2245.tar.bz2/common/sparc
Deleted
-(directory)
View file
x264-snapshot-20140321-2245.tar.bz2/common/sparc/pixel.asm
Deleted
@@ -1,1089 +0,0 @@ -/***************************************************************************** - * pixel.asm: sparc pixel metrics - ***************************************************************************** - * Copyright (C) 2005-2014 x264 project - * - * Authors: Phil Jensen <philj@csufresno.edu> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. - * - * This program is also available under a commercial proprietary license. - * For more information, contact us at licensing@x264.com. - *****************************************************************************/ - -! VIS optimized SAD for UltraSPARC - -.text -.global x264_pixel_sad_8x8_vis -x264_pixel_sad_8x8_vis: - save %sp, -120, %sp - - fzero %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - std %f12, [%fp-24] - ld [%fp-20], %i0 - - ret - restore - -.global x264_pixel_sad_8x16_vis -x264_pixel_sad_8x16_vis: - save %sp, -120, %sp - - fzero %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12
View file
x264-snapshot-20140321-2245.tar.bz2/common/sparc/pixel.h
Deleted
@@ -1,34 +0,0 @@ -/***************************************************************************** - * pixel.h: sparc pixel metrics - ***************************************************************************** - * Copyright (C) 2005-2014 x264 project - * - * Authors: Phil Jensen <philj@csufresno.edu> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. - * - * This program is also available under a commercial proprietary license. - * For more information, contact us at licensing@x264.com. - *****************************************************************************/ - -#ifndef X264_SPARC_PIXEL_H -#define X264_SPARC_PIXEL_H - -int x264_pixel_sad_8x8_vis ( uint8_t *, intptr_t, uint8_t *, intptr_t ); -int x264_pixel_sad_8x16_vis ( uint8_t *, intptr_t, uint8_t *, intptr_t ); -int x264_pixel_sad_16x8_vis ( uint8_t *, intptr_t, uint8_t *, intptr_t ); -int x264_pixel_sad_16x16_vis( uint8_t *, intptr_t, uint8_t *, intptr_t ); - -#endif
View file
x264-snapshot-20140321-2245.tar.bz2/tools/cltostr.pl
Deleted
@@ -1,65 +0,0 @@ -# Perl script used for compiling OpenCL src into x264 binary -# -# Copyright (C) 2013-2014 x264 project -# Authors: Steve Borho <sborho@multicorewareinc.com> - -use Digest::MD5 qw(md5_hex); - -# xxd takes a VAR, which will be the variable name -# and BYTES, a string of bytes to beencoded. -sub xxd -{ - my %args = @_; - my $var = $args{VAR}; - my $bytes = $args{BYTES}; - my @hexbytes; - my @bytes = split //, $$bytes; - foreach $b (@bytes) - { - push @hexbytes, sprintf("0x%02X", ord($b)); - } - - # Format 'em nice and pretty-like. - print 'static const char ' . $var . '[] = {' . "\n"; - my $count = 0; - foreach my $h (@hexbytes) - { - print "$h, "; - $count++; - if ($count == 16) - { - print "\n"; - $count = 0; - } - } - print "\n0x00 };\n\n"; - - return; -} - -if (@ARGV < 1) -{ - printf "%s: VARNAME ", $0 . "\n"; - exit(-1); -} - - -my @lines; -while(<STDIN>) -{ - s/^\s+//; # trim leading whitespace - if (/^\/\//) - { - next; # skip the line if it starts with '//' - } - push @lines, $_; -} - -my $lines = join '', @lines; -xxd(VAR => @ARGV[0], BYTES => \$lines); - -my $hash = md5_hex($lines); -@hash = ( $hash =~ m/../g ); - - -xxd(VAR => @ARGV[0] . "_hash", BYTES => \$hash);
View file
x264-snapshot-20140321-2245.tar.bz2/.gitignore -> x264-snapshot-20141104-2245.tar.bz2/.gitignore
Changed
@@ -39,6 +39,8 @@ *.mbtree *.temp *.pyc +*.pgd +*.pgc .digress_x264 dataDec.txt
View file
x264-snapshot-20140321-2245.tar.bz2/AUTHORS -> x264-snapshot-20141104-2245.tar.bz2/AUTHORS
Changed
@@ -33,6 +33,14 @@ D: BeOS and MacOS X ports. S: France +N: Fiona Glaser +E: fiona AT x264 DOT com +D: Maintainer +D: All areas of encoder analysis and algorithms +D: Motion estimation, rate control, macroblock & frame decisions, RDO, etc +D: x86 asm +S: USA + N: Gabriel Bouvigne E: bouvigne AT mp3-tech DOT org D: 2pass VBV @@ -47,31 +55,25 @@ D: 4:2:2 chroma subsampling, x86 asm, Windows improvements, bugfixes S: Sweden -N: Jason Garrett-Glaser -E: darkshikari AT gmail DOT com -D: x86 asm, 1pass VBV, adaptive quantization, inline asm -D: various speed optimizations, bugfixes -S: USA - N: Laurent Aimar -E: fenrir AT via.ecp DOT fr +E: fenrir AT videolan DOT org C: fenrir D: Intial import, former maintainer D: x86 asm (mmx/mmx2) S: France N: Loren Merritt -E: lorenm AT u.washington DOT edu +E: pengvado AT akuvian DOT org C: pengvado -D: maintainer +D: Maintainer D: All areas of encoder analysis and algorithms -D: Motion estimation, rate control, macroblock & frame decisions, RDO, etc. +D: Motion estimation, rate control, macroblock & frame decisions, RDO, etc D: Multithreading -D: x86 and x86_64 asm (mmx/mmx2/sse2) +D: x86 asm S: USA N: Mans Rullgard -E: mru AT inprovide DOT com +E: mru AT mansr DOT com C: mru D: Rate control S: Southampton, UK @@ -91,10 +93,6 @@ D: gcc asm to nasm conversion S: China -N: Phil Jensen -E: philj AT csufresno DOT edu -D: SPARC asm - N: Radek Czyz E: radoslaw AT syskin DOT cjb DOT net D: Cached motion compensation
View file
x264-snapshot-20140321-2245.tar.bz2/Makefile -> x264-snapshot-20141104-2245.tar.bz2/Makefile
Changed
@@ -88,17 +88,14 @@ ifeq ($(ARCH),X86) ARCH_X86 = yes ASMSRC = $(X86SRC) common/x86/pixel-32.asm -ASFLAGS += -DARCH_X86_64=0 endif ifeq ($(ARCH),X86_64) ARCH_X86 = yes ASMSRC = $(X86SRC:-32.asm=-64.asm) common/x86/trellis-64.asm -ASFLAGS += -DARCH_X86_64=1 endif ifdef ARCH_X86 -ASFLAGS += -I$(SRCPATH)/common/x86/ SRCS += common/x86/mc-c.c common/x86/predict-c.c OBJASM = $(ASMSRC:%.asm=%.o) $(OBJASM): common/x86/x86inc.asm common/x86/x86util.asm @@ -126,11 +123,18 @@ endif endif -# VIS optims -ifeq ($(ARCH),UltraSPARC) -ifeq ($(findstring HIGH_BIT_DEPTH, $(CONFIG)),) -ASMSRC += common/sparc/pixel.asm -OBJASM = $(ASMSRC:%.asm=%.o) +# AArch64 NEON optims +ifeq ($(ARCH),AARCH64) +ifneq ($(AS),) +ASMSRC += common/aarch64/dct-a.S \ + common/aarch64/deblock-a.S \ + common/aarch64/mc-a.S \ + common/aarch64/pixel-a.S \ + common/aarch64/predict-a.S \ + common/aarch64/quant-a.S +SRCS += common/aarch64/mc-c.c \ + common/aarch64/predict-c.c +OBJASM = $(ASMSRC:%.S=%.o) endif endif @@ -148,7 +152,7 @@ ifeq ($(HAVE_OPENCL),yes) common/oclobj.h: common/opencl/x264-cl.h $(wildcard $(SRCPATH)/common/opencl/*.cl) - cat $^ | perl $(SRCPATH)/tools/cltostr.pl x264_opencl_source > $@ + cat $^ | $(SRCPATH)/tools/cltostr.sh $@ GENERATED += common/oclobj.h SRCS += common/opencl.c encoder/slicetype-cl.c endif @@ -157,7 +161,7 @@ OBJCLI += $(SRCCLI:%.c=%.o) OBJSO += $(SRCSO:%.c=%.o) -.PHONY: all default fprofiled clean distclean install uninstall lib-static lib-shared cli install-lib-dev install-lib-static install-lib-shared install-cli +.PHONY: all default fprofiled clean distclean install install-* uninstall cli lib-* etags cli: x264$(EXE) lib-static: $(LIBX264) @@ -185,7 +189,7 @@ $(OBJS) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK): .depend -%.o: %.asm +%.o: %.asm common/x86/x86inc.asm common/x86/x86util.asm $(AS) $(ASFLAGS) -o $@ $< -@ $(if $(STRIP), $(STRIP) -x $@) # delete local/anonymous symbols, so they don't show up in oprofile @@ -201,7 +205,12 @@ .depend: config.mak @rm -f .depend + @echo 'dependency file generation...' +ifeq ($(COMPILER),CL) + @$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS) $(SRCCLI) $(SRCSO)), $(SRCPATH)/tools/msvsdepend.sh "$(CC)" "$(CFLAGS)" "$(SRC)" "$(SRC:$(SRCPATH)/%.c=%.o)" 1>> .depend;) +else @$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS) $(SRCCLI) $(SRCSO)), $(CC) $(CFLAGS) $(SRC) $(DEPMT) $(SRC:$(SRCPATH)/%.c=%.o) $(DEPMM) 1>> .depend;) +endif config.mak: ./configure @@ -232,15 +241,20 @@ $(MAKE) clean $(MAKE) x264$(EXE) CFLAGS="$(CFLAGS) $(PROF_GEN_CC)" LDFLAGS="$(LDFLAGS) $(PROF_GEN_LD)" $(foreach V, $(VIDS), $(foreach I, 0 1 2 3 4 5 6 7, ./x264$(EXE) $(OPT$I) --threads 1 $(V) -o $(DEVNULL) ;)) +ifeq ($(COMPILER),CL) +# Because Visual Studio timestamps the object files within the PGD, it fails to build if they change - only the executable should be deleted + rm -f x264$(EXE) +else rm -f $(SRC2:%.c=%.o) +endif $(MAKE) CFLAGS="$(CFLAGS) $(PROF_USE_CC)" LDFLAGS="$(LDFLAGS) $(PROF_USE_LD)" - rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock + rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock *.pgd *.pgc endif clean: rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(SONAME) *.a *.lib *.exp *.pdb x264 x264.exe .depend TAGS rm -f checkasm checkasm.exe $(OBJCHK) $(GENERATED) x264_lookahead.clbin - rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock + rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock *.pgd *.pgc distclean: clean rm -f config.mak x264_config.h config.h config.log x264.pc x264.def
View file
x264-snapshot-20141104-2245.tar.bz2/common/aarch64
Added
+(directory)
View file
x264-snapshot-20141104-2245.tar.bz2/common/aarch64/asm.S
Added
@@ -0,0 +1,221 @@ +/***************************************************************************** + * asm.S: AArch64 utility macros + ***************************************************************************** + * Copyright (C) 2008-2014 x264 project + * + * Authors: Mans Rullgard <mans@mansr.com> + * David Conrad <lessen42@gmail.com> + * Janne Grunau <janne-x264@jannau.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "config.h" + +#ifdef PREFIX +# define EXTERN_ASM _ +#else +# define EXTERN_ASM +#endif + +#ifdef __ELF__ +# define ELF +#else +# define ELF # +#endif + +#ifdef __MACH__ +# define MACH +#else +# define MACH # +#endif + +#if HAVE_AS_FUNC +# define FUNC +#else +# define FUNC # +#endif + +.macro function name, export=0, align=2 + .macro endfunc +ELF .size \name, . - \name +FUNC .endfunc + .purgem endfunc + .endm + .text + .align \align + .if \export + .global EXTERN_ASM\name +ELF .type EXTERN_ASM\name, %function +FUNC .func EXTERN_ASM\name +EXTERN_ASM\name: + .else +ELF .type \name, %function +FUNC .func \name +\name: + .endif +.endm + +.macro const name, align=2 + .macro endconst +ELF .size \name, . - \name + .purgem endconst + .endm +ELF .section .rodata +MACH .const_data + .align \align +\name: +.endm + +.macro movrel rd, val +#if defined(PIC) && defined(__APPLE__) + adrp \rd, \val@PAGE + add \rd, \rd, \val@PAGEOFF +#elif defined(PIC) + adrp \rd, \val + add \rd, \rd, :lo12:\val +#else + ldr \rd, =\val +#endif +.endm + +#define GLUE(a, b) a ## b +#define JOIN(a, b) GLUE(a, b) +#define X(s) JOIN(EXTERN_ASM, s) + +#define FDEC_STRIDE 32 +#define FENC_STRIDE 16 + + +.macro SUMSUB_AB sum, sub, a, b + add \sum, \a, \b + sub \sub, \a, \b +.endm + +.macro unzip t1, t2, s1, s2 + uzp1 \t1, \s1, \s2 + uzp2 \t2, \s1, \s2 +.endm + +.macro transpose t1, t2, s1, s2 + trn1 \t1, \s1, \s2 + trn2 \t2, \s1, \s2 +.endm + +.macro transpose4x4.h v0, v1, v2, v3, t0, t1, t2, t3 + transpose \t0\().2s, \t2\().2s, \v0\().2s, \v2\().2s + transpose \t1\().2s, \t3\().2s, \v1\().2s, \v3\().2s + transpose \v0\().4h, \v1\().4h, \t0\().4h, \t1\().4h + transpose \v2\().4h, \v3\().4h, \t2\().4h, \t3\().4h +.endm + +.macro transpose4x8.h v0, v1, v2, v3, t0, t1, t2, t3 + transpose \t0\().4s, \t2\().4s, \v0\().4s, \v2\().4s + transpose \t1\().4s, \t3\().4s, \v1\().4s, \v3\().4s + transpose \v0\().8h, \v1\().8h, \t0\().8h, \t1\().8h + transpose \v2\().8h, \v3\().8h, \t2\().8h, \t3\().8h +.endm + + +.macro transpose8x8.h r0, r1, r2, r3, r4, r5, r6, r7, r8, r9 + trn1 \r8\().8H, \r0\().8H, \r1\().8H + trn2 \r9\().8H, \r0\().8H, \r1\().8H + trn1 \r1\().8H, \r2\().8H, \r3\().8H + trn2 \r3\().8H, \r2\().8H, \r3\().8H + trn1 \r0\().8H, \r4\().8H, \r5\().8H + trn2 \r5\().8H, \r4\().8H, \r5\().8H + trn1 \r2\().8H, \r6\().8H, \r7\().8H + trn2 \r7\().8H, \r6\().8H, \r7\().8H + + trn1 \r4\().4S, \r0\().4S, \r2\().4S + trn2 \r2\().4S, \r0\().4S, \r2\().4S + trn1 \r6\().4S, \r5\().4S, \r7\().4S + trn2 \r7\().4S, \r5\().4S, \r7\().4S + trn1 \r5\().4S, \r9\().4S, \r3\().4S + trn2 \r9\().4S, \r9\().4S, \r3\().4S + trn1 \r3\().4S, \r8\().4S, \r1\().4S + trn2 \r8\().4S, \r8\().4S, \r1\().4S + + trn1 \r0\().2D, \r3\().2D, \r4\().2D + trn2 \r4\().2D, \r3\().2D, \r4\().2D + + trn1 \r1\().2D, \r5\().2D, \r6\().2D + trn2 \r5\().2D, \r5\().2D, \r6\().2D + + trn2 \r6\().2D, \r8\().2D, \r2\().2D + trn1 \r2\().2D, \r8\().2D, \r2\().2D + + trn1 \r3\().2D, \r9\().2D, \r7\().2D + trn2 \r7\().2D, \r9\().2D, \r7\().2D +.endm + +.macro transpose_8x16.b r0, r1, r2, r3, r4, r5, r6, r7, t0, t1 + trn1 \t0\().16b, \r0\().16b, \r1\().16b + trn2 \t1\().16b, \r0\().16b, \r1\().16b + trn1 \r1\().16b, \r2\().16b, \r3\().16b + trn2 \r3\().16b, \r2\().16b, \r3\().16b + trn1 \r0\().16b, \r4\().16b, \r5\().16b + trn2 \r5\().16b, \r4\().16b, \r5\().16b + trn1 \r2\().16b, \r6\().16b, \r7\().16b + trn2 \r7\().16b, \r6\().16b, \r7\().16b + + trn1 \r4\().8h, \r0\().8h, \r2\().8h + trn2 \r2\().8h, \r0\().8h, \r2\().8h + trn1 \r6\().8h, \r5\().8h, \r7\().8h + trn2 \r7\().8h, \r5\().8h, \r7\().8h + trn1 \r5\().8h, \t1\().8h, \r3\().8h + trn2 \t1\().8h, \t1\().8h, \r3\().8h + trn1 \r3\().8h, \t0\().8h, \r1\().8h + trn2 \t0\().8h, \t0\().8h, \r1\().8h + + trn1 \r0\().4s, \r3\().4s, \r4\().4s + trn2 \r4\().4s, \r3\().4s, \r4\().4s + + trn1 \r1\().4s, \r5\().4s, \r6\().4s + trn2 \r5\().4s, \r5\().4s, \r6\().4s + + trn2 \r6\().4s, \t0\().4s, \r2\().4s + trn1 \r2\().4s, \t0\().4s, \r2\().4s + + trn1 \r3\().4s, \t1\().4s, \r7\().4s + trn2 \r7\().4s, \t1\().4s, \r7\().4s +.endm + +.macro transpose_4x16.b r0, r1, r2, r3, t4, t5, t6, t7
View file
x264-snapshot-20141104-2245.tar.bz2/common/aarch64/dct-a.S
Added
@@ -0,0 +1,666 @@ +/**************************************************************************** + * dct-a.S: AArch6464 transform and zigzag + ***************************************************************************** + * Copyright (C) 2009-2014 x264 project + * + * Authors: David Conrad <lessen42@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "asm.S" + +const scan4x4_frame, align=4 +.byte 0,1, 8,9, 2,3, 4,5 +.byte 10,11, 16,17, 24,25, 18,19 +.byte 12,13, 6,7, 14,15, 20,21 +.byte 26,27, 28,29, 22,23, 30,31 +endconst + +// sum = a + (b>>shift) sub = (a>>shift) - b +.macro SUMSUB_SHR shift sum sub a b t0 t1 + sshr \t0, \b, #\shift + sshr \t1, \a, #\shift + add \sum, \a, \t0 + sub \sub, \t1, \b +.endm + +// sum = (a>>shift) + b sub = a - (b>>shift) +.macro SUMSUB_SHR2 shift sum sub a b t0 t1 + sshr \t0, \a, #\shift + sshr \t1, \b, #\shift + add \sum, \t0, \b + sub \sub, \a, \t1 +.endm + +// a += 1.5*ma b -= 1.5*mb +.macro SUMSUB_15 a b ma mb t0 t1 + sshr \t0, \ma, #1 + sshr \t1, \mb, #1 + add \t0, \t0, \ma + add \t1, \t1, \mb + add \a, \a, \t0 + sub \b, \b, \t1 +.endm + + +function x264_dct4x4dc_neon, export=1 + ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0] + movi v31.4h, #1 + SUMSUB_AB v4.4h, v5.4h, v0.4h, v1.4h + SUMSUB_AB v6.4h, v7.4h, v2.4h, v3.4h + SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h + SUMSUB_AB v3.4h, v1.4h, v5.4h, v7.4h + transpose v4.4h, v6.4h, v0.4h, v2.4h + transpose v5.4h, v7.4h, v1.4h, v3.4h + SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h + SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h + transpose v4.2s, v5.2s, v0.2s, v1.2s + transpose v6.2s, v7.2s, v2.2s, v3.2s + add v16.4h, v4.4h, v31.4h + add v17.4h, v6.4h, v31.4h + srhadd v0.4h, v4.4h, v5.4h + shsub v1.4h, v16.4h, v5.4h + shsub v2.4h, v17.4h, v7.4h + srhadd v3.4h, v6.4h, v7.4h + st1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0] + ret +endfunc + +function x264_idct4x4dc_neon, export=1 + ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0] + SUMSUB_AB v4.4h, v5.4h, v0.4h, v1.4h + SUMSUB_AB v6.4h, v7.4h, v2.4h, v3.4h + SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h + SUMSUB_AB v3.4h, v1.4h, v5.4h, v7.4h + transpose v4.4h, v6.4h, v0.4h, v2.4h + transpose v5.4h, v7.4h, v1.4h, v3.4h + SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h + SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h + transpose v4.2s, v5.2s, v0.2s, v1.2s + transpose v6.2s, v7.2s, v2.2s, v3.2s + SUMSUB_AB v0.4h, v1.4h, v4.4h, v5.4h + SUMSUB_AB v3.4h, v2.4h, v6.4h, v7.4h + st1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0] + ret +endfunc + +.macro DCT_1D v0 v1 v2 v3 v4 v5 v6 v7 + SUMSUB_AB \v1, \v6, \v5, \v6 + SUMSUB_AB \v3, \v7, \v4, \v7 + add \v0, \v3, \v1 + add \v4, \v7, \v7 + add \v5, \v6, \v6 + sub \v2, \v3, \v1 + add \v1, \v4, \v6 + sub \v3, \v7, \v5 +.endm + +function x264_sub4x4_dct_neon, export=1 + mov x3, #FENC_STRIDE + mov x4, #FDEC_STRIDE + ld1 {v0.s}[0], [x1], x3 + ld1 {v1.s}[0], [x2], x4 + ld1 {v2.s}[0], [x1], x3 + usubl v16.8h, v0.8b, v1.8b + ld1 {v3.s}[0], [x2], x4 + ld1 {v4.s}[0], [x1], x3 + usubl v17.8h, v2.8b, v3.8b + ld1 {v5.s}[0], [x2], x4 + ld1 {v6.s}[0], [x1], x3 + usubl v18.8h, v4.8b, v5.8b + ld1 {v7.s}[0], [x2], x4 + usubl v19.8h, v6.8b, v7.8b + + DCT_1D v0.4h, v1.4h, v2.4h, v3.4h, v16.4h, v17.4h, v18.4h, v19.4h + transpose4x4.h v0, v1, v2, v3, v4, v5, v6, v7 + DCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h + st1 {v4.4h,v5.4h,v6.4h,v7.4h}, [x0] + ret +endfunc + +function x264_sub8x4_dct_neon + ld1 {v0.8b}, [x1], x3 + ld1 {v1.8b}, [x2], x4 + usubl v16.8h, v0.8b, v1.8b + ld1 {v2.8b}, [x1], x3 + ld1 {v3.8b}, [x2], x4 + usubl v17.8h, v2.8b, v3.8b + ld1 {v4.8b}, [x1], x3 + ld1 {v5.8b}, [x2], x4 + usubl v18.8h, v4.8b, v5.8b + ld1 {v6.8b}, [x1], x3 + ld1 {v7.8b}, [x2], x4 + usubl v19.8h, v6.8b, v7.8b + + DCT_1D v0.8h, v1.8h, v2.8h, v3.8h, v16.8h, v17.8h, v18.8h, v19.8h + transpose4x8.h v0, v1, v2, v3, v4, v5, v6, v7 + + SUMSUB_AB v16.8h, v19.8h, v0.8h, v3.8h + SUMSUB_AB v17.8h, v18.8h, v1.8h, v2.8h + add v22.8h, v19.8h, v19.8h + add v21.8h, v18.8h, v18.8h + add v0.8h, v16.8h, v17.8h + sub v1.8h, v16.8h, v17.8h + + add v2.8h, v22.8h, v18.8h + sub v3.8h, v19.8h, v21.8h + + zip1 v4.2d, v0.2d, v2.2d + zip2 v6.2d, v0.2d, v2.2d + zip1 v5.2d, v1.2d, v3.2d + zip2 v7.2d, v1.2d, v3.2d + + st1 {v4.8h}, [x0], #16 + st1 {v5.8h}, [x0], #16 + st1 {v6.8h}, [x0], #16 + st1 {v7.8h}, [x0], #16 + ret +endfunc + +function x264_sub8x8_dct_neon, export=1 + mov x5, x30 + mov x3, #FENC_STRIDE + mov x4, #FDEC_STRIDE + bl x264_sub8x4_dct_neon + mov x30, x5 + b x264_sub8x4_dct_neon +endfunc + +function x264_sub16x16_dct_neon, export=1 + mov x5, x30 + mov x3, #FENC_STRIDE + mov x4, #FDEC_STRIDE + bl x264_sub8x4_dct_neon + bl x264_sub8x4_dct_neon + sub x1, x1, #8*FENC_STRIDE-8 + sub x2, x2, #8*FDEC_STRIDE-8 + bl x264_sub8x4_dct_neon + bl x264_sub8x4_dct_neon + sub x1, x1, #8 + sub x2, x2, #8 + bl x264_sub8x4_dct_neon + bl x264_sub8x4_dct_neon + sub x1, x1, #8*FENC_STRIDE-8
View file
x264-snapshot-20141104-2245.tar.bz2/common/aarch64/dct.h
Added
@@ -0,0 +1,52 @@ +/***************************************************************************** + * dct.h: AArch64 transform and zigzag + ***************************************************************************** + * Copyright (C) 2009-2014 x264 project + * + * Authors: David Conrad <lessen42@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_AARCH64_DCT_H +#define X264_AARCH64_DCT_H + +void x264_dct4x4dc_neon( int16_t d[16] ); +void x264_idct4x4dc_neon( int16_t d[16] ); + +void x264_sub4x4_dct_neon( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x8_dct_neon( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub16x16_dct_neon( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); + +void x264_add4x4_idct_neon( uint8_t *p_dst, int16_t dct[16] ); +void x264_add8x8_idct_neon( uint8_t *p_dst, int16_t dct[4][16] ); +void x264_add16x16_idct_neon( uint8_t *p_dst, int16_t dct[16][16] ); + +void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[4] ); +void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[16] ); +void x264_sub8x8_dct_dc_neon( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 ); + +void x264_sub8x8_dct8_neon( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub16x16_dct8_neon( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 ); + +void x264_add8x8_idct8_neon( uint8_t *p_dst, int16_t dct[64] ); +void x264_add16x16_idct8_neon( uint8_t *p_dst, int16_t dct[4][64] ); + +void x264_zigzag_scan_4x4_frame_neon( int16_t level[16], int16_t dct[16] ); + +#endif
View file
x264-snapshot-20141104-2245.tar.bz2/common/aarch64/deblock-a.S
Added
@@ -0,0 +1,392 @@ +/***************************************************************************** + * deblock.S: aarch64 deblocking + ***************************************************************************** + * Copyright (C) 2009-2014 x264 project + * + * Authors: Mans Rullgard <mans@mansr.com> + * Janne Grunau <janne-x264@jannau.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "asm.S" + +.macro h264_loop_filter_start + cmp w2, #0 + ldr w6, [x4] + ccmp w3, #0, #0, ne + mov v24.s[0], w6 + and w6, w6, w6, lsl #16 + b.eq 1f + ands w6, w6, w6, lsl #8 + b.ge 2f +1: + ret +2: +.endm + +.macro h264_loop_filter_luma + dup v22.16b, w2 // alpha + uxtl v24.8h, v24.8b + uabd v21.16b, v16.16b, v0.16b // abs(p0 - q0) + uxtl v24.4s, v24.4h + uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0) + sli v24.8h, v24.8h, #8 + uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0) + sli v24.4s, v24.4s, #16 + cmhi v21.16b, v22.16b, v21.16b // < alpha + dup v22.16b, w3 // beta + cmlt v23.16b, v24.16b, #0 + cmhi v28.16b, v22.16b, v28.16b // < beta + cmhi v30.16b, v22.16b, v30.16b // < beta + bic v21.16b, v21.16b, v23.16b + uabd v17.16b, v20.16b, v16.16b // abs(p2 - p0) + and v21.16b, v21.16b, v28.16b + uabd v19.16b, v4.16b, v0.16b // abs(q2 - q0) + cmhi v17.16b, v22.16b, v17.16b // < beta + and v21.16b, v21.16b, v30.16b + cmhi v19.16b, v22.16b, v19.16b // < beta + and v17.16b, v17.16b, v21.16b + and v19.16b, v19.16b, v21.16b + and v24.16b, v24.16b, v21.16b + urhadd v28.16b, v16.16b, v0.16b + sub v21.16b, v24.16b, v17.16b + uqadd v23.16b, v18.16b, v24.16b + uhadd v20.16b, v20.16b, v28.16b + sub v21.16b, v21.16b, v19.16b + uhadd v28.16b, v4.16b, v28.16b + umin v23.16b, v23.16b, v20.16b + uqsub v22.16b, v18.16b, v24.16b + uqadd v4.16b, v2.16b, v24.16b + umax v23.16b, v23.16b, v22.16b + uqsub v22.16b, v2.16b, v24.16b + umin v28.16b, v4.16b, v28.16b + uxtl v4.8h, v0.8b + umax v28.16b, v28.16b, v22.16b + uxtl2 v20.8h, v0.16b + usubw v4.8h, v4.8h, v16.8b + usubw2 v20.8h, v20.8h, v16.16b + shl v4.8h, v4.8h, #2 + shl v20.8h, v20.8h, #2 + uaddw v4.8h, v4.8h, v18.8b + uaddw2 v20.8h, v20.8h, v18.16b + usubw v4.8h, v4.8h, v2.8b + usubw2 v20.8h, v20.8h, v2.16b + rshrn v4.8b, v4.8h, #3 + rshrn2 v4.16b, v20.8h, #3 + bsl v17.16b, v23.16b, v18.16b + bsl v19.16b, v28.16b, v2.16b + neg v23.16b, v21.16b + uxtl v28.8h, v16.8b + smin v4.16b, v4.16b, v21.16b + uxtl2 v21.8h, v16.16b + smax v4.16b, v4.16b, v23.16b + uxtl v22.8h, v0.8b + uxtl2 v24.8h, v0.16b + saddw v28.8h, v28.8h, v4.8b + saddw2 v21.8h, v21.8h, v4.16b + ssubw v22.8h, v22.8h, v4.8b + ssubw2 v24.8h, v24.8h, v4.16b + sqxtun v16.8b, v28.8h + sqxtun2 v16.16b, v21.8h + sqxtun v0.8b, v22.8h + sqxtun2 v0.16b, v24.8h +.endm + +function x264_deblock_v_luma_neon, export=1 + h264_loop_filter_start + + ld1 {v0.16b}, [x0], x1 + ld1 {v2.16b}, [x0], x1 + ld1 {v4.16b}, [x0], x1 + sub x0, x0, x1, lsl #2 + sub x0, x0, x1, lsl #1 + ld1 {v20.16b}, [x0], x1 + ld1 {v18.16b}, [x0], x1 + ld1 {v16.16b}, [x0], x1 + + h264_loop_filter_luma + + sub x0, x0, x1, lsl #1 + st1 {v17.16b}, [x0], x1 + st1 {v16.16b}, [x0], x1 + st1 {v0.16b}, [x0], x1 + st1 {v19.16b}, [x0] + + ret +endfunc + +function x264_deblock_h_luma_neon, export=1 + h264_loop_filter_start + + sub x0, x0, #4 + ld1 {v6.8b}, [x0], x1 + ld1 {v20.8b}, [x0], x1 + ld1 {v18.8b}, [x0], x1 + ld1 {v16.8b}, [x0], x1 + ld1 {v0.8b}, [x0], x1 + ld1 {v2.8b}, [x0], x1 + ld1 {v4.8b}, [x0], x1 + ld1 {v26.8b}, [x0], x1 + ld1 {v6.d}[1], [x0], x1 + ld1 {v20.d}[1], [x0], x1 + ld1 {v18.d}[1], [x0], x1 + ld1 {v16.d}[1], [x0], x1 + ld1 {v0.d}[1], [x0], x1 + ld1 {v2.d}[1], [x0], x1 + ld1 {v4.d}[1], [x0], x1 + ld1 {v26.d}[1], [x0], x1 + + transpose_8x16.b v6, v20, v18, v16, v0, v2, v4, v26, v21, v23 + + h264_loop_filter_luma + + transpose_4x16.b v17, v16, v0, v19, v21, v23, v25, v27 + + sub x0, x0, x1, lsl #4 + add x0, x0, #2 + st1 {v17.s}[0], [x0], x1 + st1 {v16.s}[0], [x0], x1 + st1 {v0.s}[0], [x0], x1 + st1 {v19.s}[0], [x0], x1 + st1 {v17.s}[1], [x0], x1 + st1 {v16.s}[1], [x0], x1 + st1 {v0.s}[1], [x0], x1 + st1 {v19.s}[1], [x0], x1 + st1 {v17.s}[2], [x0], x1 + st1 {v16.s}[2], [x0], x1 + st1 {v0.s}[2], [x0], x1 + st1 {v19.s}[2], [x0], x1 + st1 {v17.s}[3], [x0], x1 + st1 {v16.s}[3], [x0], x1 + st1 {v0.s}[3], [x0], x1 + st1 {v19.s}[3], [x0], x1 + + ret +endfunc + +.macro h264_loop_filter_chroma + dup v22.16b, w2 // alpha + uxtl v24.8h, v24.8b + uabd v26.16b, v16.16b, v0.16b // abs(p0 - q0) + uxtl v4.8h, v0.8b + uxtl2 v5.8h, v0.16b + uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0) + usubw v4.8h, v4.8h, v16.8b + usubw2 v5.8h, v5.8h, v16.16b + sli v24.8h, v24.8h, #8 + shl v4.8h, v4.8h, #2 + shl v5.8h, v5.8h, #2 + uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0) + uxtl v24.4s, v24.4h + uaddw v4.8h, v4.8h, v18.8b + uaddw2 v5.8h, v5.8h, v18.16b + cmhi v26.16b, v22.16b, v26.16b // < alpha
View file
x264-snapshot-20141104-2245.tar.bz2/common/aarch64/mc-a.S
Added
@@ -0,0 +1,1365 @@ +/***************************************************************************** + * mc.S: aarch64 motion compensation + ***************************************************************************** + * Copyright (C) 2009-2014 x264 project + * + * Authors: David Conrad <lessen42@gmail.com> + * Janne Grunau <janne-x264@jannau.net> + * Mans Rullgard <mans@mansr.com> + * Stefan Groenroos <stefan.gronroos@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "asm.S" + +// note: prefetch stuff assumes 64-byte cacheline + +// void prefetch_ref( uint8_t *pix, intptr_t stride, int parity ) +function x264_prefetch_ref_aarch64, export=1 + cmp w2, #1 + csel x2, xzr, x1, eq + add x0, x0, #64 + add x0, x0, x2, lsl #3 + + lsl x2, x1, #1 + add x3, x1, x1, lsl #1 + add x4, x0, x1, lsl #2 + + prfm pldl1strm, [x0] + prfm pldl1strm, [x0, x1] + prfm pldl1strm, [x0, x2] + prfm pldl1strm, [x0, x3] + prfm pldl1strm, [x4] + prfm pldl1strm, [x4, x1] + prfm pldl1strm, [x4, x2] + prfm pldl1strm, [x4, x3] + ret +endfunc + +// void prefetch_fenc( uint8_t *pix_y, intptr_t stride_y, +// uint8_t *pix_uv, intptr_t stride_uv, int mb_x ) +.macro x264_prefetch_fenc sub +function x264_prefetch_fenc_\sub\()_aarch64, export=1 + and w6, w5, #3 + and w7, w5, #3 + mul x6, x6, x1 + mul x7, x7, x3 + add x0, x0, #64 + add x2, x2, #64 + + add x0, x0, x6, lsl #2 + add x6, x0, x1, lsl #1 + prfm pldl1strm, [x0] + prfm pldl1strm, [x0, x1] + prfm pldl1strm, [x6] + prfm pldl1strm, [x6, x1] + + add x2, x2, x7, lsl #1 + prfm pldl1strm, [x2] + prfm pldl1strm, [x2, x3] +.ifc \sub, 422 + add x7, x2, x3, lsl #1 + prfm pldl1strm, [x7] + prfm pldl1strm, [x7, x3] +.endif + ret +endfunc +.endm + +x264_prefetch_fenc 420 +x264_prefetch_fenc 422 + +// void pixel_avg( uint8_t *dst, intptr_t dst_stride, +// uint8_t *src1, intptr_t src1_stride, +// uint8_t *src2, intptr_t src2_stride, int weight ); +.macro AVGH w h +function x264_pixel_avg_\w\()x\h\()_neon, export=1 + mov w10, #64 + cmp w6, #32 + mov w9, #\h + b.eq pixel_avg_w\w\()_neon + subs w7, w10, w6 + b.lt pixel_avg_weight_w\w\()_add_sub_neon // weight > 64 + cmp w6, #0 + b.ge pixel_avg_weight_w\w\()_add_add_neon + b pixel_avg_weight_w\w\()_sub_add_neon // weight < 0 +endfunc +.endm + +AVGH 4, 2 +AVGH 4, 4 +AVGH 4, 8 +AVGH 4, 16 +AVGH 8, 4 +AVGH 8, 8 +AVGH 8, 16 +AVGH 16, 8 +AVGH 16, 16 + +// 0 < weight < 64 +.macro load_weights_add_add + mov w6, w6 +.endm +.macro weight_add_add dst, s1, s2, h= +.ifc \h, 2 + umull2 \dst, \s1, v30.16b + umlal2 \dst, \s2, v31.16b +.else + umull \dst, \s1, v30.8b + umlal \dst, \s2, v31.8b +.endif +.endm + +// weight > 64 +.macro load_weights_add_sub + neg w7, w7 +.endm +.macro weight_add_sub dst, s1, s2, h= +.ifc \h, 2 + umull2 \dst, \s1, v30.16b + umlsl2 \dst, \s2, v31.16b +.else + umull \dst, \s1, v30.8b + umlsl \dst, \s2, v31.8b +.endif +.endm + +// weight < 0 +.macro load_weights_sub_add + neg w6, w6 +.endm +.macro weight_sub_add dst, s1, s2, h= +.ifc \h, 2 + umull2 \dst, \s2, v31.16b + umlsl2 \dst, \s1, v30.16b +.else + umull \dst, \s2, v31.8b + umlsl \dst, \s1, v30.8b +.endif +.endm + +.macro AVG_WEIGHT ext +function pixel_avg_weight_w4_\ext\()_neon + load_weights_\ext + dup v30.8b, w6 + dup v31.8b, w7 +1: // height loop + subs w9, w9, #2 + ld1 {v0.s}[0], [x2], x3 + ld1 {v1.s}[0], [x4], x5 + weight_\ext v4.8h, v0.8b, v1.8b + ld1 {v2.s}[0], [x2], x3 + ld1 {v3.s}[0], [x4], x5 + sqrshrun v0.8b, v4.8h, #6 + weight_\ext v5.8h, v2.8b, v3.8b + st1 {v0.s}[0], [x0], x1 + sqrshrun v1.8b, v5.8h, #6 + st1 {v1.s}[0], [x0], x1 + b.gt 1b + ret +endfunc + +function pixel_avg_weight_w8_\ext\()_neon + load_weights_\ext + dup v30.8b, w6 + dup v31.8b, w7 +1: // height loop + subs w9, w9, #4 + ld1 {v0.8b}, [x2], x3 + ld1 {v1.8b}, [x4], x5 + weight_\ext v16.8h, v0.8b, v1.8b + ld1 {v2.8b}, [x2], x3 + ld1 {v3.8b}, [x4], x5 + weight_\ext v17.8h, v2.8b, v3.8b + ld1 {v4.8b}, [x2], x3 + ld1 {v5.8b}, [x4], x5 + weight_\ext v18.8h, v4.8b, v5.8b + ld1 {v6.8b}, [x2], x3 + ld1 {v7.8b}, [x4], x5 + weight_\ext v19.8h, v6.8b, v7.8b + sqrshrun v0.8b, v16.8h, #6 + sqrshrun v1.8b, v17.8h, #6 + sqrshrun v2.8b, v18.8h, #6 + sqrshrun v3.8b, v19.8h, #6
View file
x264-snapshot-20141104-2245.tar.bz2/common/aarch64/mc-c.c
Added
@@ -0,0 +1,249 @@ +/***************************************************************************** + * mc-c.c: aarch64 motion compensation + ***************************************************************************** + * Copyright (C) 2009-2014 x264 project + * + * Authors: David Conrad <lessen42@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "common/common.h" +#include "mc.h" + +void x264_prefetch_ref_aarch64( uint8_t *, intptr_t, int ); +void x264_prefetch_fenc_420_aarch64( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_prefetch_fenc_422_aarch64( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); + +void *x264_memcpy_aligned_neon( void *dst, const void *src, size_t n ); +void x264_memzero_aligned_neon( void *dst, size_t n ); + +void x264_pixel_avg_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_16x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_8x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_8x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_4x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_4x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_4x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_4x2_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); + +void x264_pixel_avg2_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); +void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); +void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); +void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); + +void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu, + pixel *dstv, intptr_t i_dstv, + pixel *src, intptr_t i_src, int w, int h ); +void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta, + pixel *dstb, intptr_t i_dstb, + pixel *dstc, intptr_t i_dstc, + pixel *src, intptr_t i_src, int pw, int w, int h ); +void x264_plane_copy_interleave_neon( pixel *dst, intptr_t i_dst, + pixel *srcu, intptr_t i_srcu, + pixel *srcv, intptr_t i_srcv, int w, int h ); + +void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height ); +void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height ); +void x264_load_deinterleave_chroma_fenc_neon( pixel *dst, pixel *src, intptr_t i_src, int height ); + +#define MC_WEIGHT(func)\ +void x264_mc_weight_w20##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ +void x264_mc_weight_w16##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ +void x264_mc_weight_w8##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ +void x264_mc_weight_w4##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ +\ +static void (* x264_mc##func##_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int ) =\ +{\ + x264_mc_weight_w4##func##_neon,\ + x264_mc_weight_w4##func##_neon,\ + x264_mc_weight_w8##func##_neon,\ + x264_mc_weight_w16##func##_neon,\ + x264_mc_weight_w16##func##_neon,\ + x264_mc_weight_w20##func##_neon,\ +}; + +MC_WEIGHT() +MC_WEIGHT(_nodenom) +MC_WEIGHT(_offsetadd) +MC_WEIGHT(_offsetsub) + +void x264_mc_copy_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_mc_copy_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_mc_copy_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); + +void x264_mc_chroma_neon( uint8_t *, uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int ); +void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int ); + +#if !HIGH_BIT_DEPTH +static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w ) +{ + if( w->i_scale == 1<<w->i_denom ) + { + if( w->i_offset < 0 ) + { + w->weightfn = x264_mc_offsetsub_wtab_neon; + w->cachea[0] = -w->i_offset; + } + else + { + w->weightfn = x264_mc_offsetadd_wtab_neon; + w->cachea[0] = w->i_offset; + } + } + else if( !w->i_denom ) + w->weightfn = x264_mc_nodenom_wtab_neon; + else + w->weightfn = x264_mc_wtab_neon; +} + +static void (* const x264_pixel_avg_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ) = +{ + NULL, + x264_pixel_avg2_w4_neon, + x264_pixel_avg2_w8_neon, + x264_pixel_avg2_w16_neon, // no slower than w12, so no point in a separate function + x264_pixel_avg2_w16_neon, + x264_pixel_avg2_w20_neon, +}; + +static void (* const x264_mc_copy_wtab_neon[5])( uint8_t *, intptr_t, uint8_t *, intptr_t, int ) = +{ + NULL, + x264_mc_copy_w4_neon, + x264_mc_copy_w8_neon, + NULL, + x264_mc_copy_w16_neon, +}; + +static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; +static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; + +static void mc_luma_neon( uint8_t *dst, intptr_t i_dst_stride, + uint8_t *src[4], intptr_t i_src_stride, + int mvx, int mvy, + int i_width, int i_height, const x264_weight_t *weight ) +{ + int qpel_idx = ((mvy&3)<<2) + (mvx&3); + intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2); + uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset; + if ( (mvy&3) == 3 ) // explict if() to force conditional add + src1 += i_src_stride; + + if( qpel_idx & 5 ) /* qpel interpolation needed */ + { + uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); + x264_pixel_avg_wtab_neon[i_width>>2]( + dst, i_dst_stride, src1, i_src_stride, + src2, i_height ); + if( weight->weightfn ) + weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height ); + } + else if( weight->weightfn ) + weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height ); + else + x264_mc_copy_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, i_height ); +} + +static uint8_t *get_ref_neon( uint8_t *dst, intptr_t *i_dst_stride, + uint8_t *src[4], intptr_t i_src_stride, + int mvx, int mvy, + int i_width, int i_height, const x264_weight_t *weight ) +{ + int qpel_idx = ((mvy&3)<<2) + (mvx&3); + intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2); + uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset; + if ( (mvy&3) == 3 ) // explict if() to force conditional add + src1 += i_src_stride; + + if( qpel_idx & 5 ) /* qpel interpolation needed */ + { + uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); + x264_pixel_avg_wtab_neon[i_width>>2]( + dst, *i_dst_stride, src1, i_src_stride, + src2, i_height ); + if( weight->weightfn ) + weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height ); + return dst; + } + else if( weight->weightfn ) + { + weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height ); + return dst; + } + else + { + *i_dst_stride = i_src_stride; + return src1; + } +} + +void x264_hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, + uint8_t *src, intptr_t stride, int width, + int height, int16_t *buf );
View file
x264-snapshot-20141104-2245.tar.bz2/common/aarch64/mc.h
Added
@@ -0,0 +1,29 @@ +/***************************************************************************** + * mc.h: aarch64 motion compensation + ***************************************************************************** + * Copyright (C) 2014 x264 project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_AARCH64_MC_H +#define X264_AARCH64_MC_H + +void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf ); + +#endif
View file
x264-snapshot-20141104-2245.tar.bz2/common/aarch64/pixel-a.S
Added
@@ -0,0 +1,1153 @@ +/***************************************************************************** + * pixel.S: aarch64 pixel metrics + ***************************************************************************** + * Copyright (C) 2009-2014 x264 project + * + * Authors: David Conrad <lessen42@gmail.com> + * Janne Grunau <janne-x264@jannau.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "asm.S" + +const mask +.rept 16 +.byte 0xff +.endr +.rept 16 +.byte 0x00 +.endr +endconst + +const mask_ac_4_8 +.short 0, -1, -1, -1, 0, -1, -1, -1 +.short 0, -1, -1, -1, -1, -1, -1, -1 +endconst + +.macro SAD_START_4 + ld1 {v1.s}[0], [x2], x3 + ld1 {v0.s}[0], [x0], x1 + ld1 {v1.s}[1], [x2], x3 + ld1 {v0.s}[1], [x0], x1 + uabdl v16.8h, v0.8b, v1.8b +.endm + +.macro SAD_4 + ld1 {v1.s}[0], [x2], x3 + ld1 {v0.s}[0], [x0], x1 + ld1 {v1.s}[1], [x2], x3 + ld1 {v0.s}[1], [x0], x1 + uabal v16.8h, v0.8b, v1.8b +.endm + +.macro SAD_START_8 + ld1 {v1.8b}, [x2], x3 + ld1 {v0.8b}, [x0], x1 + ld1 {v3.8b}, [x2], x3 + ld1 {v2.8b}, [x0], x1 + uabdl v16.8h, v0.8b, v1.8b + uabdl v17.8h, v2.8b, v3.8b +.endm + +.macro SAD_8 + ld1 {v1.8b}, [x2], x3 + ld1 {v0.8b}, [x0], x1 + ld1 {v3.8b}, [x2], x3 + ld1 {v2.8b}, [x0], x1 + uabal v16.8h, v0.8b, v1.8b + uabal v17.8h, v2.8b, v3.8b +.endm + +.macro SAD_START_16 + ld1 {v1.16b}, [x2], x3 + ld1 {v0.16b}, [x0], x1 + ld1 {v3.16b}, [x2], x3 + ld1 {v2.16b}, [x0], x1 + uabdl v16.8h, v0.8b, v1.8b + uabdl2 v17.8h, v0.16b, v1.16b + uabal v16.8h, v2.8b, v3.8b + uabal2 v17.8h, v2.16b, v3.16b +.endm + +.macro SAD_16 + ld1 {v1.16b}, [x2], x3 + ld1 {v0.16b}, [x0], x1 + ld1 {v3.16b}, [x2], x3 + ld1 {v2.16b}, [x0], x1 + uabal v16.8h, v0.8b, v1.8b + uabal2 v17.8h, v0.16b, v1.16b + uabal v16.8h, v2.8b, v3.8b + uabal2 v17.8h, v2.16b, v3.16b +.endm + +.macro SAD_FUNC w, h, name +function x264_pixel_sad\name\()_\w\()x\h\()_neon, export=1 + SAD_START_\w + +.rept \h / 2 - 1 + SAD_\w +.endr +.if \w > 4 + add v16.8h, v16.8h, v17.8h +.endif + uaddlv s0, v16.8h + fmov w0, s0 + ret +endfunc +.endm + +SAD_FUNC 4, 4 +SAD_FUNC 4, 8 +SAD_FUNC 8, 4 +SAD_FUNC 8, 8 +SAD_FUNC 8, 16 +SAD_FUNC 16, 8 +SAD_FUNC 16, 16 + +.macro SAD_X_4 x, first=uabal + ld1 {v0.s}[0], [x0], x7 + ld1 {v1.s}[0], [x1], x5 + ld1 {v0.s}[1], [x0], x7 + ld1 {v1.s}[1], [x1], x5 + \first v16.8h, v1.8b, v0.8b + ld1 {v2.s}[0], [x2], x5 + ld1 {v2.s}[1], [x2], x5 + \first v17.8h, v2.8b, v0.8b + ld1 {v3.s}[0], [x3], x5 + ld1 {v3.s}[1], [x3], x5 + \first v18.8h, v3.8b, v0.8b +.if \x == 4 + ld1 {v4.s}[0], [x4], x5 + ld1 {v4.s}[1], [x4], x5 + \first v19.8h, v4.8b, v0.8b +.endif +.endm + +.macro SAD_X_8 x, first=uabal + ld1 {v0.8b}, [x0], x7 + ld1 {v1.8b}, [x1], x5 + \first v16.8h, v1.8b, v0.8b + ld1 {v2.8b}, [x2], x5 + ld1 {v5.8b}, [x0], x7 + \first v17.8h, v2.8b, v0.8b + ld1 {v3.8b}, [x3], x5 + ld1 {v1.8b}, [x1], x5 + \first v18.8h, v3.8b, v0.8b + uabal v16.8h, v1.8b, v5.8b + ld1 {v2.8b}, [x2], x5 + ld1 {v3.8b}, [x3], x5 + uabal v17.8h, v2.8b, v5.8b + uabal v18.8h, v3.8b, v5.8b +.if \x == 4 + ld1 {v4.8b}, [x4], x5 + \first v19.8h, v4.8b, v0.8b + ld1 {v4.8b}, [x4], x5 + uabal v19.8h, v4.8b, v5.8b +.endif +.endm + +.macro SAD_X_16 x, first=uabal + ld1 {v0.16b}, [x0], x7 + ld1 {v1.16b}, [x1], x5 + \first v16.8h, v1.8b, v0.8b + \first\()2 v20.8h, v1.16b, v0.16b + ld1 {v2.16b}, [x2], x5 + ld1 {v5.16b}, [x0], x7 + \first v17.8h, v2.8b, v0.8b + \first\()2 v21.8h, v2.16b, v0.16b + ld1 {v3.16b}, [x3], x5 + ld1 {v1.16b}, [x1], x5 + \first v18.8h, v3.8b, v0.8b + \first\()2 v22.8h, v3.16b, v0.16b + uabal v16.8h, v1.8b, v5.8b + uabal2 v20.8h, v1.16b, v5.16b + ld1 {v2.16b}, [x2], x5 + ld1 {v3.16b}, [x3], x5 + uabal v17.8h, v2.8b, v5.8b + uabal2 v21.8h, v2.16b, v5.16b + uabal v18.8h, v3.8b, v5.8b + uabal2 v22.8h, v3.16b, v5.16b +.if \x == 4 + ld1 {v4.16b}, [x4], x5 + \first v19.8h, v4.8b, v0.8b + \first\()2 v23.8h, v4.16b, v0.16b + ld1 {v4.16b}, [x4], x5 + uabal v19.8h, v4.8b, v5.8b + uabal2 v23.8h, v4.16b, v5.16b +.endif +.endm + +.macro SAD_X_FUNC x, w, h +function x264_pixel_sad_x\x\()_\w\()x\h\()_neon, export=1 +.if \x == 3 + mov x6, x5
View file
x264-snapshot-20141104-2245.tar.bz2/common/aarch64/pixel.h
Added
@@ -0,0 +1,69 @@ +/***************************************************************************** + * pixel.h: aarch64 pixel metrics + ***************************************************************************** + * Copyright (C) 2009-2014 x264 project + * + * Authors: David Conrad <lessen42@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_AARCH64_PIXEL_H +#define X264_AARCH64_PIXEL_H + +#define DECL_PIXELS( ret, name, suffix, args ) \ + ret x264_pixel_##name##_16x16_##suffix args;\ + ret x264_pixel_##name##_16x8_##suffix args;\ + ret x264_pixel_##name##_8x16_##suffix args;\ + ret x264_pixel_##name##_8x8_##suffix args;\ + ret x264_pixel_##name##_8x4_##suffix args;\ + ret x264_pixel_##name##_4x8_##suffix args;\ + ret x264_pixel_##name##_4x4_##suffix args;\ + +#define DECL_X1( name, suffix ) \ + DECL_PIXELS( int, name, suffix, ( uint8_t *, intptr_t, uint8_t *, intptr_t ) ) + +#define DECL_X4( name, suffix ) \ + DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )\ + DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) ) + +DECL_X1( sad, neon ) +DECL_X4( sad, neon ) +DECL_X1( satd, neon ) +DECL_X1( ssd, neon ) + +int x264_pixel_sa8d_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t ); +int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t ); + +uint64_t x264_pixel_var_8x8_neon ( uint8_t *, intptr_t ); +uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t ); +uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t ); +int x264_pixel_var2_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); +int x264_pixel_var2_8x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); + +uint64_t x264_pixel_hadamard_ac_8x8_neon ( uint8_t *, intptr_t ); +uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t ); +uint64_t x264_pixel_hadamard_ac_16x8_neon ( uint8_t *, intptr_t ); +uint64_t x264_pixel_hadamard_ac_16x16_neon( uint8_t *, intptr_t ); + +void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, intptr_t, + const uint8_t *, intptr_t, + int sums[2][4] ); +float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width ); + +#endif
View file
x264-snapshot-20141104-2245.tar.bz2/common/aarch64/predict-a.S
Added
@@ -0,0 +1,661 @@ +/***************************************************************************** + * predict.S: aarch64 intra prediction + ***************************************************************************** + * Copyright (C) 2009-2014 x264 project + * + * Authors: David Conrad <lessen42@gmail.com> + * Mans Rullgard <mans@mansr.com> + * Janne Grunau <janne-x264@jannau.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "asm.S" + +const p8weight, align=4 + .short 1, 2, 3, 4, 1, 2, 3, 4 +endconst +const p16weight, align=4 + .short 1, 2, 3, 4, 5, 6, 7, 8 +endconst + +.macro ldcol.8 vd, xn, xm, n=8, hi=0 +.if \n == 8 || \hi == 0 + ld1 {\vd\().b}[0], [\xn], \xm + ld1 {\vd\().b}[1], [\xn], \xm + ld1 {\vd\().b}[2], [\xn], \xm + ld1 {\vd\().b}[3], [\xn], \xm +.endif +.if \n == 8 || \hi == 1 + ld1 {\vd\().b}[4], [\xn], \xm + ld1 {\vd\().b}[5], [\xn], \xm + ld1 {\vd\().b}[6], [\xn], \xm + ld1 {\vd\().b}[7], [\xn], \xm +.endif +.endm + +.macro ldcol.16 vd, xn, xm + ldcol.8 \vd, \xn, \xm + ld1 {\vd\().b}[ 8], [\xn], \xm + ld1 {\vd\().b}[ 9], [\xn], \xm + ld1 {\vd\().b}[10], [\xn], \xm + ld1 {\vd\().b}[11], [\xn], \xm + ld1 {\vd\().b}[12], [\xn], \xm + ld1 {\vd\().b}[13], [\xn], \xm + ld1 {\vd\().b}[14], [\xn], \xm + ld1 {\vd\().b}[15], [\xn], \xm +.endm + + +function x264_predict_4x4_h_aarch64, export=1 + ldrb w1, [x0, #0*FDEC_STRIDE-1] + ldrb w2, [x0, #1*FDEC_STRIDE-1] + ldrb w3, [x0, #2*FDEC_STRIDE-1] + ldrb w4, [x0, #3*FDEC_STRIDE-1] + add w1, w1, w1, lsl #8 + add w2, w2, w2, lsl #8 + add w3, w3, w3, lsl #8 + add w4, w4, w4, lsl #8 + add w1, w1, w1, lsl #16 + str w1, [x0, #0*FDEC_STRIDE] + add w2, w2, w2, lsl #16 + str w2, [x0, #1*FDEC_STRIDE] + add w3, w3, w3, lsl #16 + str w3, [x0, #2*FDEC_STRIDE] + add w4, w4, w4, lsl #16 + str w4, [x0, #3*FDEC_STRIDE] + ret +endfunc + +function x264_predict_4x4_v_aarch64, export=1 + ldr w1, [x0, #0 - 1 * FDEC_STRIDE] + str w1, [x0, #0 + 0 * FDEC_STRIDE] + str w1, [x0, #0 + 1 * FDEC_STRIDE] + str w1, [x0, #0 + 2 * FDEC_STRIDE] + str w1, [x0, #0 + 3 * FDEC_STRIDE] + ret +endfunc + +function x264_predict_4x4_dc_neon, export=1 + sub x1, x0, #FDEC_STRIDE + sub x2, x0, #1 + mov x7, #FDEC_STRIDE + ld1 {v0.8b}, [x1] + ld1r {v1.8b}, [x2], x7 + ld1r {v2.8b}, [x2], x7 + ld1r {v3.8b}, [x2], x7 + ld1r {v4.8b}, [x2], x7 + uaddlp v0.4h, v0.8b + uaddl v1.8h, v1.8b, v2.8b + uaddl v2.8h, v3.8b, v4.8b + addp v0.4h, v0.4h, v0.4h + add v1.4h, v1.4h, v2.4h + dup v0.4h, v0.h[0] + add v0.4h, v0.4h, v1.4h + rshrn v0.8b, v0.8h, #3 + str s0, [x0], #FDEC_STRIDE + str s0, [x0], #FDEC_STRIDE + str s0, [x0], #FDEC_STRIDE + str s0, [x0] + ret +endfunc + +function x264_predict_4x4_dc_top_neon, export=1 + sub x1, x0, #FDEC_STRIDE + mov x7, #FDEC_STRIDE + ld1 {v0.8b}, [x1] + uaddlp v0.4h, v0.8b + addp v0.4h, v0.4h, v0.4h + dup v0.4h, v0.h[0] + rshrn v0.8b, v0.8h, #2 + str s0, [x0], #FDEC_STRIDE + str s0, [x0], #FDEC_STRIDE + str s0, [x0], #FDEC_STRIDE + str s0, [x0] + ret +endfunc + +function x264_predict_4x4_ddr_neon, export=1 + sub x1, x0, #FDEC_STRIDE+1 + mov x7, #FDEC_STRIDE + ld1 {v0.8b}, [x1], x7 // # -FDEC_STRIDE-1 + ld1r {v1.8b}, [x1], x7 // #0*FDEC_STRIDE-1 + ld1r {v2.8b}, [x1], x7 // #1*FDEC_STRIDE-1 + ext v0.8b, v1.8b, v0.8b, #7 + ld1r {v3.8b}, [x1], x7 // #2*FDEC_STRIDE-1 + ext v0.8b, v2.8b, v0.8b, #7 // a + ld1r {v4.8b}, [x1], x7 // #3*FDEC_STRIDE-1 + ext v1.8b, v3.8b, v0.8b, #7 // b + ext v2.8b, v4.8b, v1.8b, #7 // c + uaddl v0.8h, v0.8b, v1.8b + uaddl v1.8h, v1.8b, v2.8b + add v0.8h, v0.8h, v1.8h + rshrn v0.8b, v0.8h, #2 + + ext v3.8b, v0.8b, v0.8b, #3 + ext v2.8b, v0.8b, v0.8b, #2 + ext v1.8b, v0.8b, v0.8b, #1 + + str s3, [x0], #FDEC_STRIDE + str s2, [x0], #FDEC_STRIDE + str s1, [x0], #FDEC_STRIDE + str s0, [x0] + ret +endfunc + +function x264_predict_4x4_ddl_neon, export=1 + sub x0, x0, #FDEC_STRIDE + mov x7, #FDEC_STRIDE + ld1 {v0.8b}, [x0], x7 + dup v3.8b, v0.b[7] + ext v1.8b, v0.8b, v0.8b, #1 + ext v2.8b, v0.8b, v3.8b, #2 + uhadd v0.8b, v0.8b, v2.8b + urhadd v0.8b, v0.8b, v1.8b + str s0, [x0], #FDEC_STRIDE + ext v1.8b, v0.8b, v0.8b, #1 + ext v2.8b, v0.8b, v0.8b, #2 + str s1, [x0], #FDEC_STRIDE + ext v3.8b, v0.8b, v0.8b, #3 + str s2, [x0], #FDEC_STRIDE + str s3, [x0] + ret +endfunc + +function x264_predict_8x8_dc_neon, export=1 + mov x7, #FDEC_STRIDE + ld1 {v0.16b}, [x1], #16 + ld1 {v1.8b}, [x1] + ext v0.16b, v0.16b, v0.16b, #7 + uaddlv h1, v1.8b + uaddlv h0, v0.8b + add v0.8h, v0.8h, v1.8h + dup v0.8h, v0.h[0] + rshrn v0.8b, v0.8h, #4 +.rept 8 + st1 {v0.8b}, [x0], x7 +.endr + ret +endfunc + +function x264_predict_8x8_h_neon, export=1 + mov x7, #FDEC_STRIDE + ld1 {v16.16b}, [x1] + dup v0.8b, v16.b[14]
View file
x264-snapshot-20141104-2245.tar.bz2/common/aarch64/predict-c.c
Added
@@ -0,0 +1,114 @@ +/***************************************************************************** + * predict.c: aarch64 intra prediction + ***************************************************************************** + * Copyright (C) 2009-2014 x264 project + * + * Authors: David Conrad <lessen42@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "common/common.h" +#include "predict.h" +#include "pixel.h" + +void x264_predict_4x4_dc_top_neon( uint8_t *src ); +void x264_predict_4x4_ddr_neon( uint8_t *src ); +void x264_predict_4x4_ddl_neon( uint8_t *src ); + +void x264_predict_8x8c_dc_top_neon( uint8_t *src ); +void x264_predict_8x8c_dc_left_neon( uint8_t *src ); +void x264_predict_8x8c_p_neon( uint8_t *src ); + +void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] ); + +void x264_predict_16x16_dc_top_neon( uint8_t *src ); +void x264_predict_16x16_dc_left_neon( uint8_t *src ); +void x264_predict_16x16_p_neon( uint8_t *src ); + +void x264_predict_4x4_init_aarch64( int cpu, x264_predict_t pf[12] ) +{ +#if !HIGH_BIT_DEPTH + if (cpu&X264_CPU_ARMV8) + { + pf[I_PRED_4x4_H] = x264_predict_4x4_h_aarch64; + pf[I_PRED_4x4_V] = x264_predict_4x4_v_aarch64; + } + + if (cpu&X264_CPU_NEON) + { + pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_neon; + pf[I_PRED_4x4_DC_TOP] = x264_predict_4x4_dc_top_neon; + pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_neon; + pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_neon; + } +#endif // !HIGH_BIT_DEPTH +} + +void x264_predict_8x8c_init_aarch64( int cpu, x264_predict_t pf[7] ) +{ + if (!(cpu&X264_CPU_NEON)) + return; + +#if !HIGH_BIT_DEPTH + pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_neon; + pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_neon; + pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left_neon; + pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_neon; + pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_neon; + pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_neon; +#endif // !HIGH_BIT_DEPTH +} + +void x264_predict_8x8_init_aarch64( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter ) +{ + if (!(cpu&X264_CPU_NEON)) + return; + +#if !HIGH_BIT_DEPTH + pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_neon; + pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_neon; + pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_neon; + pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_neon; + pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_neon; + pf[I_PRED_8x8_H] = x264_predict_8x8_h_neon; + pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_neon; + pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_neon; + pf[I_PRED_8x8_V] = x264_predict_8x8_v_neon; +#endif // !HIGH_BIT_DEPTH +} + +void x264_predict_16x16_init_aarch64( int cpu, x264_predict_t pf[7] ) +{ + if (!(cpu&X264_CPU_NEON)) + return; + +#if !HIGH_BIT_DEPTH + pf[I_PRED_16x16_DC ] = x264_predict_16x16_dc_neon; + pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_neon; + pf[I_PRED_16x16_DC_LEFT]= x264_predict_16x16_dc_left_neon; + pf[I_PRED_16x16_H ] = x264_predict_16x16_h_neon; + pf[I_PRED_16x16_V ] = x264_predict_16x16_v_neon; + pf[I_PRED_16x16_P ] = x264_predict_16x16_p_neon; +#endif // !HIGH_BIT_DEPTH +}
View file
x264-snapshot-20141104-2245.tar.bz2/common/aarch64/predict.h
Added
@@ -0,0 +1,52 @@ +/***************************************************************************** + * predict.h: aarch64 intra prediction + ***************************************************************************** + * Copyright (C) 2009-2014 x264 project + * + * Authors: David Conrad <lessen42@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_AARCH64_PREDICT_H +#define X264_AARCH64_PREDICT_H + +void x264_predict_4x4_h_aarch64( uint8_t *src ); +void x264_predict_4x4_v_aarch64( uint8_t *src ); + +// for the merged 4x4 intra sad/satd which expects unified suffix +#define x264_predict_4x4_h_neon x264_predict_4x4_h_aarch64 +#define x264_predict_4x4_v_neon x264_predict_4x4_v_aarch64 + +void x264_predict_4x4_dc_neon( uint8_t *src ); +void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8c_dc_neon( uint8_t *src ); +void x264_predict_8x8c_h_neon( uint8_t *src ); +void x264_predict_8x8c_v_neon( uint8_t *src ); +void x264_predict_16x16_v_neon( uint8_t *src ); +void x264_predict_16x16_h_neon( uint8_t *src ); +void x264_predict_16x16_dc_neon( uint8_t *src ); + +void x264_predict_4x4_init_aarch64( int cpu, x264_predict_t pf[12] ); +void x264_predict_8x8_init_aarch64( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter ); +void x264_predict_8x8c_init_aarch64( int cpu, x264_predict_t pf[7] ); +void x264_predict_16x16_init_aarch64( int cpu, x264_predict_t pf[7] ); + +#endif /* X264_AARCH64_PREDICT_H */
View file
x264-snapshot-20141104-2245.tar.bz2/common/aarch64/quant-a.S
Added
@@ -0,0 +1,386 @@ +/**************************************************************************** + * quant.S: arm quantization and level-run + ***************************************************************************** + * Copyright (C) 2009-2014 x264 project + * + * Authors: David Conrad <lessen42@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "asm.S" + +.macro QUANT_TWO bias0 bias1 mf0_1 mf2_3 mask + add v18.8h, v18.8h, \bias0 + add v19.8h, v19.8h, \bias1 + umull v20.4s, v18.4h, \mf0_1\().4h + umull2 v21.4s, v18.8h, \mf0_1\().8h + umull v22.4s, v19.4h, \mf2_3\().4h + umull2 v23.4s, v19.8h, \mf2_3\().8h + sshr v16.8h, v16.8h, #15 + sshr v17.8h, v17.8h, #15 + shrn v18.4h, v20.4s, #16 + shrn2 v18.8h, v21.4s, #16 + shrn v19.4h, v22.4s, #16 + shrn2 v19.8h, v23.4s, #16 + eor v18.16b, v18.16b, v16.16b + eor v19.16b, v19.16b, v17.16b + sub v18.8h, v18.8h, v16.8h + sub v19.8h, v19.8h, v17.8h + orr \mask, v18.16b, v19.16b + st1 {v18.8h,v19.8h}, [x0], #32 +.endm + +.macro QUANT_END d + fmov x2, \d + mov w0, #0 + tst x2, x2 + cinc w0, w0, ne + ret +.endm + +// quant_2x2_dc( int16_t dct[4], int mf, int bias ) +function x264_quant_2x2_dc_neon, export=1 + ld1 {v0.4h}, [x0] + dup v2.4h, w2 + dup v1.4h, w1 + abs v3.4h, v0.4h + add v3.4h, v3.4h, v2.4h + umull v3.4s, v3.4h, v1.4h + sshr v0.4h, v0.4h, #15 + shrn v3.4h, v3.4s, #16 + eor v3.8b, v3.8b, v0.8b + sub v3.4h, v3.4h, v0.4h + st1 {v3.4h}, [x0] + QUANT_END d3 +endfunc + +// quant_4x4_dc( int16_t dct[16], int mf, int bias ) +function x264_quant_4x4_dc_neon, export=1 + ld1 {v16.8h,v17.8h}, [x0] + abs v18.8h, v16.8h + abs v19.8h, v17.8h + dup v0.8h, w2 + dup v2.8h, w1 + QUANT_TWO v0.8h, v0.8h, v2, v2, v0.16b + uqxtn v0.8b, v0.8h + QUANT_END d0 +endfunc + +// quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ) +function x264_quant_4x4_neon, export=1 + ld1 {v16.8h,v17.8h}, [x0] + abs v18.8h, v16.8h + abs v19.8h, v17.8h + ld1 {v0.8h,v1.8h}, [x2] + ld1 {v2.8h,v3.8h}, [x1] + QUANT_TWO v0.8h, v1.8h, v2, v3, v0.16b + uqxtn v0.8b, v0.8h + QUANT_END d0 +endfunc + +// quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] ) +function x264_quant_4x4x4_neon, export=1 + ld1 {v16.8h,v17.8h}, [x0] + abs v18.8h, v16.8h + abs v19.8h, v17.8h + ld1 {v0.8h,v1.8h}, [x2] + ld1 {v2.8h,v3.8h}, [x1] + QUANT_TWO v0.8h, v1.8h, v2, v3, v4.16b + ld1 {v16.8h,v17.8h}, [x0] + abs v18.8h, v16.8h + abs v19.8h, v17.8h + QUANT_TWO v0.8h, v1.8h, v2, v3, v5.16b + ld1 {v16.8h,v17.8h}, [x0] + abs v18.8h, v16.8h + abs v19.8h, v17.8h + QUANT_TWO v0.8h, v1.8h, v2, v3, v6.16b + ld1 {v16.8h,v17.8h}, [x0] + abs v18.8h, v16.8h + abs v19.8h, v17.8h + QUANT_TWO v0.8h, v1.8h, v2, v3, v7.16b + uqxtn v4.8b, v4.8h + uqxtn v7.8b, v7.8h + uqxtn v6.8b, v6.8h + uqxtn v5.8b, v5.8h + fmov x7, d7 + fmov x6, d6 + fmov x5, d5 + fmov x4, d4 + mov w0, #0 + tst x7, x7 + cinc w0, w0, ne + lsl w0, w0, #1 + tst x6, x6 + cinc w0, w0, ne + lsl w0, w0, #1 + tst x5, x5 + cinc w0, w0, ne + lsl w0, w0, #1 + tst x4, x4 + cinc w0, w0, ne + ret +endfunc + +// quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] ) +function x264_quant_8x8_neon, export=1 + ld1 {v16.8h,v17.8h}, [x0] + abs v18.8h, v16.8h + abs v19.8h, v17.8h + ld1 {v0.8h,v1.8h}, [x2], #32 + ld1 {v2.8h,v3.8h}, [x1], #32 + QUANT_TWO v0.8h, v1.8h, v2, v3, v4.16b +.rept 3 + ld1 {v16.8h,v17.8h}, [x0] + abs v18.8h, v16.8h + abs v19.8h, v17.8h + ld1 {v0.8h,v1.8h}, [x2], #32 + ld1 {v2.8h,v3.8h}, [x1], #32 + QUANT_TWO v0.8h, v1.8h, v2, v3, v5.16b + orr v4.16b, v4.16b, v5.16b +.endr + uqxtn v0.8b, v4.8h + QUANT_END d0 +endfunc + +.macro DEQUANT_START mf_size offset dc=no + mov w3, #0x2b + mul w3, w3, w2 + lsr w3, w3, #8 // i_qbits = i_qp / 6 + add w5, w3, w3, lsl #1 + sub w2, w2, w5, lsl #1 // i_mf = i_qp % 6 + lsl w2, w2, #\mf_size +.ifc \dc,no + add x1, x1, w2, sxtw // dequant_mf[i_mf] +.else + ldr x1, [x1, w2, sxtw] // dequant_mf[i_mf][0][0] +.endif + subs w3, w3, #\offset // 6 for 8x8 +.endm + +// dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp ) +.macro DEQUANT size bits +function x264_dequant_\size\()_neon, export=1 + DEQUANT_START \bits+2, \bits +.ifc \size, 8x8 + mov w2, #4 +.endif + b.lt dequant_\size\()_rshift + + dup v31.8h, w3 +dequant_\size\()_lshift_loop: +.ifc \size, 8x8 + subs w2, w2, #1 +.endif + ld1 {v16.4s}, [x1], #16 + ld1 {v17.4s}, [x1], #16 + sqxtn v2.4h, v16.4s + ld1 {v18.4s}, [x1], #16 + sqxtn2 v2.8h, v17.4s + ld1 {v19.4s}, [x1], #16 + sqxtn v3.4h, v18.4s + ld1 {v0.8h,v1.8h}, [x0] + sqxtn2 v3.8h, v19.4s + mul v0.8h, v0.8h, v2.8h
View file
x264-snapshot-20141104-2245.tar.bz2/common/aarch64/quant.h
Added
@@ -0,0 +1,47 @@ +/***************************************************************************** + * quant.h: arm quantization and level-run + ***************************************************************************** + * Copyright (C) 2005-2014 x264 project + * + * Authors: David Conrad <lessen42@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_AARCH64_QUANT_H +#define X264_AARCH64_QUANT_H + +int x264_quant_2x2_dc_aarch64( int16_t dct[4], int mf, int bias ); + +int x264_quant_2x2_dc_neon( int16_t dct[4], int mf, int bias ); +int x264_quant_4x4_dc_neon( int16_t dct[16], int mf, int bias ); +int x264_quant_4x4_neon( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ); +int x264_quant_4x4x4_neon( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] ); +int x264_quant_8x8_neon( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] ); + +void x264_dequant_4x4_dc_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp ); +void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp ); +void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp ); + +int x264_coeff_last4_aarch64( int16_t * ); +int x264_coeff_last8_aarch64( int16_t * ); +int x264_coeff_last15_neon( int16_t * ); +int x264_coeff_last16_neon( int16_t * ); +int x264_coeff_last64_neon( int16_t * ); + +#endif
View file
x264-snapshot-20140321-2245.tar.bz2/common/arm/asm.S -> x264-snapshot-20141104-2245.tar.bz2/common/arm/asm.S
Changed
@@ -28,6 +28,16 @@ .syntax unified +#if HAVE_NEON + .arch armv7-a +#elif HAVE_ARMV6T2 + .arch armv6t2 +#elif HAVE_ARMV6 + .arch armv6 +#endif + +.fpu neon + #ifdef PREFIX # define EXTERN_ASM _ #else @@ -40,32 +50,49 @@ # define ELF @ #endif - .macro require8, val=1 +#if HAVE_AS_FUNC +# define FUNC +#else +# define FUNC @ +#endif + +.macro require8, val=1 ELF .eabi_attribute 24, \val - .endm +.endm - .macro preserve8, val=1 +.macro preserve8, val=1 ELF .eabi_attribute 25, \val - .endm +.endm - .macro function name - .global EXTERN_ASM\name +.macro function name, export=1 + .macro endfunc +ELF .size \name, . - \name +FUNC .endfunc + .purgem endfunc + .endm .align 2 +.if \export == 1 + .global EXTERN_ASM\name +ELF .hidden EXTERN_ASM\name +ELF .type EXTERN_ASM\name, %function +FUNC .func EXTERN_ASM\name EXTERN_ASM\name: +.else ELF .hidden \name ELF .type \name, %function - .func \name +FUNC .func \name \name: - .endm +.endif +.endm - .macro movrel rd, val +.macro movrel rd, val #if HAVE_ARMV6T2 && !defined(PIC) movw \rd, #:lower16:\val movt \rd, #:upper16:\val #else ldr \rd, =\val #endif - .endm +.endm .macro movconst rd, val #if HAVE_ARMV6T2 @@ -78,6 +105,10 @@ #endif .endm +#define GLUE(a, b) a ## b +#define JOIN(a, b) GLUE(a, b) +#define X(s) JOIN(EXTERN_ASM, s) + #define FENC_STRIDE 16 #define FDEC_STRIDE 32
View file
x264-snapshot-20140321-2245.tar.bz2/common/arm/cpu-a.S -> x264-snapshot-20141104-2245.tar.bz2/common/arm/cpu-a.S
Changed
@@ -25,7 +25,6 @@ #include "asm.S" -.fpu neon .align 2 // done in gas because .fpu neon overrides the refusal to assemble @@ -33,12 +32,12 @@ function x264_cpu_neon_test vadd.i16 q0, q0, q0 bx lr -.endfunc +endfunc // return: 0 on success // 1 if counters were already enabled // 9 if lo-res counters were already enabled -function x264_cpu_enable_armv7_counter +function x264_cpu_enable_armv7_counter, export=0 mrc p15, 0, r2, c9, c12, 0 // read PMNC ands r0, r2, #1 andne r0, r2, #9 @@ -49,14 +48,14 @@ mov r2, #1 << 31 // enable cycle counter mcr p15, 0, r2, c9, c12, 1 // write CNTENS bx lr -.endfunc +endfunc -function x264_cpu_disable_armv7_counter +function x264_cpu_disable_armv7_counter, export=0 mrc p15, 0, r0, c9, c12, 0 // read PMNC bic r0, r0, #1 // disable counters mcr p15, 0, r0, c9, c12, 0 // write PMNC bx lr -.endfunc +endfunc .macro READ_TIME r @@ -106,4 +105,4 @@ cmp r0, #10 movgt r0, #0 pop {r4-r6,pc} -.endfunc +endfunc
View file
x264-snapshot-20140321-2245.tar.bz2/common/arm/dct-a.S -> x264-snapshot-20141104-2245.tar.bz2/common/arm/dct-a.S
Changed
@@ -25,8 +25,6 @@ #include "asm.S" -.fpu neon - .section .rodata .align 4 @@ -82,7 +80,7 @@ vrhadd.s16 d3, d6, d7 vst1.64 {d0-d3}, [r0,:128] bx lr -.endfunc +endfunc function x264_idct4x4dc_neon vld1.64 {d0-d3}, [r0,:128] @@ -94,7 +92,7 @@ HADAMARD 2, sumsub, d3, d2, d6, d7 vst1.64 {d0-d3}, [r0,:128] bx lr -.endfunc +endfunc .macro DCT_1D d0 d1 d2 d3 d4 d5 d6 d7 @@ -129,9 +127,9 @@ DCT_1D d4, d5, d6, d7, d0, d1, d2, d3 vst1.64 {d4-d7}, [r0,:128] bx lr -.endfunc +endfunc -function x264_sub8x4_dct_neon +function x264_sub8x4_dct_neon, export=0 vld1.64 {d0}, [r1,:64], r3 vld1.64 {d1}, [r2,:64], ip vsubl.u8 q8, d0, d1 @@ -165,7 +163,7 @@ vst1.64 {d4-d5}, [r0,:128]! vst1.64 {d6-d7}, [r0,:128]! bx lr -.endfunc +endfunc function x264_sub8x8_dct_neon push {lr} @@ -174,7 +172,7 @@ bl x264_sub8x4_dct_neon pop {lr} b x264_sub8x4_dct_neon -.endfunc +endfunc function x264_sub16x16_dct_neon push {lr} @@ -195,7 +193,7 @@ bl x264_sub8x4_dct_neon pop {lr} b x264_sub8x4_dct_neon -.endfunc +endfunc .macro DCT8_1D type @@ -279,22 +277,22 @@ vst1.64 {d24-d27}, [r0,:128]! vst1.64 {d28-d31}, [r0,:128]! bx lr -.endfunc +endfunc function x264_sub16x16_dct8_neon push {lr} - bl x264_sub8x8_dct8_neon + bl X(x264_sub8x8_dct8_neon) sub r1, r1, #FENC_STRIDE*8 - 8 sub r2, r2, #FDEC_STRIDE*8 - 8 - bl x264_sub8x8_dct8_neon + bl X(x264_sub8x8_dct8_neon) sub r1, r1, #8 sub r2, r2, #8 - bl x264_sub8x8_dct8_neon + bl X(x264_sub8x8_dct8_neon) pop {lr} sub r1, r1, #FENC_STRIDE*8 - 8 sub r2, r2, #FDEC_STRIDE*8 - 8 - b x264_sub8x8_dct8_neon -.endfunc + b X(x264_sub8x8_dct8_neon) +endfunc // First part of IDCT (minus final SUMSUB_BA) @@ -336,9 +334,9 @@ vst1.32 {d2[1]}, [r0,:32], r2 vst1.32 {d2[0]}, [r0,:32], r2 bx lr -.endfunc +endfunc -function x264_add8x4_idct_neon +function x264_add8x4_idct_neon, export=0 vld1.64 {d0-d3}, [r1,:128]! IDCT_1D d16, d18, d20, d22, d0, d1, d2, d3 vld1.64 {d4-d7}, [r1,:128]! @@ -376,7 +374,7 @@ vst1.32 {d2}, [r0,:64], r2 vst1.32 {d3}, [r0,:64], r2 bx lr -.endfunc +endfunc function x264_add8x8_idct_neon mov r2, #FDEC_STRIDE @@ -384,7 +382,7 @@ bl x264_add8x4_idct_neon mov lr, ip b x264_add8x4_idct_neon -.endfunc +endfunc function x264_add16x16_idct_neon mov r2, #FDEC_STRIDE @@ -401,7 +399,7 @@ bl x264_add8x4_idct_neon mov lr, ip b x264_add8x4_idct_neon -.endfunc +endfunc .macro IDCT8_1D type @@ -498,19 +496,19 @@ vst1.64 {d6}, [r0,:64], r2 vst1.64 {d7}, [r0,:64], r2 bx lr -.endfunc +endfunc function x264_add16x16_idct8_neon mov ip, lr - bl x264_add8x8_idct8_neon + bl X(x264_add8x8_idct8_neon) sub r0, r0, #8*FDEC_STRIDE-8 - bl x264_add8x8_idct8_neon + bl X(x264_add8x8_idct8_neon) sub r0, r0, #8 - bl x264_add8x8_idct8_neon + bl X(x264_add8x8_idct8_neon) sub r0, r0, #8*FDEC_STRIDE-8 mov lr, ip - b x264_add8x8_idct8_neon -.endfunc + b X(x264_add8x8_idct8_neon) +endfunc function x264_add8x8_idct_dc_neon @@ -562,7 +560,7 @@ vst1.64 {d6}, [r0,:64], r2 vst1.64 {d7}, [r0,:64], r2 bx lr -.endfunc +endfunc .macro ADD16x4_IDCT_DC dc vld1.64 {d16-d17}, [r0,:128], r3 @@ -610,7 +608,7 @@ ADD16x4_IDCT_DC d2 ADD16x4_IDCT_DC d3 bx lr -.endfunc +endfunc function x264_sub8x8_dct_dc_neon mov r3, #FENC_STRIDE @@ -658,7 +656,7 @@ vpadd.s16 d0, d0, d1 vst1.64 {d0}, [r0,:64] bx lr -.endfunc +endfunc function x264_zigzag_scan_4x4_frame_neon @@ -671,4 +669,4 @@ vtbl.8 d7, {d2-d3}, d19 vst1.64 {d4-d7}, [r0,:128] bx lr -.endfunc +endfunc
View file
x264-snapshot-20140321-2245.tar.bz2/common/arm/deblock-a.S -> x264-snapshot-20141104-2245.tar.bz2/common/arm/deblock-a.S
Changed
@@ -25,8 +25,6 @@ #include "asm.S" -.fpu neon - .macro h264_loop_filter_start ldr ip, [sp] ldr ip, [ip] @@ -142,7 +140,7 @@ align_pop_regs bx lr -.endfunc +endfunc function x264_deblock_h_luma_neon h264_loop_filter_start @@ -194,7 +192,7 @@ align_pop_regs bx lr -.endfunc +endfunc .macro h264_loop_filter_chroma vdup.8 q11, r2 // alpha @@ -255,7 +253,7 @@ vst2.8 {d0, d1}, [r0,:128], r1 bx lr -.endfunc +endfunc function x264_deblock_h_chroma_neon h264_loop_filter_start @@ -303,4 +301,110 @@ vst1.8 {d3}, [r0], r1 bx lr -.endfunc +endfunc + +function x264_deblock_strength_neon + ldr ip, [sp] + vmov.i8 q8, #0 + lsl ip, ip, #8 + add r3, r3, #32 + sub ip, ip, #(1<<8)-3 + vmov.i8 q9, #0 + vdup.16 q10, ip + ldr ip, [sp, #4] + +lists: + @ load bytes ref + vld1.8 {d31}, [r1]! + add r2, r2, #16 + vld1.8 {q1}, [r1]! + vmov.i8 q0, #0 + vld1.8 {q2}, [r1]! + vext.8 q3, q0, q1, #15 + vext.8 q0, q0, q2, #15 + vuzp.32 q1, q2 + vuzp.32 q3, q0 + vext.8 q1, q15, q2, #12 + + veor q0, q0, q2 + veor q1, q1, q2 + vorr q8, q8, q0 + vorr q9, q9, q1 + + vld1.16 {q11}, [r2,:128]! @ mv + 0x10 + vld1.16 {q3}, [r2,:128]! @ mv + 0x20 + vld1.16 {q12}, [r2,:128]! @ mv + 0x30 + vld1.16 {q2}, [r2,:128]! @ mv + 0x40 + vld1.16 {q13}, [r2,:128]! @ mv + 0x50 + vext.8 q3, q3, q12, #12 + vext.8 q2, q2, q13, #12 + vabd.s16 q0, q12, q3 + vld1.16 {q3}, [r2,:128]! @ mv + 0x60 + vabd.s16 q1, q13, q2 + vld1.16 {q14}, [r2,:128]! @ mv + 0x70 + vqmovn.u16 d0, q0 + vld1.16 {q2}, [r2,:128]! @ mv + 0x80 + vld1.16 {q15}, [r2,:128]! @ mv + 0x90 + vqmovn.u16 d1, q1 + vext.8 q3, q3, q14, #12 + vext.8 q2, q2, q15, #12 + vabd.s16 q3, q14, q3 + vabd.s16 q2, q15, q2 + vqmovn.u16 d2, q3 + vqmovn.u16 d3, q2 + + vqsub.u8 q0, q0, q10 + vqsub.u8 q1, q1, q10 + vqmovn.u16 d0, q0 + vqmovn.u16 d1, q1 + + vabd.s16 q1, q12, q13 + vorr q8, q8, q0 + + vabd.s16 q0, q11, q12 + vabd.s16 q2, q13, q14 + vabd.s16 q3, q14, q15 + vqmovn.u16 d0, q0 + vqmovn.u16 d1, q1 + vqmovn.u16 d2, q2 + vqmovn.u16 d3, q3 + + vqsub.u8 q0, q0, q10 + vqsub.u8 q1, q1, q10 + vqmovn.u16 d0, q0 + vqmovn.u16 d1, q1 + subs ip, ip, #1 + vorr q9, q9, q0 + beq lists + + mov ip, #-32 + @ load bytes nnz + vld1.8 {d31}, [r0]! + vld1.8 {q1}, [r0]! + vmov.i8 q0, #0 + vld1.8 {q2}, [r0] + vext.8 q3, q0, q1, #15 + vext.8 q0, q0, q2, #15 + vuzp.32 q1, q2 + vuzp.32 q3, q0 + vext.8 q1, q15, q2, #12 + + vorr q0, q0, q2 + vorr q1, q1, q2 + vmov.u8 q10, #1 + vmin.u8 q0, q0, q10 + vmin.u8 q1, q1, q10 + vmin.u8 q8, q8, q10 @ mv ? 1 : 0 + vmin.u8 q9, q9, q10 + vadd.u8 q0, q0, q0 @ nnz ? 2 : 0 + vadd.u8 q1, q1, q1 + vmax.u8 q8, q8, q0 + vmax.u8 q9, q9, q1 + vzip.16 d16, d17 + vst1.8 {q9}, [r3,:128], ip @ bs[1] + vtrn.8 d16, d17 + vtrn.32 d16, d17 + + vst1.8 {q8}, [r3,:128] @ bs[0] + bx lr +endfunc
View file
x264-snapshot-20140321-2245.tar.bz2/common/arm/mc-a.S -> x264-snapshot-20141104-2245.tar.bz2/common/arm/mc-a.S
Changed
@@ -27,7 +27,6 @@ #include "asm.S" -.fpu neon .text // note: prefetch stuff assumes 64-byte cacheline, true for the Cortex-A8 @@ -50,7 +49,7 @@ pld [r3, r1, lsl #1] pld [r3, r2] bx lr -.endfunc +endfunc // void prefetch_fenc( uint8_t *pix_y, intptr_t stride_y, // uint8_t *pix_uv, intptr_t stride_uv, int mb_x ) @@ -76,7 +75,7 @@ pld [ip] pld [ip, r3] pop {pc} -.endfunc +endfunc // void *x264_memcpy_aligned( void *dst, const void *src, size_t n ) @@ -85,10 +84,10 @@ movrel ip, memcpy_table and r3, r3, #0xc ldr pc, [ip, r3] -.endfunc +endfunc .macro MEMCPY_ALIGNED srcalign dstalign -function memcpy_aligned_\dstalign\()_\srcalign\()_neon +function memcpy_aligned_\dstalign\()_\srcalign\()_neon, export=0 mov r3, r0 .if \srcalign == 8 && \dstalign == 8 sub r2, #16 @@ -127,7 +126,7 @@ vst1.64 {d0}, [r3,:64]! .endif bx lr -.endfunc +endfunc .endm MEMCPY_ALIGNED 16, 16 @@ -156,7 +155,7 @@ .endr bgt memzero_loop bx lr -.endfunc +endfunc // void pixel_avg( uint8_t *dst, intptr_t dst_stride, @@ -175,12 +174,13 @@ cmp ip, #0 bge x264_pixel_avg_weight_w\w\()_add_add_neon b x264_pixel_avg_weight_w\w\()_sub_add_neon // weight < 0 -.endfunc +endfunc .endm AVGH 4, 2 AVGH 4, 4 AVGH 4, 8 +AVGH 4, 16 AVGH 8, 4 AVGH 8, 8 AVGH 8, 16 @@ -238,7 +238,7 @@ .endm .macro AVG_WEIGHT ext -function x264_pixel_avg_weight_w4_\ext\()_neon +function x264_pixel_avg_weight_w4_\ext\()_neon, export=0 load_weights_\ext 1: // height loop subs lr, lr, #2 @@ -252,9 +252,9 @@ vst1.32 {d1[0]}, [r0,:32], r1 bgt 1b pop {r4-r6,pc} -.endfunc +endfunc -function x264_pixel_avg_weight_w8_\ext\()_neon +function x264_pixel_avg_weight_w8_\ext\()_neon, export=0 load_weights_\ext 1: // height loop subs lr, lr, #4 @@ -276,9 +276,9 @@ vst1.64 {d3}, [r0,:64], r1 bgt 1b pop {r4-r6,pc} -.endfunc +endfunc -function x264_pixel_avg_weight_w16_\ext\()_neon +function x264_pixel_avg_weight_w16_\ext\()_neon, export=0 load_weights_\ext 1: // height loop subs lr, lr, #2 @@ -296,14 +296,14 @@ vst1.64 {d2-d3}, [r0,:128], r1 bgt 1b pop {r4-r6,pc} -.endfunc +endfunc .endm AVG_WEIGHT add_add AVG_WEIGHT add_sub AVG_WEIGHT sub_add -function x264_pixel_avg_w4_neon +function x264_pixel_avg_w4_neon, export=0 subs lr, lr, #2 vld1.32 {d0[]}, [r2], r3 vld1.32 {d2[]}, [r4], r5 @@ -315,9 +315,9 @@ vst1.32 {d1[0]}, [r0,:32], r1 bgt x264_pixel_avg_w4_neon pop {r4-r6,pc} -.endfunc +endfunc -function x264_pixel_avg_w8_neon +function x264_pixel_avg_w8_neon, export=0 subs lr, lr, #4 vld1.64 {d0}, [r2], r3 vld1.64 {d2}, [r4], r5 @@ -337,9 +337,9 @@ vst1.64 {d3}, [r0,:64], r1 bgt x264_pixel_avg_w8_neon pop {r4-r6,pc} -.endfunc +endfunc -function x264_pixel_avg_w16_neon +function x264_pixel_avg_w16_neon, export=0 subs lr, lr, #4 vld1.64 {d0-d1}, [r2], r3 vld1.64 {d2-d3}, [r4], r5 @@ -359,7 +359,7 @@ vst1.64 {d6-d7}, [r0,:128], r1 bgt x264_pixel_avg_w16_neon pop {r4-r6,pc} -.endfunc +endfunc function x264_pixel_avg2_w4_neon @@ -378,7 +378,7 @@ vst1.32 {d1[0]}, [r0,:32], r1 bgt avg2_w4_loop pop {pc} -.endfunc +endfunc function x264_pixel_avg2_w8_neon ldr ip, [sp, #4] @@ -396,7 +396,7 @@ vst1.64 {d1}, [r0,:64], r1 bgt avg2_w8_loop pop {pc} -.endfunc +endfunc function x264_pixel_avg2_w16_neon ldr ip, [sp, #4] @@ -414,7 +414,7 @@ vst1.64 {d4-d5}, [r0,:128], r1 bgt avg2_w16_loop pop {pc} -.endfunc +endfunc function x264_pixel_avg2_w20_neon ldr ip, [sp, #4] @@ -437,7 +437,7 @@ vst1.32 {d6[0]}, [r0,:32], r1 bgt avg2_w20_loop pop {pc} -.endfunc +endfunc .macro weight_prologue type @@ -448,7 +448,7 @@ ldr lr, [r4, #32] // denom .endif ldrd r4, r5, [r4, #32+4] // scale, offset - vdup.16 q0, r4 + vdup.8 d0, r4 vdup.16 q1, r5 .ifc \type, full rsb lr, lr, #0
View file
x264-snapshot-20140321-2245.tar.bz2/common/arm/mc-c.c -> x264-snapshot-20141104-2245.tar.bz2/common/arm/mc-c.c
Changed
@@ -37,6 +37,7 @@ void x264_pixel_avg_8x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); void x264_pixel_avg_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); void x264_pixel_avg_8x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_4x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); void x264_pixel_avg_4x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); void x264_pixel_avg_4x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); void x264_pixel_avg_4x2_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); @@ -46,13 +47,28 @@ void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); +void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu, + pixel *dstv, intptr_t i_dstv, + pixel *src, intptr_t i_src, int w, int h ); +void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta, + pixel *dstb, intptr_t i_dstb, + pixel *dstc, intptr_t i_dstc, + pixel *src, intptr_t i_src, int pw, int w, int h ); +void x264_plane_copy_interleave_neon( pixel *dst, intptr_t i_dst, + pixel *srcu, intptr_t i_srcu, + pixel *srcv, intptr_t i_srcv, int w, int h ); + +void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height ); +void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height ); +void x264_load_deinterleave_chroma_fenc_neon( pixel *dst, pixel *src, intptr_t i_src, int height ); + #define MC_WEIGHT(func)\ void x264_mc_weight_w20##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ void x264_mc_weight_w16##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ void x264_mc_weight_w8##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ void x264_mc_weight_w4##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ \ -static void (* const x264_mc##func##_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int ) =\ +static weight_fn_t x264_mc##func##_wtab_neon[6] =\ {\ x264_mc_weight_w4##func##_neon,\ x264_mc_weight_w4##func##_neon,\ @@ -72,7 +88,7 @@ void x264_mc_copy_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); void x264_mc_copy_w16_aligned_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); -void x264_mc_chroma_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int ); +void x264_mc_chroma_neon( uint8_t *, uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int ); void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int ); void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, intptr_t, int ); @@ -224,11 +240,20 @@ pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon; pf->copy[PIXEL_4x4] = x264_mc_copy_w4_neon; + pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon; + pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon; + pf->plane_copy_interleave = x264_plane_copy_interleave_neon; + + pf->store_interleave_chroma = x264_store_interleave_chroma_neon; + pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon; + pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_neon; + pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon; pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_neon; pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_neon; pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_neon; pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_neon; + pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_neon; pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_neon; pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_neon; pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_neon;
View file
x264-snapshot-20140321-2245.tar.bz2/common/arm/pixel-a.S -> x264-snapshot-20141104-2245.tar.bz2/common/arm/pixel-a.S
Changed
@@ -25,14 +25,15 @@ #include "asm.S" -.fpu neon .section .rodata .align 4 -.rept 16 .byte 0xff +.rept 16 + .byte 0xff .endr mask_ff: -.rept 16 .byte 0 +.rept 16 + .byte 0 .endr mask_ac4: @@ -60,7 +61,7 @@ .endr usada8 r0, r6, lr, ip pop {r4-r6,pc} -.endfunc +endfunc .endm SAD4_ARMV6 4 @@ -137,7 +138,7 @@ vpaddl.u16 d0, d0 vmov.u32 r0, d0[0] bx lr -.endfunc +endfunc .endm SAD_FUNC 4, 4 @@ -222,7 +223,7 @@ vpaddl.u16 d0, d0 vmov.u32 r0, d0[0] bx lr -.endfunc +endfunc .endm SAD_FUNC_DUAL 8, 4 @@ -368,7 +369,7 @@ vst1.32 {d0-d1}, [r7] .endif pop {r6-r7,pc} -.endfunc +endfunc .endm SAD_X_FUNC 3, 4, 4 @@ -477,7 +478,7 @@ vpadd.s32 d0, d0, d0 vmov.32 r0, d0[0] bx lr -.endfunc +endfunc .endm SSD_FUNC 4, 4 @@ -517,7 +518,7 @@ vld1.64 {d26}, [r0,:64], r1 VAR_SQR_SUM q2, q10, q15, d26 b x264_var_end -.endfunc +endfunc function x264_pixel_var_8x16_neon vld1.64 {d16}, [r0,:64], r1 @@ -549,7 +550,7 @@ 2: VAR_SQR_SUM q2, q13, q15, d22 b x264_var_end -.endfunc +endfunc function x264_pixel_var_16x16_neon vld1.64 {d16-d17}, [r0,:128], r1 @@ -573,9 +574,9 @@ VAR_SQR_SUM q1, q12, q14, d18 VAR_SQR_SUM q2, q13, q15, d19 bgt var16_loop -.endfunc +endfunc -function x264_var_end +function x264_var_end, export=0 vpaddl.u16 q8, q14 vpaddl.u16 q9, q15 vadd.u32 q1, q1, q8 @@ -588,7 +589,7 @@ vmov r0, r1, d0 bx lr -.endfunc +endfunc .macro DIFF_SUM diff da db lastdiff vld1.64 {\da}, [r0,:64], r1 @@ -633,7 +634,7 @@ mul r0, r0, r0 sub r0, r1, r0, lsr #6 bx lr -.endfunc +endfunc function x264_pixel_var2_8x16_neon vld1.64 {d16}, [r0,:64], r1 @@ -677,7 +678,7 @@ mul r0, r0, r0 sub r0, r1, r0, lsr #7 bx lr -.endfunc +endfunc .macro LOAD_DIFF_8x4 q0 q1 q2 q3 vld1.32 {d1}, [r2], r3 @@ -714,7 +715,7 @@ HORIZ_ADD d0, d0, d1 vmov.32 r0, d0[0] bx lr -.endfunc +endfunc function x264_pixel_satd_4x8_neon vld1.32 {d1[]}, [r2], r3 @@ -741,7 +742,7 @@ vsubl.u8 q3, d6, d7 SUMSUB_AB q10, q11, q2, q3 b x264_satd_4x8_8x4_end_neon -.endfunc +endfunc function x264_pixel_satd_8x4_neon vld1.64 {d1}, [r2], r3 @@ -758,9 +759,9 @@ vld1.64 {d6}, [r0,:64], r1 vsubl.u8 q3, d6, d7 SUMSUB_AB q10, q11, q2, q3 -.endfunc +endfunc -function x264_satd_4x8_8x4_end_neon +function x264_satd_4x8_8x4_end_neon, export=0 vadd.s16 q0, q8, q10 vadd.s16 q1, q9, q11 vsub.s16 q2, q8, q10 @@ -785,7 +786,7 @@ HORIZ_ADD d0, d0, d1 vmov.32 r0, d0[0] bx lr -.endfunc +endfunc function x264_pixel_satd_8x8_neon mov ip, lr @@ -799,7 +800,7 @@ mov lr, ip vmov.32 r0, d0[0] bx lr -.endfunc +endfunc function x264_pixel_satd_8x16_neon vpush {d8-d11} @@ -821,9 +822,9 @@ mov lr, ip vmov.32 r0, d0[0] bx lr -.endfunc +endfunc -function x264_satd_8x8_neon +function x264_satd_8x8_neon, export=0 LOAD_DIFF_8x4 q8, q9, q10, q11 vld1.64 {d7}, [r2], r3 SUMSUB_AB q0, q1, q8, q9 @@ -841,10 +842,10 @@ SUMSUB_AB q9, q11, q1, q3 vld1.64 {d0}, [r0,:64], r1 vsubl.u8 q15, d0, d1 -.endfunc +endfunc // one vertical hadamard pass and two horizontal -function x264_satd_8x4v_8x8h_neon +function x264_satd_8x4v_8x8h_neon, export=0 SUMSUB_ABCD q0, q1, q2, q3, q12, q13, q14, q15 vtrn.16 q8, q9 SUMSUB_AB q12, q14, q0, q2 @@ -870,7 +871,7 @@ vmax.s16 q14, q8, q10 vmax.s16 q15, q9, q11 bx lr -.endfunc +endfunc
View file
x264-snapshot-20140321-2245.tar.bz2/common/arm/predict-a.S -> x264-snapshot-20141104-2245.tar.bz2/common/arm/predict-a.S
Changed
@@ -26,8 +26,6 @@ #include "asm.S" -.fpu neon - .section .rodata .align 4 @@ -77,7 +75,16 @@ add ip, ip, ip, lsl #16 str ip, [r0, #3*FDEC_STRIDE] bx lr -.endfunc +endfunc + +function x264_predict_4x4_v_armv6 + ldr r1, [r0, #0 - 1 * FDEC_STRIDE] + str r1, [r0, #0 + 0 * FDEC_STRIDE] + str r1, [r0, #0 + 1 * FDEC_STRIDE] + str r1, [r0, #0 + 2 * FDEC_STRIDE] + str r1, [r0, #0 + 3 * FDEC_STRIDE] + bx lr +endfunc function x264_predict_4x4_dc_armv6 mov ip, #0 @@ -100,7 +107,7 @@ str r1, [r0, #2*FDEC_STRIDE] str r1, [r0, #3*FDEC_STRIDE] bx lr -.endfunc +endfunc function x264_predict_4x4_dc_top_neon mov r12, #FDEC_STRIDE @@ -115,7 +122,7 @@ vst1.32 d1[0], [r0,:32], r12 vst1.32 d1[0], [r0,:32], r12 bx lr -.endfunc +endfunc // return a1 = (a1+2*b1+c1+2)>>2 a2 = (a2+2*b2+c2+2)>>2 .macro PRED4x4_LOWPASS a1 b1 c1 a2 b2 c2 pb_1 @@ -158,7 +165,7 @@ add r5, r5, r4, lsr #8 str r5, [r0, #3*FDEC_STRIDE] pop {r4-r6,pc} -.endfunc +endfunc function x264_predict_4x4_ddl_neon sub r0, #FDEC_STRIDE @@ -177,7 +184,7 @@ vst1.32 {d2[0]}, [r0,:32], ip vst1.32 {d3[0]}, [r0,:32], ip bx lr -.endfunc +endfunc function x264_predict_8x8_dc_neon mov ip, #0 @@ -201,7 +208,7 @@ vst1.64 {d0}, [r0,:64], ip .endr pop {r4-r5,pc} -.endfunc +endfunc function x264_predict_8x8_h_neon add r1, r1, #7 @@ -224,7 +231,7 @@ vst1.64 {d6}, [r0,:64], ip vst1.64 {d7}, [r0,:64], ip bx lr -.endfunc +endfunc function x264_predict_8x8_v_neon add r1, r1, #16 @@ -234,7 +241,7 @@ vst1.8 {d0}, [r0,:64], r12 .endr bx lr -.endfunc +endfunc function x264_predict_8x8_ddl_neon add r1, #16 @@ -262,7 +269,7 @@ vst1.8 d2, [r0,:64], r12 vst1.8 d1, [r0,:64], r12 bx lr -.endfunc +endfunc function x264_predict_8x8_ddr_neon vld1.8 {d0-d3}, [r1,:128] @@ -292,7 +299,7 @@ vst1.8 {d4}, [r0,:64], r12 vst1.8 {d5}, [r0,:64], r12 bx lr -.endfunc +endfunc function x264_predict_8x8_vl_neon add r1, #16 @@ -323,7 +330,7 @@ vst1.8 {d3}, [r0,:64], r12 vst1.8 {d2}, [r0,:64], r12 bx lr -.endfunc +endfunc function x264_predict_8x8_vr_neon add r1, #8 @@ -355,7 +362,7 @@ vst1.8 {d6}, [r0,:64], r12 vst1.8 {d3}, [r0,:64], r12 bx lr -.endfunc +endfunc function x264_predict_8x8_hd_neon mov r12, #FDEC_STRIDE @@ -388,7 +395,7 @@ vst1.8 {d16}, [r0,:64], r12 bx lr -.endfunc +endfunc function x264_predict_8x8_hu_neon mov r12, #FDEC_STRIDE @@ -421,7 +428,7 @@ vst1.8 {d7}, [r0,:64], r12 vst1.8 {d17}, [r0,:64] bx lr -.endfunc +endfunc function x264_predict_8x8c_dc_top_neon sub r2, r0, #FDEC_STRIDE @@ -434,7 +441,7 @@ vdup.8 d0, d0[0] vtrn.32 d0, d1 b pred8x8_dc_end -.endfunc +endfunc function x264_predict_8x8c_dc_left_neon mov r1, #FDEC_STRIDE @@ -446,7 +453,7 @@ vdup.8 d1, d0[1] vdup.8 d0, d0[0] b pred8x8_dc_end -.endfunc +endfunc function x264_predict_8x8c_dc_neon sub r2, r0, #FDEC_STRIDE @@ -472,7 +479,7 @@ vst1.8 {d1}, [r2,:64], r1 .endr bx lr -.endfunc +endfunc function x264_predict_8x8c_h_neon sub r1, r0, #1 @@ -484,7 +491,7 @@ vst1.64 {d2}, [r0,:64], ip .endr bx lr -.endfunc +endfunc function x264_predict_8x8c_v_neon sub r0, r0, #FDEC_STRIDE @@ -494,7 +501,7 @@ vst1.64 {d0}, [r0,:64], ip .endr bx lr -.endfunc +endfunc function x264_predict_8x8c_p_neon sub r3, r0, #FDEC_STRIDE @@ -547,7 +554,7 @@ subs r3, r3, #1 bne 1b bx lr -.endfunc +endfunc function x264_predict_16x16_dc_top_neon @@ -558,7 +565,7 @@ vrshrn.u16 d0, q0, #4
View file
x264-snapshot-20140321-2245.tar.bz2/common/arm/predict-c.c -> x264-snapshot-20141104-2245.tar.bz2/common/arm/predict-c.c
Changed
@@ -27,36 +27,6 @@ #include "predict.h" #include "pixel.h" -void x264_predict_4x4_dc_armv6( uint8_t *src ); -void x264_predict_4x4_dc_top_neon( uint8_t *src ); -void x264_predict_4x4_h_armv6( uint8_t *src ); -void x264_predict_4x4_ddr_armv6( uint8_t *src ); -void x264_predict_4x4_ddl_neon( uint8_t *src ); - -void x264_predict_8x8c_dc_neon( uint8_t *src ); -void x264_predict_8x8c_dc_top_neon( uint8_t *src ); -void x264_predict_8x8c_dc_left_neon( uint8_t *src ); -void x264_predict_8x8c_h_neon( uint8_t *src ); -void x264_predict_8x8c_v_neon( uint8_t *src ); -void x264_predict_8x8c_p_neon( uint8_t *src ); - -void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] ); -void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] ); -void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] ); -void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] ); -void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] ); -void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] ); -void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] ); -void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] ); -void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] ); - -void x264_predict_16x16_dc_neon( uint8_t *src ); -void x264_predict_16x16_dc_top_neon( uint8_t *src ); -void x264_predict_16x16_dc_left_neon( uint8_t *src ); -void x264_predict_16x16_h_neon( uint8_t *src ); -void x264_predict_16x16_v_neon( uint8_t *src ); -void x264_predict_16x16_p_neon( uint8_t *src ); - void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] ) { if (!(cpu&X264_CPU_ARMV6)) @@ -64,6 +34,7 @@ #if !HIGH_BIT_DEPTH pf[I_PRED_4x4_H] = x264_predict_4x4_h_armv6; + pf[I_PRED_4x4_V] = x264_predict_4x4_v_armv6; pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_armv6; pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_armv6;
View file
x264-snapshot-20140321-2245.tar.bz2/common/arm/predict.h -> x264-snapshot-20141104-2245.tar.bz2/common/arm/predict.h
Changed
@@ -26,15 +26,36 @@ #ifndef X264_ARM_PREDICT_H #define X264_ARM_PREDICT_H -void x264_predict_8x8_v_neon( pixel *src, pixel edge[36] ); -void x264_predict_8x8_h_neon( pixel *src, pixel edge[36] ); -void x264_predict_8x8_dc_neon( pixel *src, pixel edge[36] ); -void x264_predict_8x8c_dc_neon( pixel *src ); -void x264_predict_8x8c_h_neon( pixel *src ); -void x264_predict_8x8c_v_neon( pixel *src ); -void x264_predict_16x16_v_neon( pixel *src ); -void x264_predict_16x16_h_neon( pixel *src ); -void x264_predict_16x16_dc_neon( pixel *src ); +void x264_predict_4x4_dc_armv6( uint8_t *src ); +void x264_predict_4x4_dc_top_neon( uint8_t *src ); +void x264_predict_4x4_v_armv6( uint8_t *src ); +void x264_predict_4x4_h_armv6( uint8_t *src ); +void x264_predict_4x4_ddr_armv6( uint8_t *src ); +void x264_predict_4x4_ddl_neon( uint8_t *src ); + +void x264_predict_8x8c_dc_neon( uint8_t *src ); +void x264_predict_8x8c_dc_top_neon( uint8_t *src ); +void x264_predict_8x8c_dc_left_neon( uint8_t *src ); +void x264_predict_8x8c_h_neon( uint8_t *src ); +void x264_predict_8x8c_v_neon( uint8_t *src ); +void x264_predict_8x8c_p_neon( uint8_t *src ); + +void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] ); + +void x264_predict_16x16_dc_neon( uint8_t *src ); +void x264_predict_16x16_dc_top_neon( uint8_t *src ); +void x264_predict_16x16_dc_left_neon( uint8_t *src ); +void x264_predict_16x16_h_neon( uint8_t *src ); +void x264_predict_16x16_v_neon( uint8_t *src ); +void x264_predict_16x16_p_neon( uint8_t *src ); void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] ); void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
View file
x264-snapshot-20140321-2245.tar.bz2/common/arm/quant-a.S -> x264-snapshot-20141104-2245.tar.bz2/common/arm/quant-a.S
Changed
@@ -25,8 +25,6 @@ #include "asm.S" -.fpu neon - .section .rodata .align 4 pmovmskb_byte: @@ -80,7 +78,7 @@ vsub.s16 d3, d3, d0 vst1.64 {d3}, [r0,:64] QUANT_END d3 -.endfunc +endfunc // quant_4x4_dc( int16_t dct[16], int mf, int bias ) function x264_quant_4x4_dc_neon @@ -92,7 +90,7 @@ QUANT_TWO q0, q0, d4, d5, d4, d5, q0 vorr d0, d0, d1 QUANT_END d0 -.endfunc +endfunc // quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ) function x264_quant_4x4_neon @@ -104,7 +102,7 @@ QUANT_TWO q0, q1, d4, d5, d6, d7, q0 vorr d0, d0, d1 QUANT_END d0 -.endfunc +endfunc // quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] ) function x264_quant_4x4x4_neon @@ -145,7 +143,7 @@ orrne r0, #8 vpop {d8-d15} bx lr -.endfunc +endfunc // quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] ) function x264_quant_8x8_neon @@ -165,7 +163,7 @@ .endr vorr d0, d0, d1 QUANT_END d0 -.endfunc +endfunc .macro DEQUANT_START mf_size offset dc=no mov r3, #0x2b @@ -257,7 +255,7 @@ bgt dequant_\size\()_rshift_loop .endif bx lr -.endfunc +endfunc .endm DEQUANT 4x4, 4 @@ -307,7 +305,7 @@ vmovn.s32 d3, q13 vst1.16 {d0-d3}, [r0,:128] bx lr -.endfunc +endfunc // int coeff_last( int16_t *l ) @@ -319,7 +317,21 @@ lsrs r2, r2, #16 addne r0, r0, #1 bx lr -.endfunc +endfunc + +function x264_coeff_last8_arm + ldrd r2, r3, [r0, #8] + orrs ip, r2, r3 + movne r0, #4 + ldrdeq r2, r3, [r0] + moveq r0, #0 + tst r3, r3 + addne r0, #2 + movne r2, r3 + lsrs r2, r2, #16 + addne r0, r0, #1 + bx lr +endfunc .macro COEFF_LAST_1x size function x264_coeff_last\size\()_neon @@ -344,7 +356,7 @@ subslt r0, r3, r0, lsr #2 movlt r0, #0 bx lr -.endfunc +endfunc .endm COEFF_LAST_1x 15 @@ -393,4 +405,4 @@ subslt r0, ip, r0 movlt r0, #0 bx lr -.endfunc +endfunc
View file
x264-snapshot-20140321-2245.tar.bz2/common/arm/quant.h -> x264-snapshot-20141104-2245.tar.bz2/common/arm/quant.h
Changed
@@ -39,6 +39,7 @@ void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp ); int x264_coeff_last4_arm( int16_t * ); +int x264_coeff_last8_arm( int16_t * ); int x264_coeff_last15_neon( int16_t * ); int x264_coeff_last16_neon( int16_t * ); int x264_coeff_last64_neon( int16_t * );
View file
x264-snapshot-20140321-2245.tar.bz2/common/bitstream.c -> x264-snapshot-20141104-2245.tar.bz2/common/bitstream.c
Changed
@@ -4,7 +4,7 @@ * Copyright (C) 2003-2014 x264 project * * Authors: Laurent Aimar <fenrir@via.ecp.fr> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/common/bitstream.h -> x264-snapshot-20141104-2245.tar.bz2/common/bitstream.h
Changed
@@ -4,7 +4,7 @@ * Copyright (C) 2003-2014 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * Laurent Aimar <fenrir@via.ecp.fr> * * This program is free software; you can redistribute it and/or modify
View file
x264-snapshot-20140321-2245.tar.bz2/common/cabac.c -> x264-snapshot-20141104-2245.tar.bz2/common/cabac.c
Changed
@@ -5,7 +5,7 @@ * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/common/common.h -> x264-snapshot-20141104-2245.tar.bz2/common/common.h
Changed
@@ -316,8 +316,8 @@ static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum( uint8_t *mvdleft, uint8_t *mvdtop ) { - int amvd0 = abs(mvdleft[0]) + abs(mvdtop[0]); - int amvd1 = abs(mvdleft[1]) + abs(mvdtop[1]); + int amvd0 = mvdleft[0] + mvdtop[0]; + int amvd1 = mvdleft[1] + mvdtop[1]; amvd0 = (amvd0 > 2) + (amvd0 > 32); amvd1 = (amvd1 > 2) + (amvd1 > 32); return amvd0 + (amvd1<<8);
View file
x264-snapshot-20140321-2245.tar.bz2/common/cpu.c -> x264-snapshot-20141104-2245.tar.bz2/common/cpu.c
Changed
@@ -5,7 +5,7 @@ * * Authors: Loren Merritt <lorenm@u.washington.edu> * Laurent Aimar <fenrir@via.ecp.fr> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -89,6 +89,9 @@ {"ARMv6", X264_CPU_ARMV6}, {"NEON", X264_CPU_NEON}, {"FastNeonMRC", X264_CPU_FAST_NEON_MRC}, +#elif ARCH_AARCH64 + {"ARMv8", X264_CPU_ARMV8}, + {"NEON", X264_CPU_NEON}, #endif {"", 0}, }; @@ -338,6 +341,9 @@ uint32_t x264_cpu_detect( void ) { +#ifdef __NO_FPRS__ + return 0; +#else static void (*oldsig)( int ); oldsig = signal( SIGILL, sigill_handler ); @@ -357,6 +363,7 @@ signal( SIGILL, oldsig ); return X264_CPU_ALTIVEC; +#endif } #endif @@ -405,6 +412,13 @@ return flags; } +#elif ARCH_AARCH64 + +uint32_t x264_cpu_detect( void ) +{ + return X264_CPU_ARMV8 | X264_CPU_NEON; +} + #else uint32_t x264_cpu_detect( void )
View file
x264-snapshot-20140321-2245.tar.bz2/common/dct.c -> x264-snapshot-20141104-2245.tar.bz2/common/dct.c
Changed
@@ -35,6 +35,9 @@ #if ARCH_ARM # include "arm/dct.h" #endif +#if ARCH_AARCH64 +# include "aarch64/dct.h" +#endif /* the inverse of the scaling factors introduced by 8x8 fdct */ /* uint32 is for the asm implementation of trellis. the actual values fit in uint16. */ @@ -723,7 +726,7 @@ } #endif -#if HAVE_ARMV6 +#if HAVE_ARMV6 || ARCH_AARCH64 if( cpu&X264_CPU_NEON ) { dctf->sub4x4_dct = x264_sub4x4_dct_neon; @@ -999,10 +1002,10 @@ pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec; } #endif -#if HAVE_ARMV6 +#if HAVE_ARMV6 || ARCH_AARCH64 if( cpu&X264_CPU_NEON ) pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_neon; -#endif +#endif // HAVE_ARMV6 || ARCH_AARCH64 #endif // HIGH_BIT_DEPTH pf_interlaced->interleave_8x8_cavlc =
View file
x264-snapshot-20140321-2245.tar.bz2/common/deblock.c -> x264-snapshot-20141104-2245.tar.bz2/common/deblock.c
Changed
@@ -5,7 +5,7 @@ * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * Henrik Gramner <henrik@gramner.com> * * This program is free software; you can redistribute it and/or modify @@ -729,11 +729,14 @@ void x264_deblock_h_luma_altivec( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #endif // ARCH_PPC -#if HAVE_ARMV6 +#if HAVE_ARMV6 || ARCH_AARCH64 void x264_deblock_v_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_h_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_v_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_h_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + int mvy_limit, int bframe ); #endif void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff ) @@ -835,13 +838,14 @@ } #endif // HAVE_ALTIVEC -#if HAVE_ARMV6 +#if HAVE_ARMV6 || ARCH_AARCH64 if( cpu&X264_CPU_NEON ) { pf->deblock_luma[1] = x264_deblock_v_luma_neon; pf->deblock_luma[0] = x264_deblock_h_luma_neon; pf->deblock_chroma[1] = x264_deblock_v_chroma_neon; pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon; + pf->deblock_strength = x264_deblock_strength_neon; } #endif #endif // !HIGH_BIT_DEPTH
View file
x264-snapshot-20140321-2245.tar.bz2/common/frame.c -> x264-snapshot-20141104-2245.tar.bz2/common/frame.c
Changed
@@ -5,7 +5,7 @@ * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/common/frame.h -> x264-snapshot-20141104-2245.tar.bz2/common/frame.h
Changed
@@ -5,7 +5,7 @@ * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/common/macroblock.c -> x264-snapshot-20141104-2245.tar.bz2/common/macroblock.c
Changed
@@ -3,7 +3,7 @@ ***************************************************************************** * Copyright (C) 2003-2014 x264 project * - * Authors: Jason Garrett-Glaser <darkshikari@gmail.com> + * Authors: Fiona Glaser <fiona@x264.com> * Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> * Henrik Gramner <henrik@gramner.com>
View file
x264-snapshot-20140321-2245.tar.bz2/common/macroblock.h -> x264-snapshot-20141104-2245.tar.bz2/common/macroblock.h
Changed
@@ -5,7 +5,7 @@ * * Authors: Loren Merritt <lorenm@u.washington.edu> * Laurent Aimar <fenrir@via.ecp.fr> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/common/mc.c -> x264-snapshot-20141104-2245.tar.bz2/common/mc.c
Changed
@@ -35,6 +35,9 @@ #if ARCH_ARM #include "arm/mc.h" #endif +#if ARCH_AARCH64 +#include "aarch64/mc.h" +#endif static inline void pixel_avg( pixel *dst, intptr_t i_dst_stride, @@ -641,6 +644,9 @@ #if HAVE_ARMV6 x264_mc_init_arm( cpu, pf ); #endif +#if ARCH_AARCH64 + x264_mc_init_aarch64( cpu, pf ); +#endif if( cpu_independent ) {
View file
x264-snapshot-20140321-2245.tar.bz2/common/mvpred.c -> x264-snapshot-20141104-2245.tar.bz2/common/mvpred.c
Changed
@@ -4,7 +4,7 @@ * Copyright (C) 2003-2014 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * Laurent Aimar <fenrir@via.ecp.fr> * * This program is free software; you can redistribute it and/or modify
View file
x264-snapshot-20140321-2245.tar.bz2/common/opencl.c -> x264-snapshot-20141104-2245.tar.bz2/common/opencl.c
Changed
@@ -135,7 +135,8 @@ rewind( fp ); CHECKED_MALLOC( binary, size ); - fread( binary, 1, size, fp ); + if ( fread( binary, 1, size, fp ) != size ) + goto fail; const uint8_t *ptr = (const uint8_t*)binary; #define CHECK_STRING( STR )\
View file
x264-snapshot-20140321-2245.tar.bz2/common/osdep.h -> x264-snapshot-20141104-2245.tar.bz2/common/osdep.h
Changed
@@ -48,7 +48,7 @@ #define log2(x) (log(x)/0.693147180559945) #endif -#ifdef __ICL +#ifdef _MSC_VER #define inline __inline #define strcasecmp _stricmp #define strncasecmp _strnicmp @@ -57,10 +57,6 @@ #define S_ISREG(x) (((x) & S_IFMT) == S_IFREG) #endif -#if (defined(__GNUC__) || defined(__INTEL_COMPILER)) && (ARCH_X86 || ARCH_X86_64) -#define HAVE_X86_INLINE_ASM 1 -#endif - #if !defined(isfinite) && (SYS_OPENBSD || SYS_SunOS) #define isfinite finite #endif @@ -89,7 +85,7 @@ #define x264_is_pipe(x) 0 #endif -#ifdef __ICL +#ifdef _MSC_VER #define DECLARE_ALIGNED( var, n ) __declspec(align(n)) var #else #define DECLARE_ALIGNED( var, n ) var __attribute__((aligned(n))) @@ -156,7 +152,7 @@ #define x264_constant_p(x) __builtin_constant_p(x) #define x264_nonconstant_p(x) (!__builtin_constant_p(x)) #else -#ifdef __ICL +#ifdef _MSC_VER #define ALWAYS_INLINE __forceinline #define NOINLINE __declspec(noinline) #else
View file
x264-snapshot-20140321-2245.tar.bz2/common/pixel.c -> x264-snapshot-20141104-2245.tar.bz2/common/pixel.c
Changed
@@ -5,7 +5,7 @@ * * Authors: Loren Merritt <lorenm@u.washington.edu> * Laurent Aimar <fenrir@via.ecp.fr> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -38,8 +38,9 @@ # include "arm/pixel.h" # include "arm/predict.h" #endif -#if ARCH_UltraSPARC -# include "sparc/pixel.h" +#if ARCH_AARCH64 +# include "aarch64/pixel.h" +# include "aarch64/predict.h" #endif @@ -200,7 +201,7 @@ #define PIXEL_VAR2_C( name, w, h, shift ) \ static int name( pixel *pix1, intptr_t i_stride1, pixel *pix2, intptr_t i_stride2, int *ssd ) \ { \ - uint32_t var = 0, sum = 0, sqr = 0; \ + int var = 0, sum = 0, sqr = 0; \ for( int y = 0; y < h; y++ ) \ { \ for( int x = 0; x < w; x++ ) \ @@ -212,8 +213,7 @@ pix1 += i_stride1; \ pix2 += i_stride2; \ } \ - sum = abs(sum); \ - var = sqr - ((uint64_t)sum * sum >> shift); \ + var = sqr - ((int64_t)sum * sum >> shift); \ *ssd = sqr; \ return var; \ } @@ -454,15 +454,6 @@ SAD_X( 4x8 ) SAD_X( 4x4 ) -#if !HIGH_BIT_DEPTH -#if ARCH_UltraSPARC -SAD_X( 16x16_vis ) -SAD_X( 16x8_vis ) -SAD_X( 8x16_vis ) -SAD_X( 8x8_vis ) -#endif -#endif // !HIGH_BIT_DEPTH - /**************************************************************************** * pixel_satd_x4 * no faster than single satd, but needed for satd to be a drop-in replacement for sad @@ -509,7 +500,7 @@ #endif #if !HIGH_BIT_DEPTH -#if HAVE_ARMV6 +#if HAVE_ARMV6 || ARCH_AARCH64 SATD_X_DECL7( _neon ) #endif #endif // !HIGH_BIT_DEPTH @@ -533,7 +524,7 @@ INTRA_MBCMP_8x8( sad, _mmx2, _c ) INTRA_MBCMP_8x8(sa8d, _sse2, _sse2 ) #endif -#if !HIGH_BIT_DEPTH && HAVE_ARMV6 +#if !HIGH_BIT_DEPTH && (HAVE_ARMV6 || ARCH_AARCH64) INTRA_MBCMP_8x8( sad, _neon, _neon ) INTRA_MBCMP_8x8(sa8d, _neon, _neon ) #endif @@ -593,8 +584,18 @@ #endif #endif #if !HIGH_BIT_DEPTH && HAVE_ARMV6 -INTRA_MBCMP( sad, 4x4, v, h, dc, , _neon, _c ) -INTRA_MBCMP(satd, 4x4, v, h, dc, , _neon, _c ) +INTRA_MBCMP( sad, 4x4, v, h, dc, , _neon, _armv6 ) +INTRA_MBCMP(satd, 4x4, v, h, dc, , _neon, _armv6 ) +INTRA_MBCMP( sad, 8x8, dc, h, v, c, _neon, _neon ) +INTRA_MBCMP(satd, 8x8, dc, h, v, c, _neon, _neon ) +INTRA_MBCMP( sad, 8x16, dc, h, v, c, _neon, _c ) +INTRA_MBCMP(satd, 8x16, dc, h, v, c, _neon, _c ) +INTRA_MBCMP( sad, 16x16, v, h, dc, , _neon, _neon ) +INTRA_MBCMP(satd, 16x16, v, h, dc, , _neon, _neon ) +#endif +#if !HIGH_BIT_DEPTH && ARCH_AARCH64 +INTRA_MBCMP( sad, 4x4, v, h, dc, , _neon, _neon ) +INTRA_MBCMP(satd, 4x4, v, h, dc, , _neon, _neon ) INTRA_MBCMP( sad, 8x8, dc, h, v, c, _neon, _neon ) INTRA_MBCMP(satd, 8x8, dc, h, v, c, _neon, _neon ) INTRA_MBCMP( sad, 8x16, dc, h, v, c, _neon, _c ) @@ -1021,8 +1022,16 @@ } if( cpu&X264_CPU_XOP ) { + INIT5( sad_x3, _xop ); + INIT5( sad_x4, _xop ); + pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_xop; + pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop; + pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop; pixf->vsad = x264_pixel_vsad_xop; pixf->asd8 = x264_pixel_asd8_xop; +#if ARCH_X86_64 + pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_xop; +#endif } if( cpu&X264_CPU_AVX2 ) { @@ -1308,6 +1317,7 @@ pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_xop; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_xop; pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_xop; + pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_xop; pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop; pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_xop; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop; @@ -1394,6 +1404,46 @@ } } #endif + +#if ARCH_AARCH64 + if( cpu&X264_CPU_NEON ) + { + INIT7( sad, _neon ); + // AArch64 has no distinct instructions for aligned load/store + INIT7_NAME( sad_aligned, sad, _neon ); + INIT7( sad_x3, _neon ); + INIT7( sad_x4, _neon ); + INIT7( ssd, _neon ); + INIT7( satd, _neon ); + INIT7( satd_x3, _neon ); + INIT7( satd_x4, _neon ); + INIT4( hadamard_ac, _neon ); + + pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_neon; + pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon; + + pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_neon; + pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_neon; + pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_neon; + pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon; + pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon; + + pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_neon; + pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_neon; + pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_neon; + pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_neon; + pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_neon; + pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_neon; + pixf->intra_sad_x3_8x16c = x264_intra_sad_x3_8x16c_neon; + pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_neon; + pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_neon; + pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_neon; + + pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_neon; + pixf->ssim_end4 = x264_pixel_ssim_end4_neon; + } +#endif // ARCH_AARCH64 + #endif // HIGH_BIT_DEPTH #if HAVE_ALTIVEC if( cpu&X264_CPU_ALTIVEC ) @@ -1401,13 +1451,6 @@ x264_pixel_altivec_init( pixf ); } #endif -#if !HIGH_BIT_DEPTH -#if ARCH_UltraSPARC - INIT4( sad, _vis ); - INIT4( sad_x3, _vis ); - INIT4( sad_x4, _vis ); -#endif -#endif // !HIGH_BIT_DEPTH pixf->ads[PIXEL_8x16] = pixf->ads[PIXEL_8x4] =
View file
x264-snapshot-20140321-2245.tar.bz2/common/pixel.h -> x264-snapshot-20141104-2245.tar.bz2/common/pixel.h
Changed
@@ -4,7 +4,7 @@ * Copyright (C) 2004-2014 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> Henrik Gramner <henrik@gramner.com> * * This program is free software; you can redistribute it and/or modify
View file
x264-snapshot-20140321-2245.tar.bz2/common/predict.c -> x264-snapshot-20141104-2245.tar.bz2/common/predict.c
Changed
@@ -5,7 +5,7 @@ * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * Henrik Gramner <henrik@gramner.com> * * This program is free software; you can redistribute it and/or modify @@ -40,6 +40,9 @@ #if ARCH_ARM # include "arm/predict.h" #endif +#if ARCH_AARCH64 +# include "aarch64/predict.h" +#endif /**************************************************************************** * 16x16 prediction for intra luma block @@ -899,6 +902,10 @@ #if HAVE_ARMV6 x264_predict_16x16_init_arm( cpu, pf ); #endif + +#if ARCH_AARCH64 + x264_predict_16x16_init_aarch64( cpu, pf ); +#endif } void x264_predict_8x8c_init( int cpu, x264_predict_t pf[7] ) @@ -923,6 +930,10 @@ #if HAVE_ARMV6 x264_predict_8x8c_init_arm( cpu, pf ); #endif + +#if ARCH_AARCH64 + x264_predict_8x8c_init_aarch64( cpu, pf ); +#endif } void x264_predict_8x16c_init( int cpu, x264_predict_t pf[7] ) @@ -963,6 +974,10 @@ #if HAVE_ARMV6 x264_predict_8x8_init_arm( cpu, pf, predict_filter ); #endif + +#if ARCH_AARCH64 + x264_predict_8x8_init_aarch64( cpu, pf, predict_filter ); +#endif } void x264_predict_4x4_init( int cpu, x264_predict_t pf[12] ) @@ -987,5 +1002,9 @@ #if HAVE_ARMV6 x264_predict_4x4_init_arm( cpu, pf ); #endif + +#if ARCH_AARCH64 + x264_predict_4x4_init_aarch64( cpu, pf ); +#endif }
View file
x264-snapshot-20140321-2245.tar.bz2/common/quant.c -> x264-snapshot-20141104-2245.tar.bz2/common/quant.c
Changed
@@ -4,7 +4,7 @@ * Copyright (C) 2005-2014 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * Christian Heine <sennindemokrit@gmx.net> * Henrik Gramner <henrik@gramner.com> * @@ -37,6 +37,9 @@ #if ARCH_ARM # include "arm/quant.h" #endif +#if ARCH_AARCH64 +# include "aarch64/quant.h" +#endif #define QUANT_ONE( coef, mf, f ) \ { \ @@ -556,7 +559,6 @@ { #if ARCH_X86 pf->quant_4x4 = x264_quant_4x4_mmx; - pf->quant_4x4x4 = x264_quant_4x4x4_mmx; pf->quant_8x8 = x264_quant_8x8_mmx; pf->dequant_4x4 = x264_dequant_4x4_mmx; pf->dequant_4x4_dc = x264_dequant_4x4dc_mmx2; @@ -725,8 +727,12 @@ #if HAVE_ARMV6 if( cpu&X264_CPU_ARMV6 ) + { pf->coeff_last4 = x264_coeff_last4_arm; - + pf->coeff_last8 = x264_coeff_last8_arm; + } +#endif +#if HAVE_ARMV6 || ARCH_AARCH64 if( cpu&X264_CPU_NEON ) { pf->quant_2x2_dc = x264_quant_2x2_dc_neon; @@ -742,6 +748,13 @@ pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_neon; } #endif +#if ARCH_AARCH64 + if( cpu&X264_CPU_ARMV8 ) + { + pf->coeff_last4 = x264_coeff_last4_aarch64; + pf->coeff_last8 = x264_coeff_last8_aarch64; + } +#endif #endif // HIGH_BIT_DEPTH pf->coeff_last[DCT_LUMA_DC] = pf->coeff_last[DCT_CHROMAU_DC] = pf->coeff_last[DCT_CHROMAV_DC] = pf->coeff_last[DCT_CHROMAU_4x4] = pf->coeff_last[DCT_CHROMAV_4x4] = pf->coeff_last[DCT_LUMA_4x4];
View file
x264-snapshot-20140321-2245.tar.bz2/common/quant.h -> x264-snapshot-20141104-2245.tar.bz2/common/quant.h
Changed
@@ -4,7 +4,7 @@ * Copyright (C) 2005-2014 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/common/rectangle.c -> x264-snapshot-20141104-2245.tar.bz2/common/rectangle.c
Changed
@@ -3,7 +3,7 @@ ***************************************************************************** * Copyright (C) 2010-2014 x264 project * - * Authors: Jason Garrett-Glaser <darkshikari@gmail.com> + * Authors: Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/common/rectangle.h -> x264-snapshot-20141104-2245.tar.bz2/common/rectangle.h
Changed
@@ -3,7 +3,7 @@ ***************************************************************************** * Copyright (C) 2003-2014 x264 project * - * Authors: Jason Garrett-Glaser <darkshikari@gmail.com> + * Authors: Fiona Glaser <fiona@x264.com> * Loren Merritt <lorenm@u.washington.edu> * * This program is free software; you can redistribute it and/or modify
View file
x264-snapshot-20140321-2245.tar.bz2/common/vlc.c -> x264-snapshot-20141104-2245.tar.bz2/common/vlc.c
Changed
@@ -4,7 +4,7 @@ * Copyright (C) 2003-2014 x264 project * * Authors: Laurent Aimar <fenrir@via.ecp.fr> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * Henrik Gramner <henrik@gramner.com> * * This program is free software; you can redistribute it and/or modify
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/bitstream-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/bitstream-a.asm
Changed
@@ -3,7 +3,7 @@ ;***************************************************************************** ;* Copyright (C) 2010-2014 x264 project ;* -;* Authors: Jason Garrett-Glaser <darkshikari@gmail.com> +;* Authors: Fiona Glaser <fiona@x264.com> ;* Henrik Gramner <henrik@gramner.com> ;* ;* This program is free software; you can redistribute it and/or modify
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/cabac-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/cabac-a.asm
Changed
@@ -4,7 +4,7 @@ ;* Copyright (C) 2008-2014 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> -;* Jason Garrett-Glaser <darkshikari@gmail.com> +;* Fiona Glaser <fiona@x264.com> ;* Holger Lubitz <holger@lubitz.org> ;* ;* This program is free software; you can redistribute it and/or modify
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/const-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/const-a.asm
Changed
@@ -4,7 +4,7 @@ ;* Copyright (C) 2010-2014 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> -;* Jason Garrett-Glaser <darkshikari@gmail.com> +;* Fiona Glaser <fiona@x264.com> ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/cpu-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/cpu-a.asm
Changed
@@ -5,7 +5,7 @@ ;* ;* Authors: Laurent Aimar <fenrir@via.ecp.fr> ;* Loren Merritt <lorenm@u.washington.edu> -;* Jason Garrett-Glaser <darkshikari@gmail.com> +;* Fiona Glaser <fiona@x264.com> ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/dct-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/dct-a.asm
Changed
@@ -7,7 +7,7 @@ ;* Loren Merritt <lorenm@u.washington.edu> ;* Laurent Aimar <fenrir@via.ecp.fr> ;* Min Chen <chenm001.163.com> -;* Jason Garrett-Glaser <darkshikari@gmail.com> +;* Fiona Glaser <fiona@x264.com> ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/dct.h -> x264-snapshot-20141104-2245.tar.bz2/common/x86/dct.h
Changed
@@ -5,7 +5,7 @@ * * Authors: Loren Merritt <lorenm@u.washington.edu> * Laurent Aimar <fenrir@via.ecp.fr> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/deblock-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/deblock-a.asm
Changed
@@ -4,7 +4,7 @@ ;* Copyright (C) 2005-2014 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> -;* Jason Garrett-Glaser <darkshikari@gmail.com> +;* Fiona Glaser <fiona@x264.com> ;* Oskar Arvidsson <oskar@irock.se> ;* ;* This program is free software; you can redistribute it and/or modify
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/mc-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/mc-a.asm
Changed
@@ -4,7 +4,7 @@ ;* Copyright (C) 2003-2014 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> -;* Jason Garrett-Glaser <darkshikari@gmail.com> +;* Fiona Glaser <fiona@x264.com> ;* Laurent Aimar <fenrir@via.ecp.fr> ;* Dylan Yudaken <dyudaken@gmail.com> ;* Holger Lubitz <holger@lubitz.org>
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/mc-a2.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/mc-a2.asm
Changed
@@ -4,7 +4,7 @@ ;* Copyright (C) 2005-2014 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> -;* Jason Garrett-Glaser <darkshikari@gmail.com> +;* Fiona Glaser <fiona@x264.com> ;* Holger Lubitz <holger@lubitz.org> ;* Mathieu Monnier <manao@melix.net> ;* Oskar Arvidsson <oskar@irock.se>
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/mc-c.c -> x264-snapshot-20141104-2245.tar.bz2/common/x86/mc-c.c
Changed
@@ -5,7 +5,7 @@ * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/pixel-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/pixel-a.asm
Changed
@@ -7,7 +7,7 @@ ;* Holger Lubitz <holger@lubitz.org> ;* Laurent Aimar <fenrir@via.ecp.fr> ;* Alex Izvorski <aizvorksi@gmail.com> -;* Jason Garrett-Glaser <darkshikari@gmail.com> +;* Fiona Glaser <fiona@x264.com> ;* Oskar Arvidsson <oskar@irock.se> ;* ;* This program is free software; you can redistribute it and/or modify @@ -561,10 +561,15 @@ pshufhw m0, m0, q3120 pshufhw m1, m1, q3120 %endif +%if cpuflag(xop) + pmadcswd m2, m0, m0, m2 + pmadcswd m3, m1, m1, m3 +%else pmaddwd m0, m0 pmaddwd m1, m1 paddd m2, m0 paddd m3, m1 +%endif add r6, 2*mmsize jl .loopx %if mmsize == 32 ; avx2 may overread by 32 bytes, that has to be handled @@ -657,10 +662,15 @@ por m0, m1 psrlw m2, m0, 8 pand m0, m5 +%if cpuflag(xop) + pmadcswd m4, m2, m2, m4 + pmadcswd m3, m0, m0, m3 +%else pmaddwd m2, m2 pmaddwd m0, m0 - paddd m3, m0 paddd m4, m2 + paddd m3, m0 +%endif add r6, mmsize jl .loopx %if mmsize == 32 ; avx2 may overread by 16 bytes, that has to be handled @@ -695,6 +705,8 @@ SSD_NV12 INIT_XMM avx SSD_NV12 +INIT_XMM xop +SSD_NV12 INIT_YMM avx2 SSD_NV12 @@ -4677,12 +4689,13 @@ ;----------------------------------------------------------------------------- ; float pixel_ssim_end( int sum0[5][4], int sum1[5][4], int width ) ;----------------------------------------------------------------------------- -cglobal pixel_ssim_end4, 3,3,7 - movdqa m0, [r0+ 0] - movdqa m1, [r0+16] - movdqa m2, [r0+32] - movdqa m3, [r0+48] - movdqa m4, [r0+64] +cglobal pixel_ssim_end4, 2,3 + mov r2d, r2m + mova m0, [r0+ 0] + mova m1, [r0+16] + mova m2, [r0+32] + mova m3, [r0+48] + mova m4, [r0+64] paddd m0, [r1+ 0] paddd m1, [r1+16] paddd m2, [r1+32] @@ -4692,8 +4705,6 @@ paddd m1, m2 paddd m2, m3 paddd m3, m4 - movdqa m5, [ssim_c1] - movdqa m6, [ssim_c2] TRANSPOSE4x4D 0, 1, 2, 3, 4 ; s1=m0, s2=m1, ss=m2, s12=m3 @@ -4702,20 +4713,21 @@ cvtdq2ps m1, m1 cvtdq2ps m2, m2 cvtdq2ps m3, m3 + mulps m4, m0, m1 ; s1*s2 + mulps m0, m0 ; s1*s1 + mulps m1, m1 ; s2*s2 mulps m2, [pf_64] ; ss*64 mulps m3, [pf_128] ; s12*128 - movdqa m4, m1 - mulps m4, m0 ; s1*s2 - mulps m1, m1 ; s2*s2 - mulps m0, m0 ; s1*s1 addps m4, m4 ; s1*s2*2 addps m0, m1 ; s1*s1 + s2*s2 subps m2, m0 ; vars subps m3, m4 ; covar*2 - addps m4, m5 ; s1*s2*2 + ssim_c1 - addps m0, m5 ; s1*s1 + s2*s2 + ssim_c1 - addps m2, m6 ; vars + ssim_c2 - addps m3, m6 ; covar*2 + ssim_c2 + movaps m1, [ssim_c1] + addps m4, m1 ; s1*s2*2 + ssim_c1 + addps m0, m1 ; s1*s1 + s2*s2 + ssim_c1 + movaps m1, [ssim_c2] + addps m2, m1 ; vars + ssim_c2 + addps m3, m1 ; covar*2 + ssim_c2 %else pmaddwd m4, m1, m0 ; s1*s2 pslld m1, 16 @@ -4726,10 +4738,12 @@ pslld m2, 6 psubd m3, m4 ; covar*2 psubd m2, m0 ; vars - paddd m0, m5 - paddd m4, m5 - paddd m3, m6 - paddd m2, m6 + mova m1, [ssim_c1] + paddd m0, m1 + paddd m4, m1 + mova m1, [ssim_c2] + paddd m3, m1 + paddd m2, m1 cvtdq2ps m0, m0 ; (float)(s1*s1 + s2*s2 + ssim_c1) cvtdq2ps m4, m4 ; (float)(s1*s2*2 + ssim_c1) cvtdq2ps m3, m3 ; (float)(covar*2 + ssim_c2) @@ -4742,20 +4756,31 @@ cmp r2d, 4 je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level neg r2 + %ifdef PIC lea r3, [mask_ff + 16] - movdqu m1, [r3 + r2*4] + %xdefine %%mask r3 %else - movdqu m1, [mask_ff + r2*4 + 16] + %xdefine %%mask mask_ff + 16 %endif - pand m4, m1 +%if cpuflag(avx) + andps m4, [%%mask + r2*4] +%else + movups m0, [%%mask + r2*4] + andps m4, m0 +%endif + .skip: movhlps m0, m4 addps m0, m4 +%if cpuflag(ssse3) + movshdup m4, m0 +%else pshuflw m4, m0, q0032 +%endif addss m0, m4 %if ARCH_X86_64 == 0 - movd r0m, m0 + movss r0m, m0 fld dword r0m %endif RET
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/pixel.h -> x264-snapshot-20141104-2245.tar.bz2/common/x86/pixel.h
Changed
@@ -5,7 +5,7 @@ * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -56,6 +56,7 @@ DECL_X4( sad, sse2 ) DECL_X4( sad, sse3 ) DECL_X4( sad, ssse3 ) +DECL_X4( sad, xop ) DECL_X4( sad, avx ) DECL_X4( sad, avx2 ) DECL_X1( ssd, mmx ) @@ -153,6 +154,9 @@ void x264_pixel_ssd_nv12_core_avx ( pixel *pixuv1, intptr_t stride1, pixel *pixuv2, intptr_t stride2, int width, int height, uint64_t *ssd_u, uint64_t *ssd_v ); +void x264_pixel_ssd_nv12_core_xop ( pixel *pixuv1, intptr_t stride1, + pixel *pixuv2, intptr_t stride2, int width, + int height, uint64_t *ssd_u, uint64_t *ssd_v ); void x264_pixel_ssd_nv12_core_avx2( pixel *pixuv1, intptr_t stride1, pixel *pixuv2, intptr_t stride2, int width, int height, uint64_t *ssd_u, uint64_t *ssd_v );
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/predict-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/predict-a.asm
Changed
@@ -5,7 +5,7 @@ ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> ;* Holger Lubitz <holger@lubitz.org> -;* Jason Garrett-Glaser <darkshikari@gmail.com> +;* Fiona Glaser <fiona@x264.com> ;* Henrik Gramner <henrik@gramner.com> ;* ;* This program is free software; you can redistribute it and/or modify
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/predict-c.c -> x264-snapshot-20141104-2245.tar.bz2/common/x86/predict-c.c
Changed
@@ -5,7 +5,7 @@ * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/quant-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/quant-a.asm
Changed
@@ -4,7 +4,7 @@ ;* Copyright (C) 2005-2014 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> -;* Jason Garrett-Glaser <darkshikari@gmail.com> +;* Fiona Glaser <fiona@x264.com> ;* Christian Heine <sennindemokrit@gmx.net> ;* Oskar Arvidsson <oskar@irock.se> ;* Henrik Gramner <henrik@gramner.com> @@ -292,14 +292,11 @@ QUANT_4x4 0, 6 QUANT_4x4 64, 7 packssdw m6, m7 - packssdw m5, m6 - packssdw m5, m5 ; AA BB CC DD - packsswb m5, m5 ; A B C D + packssdw m5, m6 ; AAAA BBBB CCCC DDDD pxor m4, m4 - pcmpeqb m5, m4 - pmovmskb eax, m5 - not eax - and eax, 0xf + pcmpeqd m5, m4 + movmskps eax, m5 + xor eax, 0xf RET %endmacro @@ -444,16 +441,11 @@ QUANT_4x4 64, 5 QUANT_4x4 96, 6 packssdw m5, m6 - packssdw m4, m5 -%if mmsize == 16 - packssdw m4, m4 ; AA BB CC DD -%endif - packsswb m4, m4 ; A B C D + packssdw m4, m5 ; AAAA BBBB CCCC DDDD pxor m3, m3 - pcmpeqb m4, m3 - pmovmskb eax, m4 - not eax - and eax, 0xf + pcmpeqd m4, m3 + movmskps eax, m4 + xor eax, 0xf RET %endmacro @@ -464,7 +456,6 @@ INIT_MMX mmx QUANT_AC quant_4x4, 4 QUANT_AC quant_8x8, 16 -QUANT_4x4x4 %endif INIT_XMM sse2
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/quant.h -> x264-snapshot-20141104-2245.tar.bz2/common/x86/quant.h
Changed
@@ -4,7 +4,7 @@ * Copyright (C) 2005-2014 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * Christian Heine <sennindemokrit@gmx.net> * * This program is free software; you can redistribute it and/or modify @@ -31,7 +31,6 @@ int x264_quant_2x2_dc_mmx2( dctcoef dct[4], int mf, int bias ); int x264_quant_4x4_dc_mmx2( dctcoef dct[16], int mf, int bias ); int x264_quant_4x4_mmx( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] ); -int x264_quant_4x4x4_mmx( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] ); int x264_quant_8x8_mmx( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] ); int x264_quant_2x2_dc_sse2( dctcoef dct[16], int mf, int bias ); int x264_quant_4x4_dc_sse2( dctcoef dct[16], int mf, int bias );
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/sad-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/sad-a.asm
Changed
@@ -4,7 +4,7 @@ ;* Copyright (C) 2003-2014 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> -;* Jason Garrett-Glaser <darkshikari@gmail.com> +;* Fiona Glaser <fiona@x264.com> ;* Laurent Aimar <fenrir@via.ecp.fr> ;* Alex Izvorski <aizvorksi@gmail.com> ;*
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/sad16-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/sad16-a.asm
Changed
@@ -519,6 +519,19 @@ SAD_X 4, 8, 16 SAD_X 4, 8, 8 SAD_X 4, 8, 4 +INIT_XMM xop +%define XMM_REGS 7 +SAD_X 3, 16, 16 +SAD_X 3, 16, 8 +SAD_X 3, 8, 16 +SAD_X 3, 8, 8 +SAD_X 3, 8, 4 +%define XMM_REGS 9 +SAD_X 4, 16, 16 +SAD_X 4, 16, 8 +SAD_X 4, 8, 16 +SAD_X 4, 8, 8 +SAD_X 4, 8, 4 INIT_YMM avx2 %define XMM_REGS 7 SAD_X 3, 16, 16 @@ -533,7 +546,12 @@ %macro INTRA_SAD_X3_4x4 0 cglobal intra_sad_x3_4x4, 3,3,7 +%if cpuflag(ssse3) movddup m0, [r1-1*FDEC_STRIDEB] +%else + movq m0, [r1-1*FDEC_STRIDEB] + punpcklqdq m0, m0 +%endif movq m1, [r0+0*FENC_STRIDEB] movq m2, [r0+2*FENC_STRIDEB] pshuflw m6, m0, q1032
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/util.h -> x264-snapshot-20141104-2245.tar.bz2/common/x86/util.h
Changed
@@ -3,7 +3,7 @@ ***************************************************************************** * Copyright (C) 2008-2014 x264 project * - * Authors: Jason Garrett-Glaser <darkshikari@gmail.com> + * Authors: Fiona Glaser <fiona@x264.com> * Loren Merritt <lorenm@u.washington.edu> * * This program is free software; you can redistribute it and/or modify
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/x86inc.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/x86inc.asm
Changed
@@ -5,7 +5,7 @@ ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> ;* Anton Mitrofanov <BugMaster@narod.ru> -;* Jason Garrett-Glaser <darkshikari@gmail.com> +;* Fiona Glaser <fiona@x264.com> ;* Henrik Gramner <henrik@gramner.com> ;* ;* Permission to use, copy, modify, and/or distribute this software for any @@ -90,9 +90,6 @@ default rel %endif -; Always use long nops (reduces 0x90 spam in disassembly on x86_32) -CPU amdnop - ; Macros to eliminate most code duplication between x86_32 and x86_64: ; Currently this works only for leaf functions which load all their arguments ; into registers at the start, and make no other use of the stack. Luckily that @@ -756,19 +753,26 @@ %define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) %define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) -; Takes up to 2 cpuflags from the above list. +; Takes an arbitrary number of cpuflags from the above list. ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. -%macro INIT_CPUFLAGS 0-2 - CPU amdnop +%macro INIT_CPUFLAGS 0-* + %xdefine SUFFIX + %undef cpuname + %assign cpuflags 0 + %if %0 >= 1 - %xdefine cpuname %1 - %assign cpuflags cpuflags_%1 - %if %0 >= 2 - %xdefine cpuname %1_%2 - %assign cpuflags cpuflags | cpuflags_%2 - %endif + %rep %0 + %ifdef cpuname + %xdefine cpuname cpuname %+ _%1 + %else + %xdefine cpuname %1 + %endif + %assign cpuflags cpuflags | cpuflags_%1 + %rotate 1 + %endrep %xdefine SUFFIX _ %+ cpuname + %if cpuflag(avx) %assign avx_enabled 1 %endif @@ -779,16 +783,15 @@ %endif %if cpuflag(aligned) %define movu mova - %elifidn %1, sse3 + %elif cpuflag(sse3) && notcpuflag(ssse3) %define movu lddqu %endif - %if ARCH_X86_64 == 0 && notcpuflag(sse2) - CPU basicnop - %endif + %endif + + %if ARCH_X86_64 || cpuflag(sse2) + CPU amdnop %else - %xdefine SUFFIX - %undef cpuname - %undef cpuflags + CPU basicnop %endif %endmacro
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/x86util.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/x86util.asm
Changed
@@ -298,11 +298,16 @@ paddd %1, %2 %endif %if mmsize >= 16 +%if cpuflag(xop) && sizeof%1 == 16 + vphadddq %1, %1 +%endif movhlps %2, %1 paddd %1, %2 %endif +%if notcpuflag(xop) || sizeof%1 != 16 PSHUFLW %2, %1, q0032 paddd %1, %2 +%endif %undef %1 %undef %2 %endmacro
View file
x264-snapshot-20140321-2245.tar.bz2/configure -> x264-snapshot-20141104-2245.tar.bz2/configure
Changed
@@ -73,32 +73,36 @@ echo "$1" >> config.log } -intel_cflags() { - # Intel Compiler issues an incredibly large number of warnings on any warning level, +cc_cflags() { + # several non gcc compilers issue an incredibly large number of warnings on any warning level, # suppress them by disabling all warnings rather than having to use #pragmas to disable most of them for arg in $*; do [ $arg = -ffast-math ] && arg= [[ "$arg" = -falign-loops* ]] && arg= [ "$arg" = -fno-tree-vectorize ] && arg= [ "$arg" = -Wshadow ] && arg= + [ "$arg" = -Wno-maybe-uninitialized ] && arg= [[ "$arg" = -mpreferred-stack-boundary* ]] && arg= [[ "$arg" = -l* ]] && arg= [[ "$arg" = -L* ]] && arg= - if [ $compiler = ICL ]; then + if [ $compiler_style = MS ]; then [ "$arg" = -Wall ] && arg=-W0 + [ "$arg" = -Werror ] && arg="-W3 -WX" [ "$arg" = -g ] && arg=-Z7 [ "$arg" = -fomit-frame-pointer ] && arg= [ "$arg" = -s ] && arg= [ "$arg" = -fPIC ] && arg= else [ "$arg" = -Wall ] && arg=-w0 + [ "$arg" = -Werror ] && arg="-w3 -Werror" fi + [ $compiler = CL -a "$arg" = -O3 ] && arg=-O2 [ -n "$arg" ] && echo -n "$arg " done } -icl_ldflags() { +cl_ldflags() { for arg in $*; do arg=${arg/LIBPATH/libpath} [ ${arg#-libpath:} == $arg -a ${arg#-l} != $arg ] && arg=${arg#-l}.lib @@ -106,6 +110,11 @@ [ $arg = -Wl,--large-address-aware ] && arg=-largeaddressaware [ $arg = -s ] && arg= [ "$arg" = -Wl,-Bsymbolic ] && arg= + [ "$arg" = -fno-tree-vectorize ] && arg= + [ "$arg" = -Werror ] && arg= + [ "$arg" = -Wshadow ] && arg= + [ "$arg" = -Wmaybe-uninitialized ] && arg= + [[ "$arg" = -Qdiag-error* ]] && arg= arg=${arg/pthreadGC/pthreadVC} [ "$arg" = avifil32.lib ] && arg=vfw32.lib @@ -135,11 +144,11 @@ fi rm -f conftest.c [ -n "$1" ] && echo "#include <$1>" > conftest.c - echo "int main () { $3 return 0; }" >> conftest.c - if [ $compiler = ICL ]; then - cc_cmd="$CC conftest.c $(intel_cflags $CFLAGS $2) -link $(icl_ldflags $2 $LDFLAGSCLI $LDFLAGS)" + echo "int main (void) { $3 return 0; }" >> conftest.c + if [ $compiler_style = MS ]; then + cc_cmd="$CC conftest.c $(cc_cflags $CFLAGS $CHECK_CFLAGS $2) -link $(cl_ldflags $2 $LDFLAGSCLI $LDFLAGS)" else - cc_cmd="$CC conftest.c $CFLAGS $2 $LDFLAGSCLI $LDFLAGS -o conftest" + cc_cmd="$CC conftest.c $CFLAGS $CHECK_CFLAGS $2 $LDFLAGSCLI $LDFLAGS -o conftest" fi if $cc_cmd >conftest.log 2>&1; then res=$? @@ -165,8 +174,12 @@ rm -f conftest.c [ -n "$1" ] && echo "#include <$1>" > conftest.c echo -e "#if !($3) \n#error $4 \n#endif " >> conftest.c - - if $CC conftest.c $CFLAGS $2 -E -o conftest >conftest.log 2>&1; then + if [ $compiler_style = MS ]; then + cpp_cmd="$CC conftest.c $(cc_cflags $CFLAGS $2) -P" + else + cpp_cmd="$CC conftest.c $CFLAGS $2 -E -o conftest" + fi + if $cpp_cmd >conftest.log 2>&1; then res=$? log_ok else @@ -185,8 +198,9 @@ as_check() { log_check "whether $AS supports $1" - echo "$1" > conftest.asm - if $AS conftest.asm $ASFLAGS $2 -o conftest.o >conftest.log 2>&1; then + echo "$1" > conftest$AS_EXT + as_cmd="$AS conftest$AS_EXT $ASFLAGS $2 -o conftest.o" + if $as_cmd >conftest.log 2>&1; then res=$? log_ok else @@ -194,12 +208,12 @@ log_fail log_msg "Failed commandline was:" log_msg "--------------------------------------------------" - log_msg "$AS conftest.asm $ASFLAGS $2 -o conftest.o" + log_msg "$as_cmd" cat conftest.log >> config.log log_msg "--------------------------------------------------" log_msg "Failed program was:" log_msg "--------------------------------------------------" - cat conftest.asm >> config.log + cat conftest$AS_EXT >> config.log log_msg "--------------------------------------------------" fi return $res @@ -208,10 +222,10 @@ rc_check() { log_check "whether $RC works" echo "$1" > conftest.rc - if [ $compiler = ICL ]; then - rc_cmd="$RC $RCFLAGS -foconftest.o conftest.rc" - else + if [ $compiler = GNU ]; then rc_cmd="$RC $RCFLAGS -o conftest.o conftest.rc" + else + rc_cmd="$RC $RCFLAGS -foconftest.o conftest.rc" fi if $rc_cmd >conftest.log 2>&1; then res=$? @@ -278,21 +292,26 @@ bit_depth="8" chroma_format="all" compiler="GNU" +compiler_style="GNU" opencl="yes" CFLAGS="$CFLAGS -Wall -I. -I\$(SRCPATH)" LDFLAGS="$LDFLAGS" LDFLAGSCLI="$LDFLAGSCLI" -ASFLAGS="$ASFLAGS" +ASFLAGS="$ASFLAGS -I. -I\$(SRCPATH)" RCFLAGS="$RCFLAGS" +CHECK_CFLAGS="" HAVE_GETOPT_LONG=1 cross_prefix="" EXE="" +AS_EXT=".S" +NL=" +" # list of all preprocessor HAVE values we can define CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F SWSCALE \ - LAVF FFMS GPAC AVS GPL VECTOREXT INTERLACED CPU_COUNT OPENCL THP LSMASH" + LAVF FFMS GPAC AVS GPL VECTOREXT INTERLACED CPU_COUNT OPENCL THP LSMASH X86_INLINE_ASM AS_FUNC" # parse options @@ -439,23 +458,44 @@ host_vendor="${host%%-*}" host_os="${host#*-}" -# test for use of Intel Compiler +# test for use of compilers that require specific handling +cc_base=`basename "$CC"` +QPRE="-" if [[ $host_os = mingw* || $host_os = cygwin* ]]; then - if [[ `basename "$CC"` = icl* ]]; then + if [[ "$cc_base" = icl || "$cc_base" = icl[\ .]* ]]; then # Windows Intel Compiler creates dependency generation with absolute Windows paths, Cygwin's make does not support Windows paths. [[ $host_os = cygwin* ]] && die "Windows Intel Compiler support requires MSYS" compiler=ICL + compiler_style=MS CFLAGS="$CFLAGS -Qstd=c99 -nologo -Qms0 -DHAVE_STRING_H -I\$(SRCPATH)/extras" QPRE="-Q" `$CC 2>&1 | grep -q IA-32` && host_cpu=i486 `$CC 2>&1 | grep -q "Intel(R) 64"` && host_cpu=x86_64 cpp_check "" "" "_MSC_VER >= 1400" || die "Windows Intel Compiler support requires Visual Studio 2005 or newer" + if cc_check '' -Qdiag-error:10006,10157 ; then + CHECK_CFLAGS="$CHECK_CFLAGS -Qdiag-error:10006,10157" + fi + elif [[ "$cc_base" = cl || "$cc_base" = cl[\ .]* ]]; then + # Standard Microsoft Visual Studio + # Dependency creation includes absolute windows paths, Cygwin's make does not support Windows paths. + [[ $host_os = cygwin* ]] && die "Microsoft Visual Studio support requires MSYS" + compiler=CL + compiler_style=MS + CFLAGS="$CFLAGS -nologo -DHAVE_STRING_H -I\$(SRCPATH)/extras" + `$CC 2>&1 | grep -q 'for x86'` && host_cpu=i486 + `$CC 2>&1 | grep -q 'for x64'` && host_cpu=x86_64 + cpp_check '' '' '_MSC_VER > 1800 || (_MSC_VER == 1800 && _MSC_FULL_VER >= 180030324)' || die "Microsoft Visual Studio support requires Visual Studio 2013 Update 2 or newer" fi else - if [[ `basename "$CC"` = icc* ]]; then + if [[ "$cc_base" = icc || "$cc_base" = icc[\ .]* ]]; then AR="xiar" compiler=ICC - QPRE="-" + fi +fi + +if [[ "$cc_base" = clang || "$cc_base" = clang[\ .]* ]]; then + if cc_check '' -Werror=unknown-warning-option ; then + CHECK_CFLAGS="$CHECK_CFLAGS -Werror=unknown-warning-option"
View file
x264-snapshot-20140321-2245.tar.bz2/encoder/analyse.c -> x264-snapshot-20141104-2245.tar.bz2/encoder/analyse.c
Changed
@@ -5,7 +5,7 @@ * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/encoder/cabac.c -> x264-snapshot-20141104-2245.tar.bz2/encoder/cabac.c
Changed
@@ -5,7 +5,7 @@ * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/encoder/cavlc.c -> x264-snapshot-20141104-2245.tar.bz2/encoder/cavlc.c
Changed
@@ -5,7 +5,7 @@ * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/encoder/encoder.c -> x264-snapshot-20141104-2245.tar.bz2/encoder/encoder.c
Changed
@@ -5,7 +5,7 @@ * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -97,11 +97,14 @@ int cw = h->param.i_width>>1; int ch = h->param.i_height>>CHROMA_V_SHIFT; pixel *planeu = x264_malloc( (cw*ch*2+32)*sizeof(pixel) ); - pixel *planev = planeu + cw*ch + 16; - h->mc.plane_copy_deinterleave( planeu, cw, planev, cw, h->fdec->plane[1], h->fdec->i_stride[1], cw, ch ); - fwrite( planeu, 1, cw*ch*sizeof(pixel), f ); - fwrite( planev, 1, cw*ch*sizeof(pixel), f ); - x264_free( planeu ); + if( planeu ) + { + pixel *planev = planeu + cw*ch + 16; + h->mc.plane_copy_deinterleave( planeu, cw, planev, cw, h->fdec->plane[1], h->fdec->i_stride[1], cw, ch ); + fwrite( planeu, 1, cw*ch*sizeof(pixel), f ); + fwrite( planev, 1, cw*ch*sizeof(pixel), f ); + x264_free( planeu ); + } } fclose( f ); } @@ -412,6 +415,12 @@ static int x264_validate_parameters( x264_t *h, int b_open ) { + if( !h->param.pf_log ) + { + x264_log( NULL, X264_LOG_ERROR, "pf_log not set! did you forget to call x264_param_default?\n" ); + return -1; + } + #if HAVE_MMX if( b_open ) { @@ -818,6 +827,8 @@ /* 8x8dct is not useful without RD in CAVLC lossless */ if( !h->param.b_cabac && h->param.analyse.i_subpel_refine < 6 ) h->param.analyse.b_transform_8x8 = 0; + h->param.analyse.inter &= ~X264_ANALYSE_I8x8; + h->param.analyse.intra &= ~X264_ANALYSE_I8x8; } if( h->param.rc.i_rc_method == X264_RC_CQP ) { @@ -1403,7 +1414,11 @@ /* Init x264_t */ h->i_frame = -1; h->i_frame_num = 0; - h->i_idr_pic_id = 0; + + if( h->param.i_avcintra_class ) + h->i_idr_pic_id = 5; + else + h->i_idr_pic_id = 0; if( (uint64_t)h->param.i_timebase_den * 2 > UINT32_MAX ) { @@ -2154,6 +2169,31 @@ h->fref[1][h->i_ref[1]++] = h->frames.reference[i]; } + if( h->sh.i_mmco_remove_from_end ) + { + /* Order ref0 for MMCO remove */ + do + { + b_ok = 1; + for( int i = 0; i < h->i_ref[0] - 1; i++ ) + { + if( h->fref[0][i]->i_frame < h->fref[0][i+1]->i_frame ) + { + XCHG( x264_frame_t*, h->fref[0][i], h->fref[0][i+1] ); + b_ok = 0; + break; + } + } + } while( !b_ok ); + + for( int i = h->i_ref[0]-1; i >= h->i_ref[0] - h->sh.i_mmco_remove_from_end; i-- ) + { + int diff = h->i_frame_num - h->fref[0][i]->i_frame_num; + h->sh.mmco[h->sh.i_mmco_command_count].i_poc = h->fref[0][i]->i_poc; + h->sh.mmco[h->sh.i_mmco_command_count++].i_difference_of_pic_nums = diff; + } + } + /* Order reference lists by distance from the current frame. */ for( int list = 0; list < 2; list++ ) { @@ -2176,14 +2216,6 @@ } while( !b_ok ); } - if( h->sh.i_mmco_remove_from_end ) - for( int i = h->i_ref[0]-1; i >= h->i_ref[0] - h->sh.i_mmco_remove_from_end; i-- ) - { - int diff = h->i_frame_num - h->fref[0][i]->i_frame_num; - h->sh.mmco[h->sh.i_mmco_command_count].i_poc = h->fref[0][i]->i_poc; - h->sh.mmco[h->sh.i_mmco_command_count++].i_difference_of_pic_nums = diff; - } - x264_reference_check_reorder( h ); h->i_ref[1] = X264_MIN( h->i_ref[1], h->frames.i_max_ref1 ); @@ -2438,7 +2470,24 @@ x264_slice_header_init( h, &h->sh, h->sps, h->pps, h->i_idr_pic_id, h->i_frame_num, i_global_qp ); /* alternate id */ - h->i_idr_pic_id ^= 1; + if( h->param.i_avcintra_class ) + { + switch( h->i_idr_pic_id ) + { + case 5: + h->i_idr_pic_id = 3; + break; + case 3: + h->i_idr_pic_id = 4; + break; + case 4: + default: + h->i_idr_pic_id = 5; + break; + } + } + else + h->i_idr_pic_id ^= 1; } else { @@ -3539,15 +3588,15 @@ return -1; overhead += h->out.nal[h->out.i_nal-1].i_payload + SEI_OVERHEAD; } + } - if( h->param.i_frame_packing >= 0 ) - { - x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE ); - x264_sei_frame_packing_write( h, &h->out.bs ); - if( x264_nal_end( h ) ) - return -1; - overhead += h->out.nal[h->out.i_nal-1].i_payload + SEI_OVERHEAD; - } + if( h->param.i_frame_packing >= 0 && (h->fenc->b_keyframe || h->param.i_frame_packing == 5) ) + { + x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE ); + x264_sei_frame_packing_write( h, &h->out.bs ); + if( x264_nal_end( h ) ) + return -1; + overhead += h->out.nal[h->out.i_nal-1].i_payload + SEI_OVERHEAD; } /* generate sei pic timing */
View file
x264-snapshot-20140321-2245.tar.bz2/encoder/macroblock.c -> x264-snapshot-20141104-2245.tar.bz2/encoder/macroblock.c
Changed
@@ -5,7 +5,7 @@ * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * Henrik Gramner <henrik@gramner.com> * * This program is free software; you can redistribute it and/or modify
View file
x264-snapshot-20140321-2245.tar.bz2/encoder/me.c -> x264-snapshot-20141104-2245.tar.bz2/encoder/me.c
Changed
@@ -5,7 +5,7 @@ * * Authors: Loren Merritt <lorenm@u.washington.edu> * Laurent Aimar <fenrir@via.ecp.fr> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/encoder/ratecontrol.c -> x264-snapshot-20141104-2245.tar.bz2/encoder/ratecontrol.c
Changed
@@ -6,7 +6,7 @@ * Authors: Loren Merritt <lorenm@u.washington.edu> * Michael Niedermayer <michaelni@gmx.at> * Gabriel Bouvigne <gabriel.bouvigne@joost.com> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * Måns Rullgård <mru@mru.ath.cx> * * This program is free software; you can redistribute it and/or modify @@ -158,7 +158,7 @@ double frame_size_maximum; /* Maximum frame size due to MinCR */ double frame_size_planned; double slice_size_planned; - predictor_t (*row_pred)[2]; + predictor_t *row_pred; predictor_t row_preds[3][2]; predictor_t *pred_b_from_p; /* predict B-frame size from P-frame satd */ int bframes; /* # consecutive B-frames before this P-frame */ @@ -1418,7 +1418,7 @@ memset( h->fdec->i_row_bits, 0, h->mb.i_mb_height * sizeof(int) ); memset( h->fdec->f_row_qp, 0, h->mb.i_mb_height * sizeof(float) ); memset( h->fdec->f_row_qscale, 0, h->mb.i_mb_height * sizeof(float) ); - rc->row_pred = &rc->row_preds[h->sh.i_type]; + rc->row_pred = rc->row_preds[h->sh.i_type]; rc->buffer_rate = h->fenc->i_cpb_duration * rc->vbv_max_rate * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale; update_vbv_plan( h, overhead ); @@ -1504,7 +1504,7 @@ /* average between two predictors: * absolute SATD, and scaled bit cost of the colocated row in the previous frame */ x264_ratecontrol_t *rc = h->rc; - float pred_s = predict_size( rc->row_pred[0], qscale, h->fdec->i_row_satd[y] ); + float pred_s = predict_size( &rc->row_pred[0], qscale, h->fdec->i_row_satd[y] ); if( h->sh.i_type == SLICE_TYPE_I || qscale >= h->fref[0][0]->f_row_qscale[y] ) { if( h->sh.i_type == SLICE_TYPE_P @@ -1522,7 +1522,7 @@ /* Our QP is lower than the reference! */ else { - float pred_intra = predict_size( rc->row_pred[1], qscale, h->fdec->i_row_satds[0][0][y] ); + float pred_intra = predict_size( &rc->row_pred[1], qscale, h->fdec->i_row_satds[0][0][y] ); /* Sum: better to overestimate than underestimate by using only one of the two predictors. */ return pred_intra + pred_s; } @@ -1570,9 +1570,9 @@ h->fdec->f_row_qp[y] = rc->qpm; h->fdec->f_row_qscale[y] = qscale; - update_predictor( rc->row_pred[0], qscale, h->fdec->i_row_satd[y], h->fdec->i_row_bits[y] ); + update_predictor( &rc->row_pred[0], qscale, h->fdec->i_row_satd[y], h->fdec->i_row_bits[y] ); if( h->sh.i_type == SLICE_TYPE_P && rc->qpm < h->fref[0][0]->f_row_qp[y] ) - update_predictor( rc->row_pred[1], qscale, h->fdec->i_row_satds[0][0][y], h->fdec->i_row_bits[y] ); + update_predictor( &rc->row_pred[1], qscale, h->fdec->i_row_satds[0][0][y], h->fdec->i_row_bits[y] ); /* update ratecontrol per-mbpair in MBAFF */ if( SLICE_MBAFF && !(y&1) ) @@ -2612,7 +2612,7 @@ x264_t *t = h->thread[i]; if( t != h ) memcpy( t->rc, rc, offsetof(x264_ratecontrol_t, row_pred) ); - t->rc->row_pred = &t->rc->row_preds[h->sh.i_type]; + t->rc->row_pred = t->rc->row_preds[h->sh.i_type]; /* Calculate the planned slice size. */ if( rc->b_vbv && rc->frame_size_planned ) {
View file
x264-snapshot-20140321-2245.tar.bz2/encoder/rdo.c -> x264-snapshot-20141104-2245.tar.bz2/encoder/rdo.c
Changed
@@ -4,7 +4,7 @@ * Copyright (C) 2005-2014 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/encoder/set.c -> x264-snapshot-20141104-2245.tar.bz2/encoder/set.c
Changed
@@ -675,7 +675,9 @@ bs_write( &q, 4, 0 ); // frame1_grid_position_y } bs_write( &q, 8, 0 ); // frame_packing_arrangement_reserved_byte - bs_write_ue( &q, 1 ); // frame_packing_arrangement_repetition_period + // "frame_packing_arrangement_repetition_period equal to 1 specifies that the frame packing arrangement SEI message persists in output" + // for (i_frame_packing == 5) this will undermine current_frame_is_frame0_flag which must alternate every view sequence + bs_write_ue( &q, h->param.i_frame_packing != 5 ); // frame_packing_arrangement_repetition_period bs_write1( &q, 0 ); // frame_packing_arrangement_extension_flag bs_align_10( &q ); @@ -740,11 +742,15 @@ data[20] = 0x13; /* These bytes appear to be some sort of frame/seconds counter in certain applications, * but others jump around, so leave them as zero for now */ - data[21] = data[22] = 0; - + data[22] = data[23] = data[25] = data[26] = 0; data[28] = 0x14; + data[30] = data[31] = data[33] = data[34] = 0; data[36] = 0x60; data[41] = 0x22; /* Believed to be some sort of end of basic UMID identifier */ + data[60] = 0x62; + data[62] = data[63] = data[65] = data[66] = 0; + data[68] = 0x63; + data[70] = data[71] = data[73] = data[74] = 0; x264_sei_write( &h->out.bs, data, len, SEI_USER_DATA_UNREGISTERED );
View file
x264-snapshot-20140321-2245.tar.bz2/encoder/slicetype.c -> x264-snapshot-20141104-2245.tar.bz2/encoder/slicetype.c
Changed
@@ -3,7 +3,7 @@ ***************************************************************************** * Copyright (C) 2005-2014 x264 project * - * Authors: Jason Garrett-Glaser <darkshikari@gmail.com> + * Authors: Fiona Glaser <fiona@x264.com> * Loren Merritt <lorenm@u.washington.edu> * Dylan Yudaken <dyudaken@gmail.com> *
View file
x264-snapshot-20140321-2245.tar.bz2/filters/filters.c -> x264-snapshot-20141104-2245.tar.bz2/filters/filters.c
Changed
@@ -38,13 +38,13 @@ if( sep_count == 0 ) { if( string[0] == '\0' ) - return calloc( 1, sizeof( char** ) ); - char **ret = calloc( 2, sizeof( char** ) ); + return calloc( 1, sizeof( char* ) ); + char **ret = calloc( 2, sizeof( char* ) ); ret[0] = strdup( string ); return ret; } - char **split = calloc( ( limit > 0 ? limit : sep_count ) + 2, sizeof(char**) ); + char **split = calloc( ( limit > 0 ? limit : sep_count ) + 2, sizeof(char*) ); int i = 0; char *str = strdup( string ); assert( str ); @@ -104,7 +104,7 @@ while( options[options_count] != NULL ) ++options_count; - char **opts = calloc( split_count * 2 + 2, sizeof( char ** ) ); + char **opts = calloc( split_count * 2 + 2, sizeof( char * ) ); char **arg = NULL; int opt = 0, found_named = 0, invalid = 0; for( int i = 0; split[i] != NULL; i++, invalid = 0 )
View file
x264-snapshot-20140321-2245.tar.bz2/filters/video/select_every.c -> x264-snapshot-20141104-2245.tar.bz2/filters/video/select_every.c
Changed
@@ -51,7 +51,7 @@ printf( " apply a selection pattern to input frames\n" " step: the number of frames in the pattern\n" " offsets: the offset into the step to select a frame\n" - " see: http://avisynth.org/mediawiki/Select#SelectEvery\n" ); + " see: http://avisynth.nl/index.php/Select#SelectEvery\n" ); } static int init( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x264_param_t *param, char *opt_string )
View file
x264-snapshot-20140321-2245.tar.bz2/input/avs.c -> x264-snapshot-20141104-2245.tar.bz2/input/avs.c
Changed
@@ -298,7 +298,10 @@ opt->input_range = opt->output_range; } const char *arg_name[] = { NULL, "interlaced", "matrix" }; - AVS_Value arg_arr[] = { res, avs_new_value_bool( info->interlaced ), avs_new_value_string( matrix ) }; + AVS_Value arg_arr[3]; + arg_arr[0] = res; + arg_arr[1] = avs_new_value_bool( info->interlaced ); + arg_arr[2] = avs_new_value_string( matrix ); AVS_Value res2 = h->func.avs_invoke( h->env, conv_func, avs_new_value_array( arg_arr, arg_count ), arg_name ); FAIL_IF_ERROR( avs_is_error( res2 ), "couldn't convert input clip to %s\n", csp ) res = update_clip( h, &vi, res2, res ); @@ -308,7 +311,9 @@ { const char *levels = opt->output_range ? "TV->PC" : "PC->TV"; x264_cli_log( "avs", X264_LOG_WARNING, "performing %s conversion\n", levels ); - AVS_Value arg_arr[] = { res, avs_new_value_string( levels ) }; + AVS_Value arg_arr[2]; + arg_arr[0] = res; + arg_arr[1] = avs_new_value_string( levels ); const char *arg_name[] = { NULL, "levels" }; AVS_Value res2 = h->func.avs_invoke( h->env, "ColorYUV", avs_new_value_array( arg_arr, 2 ), arg_name ); FAIL_IF_ERROR( avs_is_error( res2 ), "couldn't convert range: %s\n", avs_as_error( res2 ) )
View file
x264-snapshot-20140321-2245.tar.bz2/input/ffms.c -> x264-snapshot-20141104-2245.tar.bz2/input/ffms.c
Changed
@@ -177,8 +177,9 @@ static int picture_alloc( cli_pic_t *pic, int csp, int width, int height ) { - if( x264_cli_pic_alloc( pic, csp, width, height ) ) + if( x264_cli_pic_alloc( pic, X264_CSP_NONE, width, height ) ) return -1; + pic->img.csp = csp; pic->img.planes = 4; return 0; }
View file
x264-snapshot-20140321-2245.tar.bz2/input/lavf.c -> x264-snapshot-20141104-2245.tar.bz2/input/lavf.c
Changed
@@ -42,12 +42,6 @@ cli_pic_t *first_pic; } lavf_hnd_t; -#define x264_free_packet( pkt )\ -{\ - av_free_packet( pkt );\ - av_init_packet( pkt );\ -} - /* handle the deprecated jpeg pixel formats */ static int handle_jpeg( int csp, int *fullrange ) { @@ -70,9 +64,7 @@ { XCHG( cli_image_t, p_pic->img, h->first_pic->img ); p_pic->pts = h->first_pic->pts; - XCHG( void*, p_pic->opaque, h->first_pic->opaque ); } - lavf_input.release_frame( h->first_pic, NULL ); lavf_input.picture_clean( h->first_pic ); free( h->first_pic ); h->first_pic = NULL; @@ -81,9 +73,11 @@ } AVCodecContext *c = h->lavf->streams[h->stream_id]->codec; - AVPacket *pkt = p_pic->opaque; - avcodec_get_frame_defaults( h->frame ); + AVPacket pkt; + av_init_packet( &pkt ); + pkt.data = NULL; + pkt.size = 0; while( i_frame >= h->next_frame ) { @@ -91,20 +85,23 @@ int ret = 0; do { - ret = av_read_frame( h->lavf, pkt ); + ret = av_read_frame( h->lavf, &pkt ); - if( pkt->stream_index == h->stream_id ) + if( ret < 0 ) { - if( ret < 0 ) - pkt->size = 0; + av_init_packet( &pkt ); + pkt.data = NULL; + pkt.size = 0; + } - c->reordered_opaque = pkt->pts; - if( avcodec_decode_video2( c, h->frame, &finished, pkt ) < 0 ) + if( ret < 0 || pkt.stream_index == h->stream_id ) + { + if( avcodec_decode_video2( c, h->frame, &finished, &pkt ) < 0 ) x264_cli_log( "lavf", X264_LOG_WARNING, "video decoding failed on frame %d\n", h->next_frame ); } - /* if the packet successfully decoded but the data from it is not desired, free it */ - else if( ret >= 0 ) - x264_free_packet( pkt ); + + if( ret >= 0 ) + av_free_packet( &pkt ); } while( !finished && ret >= 0 ); if( !finished ) @@ -130,10 +127,10 @@ if( h->vfr_input ) { p_pic->pts = p_pic->duration = 0; - if( c->has_b_frames && h->frame->reordered_opaque != AV_NOPTS_VALUE ) - p_pic->pts = h->frame->reordered_opaque; - else if( pkt->dts != AV_NOPTS_VALUE ) - p_pic->pts = pkt->dts; // for AVI files + if( h->frame->pkt_pts != AV_NOPTS_VALUE ) + p_pic->pts = h->frame->pkt_pts; + else if( h->frame->pkt_dts != AV_NOPTS_VALUE ) + p_pic->pts = h->frame->pkt_dts; // for AVI files else if( info ) { h->vfr_input = info->vfr = 0; @@ -153,7 +150,7 @@ if( !strcmp( psz_filename, "-" ) ) psz_filename = "pipe:"; - h->frame = avcodec_alloc_frame(); + h->frame = av_frame_alloc(); if( !h->frame ) return -1; @@ -220,13 +217,10 @@ static int picture_alloc( cli_pic_t *pic, int csp, int width, int height ) { - if( x264_cli_pic_alloc( pic, csp, width, height ) ) + if( x264_cli_pic_alloc( pic, X264_CSP_NONE, width, height ) ) return -1; + pic->img.csp = csp; pic->img.planes = 4; - pic->opaque = malloc( sizeof(AVPacket) ); - if( !pic->opaque ) - return -1; - av_init_packet( pic->opaque ); return 0; } @@ -235,15 +229,8 @@ return read_frame_internal( pic, handle, i_frame, NULL ); } -static int release_frame( cli_pic_t *pic, hnd_t handle ) -{ - x264_free_packet( pic->opaque ); - return 0; -} - static void picture_clean( cli_pic_t *pic ) { - free( pic->opaque ); memset( pic, 0, sizeof(cli_pic_t) ); } @@ -252,13 +239,9 @@ lavf_hnd_t *h = handle; avcodec_close( h->lavf->streams[h->stream_id]->codec ); avformat_close_input( &h->lavf ); -#if LIBAVCODEC_VERSION_INT >= AV_VERSION_INT(54, 28, 0) - avcodec_free_frame( &h->frame ); -#else - av_freep( &h->frame ); -#endif + av_frame_free( &h->frame ); free( h ); return 0; } -const cli_input_t lavf_input = { open_file, picture_alloc, read_frame, release_frame, picture_clean, close_file }; +const cli_input_t lavf_input = { open_file, picture_alloc, read_frame, NULL, picture_clean, close_file };
View file
x264-snapshot-20140321-2245.tar.bz2/input/thread.c -> x264-snapshot-20141104-2245.tar.bz2/input/thread.c
Changed
@@ -88,7 +88,11 @@ if( h->next_frame == i_frame ) XCHG( cli_pic_t, *p_pic, h->pic ); else + { + if( h->next_frame >= 0 ) + thread_input.release_frame( &h->pic, handle ); ret |= h->input.read_frame( p_pic, h->p_handle, i_frame ); + } if( !h->frame_total || i_frame+1 < h->frame_total ) {
View file
x264-snapshot-20140321-2245.tar.bz2/output/mp4_lsmash.c -> x264-snapshot-20141104-2245.tar.bz2/output/mp4_lsmash.c
Changed
@@ -79,6 +79,7 @@ int i_dts_compress_multiplier; int b_use_recovery; int b_fragments; + lsmash_file_parameters_t file_param; } mp4_hnd_t; /*******************/ @@ -88,16 +89,10 @@ mp4_hnd_t *p_mp4 = handle; if( !p_mp4 ) return; - if( p_mp4->p_sei_buffer ) - { - free( p_mp4->p_sei_buffer ); - p_mp4->p_sei_buffer = NULL; - } - if( p_mp4->p_root ) - { - lsmash_destroy_root( p_mp4->p_root ); - p_mp4->p_root = NULL; - } + lsmash_cleanup_summary( (lsmash_summary_t *)p_mp4->summary ); + lsmash_close_file( &p_mp4->file_param ); + lsmash_destroy_root( p_mp4->p_root ); + free( p_mp4->p_sei_buffer ); free( p_mp4 ); } @@ -181,9 +176,13 @@ p_mp4->b_fragments = !b_regular; p_mp4->b_stdout = !strcmp( psz_filename, "-" ); - p_mp4->p_root = lsmash_open_movie( psz_filename, p_mp4->b_fragments ? LSMASH_FILE_MODE_WRITE_FRAGMENTED : LSMASH_FILE_MODE_WRITE ); + p_mp4->p_root = lsmash_create_root(); MP4_FAIL_IF_ERR_EX( !p_mp4->p_root, "failed to create root.\n" ); + MP4_FAIL_IF_ERR_EX( lsmash_open_file( psz_filename, 0, &p_mp4->file_param ) < 0, "failed to open an output file.\n" ); + if( p_mp4->b_fragments ) + p_mp4->file_param.mode |= LSMASH_FILE_MODE_FRAGMENTED; + p_mp4->summary = (lsmash_video_summary_t *)lsmash_create_summary( LSMASH_SUMMARY_TYPE_VIDEO ); MP4_FAIL_IF_ERR_EX( !p_mp4->summary, "failed to allocate memory for summary information of video.\n" ); @@ -219,12 +218,17 @@ brands[brand_count++] = ISOM_BRAND_TYPE_ISO6; /* cslg and visual random access grouping */ } + /* Set file */ + lsmash_file_parameters_t *file_param = &p_mp4->file_param; + file_param->major_brand = brands[0]; + file_param->brands = brands; + file_param->brand_count = brand_count; + file_param->minor_version = 0; + MP4_FAIL_IF_ERR( !lsmash_set_file( p_mp4->p_root, file_param ), "failed to add an output file into a ROOT.\n" ); + /* Set movie parameters. */ lsmash_movie_parameters_t movie_param; lsmash_initialize_movie_parameters( &movie_param ); - movie_param.major_brand = ISOM_BRAND_TYPE_MP42; - movie_param.brands = brands; - movie_param.number_of_brands = brand_count; MP4_FAIL_IF_ERR( lsmash_set_movie_parameters( p_mp4->p_root, &movie_param ), "failed to set movie parameters.\n" ); p_mp4->i_movie_timescale = lsmash_get_movie_timescale( p_mp4->p_root );
View file
x264-snapshot-20140321-2245.tar.bz2/tools/checkasm.c -> x264-snapshot-20141104-2245.tar.bz2/tools/checkasm.c
Changed
@@ -5,7 +5,7 @@ * * Authors: Loren Merritt <lorenm@u.washington.edu> * Laurent Aimar <fenrir@via.ecp.fr> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -90,11 +90,11 @@ { uint32_t a = 0; #if HAVE_X86_INLINE_ASM - asm volatile( "rdtsc" :"=a"(a) ::"edx" ); + asm volatile( "rdtsc" : "=a"(a) :: "edx", "memory" ); #elif ARCH_PPC - asm volatile( "mftb %0" : "=r" (a) ); + asm volatile( "mftb %0" : "=r"(a) :: "memory" ); #elif ARCH_ARM // ARMv7 only - asm volatile( "mrc p15, 0, %0, c9, c13, 0" : "=r"(a) ); + asm volatile( "mrc p15, 0, %0, c9, c13, 0" : "=r"(a) :: "memory" ); #endif return a; } @@ -184,6 +184,9 @@ #elif ARCH_ARM b->cpu&X264_CPU_NEON ? "neon" : b->cpu&X264_CPU_ARMV6 ? "armv6" : +#elif ARCH_AARCH64 + b->cpu&X264_CPU_NEON ? "neon" : + b->cpu&X264_CPU_ARMV8 ? "armv8" : #endif "c", #if HAVE_MMX @@ -728,11 +731,14 @@ fprintf( stderr, "ssim: %.7f != %.7f [FAILED]\n", res_c, res_a ); } set_func_name( "ssim_core" ); - call_c2( pixel_c.ssim_4x4x2_core, pbuf1+2, (intptr_t)32, pbuf2+2, (intptr_t)32, sums ); - call_a2( pixel_asm.ssim_4x4x2_core, pbuf1+2, (intptr_t)32, pbuf2+2, (intptr_t)32, sums ); + call_c( pixel_c.ssim_4x4x2_core, pbuf1+2, (intptr_t)32, pbuf2+2, (intptr_t)32, sums ); + call_a( pixel_asm.ssim_4x4x2_core, pbuf1+2, (intptr_t)32, pbuf2+2, (intptr_t)32, sums ); set_func_name( "ssim_end" ); call_c2( pixel_c.ssim_end4, sums, sums, 4 ); call_a2( pixel_asm.ssim_end4, sums, sums, 4 ); + /* check incorrect assumptions that 32-bit ints are zero-extended to 64-bit */ + call_c1( pixel_c.ssim_end4, sums, sums, 3 ); + call_a1( pixel_asm.ssim_end4, sums, sums, 3 ); report( "ssim :" ); } @@ -1097,6 +1103,7 @@ TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, dct8[0], 8 ); TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 4 ); TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 ); + TEST_ZIGZAG_SUB( sub_8x8, level1, level2, 64 ); TEST_ZIGZAG_SUBAC( sub_4x4ac, level1, level2 ); report( interlace ? "zigzag_field :" : "zigzag_frame :" ); } @@ -2624,8 +2631,9 @@ { int ret = 0; int cpu0 = 0, cpu1 = 0; + uint32_t cpu_detect = x264_cpu_detect(); #if HAVE_MMX - if( x264_cpu_detect() & X264_CPU_MMX2 ) + if( cpu_detect & X264_CPU_MMX2 ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_MMX | X264_CPU_MMX2, "MMX" ); ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "MMX Cache64" ); @@ -2634,7 +2642,7 @@ ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_32, "MMX Cache32" ); cpu1 &= ~X264_CPU_CACHELINE_32; #endif - if( x264_cpu_detect() & X264_CPU_LZCNT ) + if( cpu_detect & X264_CPU_LZCNT ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX_LZCNT" ); cpu1 &= ~X264_CPU_LZCNT; @@ -2642,9 +2650,9 @@ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "MMX SlowCTZ" ); cpu1 &= ~X264_CPU_SLOW_CTZ; } - if( x264_cpu_detect() & X264_CPU_SSE ) + if( cpu_detect & X264_CPU_SSE ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE, "SSE" ); - if( x264_cpu_detect() & X264_CPU_SSE2 ) + if( cpu_detect & X264_CPU_SSE2 ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2 | X264_CPU_SSE2_IS_SLOW, "SSE2Slow" ); ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2_IS_FAST, "SSE2Fast" ); @@ -2655,17 +2663,17 @@ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" ); cpu1 &= ~X264_CPU_SLOW_CTZ; } - if( x264_cpu_detect() & X264_CPU_LZCNT ) + if( cpu_detect & X264_CPU_LZCNT ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE_LZCNT" ); cpu1 &= ~X264_CPU_LZCNT; } - if( x264_cpu_detect() & X264_CPU_SSE3 ) + if( cpu_detect & X264_CPU_SSE3 ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3 | X264_CPU_CACHELINE_64, "SSE3" ); cpu1 &= ~X264_CPU_CACHELINE_64; } - if( x264_cpu_detect() & X264_CPU_SSSE3 ) + if( cpu_detect & X264_CPU_SSSE3 ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" ); ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" ); @@ -2679,54 +2687,59 @@ cpu1 &= ~X264_CPU_CACHELINE_64; cpu1 &= ~X264_CPU_SLOW_ATOM; } - if( x264_cpu_detect() & X264_CPU_SSE4 ) + if( cpu_detect & X264_CPU_SSE4 ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" ); - if( x264_cpu_detect() & X264_CPU_AVX ) + if( cpu_detect & X264_CPU_AVX ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX, "AVX" ); - if( x264_cpu_detect() & X264_CPU_XOP ) + if( cpu_detect & X264_CPU_XOP ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_XOP, "XOP" ); - if( x264_cpu_detect() & X264_CPU_FMA4 ) + if( cpu_detect & X264_CPU_FMA4 ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA4, "FMA4" ); cpu1 &= ~X264_CPU_FMA4; } - if( x264_cpu_detect() & X264_CPU_BMI1 ) + if( cpu_detect & X264_CPU_BMI1 ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" ); cpu1 &= ~X264_CPU_BMI1; } - if( x264_cpu_detect() & X264_CPU_AVX2 ) + if( cpu_detect & X264_CPU_AVX2 ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" ); - if( x264_cpu_detect() & X264_CPU_LZCNT ) + if( cpu_detect & X264_CPU_LZCNT ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "AVX2_LZCNT" ); cpu1 &= ~X264_CPU_LZCNT; } } - if( x264_cpu_detect() & X264_CPU_BMI2 ) + if( cpu_detect & X264_CPU_BMI2 ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1|X264_CPU_BMI2, "BMI2" ); cpu1 &= ~(X264_CPU_BMI1|X264_CPU_BMI2); } - if( x264_cpu_detect() & X264_CPU_FMA3 ) + if( cpu_detect & X264_CPU_FMA3 ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" ); cpu1 &= ~X264_CPU_FMA3; } #elif ARCH_PPC - if( x264_cpu_detect() & X264_CPU_ALTIVEC ) + if( cpu_detect & X264_CPU_ALTIVEC ) { fprintf( stderr, "x264: ALTIVEC against C\n" ); ret = check_all_funcs( 0, X264_CPU_ALTIVEC ); } #elif ARCH_ARM - if( x264_cpu_detect() & X264_CPU_ARMV6 ) + if( cpu_detect & X264_CPU_ARMV6 ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_ARMV6, "ARMv6" ); - if( x264_cpu_detect() & X264_CPU_NEON ) + if( cpu_detect & X264_CPU_NEON ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_NEON, "NEON" ); - if( x264_cpu_detect() & X264_CPU_FAST_NEON_MRC ) + if( cpu_detect & X264_CPU_FAST_NEON_MRC ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_FAST_NEON_MRC, "Fast NEON MRC" ); +#elif ARCH_AARCH64 + if( cpu_detect & X264_CPU_ARMV8 ) + ret |= add_flags( &cpu0, &cpu1, X264_CPU_ARMV8, "ARMv8" ); + if( cpu_detect & X264_CPU_NEON ) + ret |= add_flags( &cpu0, &cpu1, X264_CPU_NEON, "NEON" ); #endif return ret; }
View file
x264-snapshot-20141104-2245.tar.bz2/tools/cltostr.sh
Added
@@ -0,0 +1,32 @@ +#!/bin/sh +# Convert standard input to a C char array, write to a file, then create an +# MD5 sum of that file and append said MD5 sum as char array to the file. + +FILE=$1 + +# Filter out whitespace, empty lines, and comments. +sanitize() { + sed 's/^[[:space:]]*//; /^$/d; /^\/\//d' +} + +# Convert stdin to a \0-terminated char array. +dump() { + printf 'static const char %s[] = {\n' $1 + od -v -A n -t x1 | sed 's/[[:space:]]*\([[:alnum:]]\{2\}\)/0x\1, /g' + printf '0x00 };\n' +} + +# Print MD5 hash w/o newline character to not embed the character in the array. +hash() { + # md5sum is not standard, so try different platform-specific alternatives. + { md5sum $1 2> /dev/null || md5 -q $1 || digest -a md5 $1; } | + cut -b -32 | tr -d '\n\r' +} + +trap "rm -f $FILE.temp" EXIT + +sanitize | tee $FILE.temp | + dump x264_opencl_source > $FILE + +hash $FILE.temp | + dump x264_opencl_source_hash >> $FILE
View file
x264-snapshot-20141104-2245.tar.bz2/tools/msvsdepend.sh
Added
@@ -0,0 +1,21 @@ +#!/bin/sh +# There's a lot of things going on here +# expected arguments are $(CC) $(CFLAGS) $(SRC) $(OBJ) +# 1) start the dependency line with the object argument +# 2) need to add -Zs -showIncludes to the flags to have the compiler output list of include files without compilation +# 3) look for notes in the output that start with "Note: including file:" +# 4) retain only the filepath from the notes +# 5) convert \ foldername separators to / +# 6) escape spaces in the filepath +# 7) remove system includes (hack: check for "/Program Files" string in filepath) +# 8) sort and remove duplicate filepath entries +# 9) convert newlines to spaces to collapse the dependencies into the one dependency line +# 10) print a newline character, to properly separate dependency lines +echo -n "$4: " +$1 $2 $3 -Zs -showIncludes 2>&1 | + grep '^Note: including file:' | + sed 's/^Note: including file:[[:space:]]*\(.*\)$/\1/; s/\\/\//g; s/ /\\ /g' | + sed '/\/[Pp]rogram\\ [Ff]iles/d' | + sort | uniq | + tr -s '\n\r' ' ' +echo ''
View file
x264-snapshot-20140321-2245.tar.bz2/x264.c -> x264-snapshot-20141104-2245.tar.bz2/x264.c
Changed
@@ -6,7 +6,7 @@ * Authors: Loren Merritt <lorenm@u.washington.edu> * Laurent Aimar <fenrir@via.ecp.fr> * Steven Walters <kemuri9@gmail.com> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * Kieran Kunhya <kieran@kunhya.com> * Henrik Gramner <henrik@gramner.com> * @@ -320,6 +320,8 @@ printf( "intel: %.2f (%d)\n", __INTEL_COMPILER / 100.f, __INTEL_COMPILER_BUILD_DATE ); #elif defined(__GNUC__) printf( "gcc: " __VERSION__ "\n" ); +#elif defined(_MSC_FULL_VER) + printf( "msvc: %.2f (%u)\n", _MSC_VER / 100.f, _MSC_FULL_VER ); #else printf( "using an unknown compiler\n" ); #endif
View file
x264-snapshot-20140321-2245.tar.bz2/x264.h -> x264-snapshot-20141104-2245.tar.bz2/x264.h
Changed
@@ -5,7 +5,7 @@ * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -28,8 +28,8 @@ #ifndef X264_X264_H #define X264_X264_H -#if !defined(_STDINT_H) && !defined(_STDINT_H_) && !defined(_STDINT_H_INCLUDED) &&\ - !defined(_INTTYPES_H) && !defined(_INTTYPES_H_) +#if !defined(_STDINT_H) && !defined(_STDINT_H_) && !defined(_STDINT_H_INCLUDED) && !defined(_STDINT) &&\ + !defined(_INTTYPES_H) && !defined(_INTTYPES_H_) && !defined(_INTTYPES) # ifdef _MSC_VER # pragma message("You must include stdint.h or inttypes.h before x264.h") # else @@ -152,10 +152,11 @@ /* PowerPC */ #define X264_CPU_ALTIVEC 0x0000001 -/* ARM */ +/* ARM and AArch64 */ #define X264_CPU_ARMV6 0x0000001 #define X264_CPU_NEON 0x0000002 /* ARM NEON */ #define X264_CPU_FAST_NEON_MRC 0x0000004 /* Transfer from NEON to ARM register is fast (Cortex-A9) */ +#define X264_CPU_ARMV8 0x0000008 /* Analyse flags */ #define X264_ANALYSE_I4x4 0x0001 /* Analyse i4x4 */
Locations
Projects
Search
Status Monitor
Help
Open Build Service
OBS Manuals
API Documentation
OBS Portal
Reporting a Bug
Contact
Mailing List
Forums
Chat (IRC)
Twitter
Open Build Service (OBS)
is an
openSUSE project
.