Projects
Essentials
libx264
Sign Up
Log In
Username
Password
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
Expand all
Collapse all
Changes of Revision 8
View file
libx264.changes
Changed
@@ -1,4 +1,9 @@ ------------------------------------------------------------------- +Wed Nov 5 12:33:30 UTC 2014 - i@margueirte.su + +- update version 20141104 + +------------------------------------------------------------------- Sat Mar 22 17:10:14 UTC 2014 - i@margueirte.su - update version 20140321.
View file
libx264.spec
Changed
@@ -1,6 +1,7 @@ -# vim: set ts=4 sw=4 et: -# Copyright (c) 2012 Pascal Bleser <pascal.bleser@opensuse.org> -# COpyright (c) 2013 Marguerite Su <marguerite@opensuse.org> +# +# spec file for package libx264 +# +# Copyright (c) 2014 SUSE LINUX Products GmbH, Nuernberg, Germany. # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed @@ -11,19 +12,21 @@ # license that conforms to the Open Source Definition (Version 1.9) # published by the Open Source Initiative. -# Please submit bugfixes or comments via http://bugs.links2linux.org/ +# Please submit bugfixes or comments via http://bugs.opensuse.org/ +# + -Name: libx264 %define soname 142 -%define svn 20140321 +%define svn 20141104 +Name: libx264 Version: 0.%{soname}svn%{svn} -Release: 1 -License: GPL-2.0+ +Release: 0 Summary: A free h264/avc encoder - encoder binary -Url: http://developers.videolan.org/x264.html +License: GPL-2.0+ Group: Productivity/Multimedia/Video/Editors and Convertors -Source: ftp://ftp.videolan.org/pub/videolan/x264/snapshots/x264-snapshot-%{svn}-2245.tar.bz2 -Patch: x264-use-shared-library.patch +Url: http://developers.videolan.org/x264.html +Source: ftp://ftp.videolan.org/pub/videolan/x264/snapshots/x264-snapshot-%{svn}-2245.tar.bz2 +Patch0: x264-use-shared-library.patch BuildRequires: nasm BuildRequires: pkg-config BuildRequires: yasm >= 1.2.0 @@ -92,7 +95,7 @@ %prep %setup -q -n x264-snapshot-%{svn}-2245 -%patch -p1 +%patch0 -p1 FAKE_BUILDDATE=$(LC_ALL=C date -u -r %{_sourcedir}/%{name}.changes '+%%b %%e %%Y') sed -i "s/__DATE__/\"$FAKE_BUILDDATE\"/" x264.c @@ -108,7 +111,7 @@ make %{?_smp_mflags} %install -%makeinstall +make DESTDIR=%{buildroot} install %{?_smp_mflags} rm -f %{buildroot}%{_libdir}/%{name}.so rm -f %{buildroot}%{_libdir}/%{name}.a @@ -119,6 +122,7 @@ echo "%{name}-%{soname}" > %{_sourcedir}/baselibs.conf %post -n %{name}-%{soname} -p /sbin/ldconfig + %postun -n %{name}-%{soname} -p /sbin/ldconfig %files %{soname}
View file
x264-snapshot-20140321-2245.tar.bz2/common/sparc
Deleted
-(directory)
View file
x264-snapshot-20140321-2245.tar.bz2/common/sparc/pixel.asm
Deleted
@@ -1,1089 +0,0 @@ -/***************************************************************************** - * pixel.asm: sparc pixel metrics - ***************************************************************************** - * Copyright (C) 2005-2014 x264 project - * - * Authors: Phil Jensen <philj@csufresno.edu> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. - * - * This program is also available under a commercial proprietary license. - * For more information, contact us at licensing@x264.com. - *****************************************************************************/ - -! VIS optimized SAD for UltraSPARC - -.text -.global x264_pixel_sad_8x8_vis -x264_pixel_sad_8x8_vis: - save %sp, -120, %sp - - fzero %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - std %f12, [%fp-24] - ld [%fp-20], %i0 - - ret - restore - -.global x264_pixel_sad_8x16_vis -x264_pixel_sad_8x16_vis: - save %sp, -120, %sp - - fzero %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - std %f12, [%fp-24] - ld [%fp-20], %i0 - - ret - restore - -.global x264_pixel_sad_16x8_vis -x264_pixel_sad_16x8_vis: - save %sp, -120, %sp - - fzero %f12 ! zero out the accumulator used for pdist - - sub %i1, 8, %i1 ! reduce stride by 8, since we are moving forward 8 each block - sub %i3, 8, %i3 ! same here, reduce stride by 8 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, 8, %i0 - add %i2, 8, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, 8, %i0 - add %i2, 8, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, 8, %i0 - add %i2, 8, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, 8, %i0 - add %i2, 8, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, 8, %i0 - add %i2, 8, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, 8, %i0 - add %i2, 8, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, 8, %i0 - add %i2, 8, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, 8, %i0 - add %i2, 8, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - std %f12, [%fp-24] - ld [%fp-20], %i0 - - ret - restore - -.global x264_pixel_sad_16x16_vis -x264_pixel_sad_16x16_vis: - save %sp, -120, %sp - - fzero %f12 ! zero out the accumulator used for pdist - - sub %i1, 8, %i1 ! reduce stride by 8, since we are moving forward 8 each block - sub %i3, 8, %i3 ! same here, reduce stride by 8 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, 8, %i0 - add %i2, 8, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, 8, %i0 - add %i2, 8, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, 8, %i0 - add %i2, 8, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, 8, %i0 - add %i2, 8, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, 8, %i0 - add %i2, 8, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, 8, %i0 - add %i2, 8, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, 8, %i0 - add %i2, 8, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, 8, %i0 - add %i2, 8, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, 8, %i0 - add %i2, 8, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, 8, %i0 - add %i2, 8, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, 8, %i0 - add %i2, 8, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, 8, %i0 - add %i2, 8, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, 8, %i0 - add %i2, 8, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, 8, %i0 - add %i2, 8, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, 8, %i0 - add %i2, 8, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, 8, %i0 - add %i2, 8, %i2 - pdist %f4, %f10, %f12 - - alignaddr %i0, %g0, %l0 - ldd [%l0], %f0 - ldd [%l0+8], %f2 - faligndata %f0, %f2, %f4 - - alignaddr %i2, %g0, %l2 - ldd [%l2], %f6 - ldd [%l2+8], %f8 - faligndata %f6, %f8, %f10 - - add %i0, %i1, %i0 - add %i2, %i3, %i2 - pdist %f4, %f10, %f12 - - std %f12, [%fp-24] - ld [%fp-20], %i0 - - ret - restore
View file
x264-snapshot-20140321-2245.tar.bz2/common/sparc/pixel.h
Deleted
@@ -1,34 +0,0 @@ -/***************************************************************************** - * pixel.h: sparc pixel metrics - ***************************************************************************** - * Copyright (C) 2005-2014 x264 project - * - * Authors: Phil Jensen <philj@csufresno.edu> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. - * - * This program is also available under a commercial proprietary license. - * For more information, contact us at licensing@x264.com. - *****************************************************************************/ - -#ifndef X264_SPARC_PIXEL_H -#define X264_SPARC_PIXEL_H - -int x264_pixel_sad_8x8_vis ( uint8_t *, intptr_t, uint8_t *, intptr_t ); -int x264_pixel_sad_8x16_vis ( uint8_t *, intptr_t, uint8_t *, intptr_t ); -int x264_pixel_sad_16x8_vis ( uint8_t *, intptr_t, uint8_t *, intptr_t ); -int x264_pixel_sad_16x16_vis( uint8_t *, intptr_t, uint8_t *, intptr_t ); - -#endif
View file
x264-snapshot-20140321-2245.tar.bz2/tools/cltostr.pl
Deleted
@@ -1,65 +0,0 @@ -# Perl script used for compiling OpenCL src into x264 binary -# -# Copyright (C) 2013-2014 x264 project -# Authors: Steve Borho <sborho@multicorewareinc.com> - -use Digest::MD5 qw(md5_hex); - -# xxd takes a VAR, which will be the variable name -# and BYTES, a string of bytes to beencoded. -sub xxd -{ - my %args = @_; - my $var = $args{VAR}; - my $bytes = $args{BYTES}; - my @hexbytes; - my @bytes = split //, $$bytes; - foreach $b (@bytes) - { - push @hexbytes, sprintf("0x%02X", ord($b)); - } - - # Format 'em nice and pretty-like. - print 'static const char ' . $var . '[] = {' . "\n"; - my $count = 0; - foreach my $h (@hexbytes) - { - print "$h, "; - $count++; - if ($count == 16) - { - print "\n"; - $count = 0; - } - } - print "\n0x00 };\n\n"; - - return; -} - -if (@ARGV < 1) -{ - printf "%s: VARNAME ", $0 . "\n"; - exit(-1); -} - - -my @lines; -while(<STDIN>) -{ - s/^\s+//; # trim leading whitespace - if (/^\/\//) - { - next; # skip the line if it starts with '//' - } - push @lines, $_; -} - -my $lines = join '', @lines; -xxd(VAR => @ARGV[0], BYTES => \$lines); - -my $hash = md5_hex($lines); -@hash = ( $hash =~ m/../g ); - - -xxd(VAR => @ARGV[0] . "_hash", BYTES => \$hash);
View file
x264-snapshot-20140321-2245.tar.bz2/.gitignore -> x264-snapshot-20141104-2245.tar.bz2/.gitignore
Changed
@@ -39,6 +39,8 @@ *.mbtree *.temp *.pyc +*.pgd +*.pgc .digress_x264 dataDec.txt
View file
x264-snapshot-20140321-2245.tar.bz2/AUTHORS -> x264-snapshot-20141104-2245.tar.bz2/AUTHORS
Changed
@@ -33,6 +33,14 @@ D: BeOS and MacOS X ports. S: France +N: Fiona Glaser +E: fiona AT x264 DOT com +D: Maintainer +D: All areas of encoder analysis and algorithms +D: Motion estimation, rate control, macroblock & frame decisions, RDO, etc +D: x86 asm +S: USA + N: Gabriel Bouvigne E: bouvigne AT mp3-tech DOT org D: 2pass VBV @@ -47,31 +55,25 @@ D: 4:2:2 chroma subsampling, x86 asm, Windows improvements, bugfixes S: Sweden -N: Jason Garrett-Glaser -E: darkshikari AT gmail DOT com -D: x86 asm, 1pass VBV, adaptive quantization, inline asm -D: various speed optimizations, bugfixes -S: USA - N: Laurent Aimar -E: fenrir AT via.ecp DOT fr +E: fenrir AT videolan DOT org C: fenrir D: Intial import, former maintainer D: x86 asm (mmx/mmx2) S: France N: Loren Merritt -E: lorenm AT u.washington DOT edu +E: pengvado AT akuvian DOT org C: pengvado -D: maintainer +D: Maintainer D: All areas of encoder analysis and algorithms -D: Motion estimation, rate control, macroblock & frame decisions, RDO, etc. +D: Motion estimation, rate control, macroblock & frame decisions, RDO, etc D: Multithreading -D: x86 and x86_64 asm (mmx/mmx2/sse2) +D: x86 asm S: USA N: Mans Rullgard -E: mru AT inprovide DOT com +E: mru AT mansr DOT com C: mru D: Rate control S: Southampton, UK @@ -91,10 +93,6 @@ D: gcc asm to nasm conversion S: China -N: Phil Jensen -E: philj AT csufresno DOT edu -D: SPARC asm - N: Radek Czyz E: radoslaw AT syskin DOT cjb DOT net D: Cached motion compensation
View file
x264-snapshot-20140321-2245.tar.bz2/Makefile -> x264-snapshot-20141104-2245.tar.bz2/Makefile
Changed
@@ -88,17 +88,14 @@ ifeq ($(ARCH),X86) ARCH_X86 = yes ASMSRC = $(X86SRC) common/x86/pixel-32.asm -ASFLAGS += -DARCH_X86_64=0 endif ifeq ($(ARCH),X86_64) ARCH_X86 = yes ASMSRC = $(X86SRC:-32.asm=-64.asm) common/x86/trellis-64.asm -ASFLAGS += -DARCH_X86_64=1 endif ifdef ARCH_X86 -ASFLAGS += -I$(SRCPATH)/common/x86/ SRCS += common/x86/mc-c.c common/x86/predict-c.c OBJASM = $(ASMSRC:%.asm=%.o) $(OBJASM): common/x86/x86inc.asm common/x86/x86util.asm @@ -126,11 +123,18 @@ endif endif -# VIS optims -ifeq ($(ARCH),UltraSPARC) -ifeq ($(findstring HIGH_BIT_DEPTH, $(CONFIG)),) -ASMSRC += common/sparc/pixel.asm -OBJASM = $(ASMSRC:%.asm=%.o) +# AArch64 NEON optims +ifeq ($(ARCH),AARCH64) +ifneq ($(AS),) +ASMSRC += common/aarch64/dct-a.S \ + common/aarch64/deblock-a.S \ + common/aarch64/mc-a.S \ + common/aarch64/pixel-a.S \ + common/aarch64/predict-a.S \ + common/aarch64/quant-a.S +SRCS += common/aarch64/mc-c.c \ + common/aarch64/predict-c.c +OBJASM = $(ASMSRC:%.S=%.o) endif endif @@ -148,7 +152,7 @@ ifeq ($(HAVE_OPENCL),yes) common/oclobj.h: common/opencl/x264-cl.h $(wildcard $(SRCPATH)/common/opencl/*.cl) - cat $^ | perl $(SRCPATH)/tools/cltostr.pl x264_opencl_source > $@ + cat $^ | $(SRCPATH)/tools/cltostr.sh $@ GENERATED += common/oclobj.h SRCS += common/opencl.c encoder/slicetype-cl.c endif @@ -157,7 +161,7 @@ OBJCLI += $(SRCCLI:%.c=%.o) OBJSO += $(SRCSO:%.c=%.o) -.PHONY: all default fprofiled clean distclean install uninstall lib-static lib-shared cli install-lib-dev install-lib-static install-lib-shared install-cli +.PHONY: all default fprofiled clean distclean install install-* uninstall cli lib-* etags cli: x264$(EXE) lib-static: $(LIBX264) @@ -185,7 +189,7 @@ $(OBJS) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK): .depend -%.o: %.asm +%.o: %.asm common/x86/x86inc.asm common/x86/x86util.asm $(AS) $(ASFLAGS) -o $@ $< -@ $(if $(STRIP), $(STRIP) -x $@) # delete local/anonymous symbols, so they don't show up in oprofile @@ -201,7 +205,12 @@ .depend: config.mak @rm -f .depend + @echo 'dependency file generation...' +ifeq ($(COMPILER),CL) + @$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS) $(SRCCLI) $(SRCSO)), $(SRCPATH)/tools/msvsdepend.sh "$(CC)" "$(CFLAGS)" "$(SRC)" "$(SRC:$(SRCPATH)/%.c=%.o)" 1>> .depend;) +else @$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS) $(SRCCLI) $(SRCSO)), $(CC) $(CFLAGS) $(SRC) $(DEPMT) $(SRC:$(SRCPATH)/%.c=%.o) $(DEPMM) 1>> .depend;) +endif config.mak: ./configure @@ -232,15 +241,20 @@ $(MAKE) clean $(MAKE) x264$(EXE) CFLAGS="$(CFLAGS) $(PROF_GEN_CC)" LDFLAGS="$(LDFLAGS) $(PROF_GEN_LD)" $(foreach V, $(VIDS), $(foreach I, 0 1 2 3 4 5 6 7, ./x264$(EXE) $(OPT$I) --threads 1 $(V) -o $(DEVNULL) ;)) +ifeq ($(COMPILER),CL) +# Because Visual Studio timestamps the object files within the PGD, it fails to build if they change - only the executable should be deleted + rm -f x264$(EXE) +else rm -f $(SRC2:%.c=%.o) +endif $(MAKE) CFLAGS="$(CFLAGS) $(PROF_USE_CC)" LDFLAGS="$(LDFLAGS) $(PROF_USE_LD)" - rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock + rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock *.pgd *.pgc endif clean: rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(SONAME) *.a *.lib *.exp *.pdb x264 x264.exe .depend TAGS rm -f checkasm checkasm.exe $(OBJCHK) $(GENERATED) x264_lookahead.clbin - rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock + rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock *.pgd *.pgc distclean: clean rm -f config.mak x264_config.h config.h config.log x264.pc x264.def
View file
x264-snapshot-20141104-2245.tar.bz2/common/aarch64
Added
+(directory)
View file
x264-snapshot-20141104-2245.tar.bz2/common/aarch64/asm.S
Added
@@ -0,0 +1,221 @@ +/***************************************************************************** + * asm.S: AArch64 utility macros + ***************************************************************************** + * Copyright (C) 2008-2014 x264 project + * + * Authors: Mans Rullgard <mans@mansr.com> + * David Conrad <lessen42@gmail.com> + * Janne Grunau <janne-x264@jannau.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "config.h" + +#ifdef PREFIX +# define EXTERN_ASM _ +#else +# define EXTERN_ASM +#endif + +#ifdef __ELF__ +# define ELF +#else +# define ELF # +#endif + +#ifdef __MACH__ +# define MACH +#else +# define MACH # +#endif + +#if HAVE_AS_FUNC +# define FUNC +#else +# define FUNC # +#endif + +.macro function name, export=0, align=2 + .macro endfunc +ELF .size \name, . - \name +FUNC .endfunc + .purgem endfunc + .endm + .text + .align \align + .if \export + .global EXTERN_ASM\name +ELF .type EXTERN_ASM\name, %function +FUNC .func EXTERN_ASM\name +EXTERN_ASM\name: + .else +ELF .type \name, %function +FUNC .func \name +\name: + .endif +.endm + +.macro const name, align=2 + .macro endconst +ELF .size \name, . - \name + .purgem endconst + .endm +ELF .section .rodata +MACH .const_data + .align \align +\name: +.endm + +.macro movrel rd, val +#if defined(PIC) && defined(__APPLE__) + adrp \rd, \val@PAGE + add \rd, \rd, \val@PAGEOFF +#elif defined(PIC) + adrp \rd, \val + add \rd, \rd, :lo12:\val +#else + ldr \rd, =\val +#endif +.endm + +#define GLUE(a, b) a ## b +#define JOIN(a, b) GLUE(a, b) +#define X(s) JOIN(EXTERN_ASM, s) + +#define FDEC_STRIDE 32 +#define FENC_STRIDE 16 + + +.macro SUMSUB_AB sum, sub, a, b + add \sum, \a, \b + sub \sub, \a, \b +.endm + +.macro unzip t1, t2, s1, s2 + uzp1 \t1, \s1, \s2 + uzp2 \t2, \s1, \s2 +.endm + +.macro transpose t1, t2, s1, s2 + trn1 \t1, \s1, \s2 + trn2 \t2, \s1, \s2 +.endm + +.macro transpose4x4.h v0, v1, v2, v3, t0, t1, t2, t3 + transpose \t0\().2s, \t2\().2s, \v0\().2s, \v2\().2s + transpose \t1\().2s, \t3\().2s, \v1\().2s, \v3\().2s + transpose \v0\().4h, \v1\().4h, \t0\().4h, \t1\().4h + transpose \v2\().4h, \v3\().4h, \t2\().4h, \t3\().4h +.endm + +.macro transpose4x8.h v0, v1, v2, v3, t0, t1, t2, t3 + transpose \t0\().4s, \t2\().4s, \v0\().4s, \v2\().4s + transpose \t1\().4s, \t3\().4s, \v1\().4s, \v3\().4s + transpose \v0\().8h, \v1\().8h, \t0\().8h, \t1\().8h + transpose \v2\().8h, \v3\().8h, \t2\().8h, \t3\().8h +.endm + + +.macro transpose8x8.h r0, r1, r2, r3, r4, r5, r6, r7, r8, r9 + trn1 \r8\().8H, \r0\().8H, \r1\().8H + trn2 \r9\().8H, \r0\().8H, \r1\().8H + trn1 \r1\().8H, \r2\().8H, \r3\().8H + trn2 \r3\().8H, \r2\().8H, \r3\().8H + trn1 \r0\().8H, \r4\().8H, \r5\().8H + trn2 \r5\().8H, \r4\().8H, \r5\().8H + trn1 \r2\().8H, \r6\().8H, \r7\().8H + trn2 \r7\().8H, \r6\().8H, \r7\().8H + + trn1 \r4\().4S, \r0\().4S, \r2\().4S + trn2 \r2\().4S, \r0\().4S, \r2\().4S + trn1 \r6\().4S, \r5\().4S, \r7\().4S + trn2 \r7\().4S, \r5\().4S, \r7\().4S + trn1 \r5\().4S, \r9\().4S, \r3\().4S + trn2 \r9\().4S, \r9\().4S, \r3\().4S + trn1 \r3\().4S, \r8\().4S, \r1\().4S + trn2 \r8\().4S, \r8\().4S, \r1\().4S + + trn1 \r0\().2D, \r3\().2D, \r4\().2D + trn2 \r4\().2D, \r3\().2D, \r4\().2D + + trn1 \r1\().2D, \r5\().2D, \r6\().2D + trn2 \r5\().2D, \r5\().2D, \r6\().2D + + trn2 \r6\().2D, \r8\().2D, \r2\().2D + trn1 \r2\().2D, \r8\().2D, \r2\().2D + + trn1 \r3\().2D, \r9\().2D, \r7\().2D + trn2 \r7\().2D, \r9\().2D, \r7\().2D +.endm + +.macro transpose_8x16.b r0, r1, r2, r3, r4, r5, r6, r7, t0, t1 + trn1 \t0\().16b, \r0\().16b, \r1\().16b + trn2 \t1\().16b, \r0\().16b, \r1\().16b + trn1 \r1\().16b, \r2\().16b, \r3\().16b + trn2 \r3\().16b, \r2\().16b, \r3\().16b + trn1 \r0\().16b, \r4\().16b, \r5\().16b + trn2 \r5\().16b, \r4\().16b, \r5\().16b + trn1 \r2\().16b, \r6\().16b, \r7\().16b + trn2 \r7\().16b, \r6\().16b, \r7\().16b + + trn1 \r4\().8h, \r0\().8h, \r2\().8h + trn2 \r2\().8h, \r0\().8h, \r2\().8h + trn1 \r6\().8h, \r5\().8h, \r7\().8h + trn2 \r7\().8h, \r5\().8h, \r7\().8h + trn1 \r5\().8h, \t1\().8h, \r3\().8h + trn2 \t1\().8h, \t1\().8h, \r3\().8h + trn1 \r3\().8h, \t0\().8h, \r1\().8h + trn2 \t0\().8h, \t0\().8h, \r1\().8h + + trn1 \r0\().4s, \r3\().4s, \r4\().4s + trn2 \r4\().4s, \r3\().4s, \r4\().4s + + trn1 \r1\().4s, \r5\().4s, \r6\().4s + trn2 \r5\().4s, \r5\().4s, \r6\().4s + + trn2 \r6\().4s, \t0\().4s, \r2\().4s + trn1 \r2\().4s, \t0\().4s, \r2\().4s + + trn1 \r3\().4s, \t1\().4s, \r7\().4s + trn2 \r7\().4s, \t1\().4s, \r7\().4s +.endm + +.macro transpose_4x16.b r0, r1, r2, r3, t4, t5, t6, t7 + trn1 \t4\().16b, \r0\().16b, \r1\().16b + trn2 \t5\().16b, \r0\().16b, \r1\().16b + trn1 \t6\().16b, \r2\().16b, \r3\().16b + trn2 \t7\().16b, \r2\().16b, \r3\().16b + + trn1 \r0\().8h, \t4\().8h, \t6\().8h + trn2 \r2\().8h, \t4\().8h, \t6\().8h + trn1 \r1\().8h, \t5\().8h, \t7\().8h + trn2 \r3\().8h, \t5\().8h, \t7\().8h +.endm + +.macro transpose_4x8.b r0, r1, r2, r3, t4, t5, t6, t7 + trn1 \t4\().8b, \r0\().8b, \r1\().8b + trn2 \t5\().8b, \r0\().8b, \r1\().8b + trn1 \t6\().8b, \r2\().8b, \r3\().8b + trn2 \t7\().8b, \r2\().8b, \r3\().8b + + trn1 \r0\().4h, \t4\().4h, \t6\().4h + trn2 \r2\().4h, \t4\().4h, \t6\().4h + trn1 \r1\().4h, \t5\().4h, \t7\().4h + trn2 \r3\().4h, \t5\().4h, \t7\().4h +.endm
View file
x264-snapshot-20141104-2245.tar.bz2/common/aarch64/dct-a.S
Added
@@ -0,0 +1,666 @@ +/**************************************************************************** + * dct-a.S: AArch6464 transform and zigzag + ***************************************************************************** + * Copyright (C) 2009-2014 x264 project + * + * Authors: David Conrad <lessen42@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "asm.S" + +const scan4x4_frame, align=4 +.byte 0,1, 8,9, 2,3, 4,5 +.byte 10,11, 16,17, 24,25, 18,19 +.byte 12,13, 6,7, 14,15, 20,21 +.byte 26,27, 28,29, 22,23, 30,31 +endconst + +// sum = a + (b>>shift) sub = (a>>shift) - b +.macro SUMSUB_SHR shift sum sub a b t0 t1 + sshr \t0, \b, #\shift + sshr \t1, \a, #\shift + add \sum, \a, \t0 + sub \sub, \t1, \b +.endm + +// sum = (a>>shift) + b sub = a - (b>>shift) +.macro SUMSUB_SHR2 shift sum sub a b t0 t1 + sshr \t0, \a, #\shift + sshr \t1, \b, #\shift + add \sum, \t0, \b + sub \sub, \a, \t1 +.endm + +// a += 1.5*ma b -= 1.5*mb +.macro SUMSUB_15 a b ma mb t0 t1 + sshr \t0, \ma, #1 + sshr \t1, \mb, #1 + add \t0, \t0, \ma + add \t1, \t1, \mb + add \a, \a, \t0 + sub \b, \b, \t1 +.endm + + +function x264_dct4x4dc_neon, export=1 + ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0] + movi v31.4h, #1 + SUMSUB_AB v4.4h, v5.4h, v0.4h, v1.4h + SUMSUB_AB v6.4h, v7.4h, v2.4h, v3.4h + SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h + SUMSUB_AB v3.4h, v1.4h, v5.4h, v7.4h + transpose v4.4h, v6.4h, v0.4h, v2.4h + transpose v5.4h, v7.4h, v1.4h, v3.4h + SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h + SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h + transpose v4.2s, v5.2s, v0.2s, v1.2s + transpose v6.2s, v7.2s, v2.2s, v3.2s + add v16.4h, v4.4h, v31.4h + add v17.4h, v6.4h, v31.4h + srhadd v0.4h, v4.4h, v5.4h + shsub v1.4h, v16.4h, v5.4h + shsub v2.4h, v17.4h, v7.4h + srhadd v3.4h, v6.4h, v7.4h + st1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0] + ret +endfunc + +function x264_idct4x4dc_neon, export=1 + ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0] + SUMSUB_AB v4.4h, v5.4h, v0.4h, v1.4h + SUMSUB_AB v6.4h, v7.4h, v2.4h, v3.4h + SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h + SUMSUB_AB v3.4h, v1.4h, v5.4h, v7.4h + transpose v4.4h, v6.4h, v0.4h, v2.4h + transpose v5.4h, v7.4h, v1.4h, v3.4h + SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h + SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h + transpose v4.2s, v5.2s, v0.2s, v1.2s + transpose v6.2s, v7.2s, v2.2s, v3.2s + SUMSUB_AB v0.4h, v1.4h, v4.4h, v5.4h + SUMSUB_AB v3.4h, v2.4h, v6.4h, v7.4h + st1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0] + ret +endfunc + +.macro DCT_1D v0 v1 v2 v3 v4 v5 v6 v7 + SUMSUB_AB \v1, \v6, \v5, \v6 + SUMSUB_AB \v3, \v7, \v4, \v7 + add \v0, \v3, \v1 + add \v4, \v7, \v7 + add \v5, \v6, \v6 + sub \v2, \v3, \v1 + add \v1, \v4, \v6 + sub \v3, \v7, \v5 +.endm + +function x264_sub4x4_dct_neon, export=1 + mov x3, #FENC_STRIDE + mov x4, #FDEC_STRIDE + ld1 {v0.s}[0], [x1], x3 + ld1 {v1.s}[0], [x2], x4 + ld1 {v2.s}[0], [x1], x3 + usubl v16.8h, v0.8b, v1.8b + ld1 {v3.s}[0], [x2], x4 + ld1 {v4.s}[0], [x1], x3 + usubl v17.8h, v2.8b, v3.8b + ld1 {v5.s}[0], [x2], x4 + ld1 {v6.s}[0], [x1], x3 + usubl v18.8h, v4.8b, v5.8b + ld1 {v7.s}[0], [x2], x4 + usubl v19.8h, v6.8b, v7.8b + + DCT_1D v0.4h, v1.4h, v2.4h, v3.4h, v16.4h, v17.4h, v18.4h, v19.4h + transpose4x4.h v0, v1, v2, v3, v4, v5, v6, v7 + DCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h + st1 {v4.4h,v5.4h,v6.4h,v7.4h}, [x0] + ret +endfunc + +function x264_sub8x4_dct_neon + ld1 {v0.8b}, [x1], x3 + ld1 {v1.8b}, [x2], x4 + usubl v16.8h, v0.8b, v1.8b + ld1 {v2.8b}, [x1], x3 + ld1 {v3.8b}, [x2], x4 + usubl v17.8h, v2.8b, v3.8b + ld1 {v4.8b}, [x1], x3 + ld1 {v5.8b}, [x2], x4 + usubl v18.8h, v4.8b, v5.8b + ld1 {v6.8b}, [x1], x3 + ld1 {v7.8b}, [x2], x4 + usubl v19.8h, v6.8b, v7.8b + + DCT_1D v0.8h, v1.8h, v2.8h, v3.8h, v16.8h, v17.8h, v18.8h, v19.8h + transpose4x8.h v0, v1, v2, v3, v4, v5, v6, v7 + + SUMSUB_AB v16.8h, v19.8h, v0.8h, v3.8h + SUMSUB_AB v17.8h, v18.8h, v1.8h, v2.8h + add v22.8h, v19.8h, v19.8h + add v21.8h, v18.8h, v18.8h + add v0.8h, v16.8h, v17.8h + sub v1.8h, v16.8h, v17.8h + + add v2.8h, v22.8h, v18.8h + sub v3.8h, v19.8h, v21.8h + + zip1 v4.2d, v0.2d, v2.2d + zip2 v6.2d, v0.2d, v2.2d + zip1 v5.2d, v1.2d, v3.2d + zip2 v7.2d, v1.2d, v3.2d + + st1 {v4.8h}, [x0], #16 + st1 {v5.8h}, [x0], #16 + st1 {v6.8h}, [x0], #16 + st1 {v7.8h}, [x0], #16 + ret +endfunc + +function x264_sub8x8_dct_neon, export=1 + mov x5, x30 + mov x3, #FENC_STRIDE + mov x4, #FDEC_STRIDE + bl x264_sub8x4_dct_neon + mov x30, x5 + b x264_sub8x4_dct_neon +endfunc + +function x264_sub16x16_dct_neon, export=1 + mov x5, x30 + mov x3, #FENC_STRIDE + mov x4, #FDEC_STRIDE + bl x264_sub8x4_dct_neon + bl x264_sub8x4_dct_neon + sub x1, x1, #8*FENC_STRIDE-8 + sub x2, x2, #8*FDEC_STRIDE-8 + bl x264_sub8x4_dct_neon + bl x264_sub8x4_dct_neon + sub x1, x1, #8 + sub x2, x2, #8 + bl x264_sub8x4_dct_neon + bl x264_sub8x4_dct_neon + sub x1, x1, #8*FENC_STRIDE-8 + sub x2, x2, #8*FDEC_STRIDE-8 + bl x264_sub8x4_dct_neon + mov x30, x5 + b x264_sub8x4_dct_neon +endfunc + + +.macro DCT8_1D type + SUMSUB_AB v18.8h, v17.8h, v3.8h, v4.8h // s34/d34 + SUMSUB_AB v19.8h, v16.8h, v2.8h, v5.8h // s25/d25 + SUMSUB_AB v22.8h, v21.8h, v1.8h, v6.8h // s16/d16 + SUMSUB_AB v23.8h, v20.8h, v0.8h, v7.8h // s07/d07 + + SUMSUB_AB v24.8h, v26.8h, v23.8h, v18.8h // a0/a2 + SUMSUB_AB v25.8h, v27.8h, v22.8h, v19.8h // a1/a3 + + SUMSUB_AB v30.8h, v29.8h, v20.8h, v17.8h // a6/a5 + sshr v23.8h, v21.8h, #1 + sshr v18.8h, v16.8h, #1 + add v23.8h, v23.8h, v21.8h + add v18.8h, v18.8h, v16.8h + sub v30.8h, v30.8h, v23.8h + sub v29.8h, v29.8h, v18.8h + + SUMSUB_AB v28.8h, v31.8h, v21.8h, v16.8h // a4/a7 + sshr v22.8h, v20.8h, #1 + sshr v19.8h, v17.8h, #1 + add v22.8h, v22.8h, v20.8h + add v19.8h, v19.8h, v17.8h + add v22.8h, v28.8h, v22.8h + add v31.8h, v31.8h, v19.8h + + SUMSUB_AB v0.8h, v4.8h, v24.8h, v25.8h + SUMSUB_SHR 2, v1.8h, v7.8h, v22.8h, v31.8h, v16.8h, v17.8h + SUMSUB_SHR 1, v2.8h, v6.8h, v26.8h, v27.8h, v18.8h, v19.8h + SUMSUB_SHR2 2, v3.8h, v5.8h, v30.8h, v29.8h, v20.8h, v21.8h +.endm + +function x264_sub8x8_dct8_neon, export=1 + mov x3, #FENC_STRIDE + mov x4, #FDEC_STRIDE + ld1 {v16.8b}, [x1], x3 + ld1 {v17.8b}, [x2], x4 + ld1 {v18.8b}, [x1], x3 + ld1 {v19.8b}, [x2], x4 + usubl v0.8h, v16.8b, v17.8b + ld1 {v20.8b}, [x1], x3 + ld1 {v21.8b}, [x2], x4 + usubl v1.8h, v18.8b, v19.8b + ld1 {v22.8b}, [x1], x3 + ld1 {v23.8b}, [x2], x4 + usubl v2.8h, v20.8b, v21.8b + ld1 {v24.8b}, [x1], x3 + ld1 {v25.8b}, [x2], x4 + usubl v3.8h, v22.8b, v23.8b + ld1 {v26.8b}, [x1], x3 + ld1 {v27.8b}, [x2], x4 + usubl v4.8h, v24.8b, v25.8b + ld1 {v28.8b}, [x1], x3 + ld1 {v29.8b}, [x2], x4 + usubl v5.8h, v26.8b, v27.8b + ld1 {v30.8b}, [x1], x3 + ld1 {v31.8b}, [x2], x4 + usubl v6.8h, v28.8b, v29.8b + usubl v7.8h, v30.8b, v31.8b + + DCT8_1D row + transpose8x8.h v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 + DCT8_1D col + + st1 {v0.8h,v1.8h,v2.8h,v3.8h}, [x0], #64 + st1 {v4.8h,v5.8h,v6.8h,v7.8h}, [x0], #64 + ret +endfunc + +function x264_sub16x16_dct8_neon, export=1 + mov x7, x30 + bl X(x264_sub8x8_dct8_neon) + sub x1, x1, #FENC_STRIDE*8 - 8 + sub x2, x2, #FDEC_STRIDE*8 - 8 + bl X(x264_sub8x8_dct8_neon) + sub x1, x1, #8 + sub x2, x2, #8 + bl X(x264_sub8x8_dct8_neon) + mov x30, x7 + sub x1, x1, #FENC_STRIDE*8 - 8 + sub x2, x2, #FDEC_STRIDE*8 - 8 + b X(x264_sub8x8_dct8_neon) +endfunc + + +// First part of IDCT (minus final SUMSUB_BA) +.macro IDCT_1D d4 d5 d6 d7 d0 d1 d2 d3 + SUMSUB_AB \d4, \d5, \d0, \d2 + sshr \d7, \d1, #1 + sshr \d6, \d3, #1 + sub \d7, \d7, \d3 + add \d6, \d6, \d1 +.endm + +function x264_add4x4_idct_neon, export=1 + mov x2, #FDEC_STRIDE + ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x1] + + IDCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h + ld1 {v28.s}[0], [x0], x2 + SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h + SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h + + transpose4x4.h v0, v1, v3, v2, v16, v17, v18, v19 + + IDCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v3.4h, v2.4h + ld1 {v29.s}[0], [x0], x2 + SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h + SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h + + srshr v0.4h, v0.4h, #6 + srshr v1.4h, v1.4h, #6 + ld1 {v31.s}[0], [x0], x2 + srshr v2.4h, v2.4h, #6 + srshr v3.4h, v3.4h, #6 + ld1 {v30.s}[0], [x0], x2 + + sub x0, x0, x2, lsl #2 + uaddw v0.8h, v0.8h, v28.8b + uaddw v1.8h, v1.8h, v29.8b + uaddw v2.8h, v2.8h, v30.8b + uaddw v3.8h, v3.8h, v31.8b + sqxtun v0.8b, v0.8h + sqxtun v1.8b, v1.8h + sqxtun v2.8b, v2.8h + sqxtun v3.8b, v3.8h + + st1 {v0.s}[0], [x0], x2 + st1 {v1.s}[0], [x0], x2 + st1 {v3.s}[0], [x0], x2 + st1 {v2.s}[0], [x0], x2 + ret +endfunc + +function x264_add8x4_idct_neon, export=1 + ld1 {v0.8h,v1.8h}, [x1], #32 + ld1 {v2.8h,v3.8h}, [x1], #32 + transpose v20.2d, v21.2d, v0.2d, v2.2d + transpose v22.2d, v23.2d, v1.2d, v3.2d + IDCT_1D v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h + SUMSUB_AB v0.8h, v3.8h, v16.8h, v18.8h + SUMSUB_AB v1.8h, v2.8h, v17.8h, v19.8h + + transpose4x8.h v0, v1, v2, v3, v4, v5, v6, v7 + + IDCT_1D v16.8h, v17.8h, v18.8h, v19.8h, v0.8h, v1.8h, v2.8h, v3.8h + SUMSUB_AB v0.8h, v3.8h, v16.8h, v18.8h + SUMSUB_AB v1.8h, v2.8h, v17.8h, v19.8h + + srshr v0.8h, v0.8h, #6 + ld1 {v28.8b}, [x0], x2 + srshr v1.8h, v1.8h, #6 + ld1 {v29.8b}, [x0], x2 + srshr v2.8h, v2.8h, #6 + ld1 {v30.8b}, [x0], x2 + srshr v3.8h, v3.8h, #6 + ld1 {v31.8b}, [x0], x2 + + sub x0, x0, x2, lsl #2 + uaddw v0.8h, v0.8h, v28.8b + uaddw v1.8h, v1.8h, v29.8b + uaddw v2.8h, v2.8h, v30.8b + uaddw v3.8h, v3.8h, v31.8b + + sqxtun v0.8b, v0.8h + sqxtun v1.8b, v1.8h + st1 {v0.8b}, [x0], x2 + sqxtun v2.8b, v2.8h + st1 {v1.8b}, [x0], x2 + sqxtun v3.8b, v3.8h + st1 {v2.8b}, [x0], x2 + st1 {v3.8b}, [x0], x2 + ret +endfunc + +function x264_add8x8_idct_neon, export=1 + mov x2, #FDEC_STRIDE + mov x5, x30 + bl X(x264_add8x4_idct_neon) + mov x30, x5 + b X(x264_add8x4_idct_neon) +endfunc + +function x264_add16x16_idct_neon, export=1 + mov x2, #FDEC_STRIDE + mov x5, x30 + bl X(x264_add8x4_idct_neon) + bl X(x264_add8x4_idct_neon) + sub x0, x0, #8*FDEC_STRIDE-8 + bl X(x264_add8x4_idct_neon) + bl X(x264_add8x4_idct_neon) + sub x0, x0, #8 + bl X(x264_add8x4_idct_neon) + bl X(x264_add8x4_idct_neon) + sub x0, x0, #8*FDEC_STRIDE-8 + bl X(x264_add8x4_idct_neon) + mov x30, x5 + b X(x264_add8x4_idct_neon) +endfunc + +.macro IDCT8_1D type + SUMSUB_AB v0.8h, v1.8h, v16.8h, v20.8h // a0/a2 +.ifc \type, row + ld1 {v22.8h,v23.8h}, [x1], #32 +.endif + SUMSUB_SHR 1, v2.8h, v3.8h, v18.8h, v22.8h, v16.8h, v20.8h // a6/a4 + SUMSUB_AB v16.8h, v18.8h, v21.8h, v19.8h + SUMSUB_15 v16.8h, v18.8h, v17.8h, v23.8h, v20.8h, v22.8h // a7/a1 + SUMSUB_AB v22.8h, v23.8h, v23.8h, v17.8h + SUMSUB_15 v23.8h, v22.8h, v21.8h, v19.8h, v20.8h, v17.8h // a5/a3 + + SUMSUB_SHR 2, v21.8h, v22.8h, v22.8h, v23.8h, v19.8h, v17.8h // b3/b5 + SUMSUB_SHR2 2, v20.8h, v23.8h, v16.8h, v18.8h, v19.8h, v17.8h // b1/b7 + + SUMSUB_AB v18.8h, v2.8h, v0.8h, v2.8h // b0/b6 + SUMSUB_AB v19.8h, v3.8h, v1.8h, v3.8h // b2/b4 + + SUMSUB_AB v16.8h, v23.8h, v18.8h, v23.8h + SUMSUB_AB v17.8h, v22.8h, v19.8h, v22.8h + SUMSUB_AB v18.8h, v21.8h, v3.8h, v21.8h + SUMSUB_AB v19.8h, v20.8h, v2.8h, v20.8h +.endm + +function x264_add8x8_idct8_neon, export=1 + mov x2, #FDEC_STRIDE + ld1 {v16.8h,v17.8h}, [x1], #32 + ld1 {v18.8h,v19.8h}, [x1], #32 + ld1 {v20.8h,v21.8h}, [x1], #32 + + IDCT8_1D row + + transpose8x8.h v16, v17, v18, v19, v20, v21, v22, v23, v30, v31 + + IDCT8_1D col + + ld1 {v0.8b}, [x0], x2 + srshr v16.8h, v16.8h, #6 + ld1 {v1.8b}, [x0], x2 + srshr v17.8h, v17.8h, #6 + ld1 {v2.8b}, [x0], x2 + srshr v18.8h, v18.8h, #6 + ld1 {v3.8b}, [x0], x2 + srshr v19.8h, v19.8h, #6 + ld1 {v4.8b}, [x0], x2 + srshr v20.8h, v20.8h, #6 + ld1 {v5.8b}, [x0], x2 + srshr v21.8h, v21.8h, #6 + ld1 {v6.8b}, [x0], x2 + srshr v22.8h, v22.8h, #6 + ld1 {v7.8b}, [x0], x2 + srshr v23.8h, v23.8h, #6 + sub x0, x0, x2, lsl #3 + + uaddw v16.8h, v16.8h, v0.8b + uaddw v17.8h, v17.8h, v1.8b + uaddw v18.8h, v18.8h, v2.8b + sqxtun v0.8b, v16.8h + sqxtun v1.8b, v17.8h + sqxtun v2.8b, v18.8h + uaddw v19.8h, v19.8h, v3.8b + st1 {v0.8b}, [x0], x2 + uaddw v20.8h, v20.8h, v4.8b + st1 {v1.8b}, [x0], x2 + uaddw v21.8h, v21.8h, v5.8b + st1 {v2.8b}, [x0], x2 + sqxtun v3.8b, v19.8h + sqxtun v4.8b, v20.8h + uaddw v22.8h, v22.8h, v6.8b + uaddw v23.8h, v23.8h, v7.8b + st1 {v3.8b}, [x0], x2 + sqxtun v5.8b, v21.8h + st1 {v4.8b}, [x0], x2 + sqxtun v6.8b, v22.8h + sqxtun v7.8b, v23.8h + st1 {v5.8b}, [x0], x2 + st1 {v6.8b}, [x0], x2 + st1 {v7.8b}, [x0], x2 + ret +endfunc + +function x264_add16x16_idct8_neon, export=1 + mov x7, x30 + bl X(x264_add8x8_idct8_neon) + sub x0, x0, #8*FDEC_STRIDE-8 + bl X(x264_add8x8_idct8_neon) + sub x0, x0, #8 + bl X(x264_add8x8_idct8_neon) + sub x0, x0, #8*FDEC_STRIDE-8 + mov x30, x7 + b X(x264_add8x8_idct8_neon) +endfunc + +function x264_add8x8_idct_dc_neon, export=1 + mov x2, #FDEC_STRIDE + ld1 {v16.4h}, [x1] + ld1 {v0.8b}, [x0], x2 + srshr v16.4h, v16.4h, #6 + ld1 {v1.8b}, [x0], x2 + dup v20.8h, v16.h[0] + dup v21.8h, v16.h[1] + ld1 {v2.8b}, [x0], x2 + dup v22.8h, v16.h[2] + dup v23.8h, v16.h[3] + ld1 {v3.8b}, [x0], x2 + trn1 v20.2d, v20.2d, v21.2d + ld1 {v4.8b}, [x0], x2 + trn1 v21.2d, v22.2d, v23.2d + ld1 {v5.8b}, [x0], x2 + neg v22.8h, v20.8h + ld1 {v6.8b}, [x0], x2 + neg v23.8h, v21.8h + ld1 {v7.8b}, [x0], x2 + + sub x0, x0, #8*FDEC_STRIDE + + sqxtun v20.8b, v20.8h + sqxtun v21.8b, v21.8h + sqxtun v22.8b, v22.8h + sqxtun v23.8b, v23.8h + + uqadd v0.8b, v0.8b, v20.8b + uqadd v1.8b, v1.8b, v20.8b + uqadd v2.8b, v2.8b, v20.8b + uqadd v3.8b, v3.8b, v20.8b + uqadd v4.8b, v4.8b, v21.8b + uqadd v5.8b, v5.8b, v21.8b + uqadd v6.8b, v6.8b, v21.8b + uqadd v7.8b, v7.8b, v21.8b + uqsub v0.8b, v0.8b, v22.8b + uqsub v1.8b, v1.8b, v22.8b + uqsub v2.8b, v2.8b, v22.8b + uqsub v3.8b, v3.8b, v22.8b + uqsub v4.8b, v4.8b, v23.8b + uqsub v5.8b, v5.8b, v23.8b + uqsub v6.8b, v6.8b, v23.8b + uqsub v7.8b, v7.8b, v23.8b + + st1 {v0.8b}, [x0], x2 + st1 {v1.8b}, [x0], x2 + st1 {v2.8b}, [x0], x2 + st1 {v3.8b}, [x0], x2 + st1 {v4.8b}, [x0], x2 + st1 {v5.8b}, [x0], x2 + st1 {v6.8b}, [x0], x2 + st1 {v7.8b}, [x0], x2 + ret +endfunc + +.macro ADD16x4_IDCT_DC dc + ld1 {v4.16b}, [x0], x3 + dup v24.8h, \dc[0] + dup v25.8h, \dc[1] + ld1 {v5.16b}, [x0], x3 + dup v26.8h, \dc[2] + dup v27.8h, \dc[3] + ld1 {v6.16b}, [x0], x3 + trn1 v24.2d, v24.2d, v25.2d + ld1 {v7.16b}, [x0], x3 + trn1 v25.2d, v26.2d, v27.2d + neg v26.8h, v24.8h + neg v27.8h, v25.8h + + sqxtun v20.8b, v24.8h + sqxtun v21.8b, v26.8h + sqxtun2 v20.16b, v25.8h + sqxtun2 v21.16b, v27.8h + + uqadd v4.16b, v4.16b, v20.16b + uqadd v5.16b, v5.16b, v20.16b + uqadd v6.16b, v6.16b, v20.16b + uqadd v7.16b, v7.16b, v20.16b + + uqsub v4.16b, v4.16b, v21.16b + uqsub v5.16b, v5.16b, v21.16b + uqsub v6.16b, v6.16b, v21.16b + st1 {v4.16b}, [x2], x3 + uqsub v7.16b, v7.16b, v21.16b + st1 {v5.16b}, [x2], x3 + st1 {v6.16b}, [x2], x3 + st1 {v7.16b}, [x2], x3 +.endm + +function x264_add16x16_idct_dc_neon, export=1 + mov x2, x0 + mov x3, #FDEC_STRIDE + + ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x1] + srshr v0.4h, v0.4h, #6 + srshr v1.4h, v1.4h, #6 + + ADD16x4_IDCT_DC v0.h + srshr v2.4h, v2.4h, #6 + ADD16x4_IDCT_DC v1.h + srshr v3.4h, v3.4h, #6 + ADD16x4_IDCT_DC v2.h + ADD16x4_IDCT_DC v3.h + ret +endfunc + +function x264_sub8x8_dct_dc_neon, export=1 + mov x3, #FENC_STRIDE + mov x4, #FDEC_STRIDE + ld1 {v16.8b}, [x1], x3 + ld1 {v17.8b}, [x2], x4 + usubl v16.8h, v16.8b, v17.8b + ld1 {v18.8b}, [x1], x3 + ld1 {v19.8b}, [x2], x4 + usubl v17.8h, v18.8b, v19.8b + ld1 {v20.8b}, [x1], x3 + ld1 {v21.8b}, [x2], x4 + usubl v18.8h, v20.8b, v21.8b + ld1 {v22.8b}, [x1], x3 + add v0.8h, v16.8h, v17.8h + ld1 {v23.8b}, [x2], x4 + usubl v19.8h, v22.8b, v23.8b + ld1 {v24.8b}, [x1], x3 + add v0.8h, v0.8h, v18.8h + ld1 {v25.8b}, [x2], x4 + usubl v20.8h, v24.8b, v25.8b + ld1 {v26.8b}, [x1], x3 + add v0.8h, v0.8h, v19.8h + ld1 {v27.8b}, [x2], x4 + usubl v21.8h, v26.8b, v27.8b + ld1 {v28.8b}, [x1], x3 + ld1 {v29.8b}, [x2], x4 + usubl v22.8h, v28.8b, v29.8b + ld1 {v30.8b}, [x1], x3 + add v1.8h, v20.8h, v21.8h + ld1 {v31.8b}, [x2], x4 + usubl v23.8h, v30.8b, v31.8b + add v1.8h, v1.8h, v22.8h + add v1.8h, v1.8h, v23.8h + + transpose v2.2d, v3.2d, v0.2d, v1.2d + + add v0.8h, v2.8h, v3.8h + sub v1.8h, v2.8h, v3.8h + + transpose v2.2d, v3.2d, v0.2d, v1.2d + + add v0.8h, v2.8h, v3.8h + sub v1.8h, v2.8h, v3.8h + + transpose v2.2d, v3.2d, v0.2d, v1.2d + + addp v0.8h, v2.8h, v3.8h + addp v0.8h, v0.8h, v0.8h + + st1 {v0.4h}, [x0] + ret +endfunc + +function x264_zigzag_scan_4x4_frame_neon, export=1 + movrel x2, scan4x4_frame + ld1 {v0.16b,v1.16b}, [x1] + ld1 {v16.16b,v17.16b}, [x2] + tbl v2.16b, {v0.16b,v1.16b}, v16.16b + tbl v3.16b, {v0.16b,v1.16b}, v17.16b + st1 {v2.16b,v3.16b}, [x0] + ret +endfunc
View file
x264-snapshot-20141104-2245.tar.bz2/common/aarch64/dct.h
Added
@@ -0,0 +1,52 @@ +/***************************************************************************** + * dct.h: AArch64 transform and zigzag + ***************************************************************************** + * Copyright (C) 2009-2014 x264 project + * + * Authors: David Conrad <lessen42@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_AARCH64_DCT_H +#define X264_AARCH64_DCT_H + +void x264_dct4x4dc_neon( int16_t d[16] ); +void x264_idct4x4dc_neon( int16_t d[16] ); + +void x264_sub4x4_dct_neon( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x8_dct_neon( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub16x16_dct_neon( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); + +void x264_add4x4_idct_neon( uint8_t *p_dst, int16_t dct[16] ); +void x264_add8x8_idct_neon( uint8_t *p_dst, int16_t dct[4][16] ); +void x264_add16x16_idct_neon( uint8_t *p_dst, int16_t dct[16][16] ); + +void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[4] ); +void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[16] ); +void x264_sub8x8_dct_dc_neon( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 ); + +void x264_sub8x8_dct8_neon( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub16x16_dct8_neon( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 ); + +void x264_add8x8_idct8_neon( uint8_t *p_dst, int16_t dct[64] ); +void x264_add16x16_idct8_neon( uint8_t *p_dst, int16_t dct[4][64] ); + +void x264_zigzag_scan_4x4_frame_neon( int16_t level[16], int16_t dct[16] ); + +#endif
View file
x264-snapshot-20141104-2245.tar.bz2/common/aarch64/deblock-a.S
Added
@@ -0,0 +1,392 @@ +/***************************************************************************** + * deblock.S: aarch64 deblocking + ***************************************************************************** + * Copyright (C) 2009-2014 x264 project + * + * Authors: Mans Rullgard <mans@mansr.com> + * Janne Grunau <janne-x264@jannau.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "asm.S" + +.macro h264_loop_filter_start + cmp w2, #0 + ldr w6, [x4] + ccmp w3, #0, #0, ne + mov v24.s[0], w6 + and w6, w6, w6, lsl #16 + b.eq 1f + ands w6, w6, w6, lsl #8 + b.ge 2f +1: + ret +2: +.endm + +.macro h264_loop_filter_luma + dup v22.16b, w2 // alpha + uxtl v24.8h, v24.8b + uabd v21.16b, v16.16b, v0.16b // abs(p0 - q0) + uxtl v24.4s, v24.4h + uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0) + sli v24.8h, v24.8h, #8 + uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0) + sli v24.4s, v24.4s, #16 + cmhi v21.16b, v22.16b, v21.16b // < alpha + dup v22.16b, w3 // beta + cmlt v23.16b, v24.16b, #0 + cmhi v28.16b, v22.16b, v28.16b // < beta + cmhi v30.16b, v22.16b, v30.16b // < beta + bic v21.16b, v21.16b, v23.16b + uabd v17.16b, v20.16b, v16.16b // abs(p2 - p0) + and v21.16b, v21.16b, v28.16b + uabd v19.16b, v4.16b, v0.16b // abs(q2 - q0) + cmhi v17.16b, v22.16b, v17.16b // < beta + and v21.16b, v21.16b, v30.16b + cmhi v19.16b, v22.16b, v19.16b // < beta + and v17.16b, v17.16b, v21.16b + and v19.16b, v19.16b, v21.16b + and v24.16b, v24.16b, v21.16b + urhadd v28.16b, v16.16b, v0.16b + sub v21.16b, v24.16b, v17.16b + uqadd v23.16b, v18.16b, v24.16b + uhadd v20.16b, v20.16b, v28.16b + sub v21.16b, v21.16b, v19.16b + uhadd v28.16b, v4.16b, v28.16b + umin v23.16b, v23.16b, v20.16b + uqsub v22.16b, v18.16b, v24.16b + uqadd v4.16b, v2.16b, v24.16b + umax v23.16b, v23.16b, v22.16b + uqsub v22.16b, v2.16b, v24.16b + umin v28.16b, v4.16b, v28.16b + uxtl v4.8h, v0.8b + umax v28.16b, v28.16b, v22.16b + uxtl2 v20.8h, v0.16b + usubw v4.8h, v4.8h, v16.8b + usubw2 v20.8h, v20.8h, v16.16b + shl v4.8h, v4.8h, #2 + shl v20.8h, v20.8h, #2 + uaddw v4.8h, v4.8h, v18.8b + uaddw2 v20.8h, v20.8h, v18.16b + usubw v4.8h, v4.8h, v2.8b + usubw2 v20.8h, v20.8h, v2.16b + rshrn v4.8b, v4.8h, #3 + rshrn2 v4.16b, v20.8h, #3 + bsl v17.16b, v23.16b, v18.16b + bsl v19.16b, v28.16b, v2.16b + neg v23.16b, v21.16b + uxtl v28.8h, v16.8b + smin v4.16b, v4.16b, v21.16b + uxtl2 v21.8h, v16.16b + smax v4.16b, v4.16b, v23.16b + uxtl v22.8h, v0.8b + uxtl2 v24.8h, v0.16b + saddw v28.8h, v28.8h, v4.8b + saddw2 v21.8h, v21.8h, v4.16b + ssubw v22.8h, v22.8h, v4.8b + ssubw2 v24.8h, v24.8h, v4.16b + sqxtun v16.8b, v28.8h + sqxtun2 v16.16b, v21.8h + sqxtun v0.8b, v22.8h + sqxtun2 v0.16b, v24.8h +.endm + +function x264_deblock_v_luma_neon, export=1 + h264_loop_filter_start + + ld1 {v0.16b}, [x0], x1 + ld1 {v2.16b}, [x0], x1 + ld1 {v4.16b}, [x0], x1 + sub x0, x0, x1, lsl #2 + sub x0, x0, x1, lsl #1 + ld1 {v20.16b}, [x0], x1 + ld1 {v18.16b}, [x0], x1 + ld1 {v16.16b}, [x0], x1 + + h264_loop_filter_luma + + sub x0, x0, x1, lsl #1 + st1 {v17.16b}, [x0], x1 + st1 {v16.16b}, [x0], x1 + st1 {v0.16b}, [x0], x1 + st1 {v19.16b}, [x0] + + ret +endfunc + +function x264_deblock_h_luma_neon, export=1 + h264_loop_filter_start + + sub x0, x0, #4 + ld1 {v6.8b}, [x0], x1 + ld1 {v20.8b}, [x0], x1 + ld1 {v18.8b}, [x0], x1 + ld1 {v16.8b}, [x0], x1 + ld1 {v0.8b}, [x0], x1 + ld1 {v2.8b}, [x0], x1 + ld1 {v4.8b}, [x0], x1 + ld1 {v26.8b}, [x0], x1 + ld1 {v6.d}[1], [x0], x1 + ld1 {v20.d}[1], [x0], x1 + ld1 {v18.d}[1], [x0], x1 + ld1 {v16.d}[1], [x0], x1 + ld1 {v0.d}[1], [x0], x1 + ld1 {v2.d}[1], [x0], x1 + ld1 {v4.d}[1], [x0], x1 + ld1 {v26.d}[1], [x0], x1 + + transpose_8x16.b v6, v20, v18, v16, v0, v2, v4, v26, v21, v23 + + h264_loop_filter_luma + + transpose_4x16.b v17, v16, v0, v19, v21, v23, v25, v27 + + sub x0, x0, x1, lsl #4 + add x0, x0, #2 + st1 {v17.s}[0], [x0], x1 + st1 {v16.s}[0], [x0], x1 + st1 {v0.s}[0], [x0], x1 + st1 {v19.s}[0], [x0], x1 + st1 {v17.s}[1], [x0], x1 + st1 {v16.s}[1], [x0], x1 + st1 {v0.s}[1], [x0], x1 + st1 {v19.s}[1], [x0], x1 + st1 {v17.s}[2], [x0], x1 + st1 {v16.s}[2], [x0], x1 + st1 {v0.s}[2], [x0], x1 + st1 {v19.s}[2], [x0], x1 + st1 {v17.s}[3], [x0], x1 + st1 {v16.s}[3], [x0], x1 + st1 {v0.s}[3], [x0], x1 + st1 {v19.s}[3], [x0], x1 + + ret +endfunc + +.macro h264_loop_filter_chroma + dup v22.16b, w2 // alpha + uxtl v24.8h, v24.8b + uabd v26.16b, v16.16b, v0.16b // abs(p0 - q0) + uxtl v4.8h, v0.8b + uxtl2 v5.8h, v0.16b + uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0) + usubw v4.8h, v4.8h, v16.8b + usubw2 v5.8h, v5.8h, v16.16b + sli v24.8h, v24.8h, #8 + shl v4.8h, v4.8h, #2 + shl v5.8h, v5.8h, #2 + uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0) + uxtl v24.4s, v24.4h + uaddw v4.8h, v4.8h, v18.8b + uaddw2 v5.8h, v5.8h, v18.16b + cmhi v26.16b, v22.16b, v26.16b // < alpha + usubw v4.8h, v4.8h, v2.8b + usubw2 v5.8h, v5.8h, v2.16b + sli v24.4s, v24.4s, #16 + dup v22.16b, w3 // beta + rshrn v4.8b, v4.8h, #3 + rshrn2 v4.16b, v5.8h, #3 + cmhi v28.16b, v22.16b, v28.16b // < beta + cmhi v30.16b, v22.16b, v30.16b // < beta + smin v4.16b, v4.16b, v24.16b + neg v25.16b, v24.16b + and v26.16b, v26.16b, v28.16b + smax v4.16b, v4.16b, v25.16b + and v26.16b, v26.16b, v30.16b + uxtl v22.8h, v0.8b + uxtl2 v23.8h, v0.16b + and v4.16b, v4.16b, v26.16b + uxtl v28.8h, v16.8b + uxtl2 v29.8h, v16.16b + saddw v28.8h, v28.8h, v4.8b + saddw2 v29.8h, v29.8h, v4.16b + ssubw v22.8h, v22.8h, v4.8b + ssubw2 v23.8h, v23.8h, v4.16b + sqxtun v16.8b, v28.8h + sqxtun v0.8b, v22.8h + sqxtun2 v16.16b, v29.8h + sqxtun2 v0.16b, v23.8h +.endm + +function x264_deblock_v_chroma_neon, export=1 + h264_loop_filter_start + + sub x0, x0, x1, lsl #1 + ld1 {v18.16b}, [x0], x1 + ld1 {v16.16b}, [x0], x1 + ld1 {v0.16b}, [x0], x1 + ld1 {v2.16b}, [x0] + + h264_loop_filter_chroma + + sub x0, x0, x1, lsl #1 + st1 {v16.16b}, [x0], x1 + st1 {v0.16b}, [x0], x1 + + ret +endfunc + +function x264_deblock_h_chroma_neon, export=1 + h264_loop_filter_start + + sub x0, x0, #4 + ld1 {v18.d}[0], [x0], x1 + ld1 {v16.d}[0], [x0], x1 + ld1 {v0.d}[0], [x0], x1 + ld1 {v2.d}[0], [x0], x1 + ld1 {v18.d}[1], [x0], x1 + ld1 {v16.d}[1], [x0], x1 + ld1 {v0.d}[1], [x0], x1 + ld1 {v2.d}[1], [x0], x1 + + transpose4x8.h v18, v16, v0, v2, v28, v29, v30, v31 + + h264_loop_filter_chroma + + transpose4x8.h v18, v16, v0, v2, v28, v29, v30, v31 + + sub x0, x0, x1, lsl #3 + st1 {v18.d}[0], [x0], x1 + st1 {v16.d}[0], [x0], x1 + st1 {v0.d}[0], [x0], x1 + st1 {v2.d}[0], [x0], x1 + st1 {v18.d}[1], [x0], x1 + st1 {v16.d}[1], [x0], x1 + st1 {v0.d}[1], [x0], x1 + st1 {v2.d}[1], [x0], x1 + + ret +endfunc + + +//static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], +// int8_t ref[2][X264_SCAN8_LUMA_SIZE], +// int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], +// uint8_t bs[2][8][4], int mvy_limit, +// int bframe ) +function x264_deblock_strength_neon, export=1 + movi v4.16b, #0 + lsl w4, w4, #8 + add x3, x3, #32 + sub w4, w4, #(1<<8)-3 + movi v5.16b, #0 + dup v6.8h, w4 + mov x6, #-32 + +bframe: + // load bytes ref + add x2, x2, #16 + ld1 {v31.d}[1], [x1], #8 + ld1 {v1.16b}, [x1], #16 + movi v0.16b, #0 + ld1 {v2.16b}, [x1], #16 + ext v3.16b, v0.16b, v1.16b, #15 + ext v0.16b, v0.16b, v2.16b, #15 + unzip v21.4s, v22.4s, v1.4s, v2.4s + unzip v23.4s, v20.4s, v3.4s, v0.4s + ext v21.16b, v31.16b, v22.16b, #12 + + eor v0.16b, v20.16b, v22.16b + eor v1.16b, v21.16b, v22.16b + orr v4.16b, v4.16b, v0.16b + orr v5.16b, v5.16b, v1.16b + + ld1 {v21.8h}, [x2], #16 // mv + 0x10 + ld1 {v19.8h}, [x2], #16 // mv + 0x20 + ld1 {v22.8h}, [x2], #16 // mv + 0x30 + ld1 {v18.8h}, [x2], #16 // mv + 0x40 + ld1 {v23.8h}, [x2], #16 // mv + 0x50 + ext v19.16b, v19.16b, v22.16b, #12 + ext v18.16b, v18.16b, v23.16b, #12 + sabd v0.8h, v22.8h, v19.8h + ld1 {v19.8h}, [x2], #16 // mv + 0x60 + sabd v1.8h, v23.8h, v18.8h + ld1 {v24.8h}, [x2], #16 // mv + 0x70 + uqxtn v0.8b, v0.8h + ld1 {v18.8h}, [x2], #16 // mv + 0x80 + ld1 {v25.8h}, [x2], #16 // mv + 0x90 + uqxtn2 v0.16b, v1.8h + ext v19.16b, v19.16b, v24.16b, #12 + ext v18.16b, v18.16b, v25.16b, #12 + sabd v1.8h, v24.8h, v19.8h + sabd v2.8h, v25.8h, v18.8h + uqxtn v1.8b, v1.8h + uqxtn2 v1.16b, v2.8h + + uqsub v0.16b, v0.16b, v6.16b + uqsub v1.16b, v1.16b, v6.16b + uqxtn v0.8b, v0.8h + uqxtn2 v0.16b, v1.8h + + sabd v1.8h, v22.8h, v23.8h + orr v4.16b, v4.16b, v0.16b + + sabd v0.8h, v21.8h, v22.8h + sabd v2.8h, v23.8h, v24.8h + sabd v3.8h, v24.8h, v25.8h + uqxtn v0.8b, v0.8h + uqxtn2 v0.16b, v1.8h + uqxtn v1.8b, v2.8h + uqxtn2 v1.16b, v3.8h + + uqsub v0.16b, v0.16b, v6.16b + uqsub v1.16b, v1.16b, v6.16b + uqxtn v0.8b, v0.8h + uqxtn2 v0.16b, v1.8h + subs w5, w5, #1 + orr v5.16b, v5.16b, v0.16b + b.eq bframe + + movi v6.16b, #1 + // load bytes nnz + ld1 {v31.d}[1], [x0], #8 + ld1 {v1.16b}, [x0], #16 + movi v0.16b, #0 + ld1 {v2.16b}, [x0], #16 + ext v3.16b, v0.16b, v1.16b, #15 + ext v0.16b, v0.16b, v2.16b, #15 + unzip v21.4s, v22.4s, v1.4s, v2.4s + unzip v23.4s, v20.4s, v3.4s, v0.4s + ext v21.16b, v31.16b, v22.16b, #12 + + movrel x7, transpose_table + ld1 {v7.16b}, [x7] + orr v0.16b, v20.16b, v22.16b + orr v1.16b, v21.16b, v22.16b + umin v0.16b, v0.16b, v6.16b + umin v1.16b, v1.16b, v6.16b + umin v4.16b, v4.16b, v6.16b // mv ? 1 : 0 + umin v5.16b, v5.16b, v6.16b + add v0.16b, v0.16b, v0.16b // nnz ? 2 : 0 + add v1.16b, v1.16b, v1.16b + umax v4.16b, v4.16b, v0.16b + umax v5.16b, v5.16b, v1.16b + tbl v6.16b, {v4.16b}, v7.16b + st1 {v5.16b}, [x3], x6 // bs[1] + st1 {v6.16b}, [x3] // bs[0] + ret +endfunc + +const transpose_table + .byte 0, 4, 8, 12 + .byte 1, 5, 9, 13 + .byte 2, 6, 10, 14 + .byte 3, 7, 11, 15 +endconst
View file
x264-snapshot-20141104-2245.tar.bz2/common/aarch64/mc-a.S
Added
@@ -0,0 +1,1365 @@ +/***************************************************************************** + * mc.S: aarch64 motion compensation + ***************************************************************************** + * Copyright (C) 2009-2014 x264 project + * + * Authors: David Conrad <lessen42@gmail.com> + * Janne Grunau <janne-x264@jannau.net> + * Mans Rullgard <mans@mansr.com> + * Stefan Groenroos <stefan.gronroos@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "asm.S" + +// note: prefetch stuff assumes 64-byte cacheline + +// void prefetch_ref( uint8_t *pix, intptr_t stride, int parity ) +function x264_prefetch_ref_aarch64, export=1 + cmp w2, #1 + csel x2, xzr, x1, eq + add x0, x0, #64 + add x0, x0, x2, lsl #3 + + lsl x2, x1, #1 + add x3, x1, x1, lsl #1 + add x4, x0, x1, lsl #2 + + prfm pldl1strm, [x0] + prfm pldl1strm, [x0, x1] + prfm pldl1strm, [x0, x2] + prfm pldl1strm, [x0, x3] + prfm pldl1strm, [x4] + prfm pldl1strm, [x4, x1] + prfm pldl1strm, [x4, x2] + prfm pldl1strm, [x4, x3] + ret +endfunc + +// void prefetch_fenc( uint8_t *pix_y, intptr_t stride_y, +// uint8_t *pix_uv, intptr_t stride_uv, int mb_x ) +.macro x264_prefetch_fenc sub +function x264_prefetch_fenc_\sub\()_aarch64, export=1 + and w6, w5, #3 + and w7, w5, #3 + mul x6, x6, x1 + mul x7, x7, x3 + add x0, x0, #64 + add x2, x2, #64 + + add x0, x0, x6, lsl #2 + add x6, x0, x1, lsl #1 + prfm pldl1strm, [x0] + prfm pldl1strm, [x0, x1] + prfm pldl1strm, [x6] + prfm pldl1strm, [x6, x1] + + add x2, x2, x7, lsl #1 + prfm pldl1strm, [x2] + prfm pldl1strm, [x2, x3] +.ifc \sub, 422 + add x7, x2, x3, lsl #1 + prfm pldl1strm, [x7] + prfm pldl1strm, [x7, x3] +.endif + ret +endfunc +.endm + +x264_prefetch_fenc 420 +x264_prefetch_fenc 422 + +// void pixel_avg( uint8_t *dst, intptr_t dst_stride, +// uint8_t *src1, intptr_t src1_stride, +// uint8_t *src2, intptr_t src2_stride, int weight ); +.macro AVGH w h +function x264_pixel_avg_\w\()x\h\()_neon, export=1 + mov w10, #64 + cmp w6, #32 + mov w9, #\h + b.eq pixel_avg_w\w\()_neon + subs w7, w10, w6 + b.lt pixel_avg_weight_w\w\()_add_sub_neon // weight > 64 + cmp w6, #0 + b.ge pixel_avg_weight_w\w\()_add_add_neon + b pixel_avg_weight_w\w\()_sub_add_neon // weight < 0 +endfunc +.endm + +AVGH 4, 2 +AVGH 4, 4 +AVGH 4, 8 +AVGH 4, 16 +AVGH 8, 4 +AVGH 8, 8 +AVGH 8, 16 +AVGH 16, 8 +AVGH 16, 16 + +// 0 < weight < 64 +.macro load_weights_add_add + mov w6, w6 +.endm +.macro weight_add_add dst, s1, s2, h= +.ifc \h, 2 + umull2 \dst, \s1, v30.16b + umlal2 \dst, \s2, v31.16b +.else + umull \dst, \s1, v30.8b + umlal \dst, \s2, v31.8b +.endif +.endm + +// weight > 64 +.macro load_weights_add_sub + neg w7, w7 +.endm +.macro weight_add_sub dst, s1, s2, h= +.ifc \h, 2 + umull2 \dst, \s1, v30.16b + umlsl2 \dst, \s2, v31.16b +.else + umull \dst, \s1, v30.8b + umlsl \dst, \s2, v31.8b +.endif +.endm + +// weight < 0 +.macro load_weights_sub_add + neg w6, w6 +.endm +.macro weight_sub_add dst, s1, s2, h= +.ifc \h, 2 + umull2 \dst, \s2, v31.16b + umlsl2 \dst, \s1, v30.16b +.else + umull \dst, \s2, v31.8b + umlsl \dst, \s1, v30.8b +.endif +.endm + +.macro AVG_WEIGHT ext +function pixel_avg_weight_w4_\ext\()_neon + load_weights_\ext + dup v30.8b, w6 + dup v31.8b, w7 +1: // height loop + subs w9, w9, #2 + ld1 {v0.s}[0], [x2], x3 + ld1 {v1.s}[0], [x4], x5 + weight_\ext v4.8h, v0.8b, v1.8b + ld1 {v2.s}[0], [x2], x3 + ld1 {v3.s}[0], [x4], x5 + sqrshrun v0.8b, v4.8h, #6 + weight_\ext v5.8h, v2.8b, v3.8b + st1 {v0.s}[0], [x0], x1 + sqrshrun v1.8b, v5.8h, #6 + st1 {v1.s}[0], [x0], x1 + b.gt 1b + ret +endfunc + +function pixel_avg_weight_w8_\ext\()_neon + load_weights_\ext + dup v30.8b, w6 + dup v31.8b, w7 +1: // height loop + subs w9, w9, #4 + ld1 {v0.8b}, [x2], x3 + ld1 {v1.8b}, [x4], x5 + weight_\ext v16.8h, v0.8b, v1.8b + ld1 {v2.8b}, [x2], x3 + ld1 {v3.8b}, [x4], x5 + weight_\ext v17.8h, v2.8b, v3.8b + ld1 {v4.8b}, [x2], x3 + ld1 {v5.8b}, [x4], x5 + weight_\ext v18.8h, v4.8b, v5.8b + ld1 {v6.8b}, [x2], x3 + ld1 {v7.8b}, [x4], x5 + weight_\ext v19.8h, v6.8b, v7.8b + sqrshrun v0.8b, v16.8h, #6 + sqrshrun v1.8b, v17.8h, #6 + sqrshrun v2.8b, v18.8h, #6 + sqrshrun v3.8b, v19.8h, #6 + st1 {v0.8b}, [x0], x1 + st1 {v1.8b}, [x0], x1 + st1 {v2.8b}, [x0], x1 + st1 {v3.8b}, [x0], x1 + b.gt 1b + ret +endfunc + +function pixel_avg_weight_w16_\ext\()_neon + load_weights_\ext + dup v30.16b, w6 + dup v31.16b, w7 +1: // height loop + subs w9, w9, #2 + ld1 {v0.16b}, [x2], x3 + ld1 {v1.16b}, [x4], x5 + weight_\ext v16.8h, v0.8b, v1.8b + weight_\ext v17.8h, v0.16b, v1.16b, 2 + ld1 {v2.16b}, [x2], x3 + ld1 {v3.16b}, [x4], x5 + weight_\ext v18.8h, v2.8b, v3.8b + weight_\ext v19.8h, v2.16b, v3.16b, 2 + sqrshrun v0.8b, v16.8h, #6 + sqrshrun v1.8b, v18.8h, #6 + sqrshrun2 v0.16b, v17.8h, #6 + sqrshrun2 v1.16b, v19.8h, #6 + st1 {v0.16b}, [x0], x1 + st1 {v1.16b}, [x0], x1 + b.gt 1b + ret +endfunc +.endm + +AVG_WEIGHT add_add +AVG_WEIGHT add_sub +AVG_WEIGHT sub_add + +function pixel_avg_w4_neon +1: subs w9, w9, #2 + ld1 {v0.s}[0], [x2], x3 + ld1 {v2.s}[0], [x4], x5 + urhadd v0.8b, v0.8b, v2.8b + ld1 {v1.s}[0], [x2], x3 + ld1 {v3.s}[0], [x4], x5 + urhadd v1.8b, v1.8b, v3.8b + st1 {v0.s}[0], [x0], x1 + st1 {v1.s}[0], [x0], x1 + b.gt 1b + ret +endfunc + +function pixel_avg_w8_neon +1: subs w9, w9, #4 + ld1 {v0.8b}, [x2], x3 + ld1 {v1.8b}, [x4], x5 + ld1 {v2.8b}, [x2], x3 + urhadd v0.8b, v0.8b, v1.8b + ld1 {v3.8b}, [x4], x5 + st1 {v0.8b}, [x0], x1 + ld1 {v4.8b}, [x2], x3 + urhadd v1.8b, v2.8b, v3.8b + ld1 {v5.8b}, [x4], x5 + st1 {v1.8b}, [x0], x1 + ld1 {v6.8b}, [x2], x3 + ld1 {v7.8b}, [x4], x5 + urhadd v2.8b, v4.8b, v5.8b + urhadd v3.8b, v6.8b, v7.8b + st1 {v2.8b}, [x0], x1 + st1 {v3.8b}, [x0], x1 + b.gt 1b + ret +endfunc + +function pixel_avg_w16_neon +1: subs w9, w9, #4 + ld1 {v0.16b}, [x2], x3 + ld1 {v1.16b}, [x4], x5 + ld1 {v2.16b}, [x2], x3 + urhadd v0.16b, v0.16b, v1.16b + ld1 {v3.16b}, [x4], x5 + st1 {v0.16b}, [x0], x1 + ld1 {v4.16b}, [x2], x3 + urhadd v1.16b, v2.16b, v3.16b + ld1 {v5.16b}, [x4], x5 + st1 {v1.16b}, [x0], x1 + ld1 {v6.16b}, [x2], x3 + ld1 {v7.16b}, [x4], x5 + urhadd v2.16b, v4.16b, v5.16b + urhadd v3.16b, v6.16b, v7.16b + st1 {v2.16b}, [x0], x1 + st1 {v3.16b}, [x0], x1 + b.gt 1b + ret +endfunc + +function x264_pixel_avg2_w4_neon, export=1 +1: + subs w5, w5, #2 + ld1 {v0.s}[0], [x2], x3 + ld1 {v2.s}[0], [x4], x3 + urhadd v0.8b, v0.8b, v2.8b + ld1 {v1.s}[0], [x2], x3 + ld1 {v3.s}[0], [x4], x3 + urhadd v1.8b, v1.8b, v3.8b + st1 {v0.s}[0], [x0], x1 + st1 {v1.s}[0], [x0], x1 + b.gt 1b + ret +endfunc + +function x264_pixel_avg2_w8_neon, export=1 +1: + subs w5, w5, #2 + ld1 {v0.8b}, [x2], x3 + ld1 {v2.8b}, [x4], x3 + urhadd v0.8b, v0.8b, v2.8b + ld1 {v1.8b}, [x2], x3 + ld1 {v3.8b}, [x4], x3 + urhadd v1.8b, v1.8b, v3.8b + st1 {v0.8b}, [x0], x1 + st1 {v1.8b}, [x0], x1 + b.gt 1b + ret +endfunc + +function x264_pixel_avg2_w16_neon, export=1 +1: + subs w5, w5, #2 + ld1 {v0.16b}, [x2], x3 + ld1 {v2.16b}, [x4], x3 + urhadd v0.16b, v0.16b, v2.16b + ld1 {v1.16b}, [x2], x3 + ld1 {v3.16b}, [x4], x3 + urhadd v1.16b, v1.16b, v3.16b + st1 {v0.16b}, [x0], x1 + st1 {v1.16b}, [x0], x1 + b.gt 1b + ret +endfunc + +function x264_pixel_avg2_w20_neon, export=1 + sub x1, x1, #16 +1: + subs w5, w5, #2 + ld1 {v0.16b,v1.16b}, [x2], x3 + ld1 {v2.16b,v3.16b}, [x4], x3 + urhadd v0.16b, v0.16b, v2.16b + urhadd v1.8b, v1.8b, v3.8b + ld1 {v4.16b,v5.16b}, [x2], x3 + ld1 {v6.16b,v7.16b}, [x4], x3 + urhadd v4.16b, v4.16b, v6.16b + urhadd v5.8b, v5.8b, v7.8b + st1 {v0.16b}, [x0], #16 + st1 {v1.s}[0], [x0], x1 + st1 {v4.16b}, [x0], #16 + st1 {v5.s}[0], [x0], x1 + b.gt 1b + ret +endfunc + +.macro weight_prologue type + mov w9, w5 // height +.ifc \type, full + ldr w12, [x4, #32] // denom +.endif + ldp w4, w5, [x4, #32+4] // scale, offset + dup v0.16b, w4 + dup v1.8h, w5 +.ifc \type, full + neg w12, w12 + dup v2.8h, w12 +.endif +.endm + +// void mc_weight( uint8_t *src, intptr_t src_stride, uint8_t *dst, +// intptr_t dst_stride, const x264_weight_t *weight, int h ) +function x264_mc_weight_w20_neon, export=1 + weight_prologue full + sub x1, x1, #16 +1: + subs w9, w9, #2 + ld1 {v16.8b,v17.8b,v18.8b}, [x2], x3 + ld1 {v19.8b,v20.8b,v21.8b}, [x2], x3 + umull v22.8h, v16.8b, v0.8b + umull v23.8h, v17.8b, v0.8b + zip1 v18.2s, v18.2s, v21.2s + umull v25.8h, v19.8b, v0.8b + umull v26.8h, v20.8b, v0.8b + umull v24.8h, v18.8b, v0.8b + srshl v22.8h, v22.8h, v2.8h + srshl v23.8h, v23.8h, v2.8h + srshl v24.8h, v24.8h, v2.8h + srshl v25.8h, v25.8h, v2.8h + srshl v26.8h, v26.8h, v2.8h + add v22.8h, v22.8h, v1.8h + add v23.8h, v23.8h, v1.8h + add v24.8h, v24.8h, v1.8h + add v25.8h, v25.8h, v1.8h + add v26.8h, v26.8h, v1.8h + sqxtun v4.8b, v22.8h + sqxtun2 v4.16b, v23.8h + sqxtun v6.8b, v24.8h + sqxtun v5.8b, v25.8h + sqxtun2 v5.16b, v26.8h + st1 {v4.16b}, [x0], #16 + st1 {v6.s}[0], [x0], x1 + st1 {v5.16b}, [x0], #16 + st1 {v6.s}[1], [x0], x1 + b.gt 1b + ret +endfunc + +function x264_mc_weight_w16_neon, export=1 + weight_prologue full +weight16_loop: +1: + subs w9, w9, #2 + ld1 {v4.16b}, [x2], x3 + ld1 {v5.16b}, [x2], x3 + umull v22.8h, v4.8b, v0.8b + umull2 v23.8h, v4.16b, v0.16b + umull v24.8h, v5.8b, v0.8b + umull2 v25.8h, v5.16b, v0.16b + srshl v22.8h, v22.8h, v2.8h + srshl v23.8h, v23.8h, v2.8h + srshl v24.8h, v24.8h, v2.8h + srshl v25.8h, v25.8h, v2.8h + add v22.8h, v22.8h, v1.8h + add v23.8h, v23.8h, v1.8h + add v24.8h, v24.8h, v1.8h + add v25.8h, v25.8h, v1.8h + sqxtun v4.8b, v22.8h + sqxtun2 v4.16b, v23.8h + sqxtun v5.8b, v24.8h + sqxtun2 v5.16b, v25.8h + st1 {v4.16b}, [x0], x1 + st1 {v5.16b}, [x0], x1 + b.gt 1b + ret +endfunc + +function x264_mc_weight_w8_neon, export=1 + weight_prologue full +1: + subs w9, w9, #2 + ld1 {v16.8b}, [x2], x3 + ld1 {v17.8b}, [x2], x3 + umull v4.8h, v16.8b, v0.8b + umull v5.8h, v17.8b, v0.8b + srshl v4.8h, v4.8h, v2.8h + srshl v5.8h, v5.8h, v2.8h + add v4.8h, v4.8h, v1.8h + add v5.8h, v5.8h, v1.8h + sqxtun v16.8b, v4.8h + sqxtun v17.8b, v5.8h + st1 {v16.8b}, [x0], x1 + st1 {v17.8b}, [x0], x1 + b.gt 1b + ret +endfunc + +function x264_mc_weight_w4_neon, export=1 + weight_prologue full +1: + subs w9, w9, #2 + ld1 {v16.s}[0], [x2], x3 + ld1 {v16.s}[1], [x2], x3 + umull v4.8h, v16.8b, v0.8b + srshl v4.8h, v4.8h, v2.8h + add v4.8h, v4.8h, v1.8h + sqxtun v16.8b, v4.8h + st1 {v16.s}[0], [x0], x1 + st1 {v16.s}[1], [x0], x1 + b.gt 1b + ret +endfunc + +function x264_mc_weight_w20_nodenom_neon, export=1 + weight_prologue nodenom + sub x1, x1, #16 +1: + subs w9, w9, #2 + ld1 {v16.8b,v17.8b,v18.8b}, [x2], x3 + mov v27.16b, v1.16b + mov v28.16b, v1.16b + ld1 {v19.8b,v20.8b,v21.8b}, [x2], x3 + mov v31.16b, v1.16b + mov v29.16b, v1.16b + mov v30.16b, v1.16b + zip1 v18.2s, v18.2s, v21.2s + umlal v27.8h, v16.8b, v0.8b + umlal v28.8h, v17.8b, v0.8b + umlal v31.8h, v18.8b, v0.8b + umlal v29.8h, v19.8b, v0.8b + umlal v30.8h, v20.8b, v0.8b + sqxtun v4.8b, v27.8h + sqxtun2 v4.16b, v28.8h + sqxtun v5.8b, v29.8h + sqxtun2 v5.16b, v30.8h + sqxtun v6.8b, v31.8h + st1 {v4.16b}, [x0], #16 + st1 {v6.s}[0], [x0], x1 + st1 {v5.16b}, [x0], #16 + st1 {v6.s}[1], [x0], x1 + b.gt 1b + ret +endfunc + +function x264_mc_weight_w16_nodenom_neon, export=1 + weight_prologue nodenom +1: + subs w9, w9, #2 + ld1 {v6.16b}, [x2], x3 + mov v27.16b, v1.16b + mov v28.16b, v1.16b + ld1 {v7.16b}, [x2], x3 + mov v29.16b, v1.16b + mov v30.16b, v1.16b + umlal v27.8h, v6.8b, v0.8b + umlal2 v28.8h, v6.16b, v0.16b + umlal v29.8h, v7.8b, v0.8b + umlal2 v30.8h, v7.16b, v0.16b + sqxtun v4.8b, v27.8h + sqxtun2 v4.16b, v28.8h + sqxtun v5.8b, v29.8h + sqxtun2 v5.16b, v30.8h + st1 {v4.16b}, [x0], x1 + st1 {v5.16b}, [x0], x1 + b.gt 1b + ret +endfunc + +function x264_mc_weight_w8_nodenom_neon, export=1 + weight_prologue nodenom +1: + subs w9, w9, #2 + ld1 {v16.8b}, [x2], x3 + mov v27.16b, v1.16b + ld1 {v17.8b}, [x2], x3 + mov v29.16b, v1.16b + umlal v27.8h, v16.8b, v0.8b + umlal v29.8h, v17.8b, v0.8b + sqxtun v4.8b, v27.8h + sqxtun v5.8b, v29.8h + st1 {v4.8b}, [x0], x1 + st1 {v5.8b}, [x0], x1 + b.gt 1b + ret +endfunc + +function x264_mc_weight_w4_nodenom_neon, export=1 + weight_prologue nodenom +1: + subs w9, w9, #2 + ld1 {v16.s}[0], [x2], x3 + ld1 {v16.s}[1], [x2], x3 + mov v27.16b, v1.16b + umlal v27.8h, v16.8b, v0.8b + sqxtun v4.8b, v27.8h + st1 {v4.s}[0], [x0], x1 + st1 {v4.s}[1], [x0], x1 + b.gt 1b + ret +endfunc + +.macro weight_simple_prologue + ldr w6, [x4] // offset + dup v1.16b, w6 +.endm + +.macro weight_simple name op +function x264_mc_weight_w20_\name\()_neon, export=1 + weight_simple_prologue +1: + subs w5, w5, #2 + ldr s18, [x2, #16] + ld1 {v16.16b}, [x2], x3 + ldr s19, [x2, #16] + ld1 {v17.16b}, [x2], x3 + \op v18.8b, v18.8b, v1.8b + \op v16.16b, v16.16b, v1.16b + \op v19.8b, v19.8b, v1.8b + \op v17.16b, v17.16b, v1.16b + str s18, [x0, #16] + st1 {v16.16b}, [x0], x1 + str s19, [x0, #16] + st1 {v17.16b}, [x0], x1 + b.gt 1b + ret +endfunc + +function x264_mc_weight_w16_\name\()_neon, export=1 + weight_simple_prologue +1: + subs w5, w5, #2 + ld1 {v16.16b}, [x2], x3 + ld1 {v17.16b}, [x2], x3 + \op v16.16b, v16.16b, v1.16b + \op v17.16b, v17.16b, v1.16b + st1 {v16.16b}, [x0], x1 + st1 {v17.16b}, [x0], x1 + b.gt 1b + ret +endfunc + +function x264_mc_weight_w8_\name\()_neon, export=1 + weight_simple_prologue +1: + subs w5, w5, #2 + ld1 {v16.8b}, [x2], x3 + ld1 {v17.8b}, [x2], x3 + \op v16.8b, v16.8b, v1.8b + \op v17.8b, v17.8b, v1.8b + st1 {v16.8b}, [x0], x1 + st1 {v17.8b}, [x0], x1 + b.gt 1b + ret +endfunc + +function x264_mc_weight_w4_\name\()_neon, export=1 + weight_simple_prologue +1: + subs w5, w5, #2 + ld1 {v16.s}[0], [x2], x3 + ld1 {v16.s}[1], [x2], x3 + \op v16.8b, v16.8b, v1.8b + st1 {v16.s}[0], [x0], x1 + st1 {v16.s}[1], [x0], x1 + b.gt 1b + ret +endfunc +.endm + +weight_simple offsetadd, uqadd +weight_simple offsetsub, uqsub + + +// void mc_copy( uint8_t *dst, intptr_t dst_stride, uint8_t *src, intptr_t src_stride, int height ) +function x264_mc_copy_w4_neon, export=1 +1: + subs w4, w4, #4 + ld1 {v0.s}[0], [x2], x3 + ld1 {v1.s}[0], [x2], x3 + ld1 {v2.s}[0], [x2], x3 + ld1 {v3.s}[0], [x2], x3 + st1 {v0.s}[0], [x0], x1 + st1 {v1.s}[0], [x0], x1 + st1 {v2.s}[0], [x0], x1 + st1 {v3.s}[0], [x0], x1 + b.gt 1b + ret +endfunc + +function x264_mc_copy_w8_neon, export=1 +1: subs w4, w4, #4 + ld1 {v0.8b}, [x2], x3 + ld1 {v1.8b}, [x2], x3 + ld1 {v2.8b}, [x2], x3 + ld1 {v3.8b}, [x2], x3 + st1 {v0.8b}, [x0], x1 + st1 {v1.8b}, [x0], x1 + st1 {v2.8b}, [x0], x1 + st1 {v3.8b}, [x0], x1 + b.gt 1b + ret +endfunc + +function x264_mc_copy_w16_neon, export=1 +1: subs w4, w4, #4 + ld1 {v0.16b}, [x2], x3 + ld1 {v1.16b}, [x2], x3 + ld1 {v2.16b}, [x2], x3 + ld1 {v3.16b}, [x2], x3 + st1 {v0.16b}, [x0], x1 + st1 {v1.16b}, [x0], x1 + st1 {v2.16b}, [x0], x1 + st1 {v3.16b}, [x0], x1 + b.gt 1b + ret +endfunc + +// void x264_mc_chroma_neon( uint8_t *dst_u, uint8_t *dst_v, +// intptr_t i_dst_stride, +// uint8_t *src, intptr_t i_src_stride, +// int dx, int dy, int i_width, int i_height ); +function x264_mc_chroma_neon, export=1 + ldr w15, [sp] // height + sbfx x12, x6, #3, #29 // asr(3) and sign extend + sbfx x11, x5, #3, #29 // asr(3) and sign extend + cmp w7, #4 + mul x12, x12, x4 + add x3, x3, x11, lsl #1 + + and w5, w5, #7 + and w6, w6, #7 + + add x3, x3, x12 + + //pld [x3] + //pld [x3, x4] + + b.gt mc_chroma_w8_neon + b.eq mc_chroma_w4_neon +endfunc + +.macro CHROMA_MC_START r00, r01, r10, r11 + mul w12, w5, w6 // cD = d8x *d8y + lsl w13, w5, #3 + add w9, w12, #64 + lsl w14, w6, #3 + tst w12, w12 + sub w9, w9, w13 + sub w10, w13, w12 // cB = d8x *(8-d8y); + sub w11, w14, w12 // cC = (8-d8x)*d8y + sub w9, w9, w14 // cA = (8-d8x)*(8-d8y); +.endm + +.macro CHROMA_MC width, vsize +function mc_chroma_w\width\()_neon +// since the element size varies, there's a different index for the 2nd store +.if \width == 4 + .set st2, 1 +.else + .set st2, 2 +.endif + CHROMA_MC_START + b.eq 2f + + ld2 {v28.8b,v29.8b}, [x3], x4 + dup v0.8b, w9 // cA + dup v1.8b, w10 // cB + + ext v6.8b, v28.8b, v6.8b, #1 + ext v7.8b, v29.8b, v7.8b, #1 + + ld2 {v30.8b,v31.8b}, [x3], x4 + dup v2.8b, w11 // cC + dup v3.8b, w12 // cD + + ext v22.8b, v30.8b, v22.8b, #1 + ext v23.8b, v31.8b, v23.8b, #1 + + trn1 v0.2s, v0.2s, v1.2s + trn1 v2.2s, v2.2s, v3.2s + + trn1 v4.2s, v28.2s, v6.2s + trn1 v5.2s, v29.2s, v7.2s + trn1 v20.2s, v30.2s, v22.2s + trn1 v21.2s, v31.2s, v23.2s +1: // height loop, interpolate xy + subs w15, w15, #2 + umull v16.8h, v4.8b, v0.8b + umlal v16.8h, v20.8b, v2.8b + umull v17.8h, v5.8b, v0.8b + umlal v17.8h, v21.8b, v2.8b + + ld2 {v28.8b,v29.8b}, [x3], x4 + transpose v24.2d, v25.2d, v16.2d, v17.2d + + ext v6.8b, v28.8b, v6.8b, #1 + ext v7.8b, v29.8b, v7.8b, #1 + + trn1 v4.2s, v28.2s, v6.2s + trn1 v5.2s, v29.2s, v7.2s + + add v16.8h, v24.8h, v25.8h + + umull v18.8h, v20.8b, v0.8b + umlal v18.8h, v4.8b, v2.8b + umull v19.8h, v21.8b, v0.8b + umlal v19.8h, v5.8b, v2.8b + + ld2 {v30.8b,v31.8b}, [x3], x4 + transpose v26.2d, v27.2d, v18.2d, v19.2d + + ext v22.8b, v30.8b, v22.8b, #1 + ext v23.8b, v31.8b, v23.8b, #1 + trn1 v20.2s, v30.2s, v22.2s + trn1 v21.2s, v31.2s, v23.2s + + add v17.8h, v26.8h, v27.8h + + rshrn v16.8b, v16.8h, #6 + rshrn v17.8b, v17.8h, #6 + + //pld [x3] + //pld [x3, x4] + + st1 {v16.\vsize}[0], [x0], x2 + st1 {v16.\vsize}[st2], [x1], x2 + st1 {v17.\vsize}[0], [x0], x2 + st1 {v17.\vsize}[st2], [x1], x2 + b.gt 1b + + ret +2: // dx or dy are 0 + tst w11, w11 + add w10, w10, w11 + dup v0.8b, w9 + dup v1.8b, w10 + + b.eq 4f + + ld1 {v4.8b}, [x3], x4 + ld1 {v6.8b}, [x3], x4 +3: // vertical interpolation loop + subs w15, w15, #2 + umull v16.8h, v4.8b, v0.8b + ld1 {v4.8b}, [x3], x4 + umlal v16.8h, v6.8b, v1.8b + umull v17.8h, v6.8b, v0.8b + ld1 {v6.8b}, [x3], x4 + umlal v17.8h, v4.8b, v1.8b + + rshrn v20.8b, v16.8h, #6 // uvuvuvuv + rshrn v21.8b, v17.8h, #6 // uvuvuvuv + + uzp1 v16.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv + uzp2 v17.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv + + //pld [x3] + //pld [x3, x4] + + st1 {v16.\vsize}[0], [x0], x2 + st1 {v16.\vsize}[st2], [x0], x2 + st1 {v17.\vsize}[0], [x1], x2 + st1 {v17.\vsize}[st2], [x1], x2 + b.gt 3b + + ret + +4: // dy is 0 + ld1 {v4.8b,v5.8b}, [x3], x4 + ld1 {v6.8b,v7.8b}, [x3], x4 + + ext v5.8b, v4.8b, v5.8b, #2 + ext v7.8b, v6.8b, v7.8b, #2 +5: // horizontal interpolation loop + subs w15, w15, #2 + umull v16.8h, v4.8b, v0.8b + umlal v16.8h, v5.8b, v1.8b + umull v17.8h, v6.8b, v0.8b + umlal v17.8h, v7.8b, v1.8b + + ld1 {v4.8b,v5.8b}, [x3], x4 + ld1 {v6.8b,v7.8b}, [x3], x4 + rshrn v20.8b, v16.8h, #6 + rshrn v21.8b, v17.8h, #6 + ext v5.8b, v4.8b, v5.8b, #2 + ext v7.8b, v6.8b, v7.8b, #2 + uzp1 v16.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv + uzp2 v17.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv + + //pld [x3] + //pld [x3, x4] + + st1 {v16.\vsize}[0], [x0], x2 + st1 {v16.\vsize}[st2], [x0], x2 + st1 {v17.\vsize}[0], [x1], x2 + st1 {v17.\vsize}[st2], [x1], x2 + b.gt 5b + + ret +endfunc +.endm + + CHROMA_MC 2, h + CHROMA_MC 4, s + +function mc_chroma_w8_neon + CHROMA_MC_START + b.eq 2f + ld2 {v4.16b,v5.16b}, [x3], x4 + ld2 {v20.16b,v21.16b}, [x3], x4 + dup v0.8b, w9 // cA + dup v1.8b, w10 // cB + + ext v6.16b, v4.16b, v4.16b, #1 + ext v7.16b, v5.16b, v5.16b, #1 + + dup v2.8b, w11 // cC + dup v3.8b, w12 // cD + + ext v22.16b, v20.16b, v20.16b, #1 + ext v23.16b, v21.16b, v21.16b, #1 + +1: // height loop, interpolate xy + subs w15, w15, #2 + umull v16.8h, v4.8b, v0.8b + umlal v16.8h, v6.8b, v1.8b + umlal v16.8h, v20.8b, v2.8b + umlal v16.8h, v22.8b, v3.8b + + umull v17.8h, v5.8b, v0.8b + umlal v17.8h, v7.8b, v1.8b + umlal v17.8h, v21.8b, v2.8b + umlal v17.8h, v23.8b, v3.8b + + ld2 {v4.16b,v5.16b}, [x3], x4 + + ext v6.16b, v4.16b, v4.16b, #1 + ext v7.16b, v5.16b, v5.16b, #1 + + umull v18.8h, v20.8b, v0.8b + umlal v18.8h, v22.8b, v1.8b + umlal v18.8h, v4.8b, v2.8b + umlal v18.8h, v6.8b, v3.8b + + umull v19.8h, v21.8b, v0.8b + umlal v19.8h, v23.8b, v1.8b + umlal v19.8h, v5.8b, v2.8b + umlal v19.8h, v7.8b, v3.8b + + ld2 {v20.16b,v21.16b}, [x3], x4 + + rshrn v16.8b, v16.8h, #6 + rshrn v17.8b, v17.8h, #6 + rshrn v18.8b, v18.8h, #6 + rshrn v19.8b, v19.8h, #6 + + ext v22.16b, v20.16b, v20.16b, #1 + ext v23.16b, v21.16b, v21.16b, #1 + + //pld [x3] + //pld [x3, x4] + + st1 {v16.8b}, [x0], x2 + st1 {v17.8b}, [x1], x2 + st1 {v18.8b}, [x0], x2 + st1 {v19.8b}, [x1], x2 + b.gt 1b + + ret +2: // dx or dy are 0 + tst w11, w11 + add w10, w10, w11 + dup v0.8b, w9 + dup v1.8b, w10 + + b.eq 4f + + ld2 {v4.8b,v5.8b}, [x3], x4 + ld2 {v6.8b,v7.8b}, [x3], x4 +3: // vertical interpolation loop + subs w15, w15, #2 + umull v16.8h, v4.8b, v0.8b //U + umlal v16.8h, v6.8b, v1.8b + umull v17.8h, v5.8b, v0.8b //V + umlal v17.8h, v7.8b, v1.8b + + ld2 {v4.8b,v5.8b}, [x3], x4 + + umull v18.8h, v6.8b, v0.8b + umlal v18.8h, v4.8b, v1.8b + umull v19.8h, v7.8b, v0.8b + umlal v19.8h, v5.8b, v1.8b + + ld2 {v6.8b,v7.8b}, [x3], x4 + + rshrn v16.8b, v16.8h, #6 + rshrn v17.8b, v17.8h, #6 + rshrn v18.8b, v18.8h, #6 + rshrn v19.8b, v19.8h, #6 + + //pld [x3] + //pld [x3, x4] + + st1 {v16.8b}, [x0], x2 + st1 {v17.8b}, [x1], x2 + st1 {v18.8b}, [x0], x2 + st1 {v19.8b}, [x1], x2 + b.gt 3b + + ret +4: // dy is 0 + ld2 {v4.16b,v5.16b}, [x3], x4 + ext v6.16b, v4.16b, v4.16b, #1 + ext v7.16b, v5.16b, v5.16b, #1 + ld2 {v20.16b,v21.16b}, [x3], x4 + ext v22.16b, v20.16b, v20.16b, #1 + ext v23.16b, v21.16b, v21.16b, #1 +5: // horizontal interpolation loop + subs w15, w15, #2 + umull v16.8h, v4.8b, v0.8b //U + umlal v16.8h, v6.8b, v1.8b + umull v17.8h, v5.8b, v0.8b //V + umlal v17.8h, v7.8b, v1.8b + + ld2 {v4.16b,v5.16b}, [x3], x4 + + umull v18.8h, v20.8b, v0.8b + umlal v18.8h, v22.8b, v1.8b + umull v19.8h, v21.8b, v0.8b + umlal v19.8h, v23.8b, v1.8b + + ld2 {v20.16b,v21.16b}, [x3], x4 + + rshrn v16.8b, v16.8h, #6 + rshrn v17.8b, v17.8h, #6 + rshrn v18.8b, v18.8h, #6 + rshrn v19.8b, v19.8h, #6 + + ext v6.16b, v4.16b, v4.16b, #1 + ext v7.16b, v5.16b, v5.16b, #1 + ext v22.16b, v20.16b, v20.16b, #1 + ext v23.16b, v21.16b, v21.16b, #1 + + //pld [x3] + //pld [x3, x4] + + st1 {v16.8b}, [x0], x2 + st1 {v17.8b}, [x1], x2 + st1 {v18.8b}, [x0], x2 + st1 {v19.8b}, [x1], x2 + b.gt 5b + + ret +endfunc + +//void hpel_filter( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src, +// intptr_t stride, int width, int height, int16_t *buf ) +function x264_hpel_filter_neon, export=1 + ubfm x9, x3, #0, #3 + add w15, w5, w9 + sub x13, x3, x9 // align src + sub x10, x0, x9 + sub x11, x1, x9 + sub x12, x2, x9 + movi v30.16b, #5 + movi v31.16b, #20 +1: // line start + mov x3, x13 + mov x2, x12 + mov x1, x11 + mov x0, x10 + add x7, x3, #16 // src pointer next 16b for horiz filter + mov x5, x15 // restore width + sub x3, x3, x4, lsl #1 // src - 2*stride + ld1 {v28.16b}, [x7], #16 // src[16:31] + + add x9, x3, x5 // holds src - 2*stride + width + + ld1 {v16.16b}, [x3], x4 // src-2*stride[0:15] + ld1 {v17.16b}, [x3], x4 // src-1*stride[0:15] + ld1 {v18.16b}, [x3], x4 // src+0*stride[0:15] + ld1 {v19.16b}, [x3], x4 // src+1*stride[0:15] + ld1 {v20.16b}, [x3], x4 // src+2*stride[0:15] + ld1 {v21.16b}, [x3], x4 // src+3*stride[0:15] + + ext v22.16b, v7.16b, v18.16b, #14 + uaddl v1.8h, v16.8b, v21.8b + ext v26.16b, v18.16b, v28.16b, #3 + umlsl v1.8h, v17.8b, v30.8b + ext v23.16b, v7.16b, v18.16b, #15 + umlal v1.8h, v18.8b, v31.8b + ext v24.16b, v18.16b, v28.16b, #1 + umlal v1.8h, v19.8b, v31.8b + ext v25.16b, v18.16b, v28.16b, #2 + umlsl v1.8h, v20.8b, v30.8b +2: // next 16 pixel of line + subs x5, x5, #16 + sub x3, x9, x5 // src - 2*stride += 16 + + uaddl v4.8h, v22.8b, v26.8b + uaddl2 v5.8h, v22.16b, v26.16b + sqrshrun v6.8b, v1.8h, #5 + umlsl v4.8h, v23.8b, v30.8b + umlsl2 v5.8h, v23.16b, v30.16b + umlal v4.8h, v18.8b, v31.8b + umlal2 v5.8h, v18.16b, v31.16b + umlal v4.8h, v24.8b, v31.8b + umlal2 v5.8h, v24.16b, v31.16b + umlsl v4.8h, v25.8b, v30.8b + umlsl2 v5.8h, v25.16b, v30.16b + + uaddl2 v2.8h, v16.16b, v21.16b + sqrshrun v4.8b, v4.8h, #5 + mov v7.16b, v18.16b + sqrshrun2 v4.16b, v5.8h, #5 + + umlsl2 v2.8h, v17.16b, v30.16b + ld1 {v16.16b}, [x3], x4 // src-2*stride[0:15] + umlal2 v2.8h, v18.16b, v31.16b + ld1 {v17.16b}, [x3], x4 // src-1*stride[0:15] + umlal2 v2.8h, v19.16b, v31.16b + ld1 {v18.16b}, [x3], x4 // src+0*stride[0:15] + umlsl2 v2.8h, v20.16b, v30.16b + ld1 {v19.16b}, [x3], x4 // src+1*stride[0:15] + st1 {v4.16b}, [x0], #16 + sqrshrun2 v6.16b, v2.8h, #5 + ld1 {v20.16b}, [x3], x4 // src+2*stride[0:15] + ld1 {v21.16b}, [x3], x4 // src+3*stride[0:15] + + ext v22.16b, v0.16b, v1.16b, #12 + ext v26.16b, v1.16b, v2.16b, #6 + ext v23.16b, v0.16b, v1.16b, #14 + st1 {v6.16b}, [x1], #16 + uaddl v3.8h, v16.8b, v21.8b + ext v25.16b, v1.16b, v2.16b, #4 + umlsl v3.8h, v17.8b, v30.8b + ext v24.16b, v1.16b, v2.16b, #2 + + umlal v3.8h, v18.8b, v31.8b + add v4.8h, v22.8h, v26.8h + umlal v3.8h, v19.8b, v31.8b + add v5.8h, v23.8h, v25.8h + umlsl v3.8h, v20.8b, v30.8b + add v6.8h, v24.8h, v1.8h + + ext v22.16b, v1.16b, v2.16b, #12 + ext v26.16b, v2.16b, v3.16b, #6 + ext v23.16b, v1.16b, v2.16b, #14 + ext v25.16b, v2.16b, v3.16b, #4 + ext v24.16b, v2.16b, v3.16b, #2 + + add v22.8h, v22.8h, v26.8h + add v23.8h, v23.8h, v25.8h + add v24.8h, v24.8h, v2.8h + + sub v4.8h, v4.8h, v5.8h // a-b + sub v5.8h, v5.8h, v6.8h // b-c + + sub v22.8h, v22.8h, v23.8h // a-b + sub v23.8h, v23.8h, v24.8h // b-c + + sshr v4.8h, v4.8h, #2 // (a-b)/4 + sshr v22.8h, v22.8h, #2 // (a-b)/4 + sub v4.8h, v4.8h, v5.8h // (a-b)/4-b+c + sub v22.8h, v22.8h, v23.8h // (a-b)/4-b+c + sshr v4.8h, v4.8h, #2 // ((a-b)/4-b+c)/4 + sshr v22.8h, v22.8h, #2 // ((a-b)/4-b+c)/4 + add v4.8h, v4.8h, v6.8h // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 + add v22.8h, v22.8h, v24.8h // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 + + sqrshrun v4.8b, v4.8h, #6 + ld1 {v28.16b}, [x7], #16 // src[16:31] + mov v0.16b, v2.16b + ext v23.16b, v7.16b, v18.16b, #15 + sqrshrun2 v4.16b, v22.8h, #6 + mov v1.16b, v3.16b + ext v22.16b, v7.16b, v18.16b, #14 + ext v24.16b, v18.16b, v28.16b, #1 + ext v25.16b, v18.16b, v28.16b, #2 + ext v26.16b, v18.16b, v28.16b, #3 + + st1 {v4.16b}, [x2], #16 + b.gt 2b + + subs w6, w6, #1 + add x10, x10, x4 + add x11, x11, x4 + add x12, x12, x4 + add x13, x13, x4 + b.gt 1b + + ret +endfunc + +// frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, +// uint8_t *dstv, uint8_t *dstc, intptr_t src_stride, +// intptr_t dst_stride, int width, int height ) +function x264_frame_init_lowres_core_neon, export=1 + ldr w8, [sp] + sub x10, x6, w7, uxtw // dst_stride - width + and x10, x10, #~15 + +1: + mov w9, w7 // width + mov x11, x0 // src0 + add x12, x0, x5 // src1 = src0 + src_stride + add x13, x0, x5, lsl #1 // src2 = src1 + src_stride + + ld2 {v0.16b,v1.16b}, [x11], #32 + ld2 {v2.16b,v3.16b}, [x12], #32 + ld2 {v4.16b,v5.16b}, [x13], #32 + + urhadd v20.16b, v0.16b, v2.16b // s0[2x] + s1[2x] + urhadd v22.16b, v2.16b, v4.16b // s1[2x] + s2[2x] +2: + subs w9, w9, #16 + urhadd v21.16b, v1.16b, v3.16b // s0[2x+1] + s1[2x+1] + urhadd v23.16b, v3.16b, v5.16b // s1[2x+1] + s2[2x+1] + + ld2 {v0.16b,v1.16b}, [x11], #32 + ld2 {v2.16b,v3.16b}, [x12], #32 + ld2 {v4.16b,v5.16b}, [x13], #32 + urhadd v30.16b, v0.16b, v2.16b // loop: s0[2x] + s1[2x] + urhadd v31.16b, v2.16b, v4.16b // loop: s1[2x] + s2[2x] + ext v24.16b, v20.16b, v30.16b, #1 // s0[2x+2] + s1[2x+2] + ext v25.16b, v22.16b, v31.16b, #1 // s1[2x+2] + s2[2x+2] + + urhadd v16.16b, v20.16b, v21.16b + urhadd v18.16b, v22.16b, v23.16b + urhadd v17.16b, v21.16b, v24.16b + urhadd v19.16b, v23.16b, v25.16b + + st1 {v16.16b}, [x1], #16 + st1 {v18.16b}, [x3], #16 + st1 {v17.16b}, [x2], #16 + st1 {v19.16b}, [x4], #16 + b.le 3f + + subs w9, w9, #16 + urhadd v21.16b, v1.16b, v3.16b // s0[2x+1] + s1[2x+1] + urhadd v23.16b, v3.16b, v5.16b // s1[2x+1] + s2[2x+1] + + ld2 {v0.16b,v1.16b}, [x11], #32 + ld2 {v2.16b,v3.16b}, [x12], #32 + ld2 {v4.16b,v5.16b}, [x13], #32 + urhadd v20.16b, v0.16b, v2.16b // loop: s0[2x] + s1[2x] + urhadd v22.16b, v2.16b, v4.16b // loop: s1[2x] + s2[2x] + ext v24.16b, v30.16b, v20.16b, #1 // s0[2x+2] + s1[2x+2] + ext v25.16b, v31.16b, v22.16b, #1 // s1[2x+2] + s2[2x+2] + + urhadd v16.16b, v30.16b, v21.16b + urhadd v18.16b, v31.16b, v23.16b + urhadd v17.16b, v21.16b, v24.16b + urhadd v19.16b, v23.16b, v25.16b + + st1 {v16.16b}, [x1], #16 + st1 {v18.16b}, [x3], #16 + st1 {v17.16b}, [x2], #16 + st1 {v19.16b}, [x4], #16 + b.gt 2b +3: + subs w8, w8, #1 + add x0, x0, x5, lsl #1 + add x1, x1, x10 + add x2, x2, x10 + add x3, x3, x10 + add x4, x4, x10 + b.gt 1b + + ret +endfunc + +function x264_load_deinterleave_chroma_fenc_neon, export=1 + mov x4, #FENC_STRIDE/2 + b load_deinterleave_chroma +endfunc + +function x264_load_deinterleave_chroma_fdec_neon, export=1 + mov x4, #FDEC_STRIDE/2 +load_deinterleave_chroma: + ld2 {v0.8b,v1.8b}, [x1], x2 + ld2 {v2.8b,v3.8b}, [x1], x2 + subs w3, w3, #2 + st1 {v0.8b}, [x0], x4 + st1 {v1.8b}, [x0], x4 + st1 {v2.8b}, [x0], x4 + st1 {v3.8b}, [x0], x4 + b.gt load_deinterleave_chroma + + ret +endfunc + +function x264_plane_copy_deinterleave_neon, export=1 + add w9, w6, #15 + and w9, w9, #0xfffffff0 + sub x1, x1, x9 + sub x3, x3, x9 + sub x5, x5, x9, lsl #1 +1: + ld2 {v0.16b,v1.16b}, [x4], #32 + subs w9, w9, #16 + st1 {v0.16b}, [x0], #16 + st1 {v1.16b}, [x2], #16 + b.gt 1b + + add x4, x4, x5 + subs w7, w7, #1 + add x0, x0, x1 + add x2, x2, x3 + mov w9, w6 + b.gt 1b + + ret +endfunc + +.macro deinterleave_rgb + subs x11, x11, #8 + st1 {v0.8b}, [x0], #8 + st1 {v1.8b}, [x2], #8 + st1 {v2.8b}, [x4], #8 + b.gt 1b + + subs w10, w10, #1 + add x0, x0, x1 + add x2, x2, x3 + add x4, x4, x5 + add x6, x6, x7 + mov x11, x9 + b.gt 1b +.endm + +function x264_plane_copy_deinterleave_rgb_neon, export=1 +#if SYS_MACOSX + ldr w8, [sp] + ldp w9, w10, [sp, #4] +#else + ldr x8, [sp] + ldp x9, x10, [sp, #8] +#endif + cmp w8, #3 + uxtw x9, w9 + add x11, x9, #7 + and x11, x11, #~7 + sub x1, x1, x11 + sub x3, x3, x11 + sub x5, x5, x11 + b.ne 4f + sub x7, x7, x11, lsl #1 + sub x7, x7, x11 +1: + ld3 {v0.8b,v1.8b,v2.8b}, [x6], #24 + deinterleave_rgb + + ret +4: + sub x7, x7, x11, lsl #2 +1: + ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [x6], #32 + deinterleave_rgb + + ret +endfunc + +function x264_plane_copy_interleave_neon, export=1 + add w9, w6, #15 + and w9, w9, #0xfffffff0 + sub x1, x1, x9, lsl #1 + sub x3, x3, x9 + sub x5, x5, x9 +1: + ld1 {v0.16b}, [x2], #16 + ld1 {v1.16b}, [x4], #16 + subs w9, w9, #16 + st2 {v0.16b,v1.16b}, [x0], #32 + b.gt 1b + + subs w7, w7, #1 + add x0, x0, x1 + add x2, x2, x3 + add x4, x4, x5 + mov w9, w6 + b.gt 1b + + ret +endfunc + +function x264_store_interleave_chroma_neon, export=1 + mov x5, #FDEC_STRIDE +1: + ld1 {v0.8b}, [x2], x5 + ld1 {v1.8b}, [x3], x5 + ld1 {v2.8b}, [x2], x5 + ld1 {v3.8b}, [x3], x5 + subs w4, w4, #2 + zip1 v4.16b, v0.16b, v1.16b + zip1 v5.16b, v2.16b, v3.16b + st1 {v4.16b}, [x0], x1 + st1 {v5.16b}, [x0], x1 + b.gt 1b + + ret +endfunc
View file
x264-snapshot-20141104-2245.tar.bz2/common/aarch64/mc-c.c
Added
@@ -0,0 +1,249 @@ +/***************************************************************************** + * mc-c.c: aarch64 motion compensation + ***************************************************************************** + * Copyright (C) 2009-2014 x264 project + * + * Authors: David Conrad <lessen42@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "common/common.h" +#include "mc.h" + +void x264_prefetch_ref_aarch64( uint8_t *, intptr_t, int ); +void x264_prefetch_fenc_420_aarch64( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_prefetch_fenc_422_aarch64( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); + +void *x264_memcpy_aligned_neon( void *dst, const void *src, size_t n ); +void x264_memzero_aligned_neon( void *dst, size_t n ); + +void x264_pixel_avg_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_16x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_8x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_8x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_4x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_4x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_4x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_4x2_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); + +void x264_pixel_avg2_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); +void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); +void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); +void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); + +void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu, + pixel *dstv, intptr_t i_dstv, + pixel *src, intptr_t i_src, int w, int h ); +void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta, + pixel *dstb, intptr_t i_dstb, + pixel *dstc, intptr_t i_dstc, + pixel *src, intptr_t i_src, int pw, int w, int h ); +void x264_plane_copy_interleave_neon( pixel *dst, intptr_t i_dst, + pixel *srcu, intptr_t i_srcu, + pixel *srcv, intptr_t i_srcv, int w, int h ); + +void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height ); +void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height ); +void x264_load_deinterleave_chroma_fenc_neon( pixel *dst, pixel *src, intptr_t i_src, int height ); + +#define MC_WEIGHT(func)\ +void x264_mc_weight_w20##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ +void x264_mc_weight_w16##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ +void x264_mc_weight_w8##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ +void x264_mc_weight_w4##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ +\ +static void (* x264_mc##func##_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int ) =\ +{\ + x264_mc_weight_w4##func##_neon,\ + x264_mc_weight_w4##func##_neon,\ + x264_mc_weight_w8##func##_neon,\ + x264_mc_weight_w16##func##_neon,\ + x264_mc_weight_w16##func##_neon,\ + x264_mc_weight_w20##func##_neon,\ +}; + +MC_WEIGHT() +MC_WEIGHT(_nodenom) +MC_WEIGHT(_offsetadd) +MC_WEIGHT(_offsetsub) + +void x264_mc_copy_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_mc_copy_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_mc_copy_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); + +void x264_mc_chroma_neon( uint8_t *, uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int ); +void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int ); + +#if !HIGH_BIT_DEPTH +static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w ) +{ + if( w->i_scale == 1<<w->i_denom ) + { + if( w->i_offset < 0 ) + { + w->weightfn = x264_mc_offsetsub_wtab_neon; + w->cachea[0] = -w->i_offset; + } + else + { + w->weightfn = x264_mc_offsetadd_wtab_neon; + w->cachea[0] = w->i_offset; + } + } + else if( !w->i_denom ) + w->weightfn = x264_mc_nodenom_wtab_neon; + else + w->weightfn = x264_mc_wtab_neon; +} + +static void (* const x264_pixel_avg_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ) = +{ + NULL, + x264_pixel_avg2_w4_neon, + x264_pixel_avg2_w8_neon, + x264_pixel_avg2_w16_neon, // no slower than w12, so no point in a separate function + x264_pixel_avg2_w16_neon, + x264_pixel_avg2_w20_neon, +}; + +static void (* const x264_mc_copy_wtab_neon[5])( uint8_t *, intptr_t, uint8_t *, intptr_t, int ) = +{ + NULL, + x264_mc_copy_w4_neon, + x264_mc_copy_w8_neon, + NULL, + x264_mc_copy_w16_neon, +}; + +static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; +static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; + +static void mc_luma_neon( uint8_t *dst, intptr_t i_dst_stride, + uint8_t *src[4], intptr_t i_src_stride, + int mvx, int mvy, + int i_width, int i_height, const x264_weight_t *weight ) +{ + int qpel_idx = ((mvy&3)<<2) + (mvx&3); + intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2); + uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset; + if ( (mvy&3) == 3 ) // explict if() to force conditional add + src1 += i_src_stride; + + if( qpel_idx & 5 ) /* qpel interpolation needed */ + { + uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); + x264_pixel_avg_wtab_neon[i_width>>2]( + dst, i_dst_stride, src1, i_src_stride, + src2, i_height ); + if( weight->weightfn ) + weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height ); + } + else if( weight->weightfn ) + weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height ); + else + x264_mc_copy_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, i_height ); +} + +static uint8_t *get_ref_neon( uint8_t *dst, intptr_t *i_dst_stride, + uint8_t *src[4], intptr_t i_src_stride, + int mvx, int mvy, + int i_width, int i_height, const x264_weight_t *weight ) +{ + int qpel_idx = ((mvy&3)<<2) + (mvx&3); + intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2); + uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset; + if ( (mvy&3) == 3 ) // explict if() to force conditional add + src1 += i_src_stride; + + if( qpel_idx & 5 ) /* qpel interpolation needed */ + { + uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); + x264_pixel_avg_wtab_neon[i_width>>2]( + dst, *i_dst_stride, src1, i_src_stride, + src2, i_height ); + if( weight->weightfn ) + weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height ); + return dst; + } + else if( weight->weightfn ) + { + weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height ); + return dst; + } + else + { + *i_dst_stride = i_src_stride; + return src1; + } +} + +void x264_hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, + uint8_t *src, intptr_t stride, int width, + int height, int16_t *buf ); +#endif // !HIGH_BIT_DEPTH + +void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf ) +{ +#if !HIGH_BIT_DEPTH + if( cpu&X264_CPU_ARMV8 ) + { + pf->prefetch_fenc_420 = x264_prefetch_fenc_420_aarch64; + pf->prefetch_fenc_422 = x264_prefetch_fenc_422_aarch64; + pf->prefetch_ref = x264_prefetch_ref_aarch64; + } + + if( !(cpu&X264_CPU_NEON) ) + return; + + pf->copy_16x16_unaligned = x264_mc_copy_w16_neon; + pf->copy[PIXEL_16x16] = x264_mc_copy_w16_neon; + pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon; + pf->copy[PIXEL_4x4] = x264_mc_copy_w4_neon; + + pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon; + pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon; + pf->plane_copy_interleave = x264_plane_copy_interleave_neon; + + pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon; + pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_neon; + pf->store_interleave_chroma = x264_store_interleave_chroma_neon; + + pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon; + pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_neon; + pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_neon; + pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_neon; + pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_neon; + pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_neon; + pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_neon; + pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_neon; + pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_neon; + + pf->weight = x264_mc_wtab_neon; + pf->offsetadd = x264_mc_offsetadd_wtab_neon; + pf->offsetsub = x264_mc_offsetsub_wtab_neon; + pf->weight_cache = x264_weight_cache_neon; + + pf->mc_chroma = x264_mc_chroma_neon; + pf->mc_luma = mc_luma_neon; + pf->get_ref = get_ref_neon; + pf->hpel_filter = x264_hpel_filter_neon; + pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon; +#endif // !HIGH_BIT_DEPTH +}
View file
x264-snapshot-20141104-2245.tar.bz2/common/aarch64/mc.h
Added
@@ -0,0 +1,29 @@ +/***************************************************************************** + * mc.h: aarch64 motion compensation + ***************************************************************************** + * Copyright (C) 2014 x264 project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_AARCH64_MC_H +#define X264_AARCH64_MC_H + +void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf ); + +#endif
View file
x264-snapshot-20141104-2245.tar.bz2/common/aarch64/pixel-a.S
Added
@@ -0,0 +1,1153 @@ +/***************************************************************************** + * pixel.S: aarch64 pixel metrics + ***************************************************************************** + * Copyright (C) 2009-2014 x264 project + * + * Authors: David Conrad <lessen42@gmail.com> + * Janne Grunau <janne-x264@jannau.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "asm.S" + +const mask +.rept 16 +.byte 0xff +.endr +.rept 16 +.byte 0x00 +.endr +endconst + +const mask_ac_4_8 +.short 0, -1, -1, -1, 0, -1, -1, -1 +.short 0, -1, -1, -1, -1, -1, -1, -1 +endconst + +.macro SAD_START_4 + ld1 {v1.s}[0], [x2], x3 + ld1 {v0.s}[0], [x0], x1 + ld1 {v1.s}[1], [x2], x3 + ld1 {v0.s}[1], [x0], x1 + uabdl v16.8h, v0.8b, v1.8b +.endm + +.macro SAD_4 + ld1 {v1.s}[0], [x2], x3 + ld1 {v0.s}[0], [x0], x1 + ld1 {v1.s}[1], [x2], x3 + ld1 {v0.s}[1], [x0], x1 + uabal v16.8h, v0.8b, v1.8b +.endm + +.macro SAD_START_8 + ld1 {v1.8b}, [x2], x3 + ld1 {v0.8b}, [x0], x1 + ld1 {v3.8b}, [x2], x3 + ld1 {v2.8b}, [x0], x1 + uabdl v16.8h, v0.8b, v1.8b + uabdl v17.8h, v2.8b, v3.8b +.endm + +.macro SAD_8 + ld1 {v1.8b}, [x2], x3 + ld1 {v0.8b}, [x0], x1 + ld1 {v3.8b}, [x2], x3 + ld1 {v2.8b}, [x0], x1 + uabal v16.8h, v0.8b, v1.8b + uabal v17.8h, v2.8b, v3.8b +.endm + +.macro SAD_START_16 + ld1 {v1.16b}, [x2], x3 + ld1 {v0.16b}, [x0], x1 + ld1 {v3.16b}, [x2], x3 + ld1 {v2.16b}, [x0], x1 + uabdl v16.8h, v0.8b, v1.8b + uabdl2 v17.8h, v0.16b, v1.16b + uabal v16.8h, v2.8b, v3.8b + uabal2 v17.8h, v2.16b, v3.16b +.endm + +.macro SAD_16 + ld1 {v1.16b}, [x2], x3 + ld1 {v0.16b}, [x0], x1 + ld1 {v3.16b}, [x2], x3 + ld1 {v2.16b}, [x0], x1 + uabal v16.8h, v0.8b, v1.8b + uabal2 v17.8h, v0.16b, v1.16b + uabal v16.8h, v2.8b, v3.8b + uabal2 v17.8h, v2.16b, v3.16b +.endm + +.macro SAD_FUNC w, h, name +function x264_pixel_sad\name\()_\w\()x\h\()_neon, export=1 + SAD_START_\w + +.rept \h / 2 - 1 + SAD_\w +.endr +.if \w > 4 + add v16.8h, v16.8h, v17.8h +.endif + uaddlv s0, v16.8h + fmov w0, s0 + ret +endfunc +.endm + +SAD_FUNC 4, 4 +SAD_FUNC 4, 8 +SAD_FUNC 8, 4 +SAD_FUNC 8, 8 +SAD_FUNC 8, 16 +SAD_FUNC 16, 8 +SAD_FUNC 16, 16 + +.macro SAD_X_4 x, first=uabal + ld1 {v0.s}[0], [x0], x7 + ld1 {v1.s}[0], [x1], x5 + ld1 {v0.s}[1], [x0], x7 + ld1 {v1.s}[1], [x1], x5 + \first v16.8h, v1.8b, v0.8b + ld1 {v2.s}[0], [x2], x5 + ld1 {v2.s}[1], [x2], x5 + \first v17.8h, v2.8b, v0.8b + ld1 {v3.s}[0], [x3], x5 + ld1 {v3.s}[1], [x3], x5 + \first v18.8h, v3.8b, v0.8b +.if \x == 4 + ld1 {v4.s}[0], [x4], x5 + ld1 {v4.s}[1], [x4], x5 + \first v19.8h, v4.8b, v0.8b +.endif +.endm + +.macro SAD_X_8 x, first=uabal + ld1 {v0.8b}, [x0], x7 + ld1 {v1.8b}, [x1], x5 + \first v16.8h, v1.8b, v0.8b + ld1 {v2.8b}, [x2], x5 + ld1 {v5.8b}, [x0], x7 + \first v17.8h, v2.8b, v0.8b + ld1 {v3.8b}, [x3], x5 + ld1 {v1.8b}, [x1], x5 + \first v18.8h, v3.8b, v0.8b + uabal v16.8h, v1.8b, v5.8b + ld1 {v2.8b}, [x2], x5 + ld1 {v3.8b}, [x3], x5 + uabal v17.8h, v2.8b, v5.8b + uabal v18.8h, v3.8b, v5.8b +.if \x == 4 + ld1 {v4.8b}, [x4], x5 + \first v19.8h, v4.8b, v0.8b + ld1 {v4.8b}, [x4], x5 + uabal v19.8h, v4.8b, v5.8b +.endif +.endm + +.macro SAD_X_16 x, first=uabal + ld1 {v0.16b}, [x0], x7 + ld1 {v1.16b}, [x1], x5 + \first v16.8h, v1.8b, v0.8b + \first\()2 v20.8h, v1.16b, v0.16b + ld1 {v2.16b}, [x2], x5 + ld1 {v5.16b}, [x0], x7 + \first v17.8h, v2.8b, v0.8b + \first\()2 v21.8h, v2.16b, v0.16b + ld1 {v3.16b}, [x3], x5 + ld1 {v1.16b}, [x1], x5 + \first v18.8h, v3.8b, v0.8b + \first\()2 v22.8h, v3.16b, v0.16b + uabal v16.8h, v1.8b, v5.8b + uabal2 v20.8h, v1.16b, v5.16b + ld1 {v2.16b}, [x2], x5 + ld1 {v3.16b}, [x3], x5 + uabal v17.8h, v2.8b, v5.8b + uabal2 v21.8h, v2.16b, v5.16b + uabal v18.8h, v3.8b, v5.8b + uabal2 v22.8h, v3.16b, v5.16b +.if \x == 4 + ld1 {v4.16b}, [x4], x5 + \first v19.8h, v4.8b, v0.8b + \first\()2 v23.8h, v4.16b, v0.16b + ld1 {v4.16b}, [x4], x5 + uabal v19.8h, v4.8b, v5.8b + uabal2 v23.8h, v4.16b, v5.16b +.endif +.endm + +.macro SAD_X_FUNC x, w, h +function x264_pixel_sad_x\x\()_\w\()x\h\()_neon, export=1 +.if \x == 3 + mov x6, x5 + mov x5, x4 +.endif + mov x7, #FENC_STRIDE + + SAD_X_\w \x, uabdl + +.rept \h / 2 - 1 + SAD_X_\w \x +.endr + +.if \w > 8 + add v16.8h, v16.8h, v20.8h + add v17.8h, v17.8h, v21.8h + add v18.8h, v18.8h, v22.8h +.if \x == 4 + add v19.8h, v19.8h, v23.8h +.endif +.endif +// add up the sads + uaddlv s0, v16.8h + uaddlv s1, v17.8h + uaddlv s2, v18.8h + + stp s0, s1, [x6], #8 +.if \x == 3 + str s2, [x6] +.else + uaddlv s3, v19.8h + stp s2, s3, [x6] +.endif + ret +endfunc +.endm + +SAD_X_FUNC 3, 4, 4 +SAD_X_FUNC 3, 4, 8 +SAD_X_FUNC 3, 8, 4 +SAD_X_FUNC 3, 8, 8 +SAD_X_FUNC 3, 8, 16 +SAD_X_FUNC 3, 16, 8 +SAD_X_FUNC 3, 16, 16 + +SAD_X_FUNC 4, 4, 4 +SAD_X_FUNC 4, 4, 8 +SAD_X_FUNC 4, 8, 4 +SAD_X_FUNC 4, 8, 8 +SAD_X_FUNC 4, 8, 16 +SAD_X_FUNC 4, 16, 8 +SAD_X_FUNC 4, 16, 16 + + +.macro SSD_START_4 + ld1 {v16.s}[0], [x0], x1 + ld1 {v17.s}[0], [x2], x3 + usubl v2.8h, v16.8b, v17.8b + ld1 {v16.s}[0], [x0], x1 + ld1 {v17.s}[0], [x2], x3 + smull v0.4s, v2.4h, v2.4h +.endm + +.macro SSD_4 + usubl v2.8h, v16.8b, v17.8b + ld1 {v16.s}[0], [x0], x1 + ld1 {v17.s}[0], [x2], x3 + smlal v0.4s, v2.4h, v2.4h +.endm + +.macro SSD_END_4 + usubl v2.8h, v16.8b, v17.8b + smlal v0.4s, v2.4h, v2.4h +.endm + +.macro SSD_START_8 + ld1 {v16.8b}, [x0], x1 + ld1 {v17.8b}, [x2], x3 + usubl v2.8h, v16.8b, v17.8b + ld1 {v16.8b}, [x0], x1 + smull v0.4s, v2.4h, v2.4h + ld1 {v17.8b}, [x2], x3 + smlal2 v0.4s, v2.8h, v2.8h +.endm + +.macro SSD_8 + usubl v2.8h, v16.8b, v17.8b + ld1 {v16.8b}, [x0], x1 + smlal v0.4s, v2.4h, v2.4h + ld1 {v17.8b}, [x2], x3 + smlal2 v0.4s, v2.8h, v2.8h +.endm + +.macro SSD_END_8 + usubl v2.8h, v16.8b, v17.8b + smlal v0.4s, v2.4h, v2.4h + smlal2 v0.4s, v2.8h, v2.8h +.endm + +.macro SSD_START_16 + ld1 {v16.16b}, [x0], x1 + ld1 {v17.16b}, [x2], x3 + usubl v2.8h, v16.8b, v17.8b + usubl2 v3.8h, v16.16b, v17.16b + ld1 {v16.16b}, [x0], x1 + smull v0.4s, v2.4h, v2.4h + smull2 v1.4s, v2.8h, v2.8h + ld1 {v17.16b}, [x2], x3 + smlal v0.4s, v3.4h, v3.4h + smlal2 v1.4s, v3.8h, v3.8h +.endm + +.macro SSD_16 + usubl v2.8h, v16.8b, v17.8b + usubl2 v3.8h, v16.16b, v17.16b + ld1 {v16.16b}, [x0], x1 + smlal v0.4s, v2.4h, v2.4h + smlal2 v1.4s, v2.8h, v2.8h + ld1 {v17.16b}, [x2], x3 + smlal v0.4s, v3.4h, v3.4h + smlal2 v1.4s, v3.8h, v3.8h +.endm + +.macro SSD_END_16 + usubl v2.8h, v16.8b, v17.8b + usubl2 v3.8h, v16.16b, v17.16b + smlal v0.4s, v2.4h, v2.4h + smlal2 v1.4s, v2.8h, v2.8h + smlal v0.4s, v3.4h, v3.4h + smlal2 v1.4s, v3.8h, v3.8h + add v0.4s, v0.4s, v1.4s +.endm + +.macro SSD_FUNC w h +function x264_pixel_ssd_\w\()x\h\()_neon, export=1 + SSD_START_\w +.rept \h-2 + SSD_\w +.endr + SSD_END_\w + + addv s0, v0.4s + mov w0, v0.s[0] + ret +endfunc +.endm + +SSD_FUNC 4, 4 +SSD_FUNC 4, 8 +SSD_FUNC 8, 4 +SSD_FUNC 8, 8 +SSD_FUNC 8, 16 +SSD_FUNC 16, 8 +SSD_FUNC 16, 16 + +.macro pixel_var_8 h +function x264_pixel_var_8x\h\()_neon, export=1 + ld1 {v16.8b}, [x0], x1 + ld1 {v17.8b}, [x0], x1 + mov x2, \h - 4 + umull v1.8h, v16.8b, v16.8b + uxtl v0.8h, v16.8b + umull v2.8h, v17.8b, v17.8b + uaddw v0.8h, v0.8h, v17.8b + ld1 {v18.8b}, [x0], x1 + uaddlp v1.4s, v1.8h + uaddlp v2.4s, v2.8h + ld1 {v19.8b}, [x0], x1 + +1: subs x2, x2, #4 + uaddw v0.8h, v0.8h, v18.8b + umull v24.8h, v18.8b, v18.8b + ld1 {v20.8b}, [x0], x1 + uaddw v0.8h, v0.8h, v19.8b + umull v25.8h, v19.8b, v19.8b + uadalp v1.4s, v24.8h + ld1 {v21.8b}, [x0], x1 + uaddw v0.8h, v0.8h, v20.8b + umull v26.8h, v20.8b, v20.8b + uadalp v2.4s, v25.8h + ld1 {v18.8b}, [x0], x1 + uaddw v0.8h, v0.8h, v21.8b + umull v27.8h, v21.8b, v21.8b + uadalp v1.4s, v26.8h + ld1 {v19.8b}, [x0], x1 + uadalp v2.4s, v27.8h + b.gt 1b + + uaddw v0.8h, v0.8h, v18.8b + umull v28.8h, v18.8b, v18.8b + uaddw v0.8h, v0.8h, v19.8b + umull v29.8h, v19.8b, v19.8b + uadalp v1.4s, v28.8h + uadalp v2.4s, v29.8h + + b x264_var_end +endfunc +.endm + +pixel_var_8 8 +pixel_var_8 16 + +function x264_pixel_var_16x16_neon, export=1 + ld1 {v16.16b}, [x0], x1 + ld1 {v17.16b}, [x0], x1 + mov x2, #14 + umull v1.8h, v16.8b, v16.8b + umull2 v2.8h, v16.16b, v16.16b + uxtl v0.8h, v16.8b + uaddlp v1.4s, v1.8h + uaddlp v2.4s, v2.8h + uaddw2 v0.8h, v0.8h, v16.16b + +1: subs x2, x2, #2 + ld1 {v18.16b}, [x0], x1 + uaddw v0.8h, v0.8h, v17.8b + umull v3.8h, v17.8b, v17.8b + uaddw2 v0.8h, v0.8h, v17.16b + umull2 v4.8h, v17.16b, v17.16b + uadalp v1.4s, v3.8h + uadalp v2.4s, v4.8h + + ld1 {v17.16b}, [x0], x1 + uaddw v0.8h, v0.8h, v18.8b + umull v5.8h, v18.8b, v18.8b + uaddw2 v0.8h, v0.8h, v18.16b + umull2 v6.8h, v18.16b, v18.16b + uadalp v1.4s, v5.8h + uadalp v2.4s, v6.8h + b.gt 1b + + uaddw v0.8h, v0.8h, v17.8b + umull v3.8h, v17.8b, v17.8b + uaddw2 v0.8h, v0.8h, v17.16b + umull2 v4.8h, v17.16b, v17.16b + uadalp v1.4s, v3.8h + uadalp v2.4s, v4.8h +endfunc + +function x264_var_end + add v1.4s, v1.4s, v2.4s + uaddlv s0, v0.8h + uaddlv d1, v1.4s + mov w0, v0.s[0] + mov x1, v1.d[0] + orr x0, x0, x1, lsl #32 + ret +endfunc + + +.macro pixel_var2_8 h +function x264_pixel_var2_8x\h\()_neon, export=1 + ld1 {v16.8b}, [x0], x1 + ld1 {v18.8b}, [x2], x3 + ld1 {v17.8b}, [x0], x1 + ld1 {v19.8b}, [x2], x3 + mov x5, \h - 4 + usubl v6.8h, v16.8b, v18.8b + usubl v7.8h, v17.8b, v19.8b + ld1 {v16.8b}, [x0], x1 + ld1 {v18.8b}, [x2], x3 + smull v2.4s, v6.4h, v6.4h + smull2 v3.4s, v6.8h, v6.8h + add v0.8h, v6.8h, v7.8h + smlal v2.4s, v7.4h, v7.4h + smlal2 v3.4s, v7.8h, v7.8h + + usubl v6.8h, v16.8b, v18.8b + +1: subs x5, x5, #2 + ld1 {v17.8b}, [x0], x1 + ld1 {v19.8b}, [x2], x3 + smlal v2.4s, v6.4h, v6.4h + smlal2 v3.4s, v6.8h, v6.8h + usubl v7.8h, v17.8b, v19.8b + add v0.8h, v0.8h, v6.8h + ld1 {v16.8b}, [x0], x1 + ld1 {v18.8b}, [x2], x3 + smlal v2.4s, v7.4h, v7.4h + smlal2 v3.4s, v7.8h, v7.8h + usubl v6.8h, v16.8b, v18.8b + add v0.8h, v0.8h, v7.8h + b.gt 1b + + ld1 {v17.8b}, [x0], x1 + ld1 {v19.8b}, [x2], x3 + smlal v2.4s, v6.4h, v6.4h + smlal2 v3.4s, v6.8h, v6.8h + usubl v7.8h, v17.8b, v19.8b + add v0.8h, v0.8h, v6.8h + smlal v2.4s, v7.4h, v7.4h + add v0.8h, v0.8h, v7.8h + smlal2 v3.4s, v7.8h, v7.8h + + saddlv s0, v0.8h + add v2.4s, v2.4s, v3.4s + mov w0, v0.s[0] + addv s1, v2.4s + sxtw x0, w0 + mov w1, v1.s[0] + mul x0, x0, x0 + str w1, [x4] + sub x0, x1, x0, lsr # 6 + (\h >> 4) + + ret +endfunc +.endm + +pixel_var2_8 8 +pixel_var2_8 16 + + +function x264_pixel_satd_4x4_neon, export=1 + ld1 {v1.s}[0], [x2], x3 + ld1 {v0.s}[0], [x0], x1 + ld1 {v3.s}[0], [x2], x3 + ld1 {v2.s}[0], [x0], x1 + ld1 {v1.s}[1], [x2], x3 + ld1 {v0.s}[1], [x0], x1 + ld1 {v3.s}[1], [x2], x3 + ld1 {v2.s}[1], [x0], x1 + + usubl v0.8h, v0.8b, v1.8b + usubl v1.8h, v2.8b, v3.8b + SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h + + zip1 v0.2d, v2.2d, v3.2d + zip2 v1.2d, v2.2d, v3.2d + SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h + + trn1 v0.8h, v2.8h, v3.8h + trn2 v1.8h, v2.8h, v3.8h + SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h + + trn1 v0.4s, v2.4s, v3.4s + trn2 v1.4s, v2.4s, v3.4s + abs v0.8h, v0.8h + abs v1.8h, v1.8h + umax v0.8h, v0.8h, v1.8h + + uaddlv s0, v0.8h + mov w0, v0.s[0] + ret +endfunc + +function x264_pixel_satd_4x8_neon, export=1 + ld1 {v1.s}[0], [x2], x3 + ld1 {v0.s}[0], [x0], x1 + ld1 {v3.s}[0], [x2], x3 + ld1 {v2.s}[0], [x0], x1 + ld1 {v5.s}[0], [x2], x3 + ld1 {v4.s}[0], [x0], x1 + ld1 {v7.s}[0], [x2], x3 + ld1 {v6.s}[0], [x0], x1 + ld1 {v1.s}[1], [x2], x3 + ld1 {v0.s}[1], [x0], x1 + ld1 {v3.s}[1], [x2], x3 + ld1 {v2.s}[1], [x0], x1 + ld1 {v5.s}[1], [x2], x3 + ld1 {v4.s}[1], [x0], x1 + ld1 {v7.s}[1], [x2], x3 + ld1 {v6.s}[1], [x0], x1 + b x264_satd_4x8_8x4_end_neon +endfunc + +function x264_pixel_satd_8x4_neon, export=1 + ld1 {v1.8b}, [x2], x3 + ld1 {v0.8b}, [x0], x1 + ld1 {v3.8b}, [x2], x3 + ld1 {v2.8b}, [x0], x1 + ld1 {v5.8b}, [x2], x3 + ld1 {v4.8b}, [x0], x1 + ld1 {v7.8b}, [x2], x3 + ld1 {v6.8b}, [x0], x1 +endfunc + +function x264_satd_4x8_8x4_end_neon + usubl v0.8h, v0.8b, v1.8b + usubl v1.8h, v2.8b, v3.8b + usubl v2.8h, v4.8b, v5.8b + usubl v3.8h, v6.8b, v7.8b + + SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h + SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h + + SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h + SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h + + trn1 v0.8h, v4.8h, v5.8h + trn2 v1.8h, v4.8h, v5.8h + trn1 v2.8h, v6.8h, v7.8h + trn2 v3.8h, v6.8h, v7.8h + + SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h + SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h + + trn1 v0.4s, v16.4s, v18.4s + trn2 v1.4s, v16.4s, v18.4s + trn1 v2.4s, v17.4s, v19.4s + trn2 v3.4s, v17.4s, v19.4s + abs v0.8h, v0.8h + abs v1.8h, v1.8h + abs v2.8h, v2.8h + abs v3.8h, v3.8h + umax v0.8h, v0.8h, v1.8h + umax v1.8h, v2.8h, v3.8h + add v0.8h, v0.8h, v1.8h + uaddlv s0, v0.8h + mov w0, v0.s[0] + ret +endfunc + +function x264_pixel_satd_8x8_neon, export=1 + mov x4, x30 + + bl x264_satd_8x8_neon + add v0.8h, v0.8h, v1.8h + add v1.8h, v2.8h, v3.8h + add v0.8h, v0.8h, v1.8h + uaddlv s0, v0.8h + mov w0, v0.s[0] + ret x4 +endfunc + +function x264_pixel_satd_8x16_neon, export=1 + mov x4, x30 + + bl x264_satd_8x8_neon + add v0.8h, v0.8h, v1.8h + add v1.8h, v2.8h, v3.8h + add v30.8h, v0.8h, v1.8h + + bl x264_satd_8x8_neon + add v0.8h, v0.8h, v1.8h + add v1.8h, v2.8h, v3.8h + add v31.8h, v0.8h, v1.8h + add v0.8h, v30.8h, v31.8h + uaddlv s0, v0.8h + mov w0, v0.s[0] + ret x4 +endfunc + +.macro SUMSUBL_AB sum, sub, a, b + uaddl \sum, \a, \b + usubl \sub, \a, \b +.endm + +.macro load_diff_fly_8x8 + ld1 {v1.8b}, [x2], x3 + ld1 {v0.8b}, [x0], x1 + ld1 {v3.8b}, [x2], x3 + ld1 {v2.8b}, [x0], x1 + usubl v16.8h, v0.8b, v1.8b + ld1 {v5.8b}, [x2], x3 + ld1 {v4.8b}, [x0], x1 + usubl v17.8h, v2.8b, v3.8b + ld1 {v7.8b}, [x2], x3 + ld1 {v6.8b}, [x0], x1 + usubl v18.8h, v4.8b, v5.8b + ld1 {v1.8b}, [x2], x3 + ld1 {v0.8b}, [x0], x1 + usubl v19.8h, v6.8b, v7.8b + ld1 {v3.8b}, [x2], x3 + ld1 {v2.8b}, [x0], x1 + usubl v20.8h, v0.8b, v1.8b + ld1 {v5.8b}, [x2], x3 + ld1 {v4.8b}, [x0], x1 + usubl v21.8h, v2.8b, v3.8b + ld1 {v7.8b}, [x2], x3 + ld1 {v6.8b}, [x0], x1 + + SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h + SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h + + usubl v22.8h, v4.8b, v5.8b + usubl v23.8h, v6.8b, v7.8b +.endm + +.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d + SUMSUB_AB \s1, \d1, \a, \b + SUMSUB_AB \s2, \d2, \c, \d +.endm + +.macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4 + SUMSUB_ABCD \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4 + SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4 +.endm + +function x264_satd_8x8_neon + load_diff_fly_8x8 +endfunc + +// one vertical hadamard pass and two horizontal +function x264_satd_8x4v_8x8h_neon + SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h + SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h + + HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h + + transpose v0.8h, v1.8h, v16.8h, v17.8h + transpose v2.8h, v3.8h, v18.8h, v19.8h + transpose v4.8h, v5.8h, v20.8h, v21.8h + transpose v6.8h, v7.8h, v22.8h, v23.8h + + SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h + SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h + SUMSUB_AB v20.8h, v21.8h, v4.8h, v5.8h + SUMSUB_AB v22.8h, v23.8h, v6.8h, v7.8h + + transpose v0.4s, v2.4s, v16.4s, v18.4s + transpose v1.4s, v3.4s, v17.4s, v19.4s + transpose v4.4s, v6.4s, v20.4s, v22.4s + transpose v5.4s, v7.4s, v21.4s, v23.4s + + abs v0.8h, v0.8h + abs v1.8h, v1.8h + abs v2.8h, v2.8h + abs v3.8h, v3.8h + abs v4.8h, v4.8h + abs v5.8h, v5.8h + abs v6.8h, v6.8h + abs v7.8h, v7.8h + + umax v0.8h, v0.8h, v2.8h + umax v1.8h, v1.8h, v3.8h + umax v2.8h, v4.8h, v6.8h + umax v3.8h, v5.8h, v7.8h + + ret +endfunc + +function x264_pixel_satd_16x8_neon, export=1 + mov x4, x30 + + bl x264_satd_16x4_neon + add v30.8h, v0.8h, v1.8h + add v31.8h, v2.8h, v3.8h + + bl x264_satd_16x4_neon + add v0.8h, v0.8h, v1.8h + add v1.8h, v2.8h, v3.8h + add v30.8h, v30.8h, v0.8h + add v31.8h, v31.8h, v1.8h + + add v0.8h, v30.8h, v31.8h + uaddlv s0, v0.8h + mov w0, v0.s[0] + ret x4 +endfunc + +function x264_pixel_satd_16x16_neon, export=1 + mov x4, x30 + + bl x264_satd_16x4_neon + add v30.8h, v0.8h, v1.8h + add v31.8h, v2.8h, v3.8h + + bl x264_satd_16x4_neon + add v0.8h, v0.8h, v1.8h + add v1.8h, v2.8h, v3.8h + add v30.8h, v30.8h, v0.8h + add v31.8h, v31.8h, v1.8h + + bl x264_satd_16x4_neon + add v0.8h, v0.8h, v1.8h + add v1.8h, v2.8h, v3.8h + add v30.8h, v30.8h, v0.8h + add v31.8h, v31.8h, v1.8h + + bl x264_satd_16x4_neon + add v0.8h, v0.8h, v1.8h + add v1.8h, v2.8h, v3.8h + add v30.8h, v30.8h, v0.8h + add v31.8h, v31.8h, v1.8h + + add v0.8h, v30.8h, v31.8h + uaddlv s0, v0.8h + mov w0, v0.s[0] + ret x4 +endfunc + +function x264_satd_16x4_neon + ld1 {v1.16b}, [x2], x3 + ld1 {v0.16b}, [x0], x1 + ld1 {v3.16b}, [x2], x3 + ld1 {v2.16b}, [x0], x1 + usubl v16.8h, v0.8b, v1.8b + usubl2 v20.8h, v0.16b, v1.16b + ld1 {v5.16b}, [x2], x3 + ld1 {v4.16b}, [x0], x1 + usubl v17.8h, v2.8b, v3.8b + usubl2 v21.8h, v2.16b, v3.16b + ld1 {v7.16b}, [x2], x3 + ld1 {v6.16b}, [x0], x1 + + usubl v18.8h, v4.8b, v5.8b + usubl2 v22.8h, v4.16b, v5.16b + usubl v19.8h, v6.8b, v7.8b + usubl2 v23.8h, v6.16b, v7.16b + + SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h + SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h + + b x264_satd_8x4v_8x8h_neon +endfunc + + +function x264_pixel_sa8d_8x8_neon, export=1 + mov x4, x30 + bl x264_sa8d_8x8_neon + add v0.8h, v0.8h, v1.8h + uaddlv s0, v0.8h + mov w0, v0.s[0] + add w0, w0, #1 + lsr w0, w0, #1 + ret x4 +endfunc + +function x264_pixel_sa8d_16x16_neon, export=1 + mov x4, x30 + bl x264_sa8d_8x8_neon + uaddlp v30.4s, v0.8h + uaddlp v31.4s, v1.8h + bl x264_sa8d_8x8_neon + uadalp v30.4s, v0.8h + uadalp v31.4s, v1.8h + sub x0, x0, x1, lsl #4 + sub x2, x2, x3, lsl #4 + add x0, x0, #8 + add x2, x2, #8 + bl x264_sa8d_8x8_neon + uadalp v30.4s, v0.8h + uadalp v31.4s, v1.8h + bl x264_sa8d_8x8_neon + uadalp v30.4s, v0.8h + uadalp v31.4s, v1.8h + add v0.4s, v30.4s, v31.4s + addv s0, v0.4s + mov w0, v0.s[0] + add w0, w0, #1 + lsr w0, w0, #1 + ret x4 +endfunc + +function x264_sa8d_8x8_neon + load_diff_fly_8x8 + + SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h + SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h + + HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h + SUMSUB_AB v0.8h, v16.8h, v16.8h, v20.8h + SUMSUB_AB v1.8h, v17.8h, v17.8h, v21.8h + SUMSUB_AB v2.8h, v18.8h, v18.8h, v22.8h + SUMSUB_AB v3.8h, v19.8h, v19.8h, v23.8h + + transpose v20.8h, v21.8h, v16.8h, v17.8h + transpose v4.8h, v5.8h, v0.8h, v1.8h + transpose v22.8h, v23.8h, v18.8h, v19.8h + transpose v6.8h, v7.8h, v2.8h, v3.8h + + SUMSUB_AB v28.8h, v29.8h, v20.8h, v21.8h + SUMSUB_AB v24.8h, v25.8h, v4.8h, v5.8h + SUMSUB_AB v0.8h, v1.8h, v22.8h, v23.8h + SUMSUB_AB v26.8h, v27.8h, v6.8h, v7.8h + + transpose v20.4s, v22.4s, v28.4s, v0.4s + transpose v21.4s, v23.4s, v29.4s, v1.4s + transpose v16.4s, v18.4s, v24.4s, v26.4s + transpose v17.4s, v19.4s, v25.4s, v27.4s + + SUMSUB_AB v0.8h, v2.8h, v20.8h, v22.8h + SUMSUB_AB v1.8h, v3.8h, v21.8h, v23.8h + SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h + SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h + + transpose v16.2d, v20.2d, v0.2d, v4.2d + transpose v17.2d, v21.2d, v1.2d, v5.2d + transpose v18.2d, v22.2d, v2.2d, v6.2d + transpose v19.2d, v23.2d, v3.2d, v7.2d + + abs v16.8h, v16.8h + abs v20.8h, v20.8h + abs v17.8h, v17.8h + abs v21.8h, v21.8h + abs v18.8h, v18.8h + abs v22.8h, v22.8h + abs v19.8h, v19.8h + abs v23.8h, v23.8h + + umax v16.8h, v16.8h, v20.8h + umax v17.8h, v17.8h, v21.8h + umax v18.8h, v18.8h, v22.8h + umax v19.8h, v19.8h, v23.8h + + add v0.8h, v16.8h, v17.8h + add v1.8h, v18.8h, v19.8h + + ret +endfunc + + +.macro HADAMARD_AC w h +function x264_pixel_hadamard_ac_\w\()x\h\()_neon, export=1 + movrel x5, mask_ac_4_8 + mov x4, x30 + ld1 {v30.8h,v31.8h}, [x5] + movi v28.16b, #0 + movi v29.16b, #0 + + bl x264_hadamard_ac_8x8_neon +.if \h > 8 + bl x264_hadamard_ac_8x8_neon +.endif +.if \w > 8 + sub x0, x0, x1, lsl #3 + add x0, x0, #8 + bl x264_hadamard_ac_8x8_neon +.endif +.if \w * \h == 256 + sub x0, x0, x1, lsl #4 + bl x264_hadamard_ac_8x8_neon +.endif + + addv s1, v29.4s + addv s0, v28.4s + mov w1, v1.s[0] + mov w0, v0.s[0] + lsr w1, w1, #2 + lsr w0, w0, #1 + orr x0, x0, x1, lsl #32 + ret x4 +endfunc +.endm + +HADAMARD_AC 8, 8 +HADAMARD_AC 8, 16 +HADAMARD_AC 16, 8 +HADAMARD_AC 16, 16 + +// v28: satd v29: sa8d v30: mask_ac4 v31: mask_ac8 +function x264_hadamard_ac_8x8_neon + ld1 {v16.8b}, [x0], x1 + ld1 {v17.8b}, [x0], x1 + ld1 {v18.8b}, [x0], x1 + ld1 {v19.8b}, [x0], x1 + SUMSUBL_AB v0.8h, v1.8h, v16.8b, v17.8b + ld1 {v20.8b}, [x0], x1 + ld1 {v21.8b}, [x0], x1 + SUMSUBL_AB v2.8h, v3.8h, v18.8b, v19.8b + ld1 {v22.8b}, [x0], x1 + ld1 {v23.8b}, [x0], x1 + SUMSUBL_AB v4.8h, v5.8h, v20.8b, v21.8b + SUMSUBL_AB v6.8h, v7.8h, v22.8b, v23.8b + + SUMSUB_ABCD v16.8h, v18.8h, v17.8h, v19.8h, v0.8h, v2.8h, v1.8h, v3.8h + SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h + + transpose v0.8h, v1.8h, v16.8h, v17.8h + transpose v2.8h, v3.8h, v18.8h, v19.8h + transpose v4.8h, v5.8h, v20.8h, v21.8h + transpose v6.8h, v7.8h, v22.8h, v23.8h + + SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h + SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h + SUMSUB_AB v20.8h, v21.8h, v4.8h, v5.8h + SUMSUB_AB v22.8h, v23.8h, v6.8h, v7.8h + + transpose v0.4s, v2.4s, v16.4s, v18.4s + transpose v1.4s, v3.4s, v17.4s, v19.4s + transpose v4.4s, v6.4s, v20.4s, v22.4s + transpose v5.4s, v7.4s, v21.4s, v23.4s + + SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h + SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h + SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h + + abs v0.8h, v16.8h + abs v4.8h, v20.8h + abs v1.8h, v17.8h + abs v5.8h, v21.8h + abs v2.8h, v18.8h + abs v6.8h, v22.8h + abs v3.8h, v19.8h + abs v7.8h, v23.8h + + add v0.8h, v0.8h, v4.8h + add v1.8h, v1.8h, v5.8h + and v0.16b, v0.16b, v30.16b + add v2.8h, v2.8h, v6.8h + add v3.8h, v3.8h, v7.8h + add v0.8h, v0.8h, v2.8h + add v1.8h, v1.8h, v3.8h + uadalp v28.4s, v0.8h + uadalp v28.4s, v1.8h + + SUMSUB_AB v6.8h, v7.8h, v23.8h, v19.8h + SUMSUB_AB v4.8h, v5.8h, v22.8h, v18.8h + SUMSUB_AB v2.8h, v3.8h, v21.8h, v17.8h + SUMSUB_AB v1.8h, v0.8h, v16.8h, v20.8h + + transpose v16.2d, v17.2d, v6.2d, v7.2d + transpose v18.2d, v19.2d, v4.2d, v5.2d + transpose v20.2d, v21.2d, v2.2d, v3.2d + + abs v16.8h, v16.8h + abs v17.8h, v17.8h + abs v18.8h, v18.8h + abs v19.8h, v19.8h + abs v20.8h, v20.8h + abs v21.8h, v21.8h + + transpose v7.2d, v6.2d, v1.2d, v0.2d + + umax v3.8h, v16.8h, v17.8h + umax v2.8h, v18.8h, v19.8h + umax v1.8h, v20.8h, v21.8h + + SUMSUB_AB v4.8h, v5.8h, v7.8h, v6.8h + + add v2.8h, v2.8h, v3.8h + add v2.8h, v2.8h, v1.8h + and v4.16b, v4.16b, v31.16b + add v2.8h, v2.8h, v2.8h + abs v5.8h, v5.8h + abs v4.8h, v4.8h + add v2.8h, v2.8h, v5.8h + add v2.8h, v2.8h, v4.8h + uadalp v29.4s, v2.8h + ret +endfunc + + +function x264_pixel_ssim_4x4x2_core_neon, export=1 + ld1 {v0.8b}, [x0], x1 + ld1 {v2.8b}, [x2], x3 + umull v16.8h, v0.8b, v0.8b + umull v17.8h, v0.8b, v2.8b + umull v18.8h, v2.8b, v2.8b + + ld1 {v28.8b}, [x0], x1 + ld1 {v29.8b}, [x2], x3 + umull v20.8h, v28.8b, v28.8b + umull v21.8h, v28.8b, v29.8b + umull v22.8h, v29.8b, v29.8b + + uaddlp v16.4s, v16.8h + uaddlp v17.4s, v17.8h + uaddl v0.8h, v0.8b, v28.8b + uadalp v16.4s, v18.8h + uaddl v1.8h, v2.8b, v29.8b + + ld1 {v26.8b}, [x0], x1 + ld1 {v27.8b}, [x2], x3 + umull v23.8h, v26.8b, v26.8b + umull v24.8h, v26.8b, v27.8b + umull v25.8h, v27.8b, v27.8b + + uadalp v16.4s, v20.8h + uaddw v0.8h, v0.8h, v26.8b + uadalp v17.4s, v21.8h + uaddw v1.8h, v1.8h, v27.8b + uadalp v16.4s, v22.8h + + ld1 {v28.8b}, [x0], x1 + ld1 {v29.8b}, [x2], x3 + umull v20.8h, v28.8b, v28.8b + umull v21.8h, v28.8b, v29.8b + umull v22.8h, v29.8b, v29.8b + + uadalp v16.4s, v23.8h + uaddw v0.8h, v0.8h, v28.8b + uadalp v17.4s, v24.8h + uaddw v1.8h, v1.8h, v29.8b + uadalp v16.4s, v25.8h + + uadalp v16.4s, v20.8h + uadalp v17.4s, v21.8h + uadalp v16.4s, v22.8h + + uaddlp v0.4s, v0.8h + uaddlp v1.4s, v1.8h + + addp v0.4s, v0.4s, v0.4s + addp v1.4s, v1.4s, v1.4s + addp v2.4s, v16.4s, v16.4s + addp v3.4s, v17.4s, v17.4s + + st4 {v0.2s,v1.2s,v2.2s,v3.2s}, [x4] + ret +endfunc + +function x264_pixel_ssim_end4_neon, export=1 + mov x5, #4 + ld1 {v16.4s,v17.4s}, [x0], #32 + ld1 {v18.4s,v19.4s}, [x1], #32 + mov w4, #0x99bb + subs x2, x5, w2, uxtw + mov w3, #416 // ssim_c1 = .01*.01*255*255*64 + movk w4, #0x03, lsl #16 // ssim_c2 = .03*.03*255*255*64*63 + add v0.4s, v16.4s, v18.4s + add v1.4s, v17.4s, v19.4s + add v0.4s, v0.4s, v1.4s + ld1 {v20.4s,v21.4s}, [x0], #32 + ld1 {v22.4s,v23.4s}, [x1], #32 + add v2.4s, v20.4s, v22.4s + add v3.4s, v21.4s, v23.4s + add v1.4s, v1.4s, v2.4s + ld1 {v16.4s}, [x0], #16 + ld1 {v18.4s}, [x1], #16 + add v16.4s, v16.4s, v18.4s + add v2.4s, v2.4s, v3.4s + add v3.4s, v3.4s, v16.4s + + dup v30.4s, w3 + dup v31.4s, w4 + + transpose v4.4s, v5.4s, v0.4s, v1.4s + transpose v6.4s, v7.4s, v2.4s, v3.4s + transpose v0.2d, v2.2d, v4.2d, v6.2d + transpose v1.2d, v3.2d, v5.2d, v7.2d + + mul v16.4s, v0.4s, v1.4s // s1*s2 + mul v0.4s, v0.4s, v0.4s + mla v0.4s, v1.4s, v1.4s // s1*s1 + s2*s2 + + shl v3.4s, v3.4s, #7 + shl v2.4s, v2.4s, #6 + add v1.4s, v16.4s, v16.4s + + sub v2.4s, v2.4s, v0.4s // vars + sub v3.4s, v3.4s, v1.4s // covar*2 + add v0.4s, v0.4s, v30.4s + add v2.4s, v2.4s, v31.4s + add v1.4s, v1.4s, v30.4s + add v3.4s, v3.4s, v31.4s + + scvtf v0.4s, v0.4s + scvtf v2.4s, v2.4s + scvtf v1.4s, v1.4s + scvtf v3.4s, v3.4s + + fmul v0.4s, v0.4s, v2.4s + fmul v1.4s, v1.4s, v3.4s + + fdiv v0.4s, v1.4s, v0.4s + + b.eq 1f + movrel x3, mask + add x3, x3, x2, lsl #2 + ld1 {v29.4s}, [x3] + and v0.16b, v0.16b, v29.16b +1: + faddp v0.4s, v0.4s, v0.4s + faddp s0, v0.2s + ret +endfunc
View file
x264-snapshot-20141104-2245.tar.bz2/common/aarch64/pixel.h
Added
@@ -0,0 +1,69 @@ +/***************************************************************************** + * pixel.h: aarch64 pixel metrics + ***************************************************************************** + * Copyright (C) 2009-2014 x264 project + * + * Authors: David Conrad <lessen42@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_AARCH64_PIXEL_H +#define X264_AARCH64_PIXEL_H + +#define DECL_PIXELS( ret, name, suffix, args ) \ + ret x264_pixel_##name##_16x16_##suffix args;\ + ret x264_pixel_##name##_16x8_##suffix args;\ + ret x264_pixel_##name##_8x16_##suffix args;\ + ret x264_pixel_##name##_8x8_##suffix args;\ + ret x264_pixel_##name##_8x4_##suffix args;\ + ret x264_pixel_##name##_4x8_##suffix args;\ + ret x264_pixel_##name##_4x4_##suffix args;\ + +#define DECL_X1( name, suffix ) \ + DECL_PIXELS( int, name, suffix, ( uint8_t *, intptr_t, uint8_t *, intptr_t ) ) + +#define DECL_X4( name, suffix ) \ + DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )\ + DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) ) + +DECL_X1( sad, neon ) +DECL_X4( sad, neon ) +DECL_X1( satd, neon ) +DECL_X1( ssd, neon ) + +int x264_pixel_sa8d_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t ); +int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t ); + +uint64_t x264_pixel_var_8x8_neon ( uint8_t *, intptr_t ); +uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t ); +uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t ); +int x264_pixel_var2_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); +int x264_pixel_var2_8x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); + +uint64_t x264_pixel_hadamard_ac_8x8_neon ( uint8_t *, intptr_t ); +uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t ); +uint64_t x264_pixel_hadamard_ac_16x8_neon ( uint8_t *, intptr_t ); +uint64_t x264_pixel_hadamard_ac_16x16_neon( uint8_t *, intptr_t ); + +void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, intptr_t, + const uint8_t *, intptr_t, + int sums[2][4] ); +float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width ); + +#endif
View file
x264-snapshot-20141104-2245.tar.bz2/common/aarch64/predict-a.S
Added
@@ -0,0 +1,661 @@ +/***************************************************************************** + * predict.S: aarch64 intra prediction + ***************************************************************************** + * Copyright (C) 2009-2014 x264 project + * + * Authors: David Conrad <lessen42@gmail.com> + * Mans Rullgard <mans@mansr.com> + * Janne Grunau <janne-x264@jannau.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "asm.S" + +const p8weight, align=4 + .short 1, 2, 3, 4, 1, 2, 3, 4 +endconst +const p16weight, align=4 + .short 1, 2, 3, 4, 5, 6, 7, 8 +endconst + +.macro ldcol.8 vd, xn, xm, n=8, hi=0 +.if \n == 8 || \hi == 0 + ld1 {\vd\().b}[0], [\xn], \xm + ld1 {\vd\().b}[1], [\xn], \xm + ld1 {\vd\().b}[2], [\xn], \xm + ld1 {\vd\().b}[3], [\xn], \xm +.endif +.if \n == 8 || \hi == 1 + ld1 {\vd\().b}[4], [\xn], \xm + ld1 {\vd\().b}[5], [\xn], \xm + ld1 {\vd\().b}[6], [\xn], \xm + ld1 {\vd\().b}[7], [\xn], \xm +.endif +.endm + +.macro ldcol.16 vd, xn, xm + ldcol.8 \vd, \xn, \xm + ld1 {\vd\().b}[ 8], [\xn], \xm + ld1 {\vd\().b}[ 9], [\xn], \xm + ld1 {\vd\().b}[10], [\xn], \xm + ld1 {\vd\().b}[11], [\xn], \xm + ld1 {\vd\().b}[12], [\xn], \xm + ld1 {\vd\().b}[13], [\xn], \xm + ld1 {\vd\().b}[14], [\xn], \xm + ld1 {\vd\().b}[15], [\xn], \xm +.endm + + +function x264_predict_4x4_h_aarch64, export=1 + ldrb w1, [x0, #0*FDEC_STRIDE-1] + ldrb w2, [x0, #1*FDEC_STRIDE-1] + ldrb w3, [x0, #2*FDEC_STRIDE-1] + ldrb w4, [x0, #3*FDEC_STRIDE-1] + add w1, w1, w1, lsl #8 + add w2, w2, w2, lsl #8 + add w3, w3, w3, lsl #8 + add w4, w4, w4, lsl #8 + add w1, w1, w1, lsl #16 + str w1, [x0, #0*FDEC_STRIDE] + add w2, w2, w2, lsl #16 + str w2, [x0, #1*FDEC_STRIDE] + add w3, w3, w3, lsl #16 + str w3, [x0, #2*FDEC_STRIDE] + add w4, w4, w4, lsl #16 + str w4, [x0, #3*FDEC_STRIDE] + ret +endfunc + +function x264_predict_4x4_v_aarch64, export=1 + ldr w1, [x0, #0 - 1 * FDEC_STRIDE] + str w1, [x0, #0 + 0 * FDEC_STRIDE] + str w1, [x0, #0 + 1 * FDEC_STRIDE] + str w1, [x0, #0 + 2 * FDEC_STRIDE] + str w1, [x0, #0 + 3 * FDEC_STRIDE] + ret +endfunc + +function x264_predict_4x4_dc_neon, export=1 + sub x1, x0, #FDEC_STRIDE + sub x2, x0, #1 + mov x7, #FDEC_STRIDE + ld1 {v0.8b}, [x1] + ld1r {v1.8b}, [x2], x7 + ld1r {v2.8b}, [x2], x7 + ld1r {v3.8b}, [x2], x7 + ld1r {v4.8b}, [x2], x7 + uaddlp v0.4h, v0.8b + uaddl v1.8h, v1.8b, v2.8b + uaddl v2.8h, v3.8b, v4.8b + addp v0.4h, v0.4h, v0.4h + add v1.4h, v1.4h, v2.4h + dup v0.4h, v0.h[0] + add v0.4h, v0.4h, v1.4h + rshrn v0.8b, v0.8h, #3 + str s0, [x0], #FDEC_STRIDE + str s0, [x0], #FDEC_STRIDE + str s0, [x0], #FDEC_STRIDE + str s0, [x0] + ret +endfunc + +function x264_predict_4x4_dc_top_neon, export=1 + sub x1, x0, #FDEC_STRIDE + mov x7, #FDEC_STRIDE + ld1 {v0.8b}, [x1] + uaddlp v0.4h, v0.8b + addp v0.4h, v0.4h, v0.4h + dup v0.4h, v0.h[0] + rshrn v0.8b, v0.8h, #2 + str s0, [x0], #FDEC_STRIDE + str s0, [x0], #FDEC_STRIDE + str s0, [x0], #FDEC_STRIDE + str s0, [x0] + ret +endfunc + +function x264_predict_4x4_ddr_neon, export=1 + sub x1, x0, #FDEC_STRIDE+1 + mov x7, #FDEC_STRIDE + ld1 {v0.8b}, [x1], x7 // # -FDEC_STRIDE-1 + ld1r {v1.8b}, [x1], x7 // #0*FDEC_STRIDE-1 + ld1r {v2.8b}, [x1], x7 // #1*FDEC_STRIDE-1 + ext v0.8b, v1.8b, v0.8b, #7 + ld1r {v3.8b}, [x1], x7 // #2*FDEC_STRIDE-1 + ext v0.8b, v2.8b, v0.8b, #7 // a + ld1r {v4.8b}, [x1], x7 // #3*FDEC_STRIDE-1 + ext v1.8b, v3.8b, v0.8b, #7 // b + ext v2.8b, v4.8b, v1.8b, #7 // c + uaddl v0.8h, v0.8b, v1.8b + uaddl v1.8h, v1.8b, v2.8b + add v0.8h, v0.8h, v1.8h + rshrn v0.8b, v0.8h, #2 + + ext v3.8b, v0.8b, v0.8b, #3 + ext v2.8b, v0.8b, v0.8b, #2 + ext v1.8b, v0.8b, v0.8b, #1 + + str s3, [x0], #FDEC_STRIDE + str s2, [x0], #FDEC_STRIDE + str s1, [x0], #FDEC_STRIDE + str s0, [x0] + ret +endfunc + +function x264_predict_4x4_ddl_neon, export=1 + sub x0, x0, #FDEC_STRIDE + mov x7, #FDEC_STRIDE + ld1 {v0.8b}, [x0], x7 + dup v3.8b, v0.b[7] + ext v1.8b, v0.8b, v0.8b, #1 + ext v2.8b, v0.8b, v3.8b, #2 + uhadd v0.8b, v0.8b, v2.8b + urhadd v0.8b, v0.8b, v1.8b + str s0, [x0], #FDEC_STRIDE + ext v1.8b, v0.8b, v0.8b, #1 + ext v2.8b, v0.8b, v0.8b, #2 + str s1, [x0], #FDEC_STRIDE + ext v3.8b, v0.8b, v0.8b, #3 + str s2, [x0], #FDEC_STRIDE + str s3, [x0] + ret +endfunc + +function x264_predict_8x8_dc_neon, export=1 + mov x7, #FDEC_STRIDE + ld1 {v0.16b}, [x1], #16 + ld1 {v1.8b}, [x1] + ext v0.16b, v0.16b, v0.16b, #7 + uaddlv h1, v1.8b + uaddlv h0, v0.8b + add v0.8h, v0.8h, v1.8h + dup v0.8h, v0.h[0] + rshrn v0.8b, v0.8h, #4 +.rept 8 + st1 {v0.8b}, [x0], x7 +.endr + ret +endfunc + +function x264_predict_8x8_h_neon, export=1 + mov x7, #FDEC_STRIDE + ld1 {v16.16b}, [x1] + dup v0.8b, v16.b[14] + dup v1.8b, v16.b[13] + st1 {v0.8b}, [x0], x7 + dup v2.8b, v16.b[12] + st1 {v1.8b}, [x0], x7 + dup v3.8b, v16.b[11] + st1 {v2.8b}, [x0], x7 + dup v4.8b, v16.b[10] + st1 {v3.8b}, [x0], x7 + dup v5.8b, v16.b[9] + st1 {v4.8b}, [x0], x7 + dup v6.8b, v16.b[8] + st1 {v5.8b}, [x0], x7 + dup v7.8b, v16.b[7] + st1 {v6.8b}, [x0], x7 + st1 {v7.8b}, [x0], x7 + ret +endfunc + +function x264_predict_8x8_v_neon, export=1 + add x1, x1, #16 + mov x7, #FDEC_STRIDE + ld1 {v0.8b}, [x1] +.rept 8 + st1 {v0.8b}, [x0], x7 +.endr + ret +endfunc + +function x264_predict_8x8_ddl_neon, export=1 + add x1, x1, #16 + mov x7, #FDEC_STRIDE + ld1 {v0.16b}, [x1] + movi v3.16b, #0 + dup v2.16b, v0.b[15] + ext v4.16b, v3.16b, v0.16b, #15 + ext v2.16b, v0.16b, v2.16b, #1 + uhadd v4.16b, v4.16b, v2.16b + urhadd v0.16b, v0.16b, v4.16b + ext v1.16b, v0.16b, v0.16b, #1 + ext v2.16b, v0.16b, v0.16b, #2 + st1 {v1.8b}, [x0], x7 + ext v3.16b, v0.16b, v0.16b, #3 + st1 {v2.8b}, [x0], x7 + ext v4.16b, v0.16b, v0.16b, #4 + st1 {v3.8b}, [x0], x7 + ext v5.16b, v0.16b, v0.16b, #5 + st1 {v4.8b}, [x0], x7 + ext v6.16b, v0.16b, v0.16b, #6 + st1 {v5.8b}, [x0], x7 + ext v7.16b, v0.16b, v0.16b, #7 + st1 {v6.8b}, [x0], x7 + ext v0.16b, v0.16b, v0.16b, #8 + st1 {v7.8b}, [x0], x7 + st1 {v0.8b}, [x0], x7 + ret +endfunc + +function x264_predict_8x8_ddr_neon, export=1 + ld1 {v0.16b,v1.16b}, [x1] + ext v2.16b, v0.16b, v1.16b, #7 + ext v4.16b, v0.16b, v1.16b, #9 + ext v3.16b, v0.16b, v1.16b, #8 + + uhadd v2.16b, v2.16b, v4.16b + urhadd v7.16b, v3.16b, v2.16b + + add x0, x0, #7*FDEC_STRIDE + mov x7, #-1*FDEC_STRIDE + + ext v6.16b, v7.16b, v7.16b, #1 + st1 {v7.8b}, [x0], x7 + ext v5.16b, v7.16b, v7.16b, #2 + st1 {v6.8b}, [x0], x7 + ext v4.16b, v7.16b, v7.16b, #3 + st1 {v5.8b}, [x0], x7 + ext v3.16b, v7.16b, v7.16b, #4 + st1 {v4.8b}, [x0], x7 + ext v2.16b, v7.16b, v7.16b, #5 + st1 {v3.8b}, [x0], x7 + ext v1.16b, v7.16b, v7.16b, #6 + st1 {v2.8b}, [x0], x7 + ext v0.16b, v7.16b, v7.16b, #7 + st1 {v1.8b}, [x0], x7 + st1 {v0.8b}, [x0], x7 + ret +endfunc + +function x264_predict_8x8_vl_neon, export=1 + add x1, x1, #16 + mov x7, #FDEC_STRIDE + + ld1 {v0.16b}, [x1] + ext v1.16b, v1.16b, v0.16b, #15 + ext v2.16b, v0.16b, v2.16b, #1 + + uhadd v1.16b, v1.16b, v2.16b + urhadd v3.16b, v0.16b, v2.16b + + urhadd v0.16b, v0.16b, v1.16b + + ext v4.16b, v0.16b, v0.16b, #1 + st1 {v3.8b}, [x0], x7 + ext v5.16b, v3.16b, v3.16b, #1 + st1 {v4.8b}, [x0], x7 + ext v6.16b, v0.16b, v0.16b, #2 + st1 {v5.8b}, [x0], x7 + ext v7.16b, v3.16b, v3.16b, #2 + st1 {v6.8b}, [x0], x7 + ext v4.16b, v0.16b, v0.16b, #3 + st1 {v7.8b}, [x0], x7 + ext v5.16b, v3.16b, v3.16b, #3 + st1 {v4.8b}, [x0], x7 + ext v6.16b, v0.16b, v0.16b, #4 + st1 {v5.8b}, [x0], x7 + st1 {v6.8b}, [x0], x7 + ret +endfunc + +function x264_predict_8x8_vr_neon, export=1 + add x1, x1, #8 + mov x7, #FDEC_STRIDE + ld1 {v2.16b}, [x1] + + ext v1.16b, v2.16b, v2.16b, #14 + ext v0.16b, v2.16b, v2.16b, #15 + + uhadd v3.16b, v2.16b, v1.16b + urhadd v2.16b, v2.16b, v0.16b + urhadd v0.16b, v0.16b, v3.16b + + ext v1.16b, v2.16b, v2.16b, #8 + uzp1 v2.8b, v0.8b, v0.8b + uzp2 v3.8b, v0.8b, v0.8b + ext v0.16b, v0.16b, v0.16b, #8 + + st1 {v1.8b}, [x0], x7 + st1 {v0.8b}, [x0], x7 + ext v4.8b, v3.8b, v1.8b, #7 + ext v5.8b, v2.8b, v0.8b, #7 + st1 {v4.8b}, [x0], x7 + st1 {v5.8b}, [x0], x7 + ext v6.8b, v3.8b, v1.8b, #6 + ext v7.8b, v2.8b, v0.8b, #6 + st1 {v6.8b}, [x0], x7 + st1 {v7.8b}, [x0], x7 + ext v1.8b, v3.8b, v1.8b, #5 + ext v0.8b, v2.8b, v0.8b, #5 + st1 {v1.8b}, [x0], x7 + st1 {v0.8b}, [x0], x7 + ret +endfunc + +function x264_predict_8x8_hd_neon, export=1 + add x1, x1, #7 + mov x7, #FDEC_STRIDE + + ld1 {v1.16b}, [x1] + ext v3.16b, v1.16b, v1.16b, #1 + ext v2.16b, v1.16b, v1.16b, #2 + + urhadd v4.16b, v1.16b, v3.16b + + uhadd v1.16b, v1.16b, v2.16b + urhadd v0.16b, v1.16b, v3.16b + + zip1 v16.8b, v4.8b, v0.8b + zip2 v17.8b, v4.8b, v0.8b + ext v7.16b, v0.16b, v0.16b, #8 + + ext v0.8b, v17.8b, v7.8b, #6 + ext v1.8b, v17.8b, v7.8b, #4 + st1 {v0.8b}, [x0], x7 + ext v2.8b, v17.8b, v7.8b, #2 + st1 {v1.8b}, [x0], x7 + st1 {v2.8b}, [x0], x7 + ext v3.8b, v16.8b, v17.8b, #6 + st1 {v17.8b}, [x0], x7 + ext v4.8b, v16.8b, v17.8b, #4 + st1 {v3.8b}, [x0], x7 + ext v5.8b, v16.8b, v17.8b, #2 + st1 {v4.8b}, [x0], x7 + st1 {v5.8b}, [x0], x7 + st1 {v16.8b}, [x0], x7 + + ret +endfunc + +function x264_predict_8x8_hu_neon, export=1 + add x1, x1, #7 + mov x7, #FDEC_STRIDE + ld1 {v7.8b}, [x1] + dup v6.8b, v7.b[0] + rev64 v7.8b, v7.8b + + ext v4.8b, v7.8b, v6.8b, #2 + ext v2.8b, v7.8b, v6.8b, #1 + + uhadd v5.8b, v7.8b, v4.8b + urhadd v0.8b, v2.8b, v7.8b + urhadd v1.8b, v5.8b, v2.8b + + zip1 v16.8b, v0.8b, v1.8b + zip2 v17.8b, v0.8b, v1.8b + + dup v18.4h, v17.h[3] + + ext v0.8b, v16.8b, v17.8b, #2 + ext v1.8b, v16.8b, v17.8b, #4 + ext v2.8b, v16.8b, v17.8b, #6 + st1 {v16.8b}, [x0], x7 + st1 {v0.8b}, [x0], x7 + st1 {v1.8b}, [x0], x7 + st1 {v2.8b}, [x0], x7 + + ext v4.8b, v17.8b, v18.8b, #2 + ext v5.8b, v17.8b, v18.8b, #4 + ext v6.8b, v17.8b, v18.8b, #6 + st1 {v17.8b}, [x0], x7 + st1 {v4.8b}, [x0], x7 + st1 {v5.8b}, [x0], x7 + st1 {v6.8b}, [x0] + ret +endfunc + + +function x264_predict_8x8c_dc_top_neon, export=1 + sub x2, x0, #FDEC_STRIDE + mov x1, #FDEC_STRIDE + ld1 {v0.8b}, [x2] + uaddlp v0.4h, v0.8b + addp v0.4h, v0.4h, v0.4h + rshrn v0.8b, v0.8h, #2 + dup v3.8b, v0.b[1] + dup v2.8b, v0.b[0] + transpose v0.2s, v1.2s, v2.2s, v3.2s + b pred8x8c_dc_end +endfunc + +function x264_predict_8x8c_dc_left_neon, export=1 + sub x2, x0, #1 + mov x1, #FDEC_STRIDE + ldcol.8 v0, x2, x1 + uaddlp v0.4h, v0.8b + addp v0.4h, v0.4h, v0.4h + rshrn v0.8b, v0.8h, #2 + dup v1.8b, v0.b[1] + dup v0.8b, v0.b[0] + b pred8x8c_dc_end +endfunc + +function x264_predict_8x8c_dc_neon, export=1 + sub x2, x0, #FDEC_STRIDE + sub x3, x0, #1 + mov x1, #FDEC_STRIDE + ld1 {v2.8b}, [x2] + ldcol.8 v3, x3, x1 + transpose v0.2s, v1.2s, v2.2s, v3.2s + uaddlp v0.4h, v0.8b // s0, s2 + uaddlp v1.4h, v1.8b // s1, s3 + addp v0.4h, v0.4h, v1.4h // s0, s2, s1, s3 + addp v1.4h, v0.4h, v0.4h + rshrn v2.8b, v0.8h, #2 + rshrn v3.8b, v1.8h, #3 + dup v5.8b, v2.b[2] // dc1 + dup v6.8b, v3.b[1] // dc2 + dup v4.8b, v3.b[0] // dc0 + dup v7.8b, v2.b[3] // dc3 + trn1 v0.2s, v4.2s, v5.2s + trn1 v1.2s, v7.2s, v6.2s +pred8x8c_dc_end: + add x2, x0, x1, lsl #2 +.rept 4 + st1 {v0.8b}, [x0], x1 + st1 {v1.8b}, [x2], x1 +.endr + ret +endfunc + +function x264_predict_8x8c_h_neon, export=1 + sub x1, x0, #1 + mov x7, #FDEC_STRIDE +.rept 4 + ld1r {v0.8b}, [x1], x7 + ld1r {v1.8b}, [x1], x7 + st1 {v0.8b}, [x0], x7 + st1 {v1.8b}, [x0], x7 +.endr + ret +endfunc + +function x264_predict_8x8c_v_neon, export=1 + sub x0, x0, #FDEC_STRIDE + mov x7, #FDEC_STRIDE + ld1 {v0.8b}, [x0], x7 +.rept 8 + st1 {v0.8b}, [x0], x7 +.endr + ret +endfunc + +function x264_predict_8x8c_p_neon, export=1 + sub x3, x0, #FDEC_STRIDE + mov x1, #FDEC_STRIDE + add x2, x3, #4 + sub x3, x3, #1 + ld1 {v0.s}[0], [x3] + ld1 {v2.s}[0], [x2], x1 + ldcol.8 v0, x3, x1, 4, hi=1 + add x3, x3, x1 + ldcol.8 v3, x3, x1, 4 + movrel x4, p8weight + movrel x5, p16weight + uaddl v4.8h, v2.8b, v3.8b + rev32 v0.8b, v0.8b + trn1 v2.2s, v2.2s, v3.2s + ld1 {v7.8h}, [x4] + usubl v2.8h, v2.8b, v0.8b + mul v2.8h, v2.8h, v7.8h + ld1 {v0.8h}, [x5] + saddlp v2.4s, v2.8h + addp v2.4s, v2.4s, v2.4s + shl v3.2s, v2.2s, #4 + add v2.2s, v2.2s, v3.2s + rshrn v5.4h, v2.4s, #5 // b, c, x, x + addp v2.4h, v5.4h, v5.4h + shl v3.4h, v2.4h, #2 + sub v3.4h, v3.4h, v2.4h // 3 * (b + c) + rev64 v4.4h, v4.4h + add v4.4h, v4.4h, v0.4h + shl v2.4h, v4.4h, #4 // a + sub v2.4h, v2.4h, v3.4h // a - 3 * (b + c) + 16 + ext v0.16b, v0.16b, v0.16b, #14 + sub v6.4h, v5.4h, v3.4h + mov v0.h[0], wzr + mul v0.8h, v0.8h, v5.h[0] // 0,1,2,3,4,5,6,7 * b + dup v1.8h, v2.h[0] // pix + dup v2.8h, v5.h[1] // c + add v1.8h, v1.8h, v0.8h // pix + x*b + mov x3, #8 +1: + subs x3, x3, #1 + sqshrun v0.8b, v1.8h, #5 + add v1.8h, v1.8h, v2.8h + st1 {v0.8b}, [x0], x1 + b.ne 1b + ret +endfunc + + +function x264_predict_16x16_dc_top_neon, export=1 + sub x2, x0, #FDEC_STRIDE + mov x1, #FDEC_STRIDE + ld1 {v0.16b}, [x2] + uaddlv h0, v0.16b + rshrn v0.8b, v0.8h, #4 + dup v0.16b, v0.b[0] + b pred16x16_dc_end +endfunc + +function x264_predict_16x16_dc_left_neon, export=1 + sub x2, x0, #1 + mov x1, #FDEC_STRIDE + ldcol.16 v0, x2, x1 + uaddlv h0, v0.16b + rshrn v0.8b, v0.8h, #4 + dup v0.16b, v0.b[0] + b pred16x16_dc_end +endfunc + +function x264_predict_16x16_dc_neon, export=1 + sub x3, x0, #FDEC_STRIDE + sub x2, x0, #1 + mov x1, #FDEC_STRIDE + ld1 {v0.16b}, [x3] + ldcol.16 v1, x2, x1 + uaddlv h0, v0.16b + uaddlv h1, v1.16b + add v0.4h, v0.4h, v1.4h + rshrn v0.8b, v0.8h, #5 + dup v0.16b, v0.b[0] +pred16x16_dc_end: +.rept 16 + st1 {v0.16b}, [x0], x1 +.endr + ret +endfunc + +function x264_predict_16x16_h_neon, export=1 + sub x1, x0, #1 + mov x7, #FDEC_STRIDE +.rept 8 + ld1r {v0.16b}, [x1], x7 + ld1r {v1.16b}, [x1], x7 + st1 {v0.16b}, [x0], x7 + st1 {v1.16b}, [x0], x7 +.endr + ret +endfunc + +function x264_predict_16x16_v_neon, export=1 + sub x0, x0, #FDEC_STRIDE + mov x7, #FDEC_STRIDE + ld1 {v0.16b}, [x0], x7 +.rept 16 + st1 {v0.16b}, [x0], x7 +.endr + ret +endfunc + +function x264_predict_16x16_p_neon, export=1 + sub x3, x0, #FDEC_STRIDE + mov x1, #FDEC_STRIDE + add x2, x3, #8 + sub x3, x3, #1 + ld1 {v0.8b}, [x3] + ld1 {v2.8b}, [x2], x1 + ldcol.8 v1, x3, x1 + add x3, x3, x1 + ldcol.8 v3, x3, x1 + rev64 v0.8b, v0.8b + rev64 v1.8b, v1.8b + movrel x4, p16weight + uaddl v4.8h, v2.8b, v3.8b + ld1 {v7.8h}, [x4] + usubl v2.8h, v2.8b, v0.8b + usubl v3.8h, v3.8b, v1.8b + mul v2.8h, v2.8h, v7.8h + mul v3.8h, v3.8h, v7.8h + saddlp v2.4s, v2.8h + saddlp v3.4s, v3.8h + addp v2.4s, v2.4s, v3.4s + addp v2.4s, v2.4s, v2.4s + shl v3.2s, v2.2s, #2 + add v2.2s, v2.2s, v3.2s + rshrn v5.4h, v2.4s, #6 // b, c, x, x + addp v2.4h, v5.4h, v5.4h + shl v3.4h, v2.4h, #3 + sub v3.4h, v3.4h, v2.4h // 7 * (b + c) + ext v4.16b, v4.16b, v4.16b, #14 + add v4.4h, v4.4h, v7.4h + shl v2.4h, v4.4h, #4 // a + sub v2.4h, v2.4h, v3.4h // a - 7 * (b + c) + 16 + ext v7.16b, v7.16b, v7.16b, #14 + mov v7.h[0], wzr + dup v3.8h, v5.h[0] + mul v0.8h, v7.8h, v5.h[0] // 0,1,2,3,4,5,6,7 * b + dup v1.8h, v2.h[0] // pix + dup v2.8h, v5.h[1] // c + shl v3.8h, v3.8h, #3 + add v1.8h, v1.8h, v0.8h // pix + x*b + add v3.8h, v3.8h, v1.8h // pix + x{8-15}*b + mov x3, #16 +1: + subs x3, x3, #1 + sqshrun v0.8b, v1.8h, #5 + add v1.8h, v1.8h, v2.8h + sqshrun2 v0.16b, v3.8h, #5 + add v3.8h, v3.8h, v2.8h + st1 {v0.16b}, [x0], x1 + b.ne 1b + ret +endfunc
View file
x264-snapshot-20141104-2245.tar.bz2/common/aarch64/predict-c.c
Added
@@ -0,0 +1,114 @@ +/***************************************************************************** + * predict.c: aarch64 intra prediction + ***************************************************************************** + * Copyright (C) 2009-2014 x264 project + * + * Authors: David Conrad <lessen42@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "common/common.h" +#include "predict.h" +#include "pixel.h" + +void x264_predict_4x4_dc_top_neon( uint8_t *src ); +void x264_predict_4x4_ddr_neon( uint8_t *src ); +void x264_predict_4x4_ddl_neon( uint8_t *src ); + +void x264_predict_8x8c_dc_top_neon( uint8_t *src ); +void x264_predict_8x8c_dc_left_neon( uint8_t *src ); +void x264_predict_8x8c_p_neon( uint8_t *src ); + +void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] ); + +void x264_predict_16x16_dc_top_neon( uint8_t *src ); +void x264_predict_16x16_dc_left_neon( uint8_t *src ); +void x264_predict_16x16_p_neon( uint8_t *src ); + +void x264_predict_4x4_init_aarch64( int cpu, x264_predict_t pf[12] ) +{ +#if !HIGH_BIT_DEPTH + if (cpu&X264_CPU_ARMV8) + { + pf[I_PRED_4x4_H] = x264_predict_4x4_h_aarch64; + pf[I_PRED_4x4_V] = x264_predict_4x4_v_aarch64; + } + + if (cpu&X264_CPU_NEON) + { + pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_neon; + pf[I_PRED_4x4_DC_TOP] = x264_predict_4x4_dc_top_neon; + pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_neon; + pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_neon; + } +#endif // !HIGH_BIT_DEPTH +} + +void x264_predict_8x8c_init_aarch64( int cpu, x264_predict_t pf[7] ) +{ + if (!(cpu&X264_CPU_NEON)) + return; + +#if !HIGH_BIT_DEPTH + pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_neon; + pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_neon; + pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left_neon; + pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_neon; + pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_neon; + pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_neon; +#endif // !HIGH_BIT_DEPTH +} + +void x264_predict_8x8_init_aarch64( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter ) +{ + if (!(cpu&X264_CPU_NEON)) + return; + +#if !HIGH_BIT_DEPTH + pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_neon; + pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_neon; + pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_neon; + pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_neon; + pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_neon; + pf[I_PRED_8x8_H] = x264_predict_8x8_h_neon; + pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_neon; + pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_neon; + pf[I_PRED_8x8_V] = x264_predict_8x8_v_neon; +#endif // !HIGH_BIT_DEPTH +} + +void x264_predict_16x16_init_aarch64( int cpu, x264_predict_t pf[7] ) +{ + if (!(cpu&X264_CPU_NEON)) + return; + +#if !HIGH_BIT_DEPTH + pf[I_PRED_16x16_DC ] = x264_predict_16x16_dc_neon; + pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_neon; + pf[I_PRED_16x16_DC_LEFT]= x264_predict_16x16_dc_left_neon; + pf[I_PRED_16x16_H ] = x264_predict_16x16_h_neon; + pf[I_PRED_16x16_V ] = x264_predict_16x16_v_neon; + pf[I_PRED_16x16_P ] = x264_predict_16x16_p_neon; +#endif // !HIGH_BIT_DEPTH +}
View file
x264-snapshot-20141104-2245.tar.bz2/common/aarch64/predict.h
Added
@@ -0,0 +1,52 @@ +/***************************************************************************** + * predict.h: aarch64 intra prediction + ***************************************************************************** + * Copyright (C) 2009-2014 x264 project + * + * Authors: David Conrad <lessen42@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_AARCH64_PREDICT_H +#define X264_AARCH64_PREDICT_H + +void x264_predict_4x4_h_aarch64( uint8_t *src ); +void x264_predict_4x4_v_aarch64( uint8_t *src ); + +// for the merged 4x4 intra sad/satd which expects unified suffix +#define x264_predict_4x4_h_neon x264_predict_4x4_h_aarch64 +#define x264_predict_4x4_v_neon x264_predict_4x4_v_aarch64 + +void x264_predict_4x4_dc_neon( uint8_t *src ); +void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8c_dc_neon( uint8_t *src ); +void x264_predict_8x8c_h_neon( uint8_t *src ); +void x264_predict_8x8c_v_neon( uint8_t *src ); +void x264_predict_16x16_v_neon( uint8_t *src ); +void x264_predict_16x16_h_neon( uint8_t *src ); +void x264_predict_16x16_dc_neon( uint8_t *src ); + +void x264_predict_4x4_init_aarch64( int cpu, x264_predict_t pf[12] ); +void x264_predict_8x8_init_aarch64( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter ); +void x264_predict_8x8c_init_aarch64( int cpu, x264_predict_t pf[7] ); +void x264_predict_16x16_init_aarch64( int cpu, x264_predict_t pf[7] ); + +#endif /* X264_AARCH64_PREDICT_H */
View file
x264-snapshot-20141104-2245.tar.bz2/common/aarch64/quant-a.S
Added
@@ -0,0 +1,386 @@ +/**************************************************************************** + * quant.S: arm quantization and level-run + ***************************************************************************** + * Copyright (C) 2009-2014 x264 project + * + * Authors: David Conrad <lessen42@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "asm.S" + +.macro QUANT_TWO bias0 bias1 mf0_1 mf2_3 mask + add v18.8h, v18.8h, \bias0 + add v19.8h, v19.8h, \bias1 + umull v20.4s, v18.4h, \mf0_1\().4h + umull2 v21.4s, v18.8h, \mf0_1\().8h + umull v22.4s, v19.4h, \mf2_3\().4h + umull2 v23.4s, v19.8h, \mf2_3\().8h + sshr v16.8h, v16.8h, #15 + sshr v17.8h, v17.8h, #15 + shrn v18.4h, v20.4s, #16 + shrn2 v18.8h, v21.4s, #16 + shrn v19.4h, v22.4s, #16 + shrn2 v19.8h, v23.4s, #16 + eor v18.16b, v18.16b, v16.16b + eor v19.16b, v19.16b, v17.16b + sub v18.8h, v18.8h, v16.8h + sub v19.8h, v19.8h, v17.8h + orr \mask, v18.16b, v19.16b + st1 {v18.8h,v19.8h}, [x0], #32 +.endm + +.macro QUANT_END d + fmov x2, \d + mov w0, #0 + tst x2, x2 + cinc w0, w0, ne + ret +.endm + +// quant_2x2_dc( int16_t dct[4], int mf, int bias ) +function x264_quant_2x2_dc_neon, export=1 + ld1 {v0.4h}, [x0] + dup v2.4h, w2 + dup v1.4h, w1 + abs v3.4h, v0.4h + add v3.4h, v3.4h, v2.4h + umull v3.4s, v3.4h, v1.4h + sshr v0.4h, v0.4h, #15 + shrn v3.4h, v3.4s, #16 + eor v3.8b, v3.8b, v0.8b + sub v3.4h, v3.4h, v0.4h + st1 {v3.4h}, [x0] + QUANT_END d3 +endfunc + +// quant_4x4_dc( int16_t dct[16], int mf, int bias ) +function x264_quant_4x4_dc_neon, export=1 + ld1 {v16.8h,v17.8h}, [x0] + abs v18.8h, v16.8h + abs v19.8h, v17.8h + dup v0.8h, w2 + dup v2.8h, w1 + QUANT_TWO v0.8h, v0.8h, v2, v2, v0.16b + uqxtn v0.8b, v0.8h + QUANT_END d0 +endfunc + +// quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ) +function x264_quant_4x4_neon, export=1 + ld1 {v16.8h,v17.8h}, [x0] + abs v18.8h, v16.8h + abs v19.8h, v17.8h + ld1 {v0.8h,v1.8h}, [x2] + ld1 {v2.8h,v3.8h}, [x1] + QUANT_TWO v0.8h, v1.8h, v2, v3, v0.16b + uqxtn v0.8b, v0.8h + QUANT_END d0 +endfunc + +// quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] ) +function x264_quant_4x4x4_neon, export=1 + ld1 {v16.8h,v17.8h}, [x0] + abs v18.8h, v16.8h + abs v19.8h, v17.8h + ld1 {v0.8h,v1.8h}, [x2] + ld1 {v2.8h,v3.8h}, [x1] + QUANT_TWO v0.8h, v1.8h, v2, v3, v4.16b + ld1 {v16.8h,v17.8h}, [x0] + abs v18.8h, v16.8h + abs v19.8h, v17.8h + QUANT_TWO v0.8h, v1.8h, v2, v3, v5.16b + ld1 {v16.8h,v17.8h}, [x0] + abs v18.8h, v16.8h + abs v19.8h, v17.8h + QUANT_TWO v0.8h, v1.8h, v2, v3, v6.16b + ld1 {v16.8h,v17.8h}, [x0] + abs v18.8h, v16.8h + abs v19.8h, v17.8h + QUANT_TWO v0.8h, v1.8h, v2, v3, v7.16b + uqxtn v4.8b, v4.8h + uqxtn v7.8b, v7.8h + uqxtn v6.8b, v6.8h + uqxtn v5.8b, v5.8h + fmov x7, d7 + fmov x6, d6 + fmov x5, d5 + fmov x4, d4 + mov w0, #0 + tst x7, x7 + cinc w0, w0, ne + lsl w0, w0, #1 + tst x6, x6 + cinc w0, w0, ne + lsl w0, w0, #1 + tst x5, x5 + cinc w0, w0, ne + lsl w0, w0, #1 + tst x4, x4 + cinc w0, w0, ne + ret +endfunc + +// quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] ) +function x264_quant_8x8_neon, export=1 + ld1 {v16.8h,v17.8h}, [x0] + abs v18.8h, v16.8h + abs v19.8h, v17.8h + ld1 {v0.8h,v1.8h}, [x2], #32 + ld1 {v2.8h,v3.8h}, [x1], #32 + QUANT_TWO v0.8h, v1.8h, v2, v3, v4.16b +.rept 3 + ld1 {v16.8h,v17.8h}, [x0] + abs v18.8h, v16.8h + abs v19.8h, v17.8h + ld1 {v0.8h,v1.8h}, [x2], #32 + ld1 {v2.8h,v3.8h}, [x1], #32 + QUANT_TWO v0.8h, v1.8h, v2, v3, v5.16b + orr v4.16b, v4.16b, v5.16b +.endr + uqxtn v0.8b, v4.8h + QUANT_END d0 +endfunc + +.macro DEQUANT_START mf_size offset dc=no + mov w3, #0x2b + mul w3, w3, w2 + lsr w3, w3, #8 // i_qbits = i_qp / 6 + add w5, w3, w3, lsl #1 + sub w2, w2, w5, lsl #1 // i_mf = i_qp % 6 + lsl w2, w2, #\mf_size +.ifc \dc,no + add x1, x1, w2, sxtw // dequant_mf[i_mf] +.else + ldr x1, [x1, w2, sxtw] // dequant_mf[i_mf][0][0] +.endif + subs w3, w3, #\offset // 6 for 8x8 +.endm + +// dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp ) +.macro DEQUANT size bits +function x264_dequant_\size\()_neon, export=1 + DEQUANT_START \bits+2, \bits +.ifc \size, 8x8 + mov w2, #4 +.endif + b.lt dequant_\size\()_rshift + + dup v31.8h, w3 +dequant_\size\()_lshift_loop: +.ifc \size, 8x8 + subs w2, w2, #1 +.endif + ld1 {v16.4s}, [x1], #16 + ld1 {v17.4s}, [x1], #16 + sqxtn v2.4h, v16.4s + ld1 {v18.4s}, [x1], #16 + sqxtn2 v2.8h, v17.4s + ld1 {v19.4s}, [x1], #16 + sqxtn v3.4h, v18.4s + ld1 {v0.8h,v1.8h}, [x0] + sqxtn2 v3.8h, v19.4s + mul v0.8h, v0.8h, v2.8h + mul v1.8h, v1.8h, v3.8h + sshl v0.8h, v0.8h, v31.8h + sshl v1.8h, v1.8h, v31.8h + st1 {v0.8h,v1.8h}, [x0], #32 +.ifc \size, 8x8 + b.gt dequant_\size\()_lshift_loop +.endif + ret + +dequant_\size\()_rshift: + dup v31.4s, w3 + neg w3, w3 + mov w5, #1 + sub w3, w3, #1 + lsl w5, w5, w3 + +.ifc \size, 8x8 +dequant_\size\()_rshift_loop: + subs w2, w2, #1 +.endif + ld1 {v16.4s}, [x1], #16 + ld1 {v17.4s}, [x1], #16 + sqxtn v2.4h, v16.4s + ld1 {v18.4s}, [x1], #16 + dup v16.4s, w5 + sqxtn2 v2.8h, v17.4s + ld1 {v19.4s}, [x1], #16 + dup v17.4s, w5 + sqxtn v3.4h, v18.4s + ld1 {v0.8h,v1.8h}, [x0] + dup v18.4s, w5 + sqxtn2 v3.8h, v19.4s + dup v19.4s, w5 + + smlal v16.4s, v0.4h, v2.4h + smlal2 v17.4s, v0.8h, v2.8h + smlal v18.4s, v1.4h, v3.4h + smlal2 v19.4s, v1.8h, v3.8h + sshl v16.4s, v16.4s, v31.4s + sshl v17.4s, v17.4s, v31.4s + sshl v18.4s, v18.4s, v31.4s + sshl v19.4s, v19.4s, v31.4s + + sqxtn v0.4h, v16.4s + sqxtn2 v0.8h, v17.4s + sqxtn v1.4h, v18.4s + sqxtn2 v1.8h, v19.4s + st1 {v0.8h,v1.8h}, [x0], #32 +.ifc \size, 8x8 + b.gt dequant_\size\()_rshift_loop +.endif + ret +endfunc +.endm + +DEQUANT 4x4, 4 +DEQUANT 8x8, 6 + +// dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp ) +function x264_dequant_4x4_dc_neon, export=1 + DEQUANT_START 6, 6, yes + b.lt dequant_4x4_dc_rshift + + lsl w1, w1, w3 + dup v2.8h, w1 + ld1 {v0.8h,v1.8h}, [x0] + + mul v0.8h, v0.8h, v2.8h + mul v1.8h, v1.8h, v2.8h + st1 {v0.8h,v1.8h}, [x0] + ret + +dequant_4x4_dc_rshift: + dup v4.8h, w1 + dup v3.4s, w3 + neg w3, w3 + mov w5, #1 + sub w3, w3, #1 + lsl w5, w5, w3 + + dup v16.4s, w5 + dup v17.4s, w5 + ld1 {v0.8h,v1.8h}, [x0] + dup v18.4s, w5 + dup v19.4s, w5 + + smlal v16.4s, v0.4h, v4.4h + smlal2 v17.4s, v0.8h, v4.8h + smlal v18.4s, v1.4h, v4.4h + smlal2 v19.4s, v1.8h, v4.8h + sshl v16.4s, v16.4s, v3.4s + sshl v17.4s, v17.4s, v3.4s + sshl v18.4s, v18.4s, v3.4s + sshl v19.4s, v19.4s, v3.4s + + sqxtn v0.4h, v16.4s + sqxtn2 v0.8h, v17.4s + sqxtn v1.4h, v18.4s + sqxtn2 v1.8h, v19.4s + st1 {v0.8h,v1.8h}, [x0] + ret +endfunc + +// int coeff_last( int16_t *l ) +function x264_coeff_last4_aarch64, export=1 + ldr x2, [x0] + mov w4, #3 + clz x0, x2 + sub w0, w4, w0, lsr #4 + ret +endfunc + +function x264_coeff_last8_aarch64, export=1 + ldr x3, [x0, #8] + mov w4, #7 + clz x2, x3 + cmp w2, #64 + b.ne 1f + ldr x3, [x0] + sub w4, w4, #4 + clz x2, x3 +1: + sub w0, w4, w2, lsr #4 + ret +endfunc + +.macro COEFF_LAST_1x size +function x264_coeff_last\size\()_neon, export=1 +.if \size == 15 + sub x0, x0, #2 +.endif + ld1 {v0.8h,v1.8h}, [x0] + uqxtn v0.8b, v0.8h + uqxtn2 v0.16b, v1.8h + cmtst v0.16b, v0.16b, v0.16b + shrn v0.8b, v0.8h, #4 + fmov x1, d0 + mov w3, #\size - 1 + clz x2, x1 + sub w0, w3, w2, lsr #2 + ret +endfunc +.endm + +COEFF_LAST_1x 15 +COEFF_LAST_1x 16 + +function x264_coeff_last64_neon, export=1 + ld1 {v0.8h,v1.8h,v2.8h,v3.8h}, [x0], 64 + movi v31.8h, #8 + movi v30.8h, #1 + uqxtn v0.8b, v0.8h + uqxtn2 v0.16b, v1.8h + ld1 {v4.8h,v5.8h,v6.8h,v7.8h}, [x0], 64 + uqxtn v1.8b, v2.8h + uqxtn2 v1.16b, v3.8h + uqxtn v2.8b, v4.8h + uqxtn2 v2.16b, v5.8h + uqxtn v3.8b, v6.8h + uqxtn2 v3.16b, v7.8h + + cmtst v0.16b, v0.16b, v0.16b + cmtst v1.16b, v1.16b, v1.16b + cmtst v2.16b, v2.16b, v2.16b + cmtst v3.16b, v3.16b, v3.16b + + shrn v0.8b, v0.8h, #4 + shrn2 v0.16b, v1.8h, #4 + shrn v1.8b, v2.8h, #4 + shrn2 v1.16b, v3.8h, #4 + + clz v0.4s, v0.4s + clz v1.4s, v1.4s + + shrn v0.4h, v0.4s, #2 + shrn2 v0.8h, v1.4s, #2 + + sub v0.8h, v31.8h, v0.8h + sshl v0.8h, v30.8h, v0.8h + shrn v0.8b, v0.8h, #1 + + fmov x2, d0 + mov w3, #63 + clz x2, x2 + sub w0, w3, w2 + ret +endfunc
View file
x264-snapshot-20141104-2245.tar.bz2/common/aarch64/quant.h
Added
@@ -0,0 +1,47 @@ +/***************************************************************************** + * quant.h: arm quantization and level-run + ***************************************************************************** + * Copyright (C) 2005-2014 x264 project + * + * Authors: David Conrad <lessen42@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_AARCH64_QUANT_H +#define X264_AARCH64_QUANT_H + +int x264_quant_2x2_dc_aarch64( int16_t dct[4], int mf, int bias ); + +int x264_quant_2x2_dc_neon( int16_t dct[4], int mf, int bias ); +int x264_quant_4x4_dc_neon( int16_t dct[16], int mf, int bias ); +int x264_quant_4x4_neon( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ); +int x264_quant_4x4x4_neon( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] ); +int x264_quant_8x8_neon( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] ); + +void x264_dequant_4x4_dc_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp ); +void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp ); +void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp ); + +int x264_coeff_last4_aarch64( int16_t * ); +int x264_coeff_last8_aarch64( int16_t * ); +int x264_coeff_last15_neon( int16_t * ); +int x264_coeff_last16_neon( int16_t * ); +int x264_coeff_last64_neon( int16_t * ); + +#endif
View file
x264-snapshot-20140321-2245.tar.bz2/common/arm/asm.S -> x264-snapshot-20141104-2245.tar.bz2/common/arm/asm.S
Changed
@@ -28,6 +28,16 @@ .syntax unified +#if HAVE_NEON + .arch armv7-a +#elif HAVE_ARMV6T2 + .arch armv6t2 +#elif HAVE_ARMV6 + .arch armv6 +#endif + +.fpu neon + #ifdef PREFIX # define EXTERN_ASM _ #else @@ -40,32 +50,49 @@ # define ELF @ #endif - .macro require8, val=1 +#if HAVE_AS_FUNC +# define FUNC +#else +# define FUNC @ +#endif + +.macro require8, val=1 ELF .eabi_attribute 24, \val - .endm +.endm - .macro preserve8, val=1 +.macro preserve8, val=1 ELF .eabi_attribute 25, \val - .endm +.endm - .macro function name - .global EXTERN_ASM\name +.macro function name, export=1 + .macro endfunc +ELF .size \name, . - \name +FUNC .endfunc + .purgem endfunc + .endm .align 2 +.if \export == 1 + .global EXTERN_ASM\name +ELF .hidden EXTERN_ASM\name +ELF .type EXTERN_ASM\name, %function +FUNC .func EXTERN_ASM\name EXTERN_ASM\name: +.else ELF .hidden \name ELF .type \name, %function - .func \name +FUNC .func \name \name: - .endm +.endif +.endm - .macro movrel rd, val +.macro movrel rd, val #if HAVE_ARMV6T2 && !defined(PIC) movw \rd, #:lower16:\val movt \rd, #:upper16:\val #else ldr \rd, =\val #endif - .endm +.endm .macro movconst rd, val #if HAVE_ARMV6T2 @@ -78,6 +105,10 @@ #endif .endm +#define GLUE(a, b) a ## b +#define JOIN(a, b) GLUE(a, b) +#define X(s) JOIN(EXTERN_ASM, s) + #define FENC_STRIDE 16 #define FDEC_STRIDE 32
View file
x264-snapshot-20140321-2245.tar.bz2/common/arm/cpu-a.S -> x264-snapshot-20141104-2245.tar.bz2/common/arm/cpu-a.S
Changed
@@ -25,7 +25,6 @@ #include "asm.S" -.fpu neon .align 2 // done in gas because .fpu neon overrides the refusal to assemble @@ -33,12 +32,12 @@ function x264_cpu_neon_test vadd.i16 q0, q0, q0 bx lr -.endfunc +endfunc // return: 0 on success // 1 if counters were already enabled // 9 if lo-res counters were already enabled -function x264_cpu_enable_armv7_counter +function x264_cpu_enable_armv7_counter, export=0 mrc p15, 0, r2, c9, c12, 0 // read PMNC ands r0, r2, #1 andne r0, r2, #9 @@ -49,14 +48,14 @@ mov r2, #1 << 31 // enable cycle counter mcr p15, 0, r2, c9, c12, 1 // write CNTENS bx lr -.endfunc +endfunc -function x264_cpu_disable_armv7_counter +function x264_cpu_disable_armv7_counter, export=0 mrc p15, 0, r0, c9, c12, 0 // read PMNC bic r0, r0, #1 // disable counters mcr p15, 0, r0, c9, c12, 0 // write PMNC bx lr -.endfunc +endfunc .macro READ_TIME r @@ -106,4 +105,4 @@ cmp r0, #10 movgt r0, #0 pop {r4-r6,pc} -.endfunc +endfunc
View file
x264-snapshot-20140321-2245.tar.bz2/common/arm/dct-a.S -> x264-snapshot-20141104-2245.tar.bz2/common/arm/dct-a.S
Changed
@@ -25,8 +25,6 @@ #include "asm.S" -.fpu neon - .section .rodata .align 4 @@ -82,7 +80,7 @@ vrhadd.s16 d3, d6, d7 vst1.64 {d0-d3}, [r0,:128] bx lr -.endfunc +endfunc function x264_idct4x4dc_neon vld1.64 {d0-d3}, [r0,:128] @@ -94,7 +92,7 @@ HADAMARD 2, sumsub, d3, d2, d6, d7 vst1.64 {d0-d3}, [r0,:128] bx lr -.endfunc +endfunc .macro DCT_1D d0 d1 d2 d3 d4 d5 d6 d7 @@ -129,9 +127,9 @@ DCT_1D d4, d5, d6, d7, d0, d1, d2, d3 vst1.64 {d4-d7}, [r0,:128] bx lr -.endfunc +endfunc -function x264_sub8x4_dct_neon +function x264_sub8x4_dct_neon, export=0 vld1.64 {d0}, [r1,:64], r3 vld1.64 {d1}, [r2,:64], ip vsubl.u8 q8, d0, d1 @@ -165,7 +163,7 @@ vst1.64 {d4-d5}, [r0,:128]! vst1.64 {d6-d7}, [r0,:128]! bx lr -.endfunc +endfunc function x264_sub8x8_dct_neon push {lr} @@ -174,7 +172,7 @@ bl x264_sub8x4_dct_neon pop {lr} b x264_sub8x4_dct_neon -.endfunc +endfunc function x264_sub16x16_dct_neon push {lr} @@ -195,7 +193,7 @@ bl x264_sub8x4_dct_neon pop {lr} b x264_sub8x4_dct_neon -.endfunc +endfunc .macro DCT8_1D type @@ -279,22 +277,22 @@ vst1.64 {d24-d27}, [r0,:128]! vst1.64 {d28-d31}, [r0,:128]! bx lr -.endfunc +endfunc function x264_sub16x16_dct8_neon push {lr} - bl x264_sub8x8_dct8_neon + bl X(x264_sub8x8_dct8_neon) sub r1, r1, #FENC_STRIDE*8 - 8 sub r2, r2, #FDEC_STRIDE*8 - 8 - bl x264_sub8x8_dct8_neon + bl X(x264_sub8x8_dct8_neon) sub r1, r1, #8 sub r2, r2, #8 - bl x264_sub8x8_dct8_neon + bl X(x264_sub8x8_dct8_neon) pop {lr} sub r1, r1, #FENC_STRIDE*8 - 8 sub r2, r2, #FDEC_STRIDE*8 - 8 - b x264_sub8x8_dct8_neon -.endfunc + b X(x264_sub8x8_dct8_neon) +endfunc // First part of IDCT (minus final SUMSUB_BA) @@ -336,9 +334,9 @@ vst1.32 {d2[1]}, [r0,:32], r2 vst1.32 {d2[0]}, [r0,:32], r2 bx lr -.endfunc +endfunc -function x264_add8x4_idct_neon +function x264_add8x4_idct_neon, export=0 vld1.64 {d0-d3}, [r1,:128]! IDCT_1D d16, d18, d20, d22, d0, d1, d2, d3 vld1.64 {d4-d7}, [r1,:128]! @@ -376,7 +374,7 @@ vst1.32 {d2}, [r0,:64], r2 vst1.32 {d3}, [r0,:64], r2 bx lr -.endfunc +endfunc function x264_add8x8_idct_neon mov r2, #FDEC_STRIDE @@ -384,7 +382,7 @@ bl x264_add8x4_idct_neon mov lr, ip b x264_add8x4_idct_neon -.endfunc +endfunc function x264_add16x16_idct_neon mov r2, #FDEC_STRIDE @@ -401,7 +399,7 @@ bl x264_add8x4_idct_neon mov lr, ip b x264_add8x4_idct_neon -.endfunc +endfunc .macro IDCT8_1D type @@ -498,19 +496,19 @@ vst1.64 {d6}, [r0,:64], r2 vst1.64 {d7}, [r0,:64], r2 bx lr -.endfunc +endfunc function x264_add16x16_idct8_neon mov ip, lr - bl x264_add8x8_idct8_neon + bl X(x264_add8x8_idct8_neon) sub r0, r0, #8*FDEC_STRIDE-8 - bl x264_add8x8_idct8_neon + bl X(x264_add8x8_idct8_neon) sub r0, r0, #8 - bl x264_add8x8_idct8_neon + bl X(x264_add8x8_idct8_neon) sub r0, r0, #8*FDEC_STRIDE-8 mov lr, ip - b x264_add8x8_idct8_neon -.endfunc + b X(x264_add8x8_idct8_neon) +endfunc function x264_add8x8_idct_dc_neon @@ -562,7 +560,7 @@ vst1.64 {d6}, [r0,:64], r2 vst1.64 {d7}, [r0,:64], r2 bx lr -.endfunc +endfunc .macro ADD16x4_IDCT_DC dc vld1.64 {d16-d17}, [r0,:128], r3 @@ -610,7 +608,7 @@ ADD16x4_IDCT_DC d2 ADD16x4_IDCT_DC d3 bx lr -.endfunc +endfunc function x264_sub8x8_dct_dc_neon mov r3, #FENC_STRIDE @@ -658,7 +656,7 @@ vpadd.s16 d0, d0, d1 vst1.64 {d0}, [r0,:64] bx lr -.endfunc +endfunc function x264_zigzag_scan_4x4_frame_neon @@ -671,4 +669,4 @@ vtbl.8 d7, {d2-d3}, d19 vst1.64 {d4-d7}, [r0,:128] bx lr -.endfunc +endfunc
View file
x264-snapshot-20140321-2245.tar.bz2/common/arm/deblock-a.S -> x264-snapshot-20141104-2245.tar.bz2/common/arm/deblock-a.S
Changed
@@ -25,8 +25,6 @@ #include "asm.S" -.fpu neon - .macro h264_loop_filter_start ldr ip, [sp] ldr ip, [ip] @@ -142,7 +140,7 @@ align_pop_regs bx lr -.endfunc +endfunc function x264_deblock_h_luma_neon h264_loop_filter_start @@ -194,7 +192,7 @@ align_pop_regs bx lr -.endfunc +endfunc .macro h264_loop_filter_chroma vdup.8 q11, r2 // alpha @@ -255,7 +253,7 @@ vst2.8 {d0, d1}, [r0,:128], r1 bx lr -.endfunc +endfunc function x264_deblock_h_chroma_neon h264_loop_filter_start @@ -303,4 +301,110 @@ vst1.8 {d3}, [r0], r1 bx lr -.endfunc +endfunc + +function x264_deblock_strength_neon + ldr ip, [sp] + vmov.i8 q8, #0 + lsl ip, ip, #8 + add r3, r3, #32 + sub ip, ip, #(1<<8)-3 + vmov.i8 q9, #0 + vdup.16 q10, ip + ldr ip, [sp, #4] + +lists: + @ load bytes ref + vld1.8 {d31}, [r1]! + add r2, r2, #16 + vld1.8 {q1}, [r1]! + vmov.i8 q0, #0 + vld1.8 {q2}, [r1]! + vext.8 q3, q0, q1, #15 + vext.8 q0, q0, q2, #15 + vuzp.32 q1, q2 + vuzp.32 q3, q0 + vext.8 q1, q15, q2, #12 + + veor q0, q0, q2 + veor q1, q1, q2 + vorr q8, q8, q0 + vorr q9, q9, q1 + + vld1.16 {q11}, [r2,:128]! @ mv + 0x10 + vld1.16 {q3}, [r2,:128]! @ mv + 0x20 + vld1.16 {q12}, [r2,:128]! @ mv + 0x30 + vld1.16 {q2}, [r2,:128]! @ mv + 0x40 + vld1.16 {q13}, [r2,:128]! @ mv + 0x50 + vext.8 q3, q3, q12, #12 + vext.8 q2, q2, q13, #12 + vabd.s16 q0, q12, q3 + vld1.16 {q3}, [r2,:128]! @ mv + 0x60 + vabd.s16 q1, q13, q2 + vld1.16 {q14}, [r2,:128]! @ mv + 0x70 + vqmovn.u16 d0, q0 + vld1.16 {q2}, [r2,:128]! @ mv + 0x80 + vld1.16 {q15}, [r2,:128]! @ mv + 0x90 + vqmovn.u16 d1, q1 + vext.8 q3, q3, q14, #12 + vext.8 q2, q2, q15, #12 + vabd.s16 q3, q14, q3 + vabd.s16 q2, q15, q2 + vqmovn.u16 d2, q3 + vqmovn.u16 d3, q2 + + vqsub.u8 q0, q0, q10 + vqsub.u8 q1, q1, q10 + vqmovn.u16 d0, q0 + vqmovn.u16 d1, q1 + + vabd.s16 q1, q12, q13 + vorr q8, q8, q0 + + vabd.s16 q0, q11, q12 + vabd.s16 q2, q13, q14 + vabd.s16 q3, q14, q15 + vqmovn.u16 d0, q0 + vqmovn.u16 d1, q1 + vqmovn.u16 d2, q2 + vqmovn.u16 d3, q3 + + vqsub.u8 q0, q0, q10 + vqsub.u8 q1, q1, q10 + vqmovn.u16 d0, q0 + vqmovn.u16 d1, q1 + subs ip, ip, #1 + vorr q9, q9, q0 + beq lists + + mov ip, #-32 + @ load bytes nnz + vld1.8 {d31}, [r0]! + vld1.8 {q1}, [r0]! + vmov.i8 q0, #0 + vld1.8 {q2}, [r0] + vext.8 q3, q0, q1, #15 + vext.8 q0, q0, q2, #15 + vuzp.32 q1, q2 + vuzp.32 q3, q0 + vext.8 q1, q15, q2, #12 + + vorr q0, q0, q2 + vorr q1, q1, q2 + vmov.u8 q10, #1 + vmin.u8 q0, q0, q10 + vmin.u8 q1, q1, q10 + vmin.u8 q8, q8, q10 @ mv ? 1 : 0 + vmin.u8 q9, q9, q10 + vadd.u8 q0, q0, q0 @ nnz ? 2 : 0 + vadd.u8 q1, q1, q1 + vmax.u8 q8, q8, q0 + vmax.u8 q9, q9, q1 + vzip.16 d16, d17 + vst1.8 {q9}, [r3,:128], ip @ bs[1] + vtrn.8 d16, d17 + vtrn.32 d16, d17 + + vst1.8 {q8}, [r3,:128] @ bs[0] + bx lr +endfunc
View file
x264-snapshot-20140321-2245.tar.bz2/common/arm/mc-a.S -> x264-snapshot-20141104-2245.tar.bz2/common/arm/mc-a.S
Changed
@@ -27,7 +27,6 @@ #include "asm.S" -.fpu neon .text // note: prefetch stuff assumes 64-byte cacheline, true for the Cortex-A8 @@ -50,7 +49,7 @@ pld [r3, r1, lsl #1] pld [r3, r2] bx lr -.endfunc +endfunc // void prefetch_fenc( uint8_t *pix_y, intptr_t stride_y, // uint8_t *pix_uv, intptr_t stride_uv, int mb_x ) @@ -76,7 +75,7 @@ pld [ip] pld [ip, r3] pop {pc} -.endfunc +endfunc // void *x264_memcpy_aligned( void *dst, const void *src, size_t n ) @@ -85,10 +84,10 @@ movrel ip, memcpy_table and r3, r3, #0xc ldr pc, [ip, r3] -.endfunc +endfunc .macro MEMCPY_ALIGNED srcalign dstalign -function memcpy_aligned_\dstalign\()_\srcalign\()_neon +function memcpy_aligned_\dstalign\()_\srcalign\()_neon, export=0 mov r3, r0 .if \srcalign == 8 && \dstalign == 8 sub r2, #16 @@ -127,7 +126,7 @@ vst1.64 {d0}, [r3,:64]! .endif bx lr -.endfunc +endfunc .endm MEMCPY_ALIGNED 16, 16 @@ -156,7 +155,7 @@ .endr bgt memzero_loop bx lr -.endfunc +endfunc // void pixel_avg( uint8_t *dst, intptr_t dst_stride, @@ -175,12 +174,13 @@ cmp ip, #0 bge x264_pixel_avg_weight_w\w\()_add_add_neon b x264_pixel_avg_weight_w\w\()_sub_add_neon // weight < 0 -.endfunc +endfunc .endm AVGH 4, 2 AVGH 4, 4 AVGH 4, 8 +AVGH 4, 16 AVGH 8, 4 AVGH 8, 8 AVGH 8, 16 @@ -238,7 +238,7 @@ .endm .macro AVG_WEIGHT ext -function x264_pixel_avg_weight_w4_\ext\()_neon +function x264_pixel_avg_weight_w4_\ext\()_neon, export=0 load_weights_\ext 1: // height loop subs lr, lr, #2 @@ -252,9 +252,9 @@ vst1.32 {d1[0]}, [r0,:32], r1 bgt 1b pop {r4-r6,pc} -.endfunc +endfunc -function x264_pixel_avg_weight_w8_\ext\()_neon +function x264_pixel_avg_weight_w8_\ext\()_neon, export=0 load_weights_\ext 1: // height loop subs lr, lr, #4 @@ -276,9 +276,9 @@ vst1.64 {d3}, [r0,:64], r1 bgt 1b pop {r4-r6,pc} -.endfunc +endfunc -function x264_pixel_avg_weight_w16_\ext\()_neon +function x264_pixel_avg_weight_w16_\ext\()_neon, export=0 load_weights_\ext 1: // height loop subs lr, lr, #2 @@ -296,14 +296,14 @@ vst1.64 {d2-d3}, [r0,:128], r1 bgt 1b pop {r4-r6,pc} -.endfunc +endfunc .endm AVG_WEIGHT add_add AVG_WEIGHT add_sub AVG_WEIGHT sub_add -function x264_pixel_avg_w4_neon +function x264_pixel_avg_w4_neon, export=0 subs lr, lr, #2 vld1.32 {d0[]}, [r2], r3 vld1.32 {d2[]}, [r4], r5 @@ -315,9 +315,9 @@ vst1.32 {d1[0]}, [r0,:32], r1 bgt x264_pixel_avg_w4_neon pop {r4-r6,pc} -.endfunc +endfunc -function x264_pixel_avg_w8_neon +function x264_pixel_avg_w8_neon, export=0 subs lr, lr, #4 vld1.64 {d0}, [r2], r3 vld1.64 {d2}, [r4], r5 @@ -337,9 +337,9 @@ vst1.64 {d3}, [r0,:64], r1 bgt x264_pixel_avg_w8_neon pop {r4-r6,pc} -.endfunc +endfunc -function x264_pixel_avg_w16_neon +function x264_pixel_avg_w16_neon, export=0 subs lr, lr, #4 vld1.64 {d0-d1}, [r2], r3 vld1.64 {d2-d3}, [r4], r5 @@ -359,7 +359,7 @@ vst1.64 {d6-d7}, [r0,:128], r1 bgt x264_pixel_avg_w16_neon pop {r4-r6,pc} -.endfunc +endfunc function x264_pixel_avg2_w4_neon @@ -378,7 +378,7 @@ vst1.32 {d1[0]}, [r0,:32], r1 bgt avg2_w4_loop pop {pc} -.endfunc +endfunc function x264_pixel_avg2_w8_neon ldr ip, [sp, #4] @@ -396,7 +396,7 @@ vst1.64 {d1}, [r0,:64], r1 bgt avg2_w8_loop pop {pc} -.endfunc +endfunc function x264_pixel_avg2_w16_neon ldr ip, [sp, #4] @@ -414,7 +414,7 @@ vst1.64 {d4-d5}, [r0,:128], r1 bgt avg2_w16_loop pop {pc} -.endfunc +endfunc function x264_pixel_avg2_w20_neon ldr ip, [sp, #4] @@ -437,7 +437,7 @@ vst1.32 {d6[0]}, [r0,:32], r1 bgt avg2_w20_loop pop {pc} -.endfunc +endfunc .macro weight_prologue type @@ -448,7 +448,7 @@ ldr lr, [r4, #32] // denom .endif ldrd r4, r5, [r4, #32+4] // scale, offset - vdup.16 q0, r4 + vdup.8 d0, r4 vdup.16 q1, r5 .ifc \type, full rsb lr, lr, #0 @@ -464,19 +464,13 @@ weight20_loop: subs ip, #2 vld1.8 {d17-d19}, [r2], r3 - vmovl.u8 q10, d17 - vmovl.u8 q11, d18 - vmovl.u8 q14, d19 + vmull.u8 q10, d17, d0 + vmull.u8 q11, d18, d0 vld1.8 {d16-d18}, [r2], r3 - vmovl.u8 q12, d16 - vmovl.u8 q13, d17 - vmovl.u8 q15, d18 - vmul.s16 q10, q10, q0 - vmul.s16 q11, q11, q0 - vmul.s16 q12, q12, q0 - vmul.s16 q13, q13, q0 - vmul.s16 d28, d28, d0 - vmul.s16 d29, d30, d0 + vmull.u8 q12, d16, d0 + vmull.u8 q13, d17, d0 + vtrn.32 d19, d18 + vmull.u8 q14, d19, d0 vrshl.s16 q10, q10, q2 vrshl.s16 q11, q11, q2 vrshl.s16 q12, q12, q2 @@ -498,7 +492,7 @@ vst1.32 {d20[1]}, [r0,:32], r1 bgt weight20_loop pop {r4-r5,pc} -.endfunc +endfunc function x264_mc_weight_w16_neon weight_prologue full @@ -506,14 +500,10 @@ subs ip, #2 vld1.8 {d16-d17}, [r2], r3 vld1.8 {d18-d19}, [r2], r3 - vmovl.u8 q10, d16 - vmovl.u8 q11, d17 - vmovl.u8 q12, d18 - vmovl.u8 q13, d19 - vmul.s16 q10, q10, q0 - vmul.s16 q11, q11, q0 - vmul.s16 q12, q12, q0 - vmul.s16 q13, q13, q0 + vmull.u8 q10, d16, d0 + vmull.u8 q11, d17, d0 + vmull.u8 q12, d18, d0 + vmull.u8 q13, d19, d0 vrshl.s16 q10, q10, q2 vrshl.s16 q11, q11, q2 vrshl.s16 q12, q12, q2 @@ -530,7 +520,7 @@ vst1.8 {d18-d19}, [r0,:128], r1 bgt weight16_loop pop {r4-r5,pc} -.endfunc +endfunc function x264_mc_weight_w8_neon weight_prologue full @@ -538,10 +528,8 @@ subs ip, #2 vld1.8 {d16}, [r2], r3 vld1.8 {d18}, [r2], r3 - vmovl.u8 q8, d16 - vmovl.u8 q9, d18 - vmul.s16 q8, q8, q0 - vmul.s16 q9, q9, q0 + vmull.u8 q8, d16, d0 + vmull.u8 q9, d18, d0 vrshl.s16 q8, q8, q2 vrshl.s16 q9, q9, q2 vadd.s16 q8, q8, q1 @@ -552,51 +540,42 @@ vst1.8 {d18}, [r0,:64], r1 bgt weight8_loop pop {r4-r5,pc} -.endfunc +endfunc function x264_mc_weight_w4_neon weight_prologue full weight4_loop: subs ip, #2 - vld1.32 {d16[]}, [r2], r3 - vld1.32 {d18[]}, [r2], r3 - vmovl.u8 q8, d16 - vmovl.u8 q9, d18 - vmul.s16 d16, d16, d0 - vmul.s16 d17, d18, d0 + vld1.32 {d16[0]}, [r2], r3 + vld1.32 {d16[1]}, [r2], r3 + vmull.u8 q8, d16, d0 vrshl.s16 q8, q8, q2 vadd.s16 q8, q8, q1 vqmovun.s16 d16, q8 - vst1.32 {d16[0]}, [r0,:32], r1 - vst1.32 {d16[1]}, [r0,:32], r1 + vst1.32 {d16[0]}, [r0], r1 + vst1.32 {d16[1]}, [r0], r1 bgt weight4_loop pop {r4-r5,pc} -.endfunc +endfunc function x264_mc_weight_w20_nodenom_neon weight_prologue nodenom sub r1, #16 weight20_nodenom_loop: subs ip, #2 - vld1.8 {d17-d19}, [r2], r3 - vmovl.u8 q10, d17 - vmovl.u8 q11, d18 - vmovl.u8 q14, d19 - vld1.8 {d16-d18}, [r2], r3 - vmovl.u8 q12, d16 - vmovl.u8 q13, d17 - vmovl.u8 q15, d18 + vld1.8 {d26-d28}, [r2], r3 vmov q8, q1 vmov q9, q1 - vmla.s16 q8, q10, q0 - vmla.s16 q9, q11, q0 + vld1.8 {d29-d31}, [r2], r3 vmov q10, q1 vmov q11, q1 - vmla.s16 q10, q12, q0 - vmla.s16 q11, q13, q0 vmov q12, q1 - vmla.s16 d24, d28, d0 - vmla.s16 d25, d30, d0 + vtrn.32 d28, d31 + vmlal.u8 q8, d26, d0 + vmlal.u8 q9, d27, d0 + vmlal.u8 q10, d29, d0 + vmlal.u8 q11, d30, d0 + vmlal.u8 q12, d28, d0 vqmovun.s16 d16, q8 vqmovun.s16 d17, q9 vqmovun.s16 d18, q10 @@ -608,7 +587,7 @@ vst1.32 {d20[1]}, [r0,:32], r1 bgt weight20_nodenom_loop pop {r4-r5,pc} -.endfunc +endfunc function x264_mc_weight_w16_nodenom_neon weight_prologue nodenom @@ -616,27 +595,23 @@ subs ip, #2 vld1.8 {d16-d17}, [r2], r3 vld1.8 {d18-d19}, [r2], r3 - vmovl.u8 q12, d16 - vmovl.u8 q13, d17 - vmovl.u8 q14, d18 - vmovl.u8 q15, d19 - vmov q8, q1 - vmov q9, q1 - vmov q10, q1 - vmov q11, q1 - vmla.s16 q8, q12, q0 - vmla.s16 q9, q13, q0 - vmla.s16 q10, q14, q0 - vmla.s16 q11, q15, q0 - vqmovun.s16 d16, q8 - vqmovun.s16 d17, q9 - vqmovun.s16 d18, q10 - vqmovun.s16 d19, q11 + vmov q12, q1 + vmov q13, q1 + vmov q14, q1 + vmov q15, q1 + vmlal.u8 q12, d16, d0 + vmlal.u8 q13, d17, d0 + vmlal.u8 q14, d18, d0 + vmlal.u8 q15, d19, d0 + vqmovun.s16 d16, q12 + vqmovun.s16 d17, q13 + vqmovun.s16 d18, q14 + vqmovun.s16 d19, q15 vst1.8 {d16-d17}, [r0,:128], r1 vst1.8 {d18-d19}, [r0,:128], r1 bgt weight16_nodenom_loop pop {r4-r5,pc} -.endfunc +endfunc function x264_mc_weight_w8_nodenom_neon weight_prologue nodenom @@ -644,37 +619,32 @@ subs ip, #2 vld1.8 {d16}, [r2], r3 vld1.8 {d18}, [r2], r3 - vmovl.u8 q8, d16 - vmovl.u8 q9, d18 vmov q10, q1 vmov q11, q1 - vmla.s16 q10, q8, q0 - vmla.s16 q11, q9, q0 + vmlal.u8 q10, d16, d0 + vmlal.u8 q11, d18, d0 vqmovun.s16 d16, q10 vqmovun.s16 d17, q11 vst1.8 {d16}, [r0,:64], r1 vst1.8 {d17}, [r0,:64], r1 bgt weight8_nodenom_loop pop {r4-r5,pc} -.endfunc +endfunc function x264_mc_weight_w4_nodenom_neon weight_prologue nodenom weight4_nodenom_loop: subs ip, #2 - vld1.32 {d16[]}, [r2], r3 - vld1.32 {d18[]}, [r2], r3 - vmovl.u8 q8, d16 - vmovl.u8 q9, d18 + vld1.32 {d16[0]}, [r2], r3 + vld1.32 {d16[1]}, [r2], r3 vmov q10, q1 - vmla.s16 d20, d16, d0 - vmla.s16 d21, d18, d0 + vmlal.u8 q10, d16, d0 vqmovun.s16 d16, q10 - vst1.32 {d16[0]}, [r0,:32], r1 - vst1.32 {d16[1]}, [r0,:32], r1 + vst1.32 {d16[0]}, [r0], r1 + vst1.32 {d16[1]}, [r0], r1 bgt weight4_nodenom_loop pop {r4-r5,pc} -.endfunc +endfunc .macro weight_simple_prologue push {lr} @@ -698,7 +668,7 @@ vst1.8 {d19-d21}, [r0,:64], r1 bgt weight20_\name\()_loop pop {pc} -.endfunc +endfunc function x264_mc_weight_w16_\name\()_neon weight_simple_prologue @@ -712,7 +682,7 @@ vst1.8 {d18-d19}, [r0,:128], r1 bgt weight16_\name\()_loop pop {pc} -.endfunc +endfunc function x264_mc_weight_w8_\name\()_neon weight_simple_prologue @@ -725,7 +695,7 @@ vst1.8 {d17}, [r0,:64], r1 bgt weight8_\name\()_loop pop {pc} -.endfunc +endfunc function x264_mc_weight_w4_\name\()_neon weight_simple_prologue @@ -734,11 +704,11 @@ vld1.32 {d16[]}, [r2], r3 vld1.32 {d17[]}, [r2], r3 \op q8, q8, q1 - vst1.32 {d16[0]}, [r0,:32], r1 - vst1.32 {d17[0]}, [r0,:32], r1 + vst1.32 {d16[0]}, [r0], r1 + vst1.32 {d17[0]}, [r0], r1 bgt weight4_\name\()_loop pop {pc} -.endfunc +endfunc .endm weight_simple offsetadd, vqadd.u8 @@ -760,7 +730,7 @@ vst1.32 {d3[0]}, [r0,:32], r1 bgt copy_w4_loop bx lr -.endfunc +endfunc function x264_mc_copy_w8_neon ldr ip, [sp] @@ -776,7 +746,7 @@ vst1.32 {d3}, [r0,:64], r1 bgt copy_w8_loop bx lr -.endfunc +endfunc function x264_mc_copy_w16_neon ldr ip, [sp] @@ -792,7 +762,7 @@ vst1.32 {d6-d7}, [r0,:128], r1 bgt copy_w16_loop bx lr -.endfunc +endfunc function x264_mc_copy_w16_aligned_neon ldr ip, [sp] @@ -808,7 +778,7 @@ vst1.32 {d6-d7}, [r0,:128], r1 bgt copy_w16_aligned_loop bx lr -.endfunc +endfunc // void x264_mc_chroma_neon( uint8_t *dst, intptr_t i_dst_stride, @@ -1158,7 +1128,7 @@ vpop {d8-d11} pop {r4-r8, pc} -.endfunc +endfunc // hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, intptr_t stride, int width ) @@ -1199,7 +1169,7 @@ vst1.64 {d0-d1}, [r0,:128]! bgt filter_v_loop pop {pc} -.endfunc +endfunc // hpel_filter_c( uint8_t *dst, int16_t *buf, int width ); function x264_hpel_filter_c_neon @@ -1284,7 +1254,7 @@ vst1.64 {d30-d31}, [r0,:128]! bgt filter_c_loop bx lr -.endfunc +endfunc // hpel_filter_h( uint8_t *dst, uint8_t *src, int width ); function x264_hpel_filter_h_neon @@ -1371,7 +1341,7 @@ vst1.64 {d6-d7}, [r0,:128]! bgt filter_h_loop bx lr -.endfunc +endfunc // frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, @@ -1463,4 +1433,149 @@ vpop {d8-d15} pop {r4-r10,pc} -.endfunc +endfunc + +function x264_load_deinterleave_chroma_fdec_neon + mov ip, #FDEC_STRIDE/2 +1: + vld2.8 {d0-d1}, [r1,:128], r2 + subs r3, r3, #1 + pld [r1] + vst1.8 {d0}, [r0,:64], ip + vst1.8 {d1}, [r0,:64], ip + bgt 1b + + bx lr +endfunc + +function x264_load_deinterleave_chroma_fenc_neon + mov ip, #FENC_STRIDE/2 +1: + vld2.8 {d0-d1}, [r1,:128], r2 + subs r3, r3, #1 + pld [r1] + vst1.8 {d0}, [r0,:64], ip + vst1.8 {d1}, [r0,:64], ip + bgt 1b + + bx lr +endfunc + +function x264_plane_copy_deinterleave_neon + push {r4-r7, lr} + ldrd r6, r7, [sp, #28] + ldrd r4, r5, [sp, #20] + add lr, r6, #15 + bic lr, lr, #15 + sub r1, r1, lr + sub r3, r3, lr + sub r5, r5, lr, lsl #1 +block: + vld2.8 {d0-d3}, [r4,:128]! + subs lr, lr, #16 + vst1.8 {q0}, [r0]! + vst1.8 {q1}, [r2]! + bgt block + + add r4, r4, r5 + subs r7, r7, #1 + add r0, r0, r1 + add r2, r2, r3 + mov lr, r6 + bgt block + + pop {r4-r7, pc} +endfunc + +function x264_plane_copy_deinterleave_rgb_neon + push {r4-r8, r10, r11, lr} + ldrd r4, r5, [sp, #32] + ldrd r6, r7, [sp, #40] + ldr r8, [sp, #48] + ldrd r10, r11, [sp, #52] + add lr, r10, #7 + subs r8, r8, #3 + bic lr, lr, #7 + sub r7, r7, lr, lsl #1 + sub r1, r1, lr + sub r3, r3, lr + sub r5, r5, lr + subne r7, r7, lr, lsl #1 + subeq r7, r7, lr + bne block4 +block3: + vld3.8 {d0,d1,d2}, [r6]! + subs lr, lr, #8 + vst1.8 {d0}, [r0]! + vst1.8 {d1}, [r2]! + vst1.8 {d2}, [r4]! + bgt block3 + + subs r11, r11, #1 + add r0, r0, r1 + add r2, r2, r3 + add r4, r4, r5 + add r6, r6, r7 + mov lr, r10 + bgt block3 + + pop {r4-r8, r10, r11, pc} +block4: + vld4.8 {d0,d1,d2,d3}, [r6]! + subs lr, lr, #8 + vst1.8 {d0}, [r0]! + vst1.8 {d1}, [r2]! + vst1.8 {d2}, [r4]! + bgt block4 + + subs r11, r11, #1 + add r0, r0, r1 + add r2, r2, r3 + add r4, r4, r5 + add r6, r6, r7 + mov lr, r10 + bgt block4 + + pop {r4-r8, r10, r11, pc} +endfunc + +function x264_plane_copy_interleave_neon + push {r4-r7, lr} + ldrd r6, r7, [sp, #28] + ldrd r4, r5, [sp, #20] + add lr, r6, #15 + bic lr, lr, #15 + sub r1, r1, lr, lsl #1 + sub r3, r3, lr + sub r5, r5, lr +blocki: + vld1.8 {q0}, [r2]! + vld1.8 {q1}, [r4]! + subs lr, lr, #16 + vst2.8 {d0,d2}, [r0]! + vst2.8 {d1,d3}, [r0]! + bgt blocki + + subs r7, r7, #1 + add r0, r0, r1 + add r2, r2, r3 + add r4, r4, r5 + mov lr, r6 + bgt blocki + + pop {r4-r7, pc} +endfunc + +function x264_store_interleave_chroma_neon + push {lr} + ldr lr, [sp, #4] + mov ip, #FDEC_STRIDE +1: + vld1.8 {d0}, [r2], ip + vld1.8 {d1}, [r3], ip + subs lr, lr, #1 + vst2.8 {d0,d1}, [r0,:128], r1 + bgt 1b + + pop {pc} +endfunc
View file
x264-snapshot-20140321-2245.tar.bz2/common/arm/mc-c.c -> x264-snapshot-20141104-2245.tar.bz2/common/arm/mc-c.c
Changed
@@ -37,6 +37,7 @@ void x264_pixel_avg_8x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); void x264_pixel_avg_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); void x264_pixel_avg_8x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_4x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); void x264_pixel_avg_4x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); void x264_pixel_avg_4x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); void x264_pixel_avg_4x2_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); @@ -46,13 +47,28 @@ void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); +void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu, + pixel *dstv, intptr_t i_dstv, + pixel *src, intptr_t i_src, int w, int h ); +void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta, + pixel *dstb, intptr_t i_dstb, + pixel *dstc, intptr_t i_dstc, + pixel *src, intptr_t i_src, int pw, int w, int h ); +void x264_plane_copy_interleave_neon( pixel *dst, intptr_t i_dst, + pixel *srcu, intptr_t i_srcu, + pixel *srcv, intptr_t i_srcv, int w, int h ); + +void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height ); +void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height ); +void x264_load_deinterleave_chroma_fenc_neon( pixel *dst, pixel *src, intptr_t i_src, int height ); + #define MC_WEIGHT(func)\ void x264_mc_weight_w20##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ void x264_mc_weight_w16##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ void x264_mc_weight_w8##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ void x264_mc_weight_w4##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ \ -static void (* const x264_mc##func##_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int ) =\ +static weight_fn_t x264_mc##func##_wtab_neon[6] =\ {\ x264_mc_weight_w4##func##_neon,\ x264_mc_weight_w4##func##_neon,\ @@ -72,7 +88,7 @@ void x264_mc_copy_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); void x264_mc_copy_w16_aligned_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); -void x264_mc_chroma_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int ); +void x264_mc_chroma_neon( uint8_t *, uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int ); void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int ); void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, intptr_t, int ); @@ -224,11 +240,20 @@ pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon; pf->copy[PIXEL_4x4] = x264_mc_copy_w4_neon; + pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon; + pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon; + pf->plane_copy_interleave = x264_plane_copy_interleave_neon; + + pf->store_interleave_chroma = x264_store_interleave_chroma_neon; + pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon; + pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_neon; + pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon; pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_neon; pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_neon; pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_neon; pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_neon; + pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_neon; pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_neon; pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_neon; pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_neon;
View file
x264-snapshot-20140321-2245.tar.bz2/common/arm/pixel-a.S -> x264-snapshot-20141104-2245.tar.bz2/common/arm/pixel-a.S
Changed
@@ -25,14 +25,15 @@ #include "asm.S" -.fpu neon .section .rodata .align 4 -.rept 16 .byte 0xff +.rept 16 + .byte 0xff .endr mask_ff: -.rept 16 .byte 0 +.rept 16 + .byte 0 .endr mask_ac4: @@ -60,7 +61,7 @@ .endr usada8 r0, r6, lr, ip pop {r4-r6,pc} -.endfunc +endfunc .endm SAD4_ARMV6 4 @@ -137,7 +138,7 @@ vpaddl.u16 d0, d0 vmov.u32 r0, d0[0] bx lr -.endfunc +endfunc .endm SAD_FUNC 4, 4 @@ -222,7 +223,7 @@ vpaddl.u16 d0, d0 vmov.u32 r0, d0[0] bx lr -.endfunc +endfunc .endm SAD_FUNC_DUAL 8, 4 @@ -368,7 +369,7 @@ vst1.32 {d0-d1}, [r7] .endif pop {r6-r7,pc} -.endfunc +endfunc .endm SAD_X_FUNC 3, 4, 4 @@ -477,7 +478,7 @@ vpadd.s32 d0, d0, d0 vmov.32 r0, d0[0] bx lr -.endfunc +endfunc .endm SSD_FUNC 4, 4 @@ -517,7 +518,7 @@ vld1.64 {d26}, [r0,:64], r1 VAR_SQR_SUM q2, q10, q15, d26 b x264_var_end -.endfunc +endfunc function x264_pixel_var_8x16_neon vld1.64 {d16}, [r0,:64], r1 @@ -549,7 +550,7 @@ 2: VAR_SQR_SUM q2, q13, q15, d22 b x264_var_end -.endfunc +endfunc function x264_pixel_var_16x16_neon vld1.64 {d16-d17}, [r0,:128], r1 @@ -573,9 +574,9 @@ VAR_SQR_SUM q1, q12, q14, d18 VAR_SQR_SUM q2, q13, q15, d19 bgt var16_loop -.endfunc +endfunc -function x264_var_end +function x264_var_end, export=0 vpaddl.u16 q8, q14 vpaddl.u16 q9, q15 vadd.u32 q1, q1, q8 @@ -588,7 +589,7 @@ vmov r0, r1, d0 bx lr -.endfunc +endfunc .macro DIFF_SUM diff da db lastdiff vld1.64 {\da}, [r0,:64], r1 @@ -633,7 +634,7 @@ mul r0, r0, r0 sub r0, r1, r0, lsr #6 bx lr -.endfunc +endfunc function x264_pixel_var2_8x16_neon vld1.64 {d16}, [r0,:64], r1 @@ -677,7 +678,7 @@ mul r0, r0, r0 sub r0, r1, r0, lsr #7 bx lr -.endfunc +endfunc .macro LOAD_DIFF_8x4 q0 q1 q2 q3 vld1.32 {d1}, [r2], r3 @@ -714,7 +715,7 @@ HORIZ_ADD d0, d0, d1 vmov.32 r0, d0[0] bx lr -.endfunc +endfunc function x264_pixel_satd_4x8_neon vld1.32 {d1[]}, [r2], r3 @@ -741,7 +742,7 @@ vsubl.u8 q3, d6, d7 SUMSUB_AB q10, q11, q2, q3 b x264_satd_4x8_8x4_end_neon -.endfunc +endfunc function x264_pixel_satd_8x4_neon vld1.64 {d1}, [r2], r3 @@ -758,9 +759,9 @@ vld1.64 {d6}, [r0,:64], r1 vsubl.u8 q3, d6, d7 SUMSUB_AB q10, q11, q2, q3 -.endfunc +endfunc -function x264_satd_4x8_8x4_end_neon +function x264_satd_4x8_8x4_end_neon, export=0 vadd.s16 q0, q8, q10 vadd.s16 q1, q9, q11 vsub.s16 q2, q8, q10 @@ -785,7 +786,7 @@ HORIZ_ADD d0, d0, d1 vmov.32 r0, d0[0] bx lr -.endfunc +endfunc function x264_pixel_satd_8x8_neon mov ip, lr @@ -799,7 +800,7 @@ mov lr, ip vmov.32 r0, d0[0] bx lr -.endfunc +endfunc function x264_pixel_satd_8x16_neon vpush {d8-d11} @@ -821,9 +822,9 @@ mov lr, ip vmov.32 r0, d0[0] bx lr -.endfunc +endfunc -function x264_satd_8x8_neon +function x264_satd_8x8_neon, export=0 LOAD_DIFF_8x4 q8, q9, q10, q11 vld1.64 {d7}, [r2], r3 SUMSUB_AB q0, q1, q8, q9 @@ -841,10 +842,10 @@ SUMSUB_AB q9, q11, q1, q3 vld1.64 {d0}, [r0,:64], r1 vsubl.u8 q15, d0, d1 -.endfunc +endfunc // one vertical hadamard pass and two horizontal -function x264_satd_8x4v_8x8h_neon +function x264_satd_8x4v_8x8h_neon, export=0 SUMSUB_ABCD q0, q1, q2, q3, q12, q13, q14, q15 vtrn.16 q8, q9 SUMSUB_AB q12, q14, q0, q2 @@ -870,7 +871,7 @@ vmax.s16 q14, q8, q10 vmax.s16 q15, q9, q11 bx lr -.endfunc +endfunc function x264_pixel_satd_16x8_neon vpush {d8-d11} @@ -892,7 +893,7 @@ mov lr, ip vmov.32 r0, d0[0] bx lr -.endfunc +endfunc function x264_pixel_satd_16x16_neon vpush {d8-d11} @@ -926,9 +927,9 @@ mov lr, ip vmov.32 r0, d0[0] bx lr -.endfunc +endfunc -function x264_satd_16x4_neon +function x264_satd_16x4_neon, export=0 vld1.64 {d2-d3}, [r2], r3 vld1.64 {d0-d1}, [r0,:128], r1 vsubl.u8 q8, d0, d2 @@ -950,7 +951,7 @@ SUMSUB_AB q2, q3, q10, q11 SUMSUB_ABCD q8, q10, q9, q11, q0, q2, q1, q3 b x264_satd_8x4v_8x8h_neon -.endfunc +endfunc function x264_pixel_sa8d_8x8_neon @@ -963,7 +964,7 @@ add r0, r0, #1 lsr r0, r0, #1 bx lr -.endfunc +endfunc function x264_pixel_sa8d_16x16_neon vpush {d8-d11} @@ -995,14 +996,14 @@ add r0, r0, #1 lsr r0, r0, #1 bx lr -.endfunc +endfunc .macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4 SUMSUB_ABCD \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4 SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4 .endm -function x264_sa8d_8x8_neon +function x264_sa8d_8x8_neon, export=0 LOAD_DIFF_8x4 q8, q9, q10, q11 vld1.64 {d7}, [r2], r3 SUMSUB_AB q0, q1, q8, q9 @@ -1058,7 +1059,7 @@ vadd.i16 q8, q8, q9 vadd.i16 q9, q10, q11 bx lr -.endfunc +endfunc .macro HADAMARD_AC w h @@ -1094,7 +1095,7 @@ lsr r0, r0, #1 lsr r1, r1, #2 bx lr -.endfunc +endfunc .endm HADAMARD_AC 8, 8 @@ -1103,7 +1104,7 @@ HADAMARD_AC 16, 16 // q4: satd q5: sa8d q6: mask_ac4 q7: mask_ac8 -function x264_hadamard_ac_8x8_neon +function x264_hadamard_ac_8x8_neon, export=0 vld1.64 {d2}, [r0,:64], r1 vld1.64 {d3}, [r0,:64], r1 vaddl.u8 q0, d2, d3 @@ -1189,7 +1190,7 @@ vadd.s16 q2, q2, q14 vpadal.u16 q5, q2 bx lr -.endfunc +endfunc .macro SSIM_ITER n ssa s12 ssb lastssa lasts12 lastssb da db dnext @@ -1243,7 +1244,7 @@ vst4.32 {d0-d3}, [ip] bx lr -.endfunc +endfunc // FIXME: see about doing 16x16 -> 32 bit multiplies for s1/s2 function x264_pixel_ssim_end4_neon @@ -1314,4 +1315,4 @@ vpadd.f32 d0, d0, d0 vmov.32 r0, d0[0] bx lr -.endfunc +endfunc
View file
x264-snapshot-20140321-2245.tar.bz2/common/arm/predict-a.S -> x264-snapshot-20141104-2245.tar.bz2/common/arm/predict-a.S
Changed
@@ -26,8 +26,6 @@ #include "asm.S" -.fpu neon - .section .rodata .align 4 @@ -77,7 +75,16 @@ add ip, ip, ip, lsl #16 str ip, [r0, #3*FDEC_STRIDE] bx lr -.endfunc +endfunc + +function x264_predict_4x4_v_armv6 + ldr r1, [r0, #0 - 1 * FDEC_STRIDE] + str r1, [r0, #0 + 0 * FDEC_STRIDE] + str r1, [r0, #0 + 1 * FDEC_STRIDE] + str r1, [r0, #0 + 2 * FDEC_STRIDE] + str r1, [r0, #0 + 3 * FDEC_STRIDE] + bx lr +endfunc function x264_predict_4x4_dc_armv6 mov ip, #0 @@ -100,7 +107,7 @@ str r1, [r0, #2*FDEC_STRIDE] str r1, [r0, #3*FDEC_STRIDE] bx lr -.endfunc +endfunc function x264_predict_4x4_dc_top_neon mov r12, #FDEC_STRIDE @@ -115,7 +122,7 @@ vst1.32 d1[0], [r0,:32], r12 vst1.32 d1[0], [r0,:32], r12 bx lr -.endfunc +endfunc // return a1 = (a1+2*b1+c1+2)>>2 a2 = (a2+2*b2+c2+2)>>2 .macro PRED4x4_LOWPASS a1 b1 c1 a2 b2 c2 pb_1 @@ -158,7 +165,7 @@ add r5, r5, r4, lsr #8 str r5, [r0, #3*FDEC_STRIDE] pop {r4-r6,pc} -.endfunc +endfunc function x264_predict_4x4_ddl_neon sub r0, #FDEC_STRIDE @@ -177,7 +184,7 @@ vst1.32 {d2[0]}, [r0,:32], ip vst1.32 {d3[0]}, [r0,:32], ip bx lr -.endfunc +endfunc function x264_predict_8x8_dc_neon mov ip, #0 @@ -201,7 +208,7 @@ vst1.64 {d0}, [r0,:64], ip .endr pop {r4-r5,pc} -.endfunc +endfunc function x264_predict_8x8_h_neon add r1, r1, #7 @@ -224,7 +231,7 @@ vst1.64 {d6}, [r0,:64], ip vst1.64 {d7}, [r0,:64], ip bx lr -.endfunc +endfunc function x264_predict_8x8_v_neon add r1, r1, #16 @@ -234,7 +241,7 @@ vst1.8 {d0}, [r0,:64], r12 .endr bx lr -.endfunc +endfunc function x264_predict_8x8_ddl_neon add r1, #16 @@ -262,7 +269,7 @@ vst1.8 d2, [r0,:64], r12 vst1.8 d1, [r0,:64], r12 bx lr -.endfunc +endfunc function x264_predict_8x8_ddr_neon vld1.8 {d0-d3}, [r1,:128] @@ -292,7 +299,7 @@ vst1.8 {d4}, [r0,:64], r12 vst1.8 {d5}, [r0,:64], r12 bx lr -.endfunc +endfunc function x264_predict_8x8_vl_neon add r1, #16 @@ -323,7 +330,7 @@ vst1.8 {d3}, [r0,:64], r12 vst1.8 {d2}, [r0,:64], r12 bx lr -.endfunc +endfunc function x264_predict_8x8_vr_neon add r1, #8 @@ -355,7 +362,7 @@ vst1.8 {d6}, [r0,:64], r12 vst1.8 {d3}, [r0,:64], r12 bx lr -.endfunc +endfunc function x264_predict_8x8_hd_neon mov r12, #FDEC_STRIDE @@ -388,7 +395,7 @@ vst1.8 {d16}, [r0,:64], r12 bx lr -.endfunc +endfunc function x264_predict_8x8_hu_neon mov r12, #FDEC_STRIDE @@ -421,7 +428,7 @@ vst1.8 {d7}, [r0,:64], r12 vst1.8 {d17}, [r0,:64] bx lr -.endfunc +endfunc function x264_predict_8x8c_dc_top_neon sub r2, r0, #FDEC_STRIDE @@ -434,7 +441,7 @@ vdup.8 d0, d0[0] vtrn.32 d0, d1 b pred8x8_dc_end -.endfunc +endfunc function x264_predict_8x8c_dc_left_neon mov r1, #FDEC_STRIDE @@ -446,7 +453,7 @@ vdup.8 d1, d0[1] vdup.8 d0, d0[0] b pred8x8_dc_end -.endfunc +endfunc function x264_predict_8x8c_dc_neon sub r2, r0, #FDEC_STRIDE @@ -472,7 +479,7 @@ vst1.8 {d1}, [r2,:64], r1 .endr bx lr -.endfunc +endfunc function x264_predict_8x8c_h_neon sub r1, r0, #1 @@ -484,7 +491,7 @@ vst1.64 {d2}, [r0,:64], ip .endr bx lr -.endfunc +endfunc function x264_predict_8x8c_v_neon sub r0, r0, #FDEC_STRIDE @@ -494,7 +501,7 @@ vst1.64 {d0}, [r0,:64], ip .endr bx lr -.endfunc +endfunc function x264_predict_8x8c_p_neon sub r3, r0, #FDEC_STRIDE @@ -547,7 +554,7 @@ subs r3, r3, #1 bne 1b bx lr -.endfunc +endfunc function x264_predict_16x16_dc_top_neon @@ -558,7 +565,7 @@ vrshrn.u16 d0, q0, #4 vdup.8 q0, d0[0] b pred16x16_dc_end -.endfunc +endfunc function x264_predict_16x16_dc_left_neon mov r1, #FDEC_STRIDE @@ -569,7 +576,7 @@ vrshrn.u16 d0, q0, #4 vdup.8 q0, d0[0] b pred16x16_dc_end -.endfunc +endfunc function x264_predict_16x16_dc_neon sub r3, r0, #FDEC_STRIDE @@ -607,7 +614,7 @@ vst1.64 {d0-d1}, [r0,:128], r1 .endr bx lr -.endfunc +endfunc function x264_predict_16x16_h_neon sub r1, r0, #1 @@ -621,7 +628,7 @@ vst1.64 {d2-d3}, [r0,:128], ip .endr bx lr -.endfunc +endfunc function x264_predict_16x16_v_neon sub r0, r0, #FDEC_STRIDE @@ -631,7 +638,7 @@ vst1.64 {d0-d1}, [r0,:128], ip .endr bx lr -.endfunc +endfunc function x264_predict_16x16_p_neon sub r3, r0, #FDEC_STRIDE @@ -688,4 +695,4 @@ subs r3, r3, #1 bne 1b bx lr -.endfunc +endfunc
View file
x264-snapshot-20140321-2245.tar.bz2/common/arm/predict-c.c -> x264-snapshot-20141104-2245.tar.bz2/common/arm/predict-c.c
Changed
@@ -27,36 +27,6 @@ #include "predict.h" #include "pixel.h" -void x264_predict_4x4_dc_armv6( uint8_t *src ); -void x264_predict_4x4_dc_top_neon( uint8_t *src ); -void x264_predict_4x4_h_armv6( uint8_t *src ); -void x264_predict_4x4_ddr_armv6( uint8_t *src ); -void x264_predict_4x4_ddl_neon( uint8_t *src ); - -void x264_predict_8x8c_dc_neon( uint8_t *src ); -void x264_predict_8x8c_dc_top_neon( uint8_t *src ); -void x264_predict_8x8c_dc_left_neon( uint8_t *src ); -void x264_predict_8x8c_h_neon( uint8_t *src ); -void x264_predict_8x8c_v_neon( uint8_t *src ); -void x264_predict_8x8c_p_neon( uint8_t *src ); - -void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] ); -void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] ); -void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] ); -void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] ); -void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] ); -void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] ); -void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] ); -void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] ); -void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] ); - -void x264_predict_16x16_dc_neon( uint8_t *src ); -void x264_predict_16x16_dc_top_neon( uint8_t *src ); -void x264_predict_16x16_dc_left_neon( uint8_t *src ); -void x264_predict_16x16_h_neon( uint8_t *src ); -void x264_predict_16x16_v_neon( uint8_t *src ); -void x264_predict_16x16_p_neon( uint8_t *src ); - void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] ) { if (!(cpu&X264_CPU_ARMV6)) @@ -64,6 +34,7 @@ #if !HIGH_BIT_DEPTH pf[I_PRED_4x4_H] = x264_predict_4x4_h_armv6; + pf[I_PRED_4x4_V] = x264_predict_4x4_v_armv6; pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_armv6; pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_armv6;
View file
x264-snapshot-20140321-2245.tar.bz2/common/arm/predict.h -> x264-snapshot-20141104-2245.tar.bz2/common/arm/predict.h
Changed
@@ -26,15 +26,36 @@ #ifndef X264_ARM_PREDICT_H #define X264_ARM_PREDICT_H -void x264_predict_8x8_v_neon( pixel *src, pixel edge[36] ); -void x264_predict_8x8_h_neon( pixel *src, pixel edge[36] ); -void x264_predict_8x8_dc_neon( pixel *src, pixel edge[36] ); -void x264_predict_8x8c_dc_neon( pixel *src ); -void x264_predict_8x8c_h_neon( pixel *src ); -void x264_predict_8x8c_v_neon( pixel *src ); -void x264_predict_16x16_v_neon( pixel *src ); -void x264_predict_16x16_h_neon( pixel *src ); -void x264_predict_16x16_dc_neon( pixel *src ); +void x264_predict_4x4_dc_armv6( uint8_t *src ); +void x264_predict_4x4_dc_top_neon( uint8_t *src ); +void x264_predict_4x4_v_armv6( uint8_t *src ); +void x264_predict_4x4_h_armv6( uint8_t *src ); +void x264_predict_4x4_ddr_armv6( uint8_t *src ); +void x264_predict_4x4_ddl_neon( uint8_t *src ); + +void x264_predict_8x8c_dc_neon( uint8_t *src ); +void x264_predict_8x8c_dc_top_neon( uint8_t *src ); +void x264_predict_8x8c_dc_left_neon( uint8_t *src ); +void x264_predict_8x8c_h_neon( uint8_t *src ); +void x264_predict_8x8c_v_neon( uint8_t *src ); +void x264_predict_8x8c_p_neon( uint8_t *src ); + +void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] ); + +void x264_predict_16x16_dc_neon( uint8_t *src ); +void x264_predict_16x16_dc_top_neon( uint8_t *src ); +void x264_predict_16x16_dc_left_neon( uint8_t *src ); +void x264_predict_16x16_h_neon( uint8_t *src ); +void x264_predict_16x16_v_neon( uint8_t *src ); +void x264_predict_16x16_p_neon( uint8_t *src ); void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] ); void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
View file
x264-snapshot-20140321-2245.tar.bz2/common/arm/quant-a.S -> x264-snapshot-20141104-2245.tar.bz2/common/arm/quant-a.S
Changed
@@ -25,8 +25,6 @@ #include "asm.S" -.fpu neon - .section .rodata .align 4 pmovmskb_byte: @@ -80,7 +78,7 @@ vsub.s16 d3, d3, d0 vst1.64 {d3}, [r0,:64] QUANT_END d3 -.endfunc +endfunc // quant_4x4_dc( int16_t dct[16], int mf, int bias ) function x264_quant_4x4_dc_neon @@ -92,7 +90,7 @@ QUANT_TWO q0, q0, d4, d5, d4, d5, q0 vorr d0, d0, d1 QUANT_END d0 -.endfunc +endfunc // quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ) function x264_quant_4x4_neon @@ -104,7 +102,7 @@ QUANT_TWO q0, q1, d4, d5, d6, d7, q0 vorr d0, d0, d1 QUANT_END d0 -.endfunc +endfunc // quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] ) function x264_quant_4x4x4_neon @@ -145,7 +143,7 @@ orrne r0, #8 vpop {d8-d15} bx lr -.endfunc +endfunc // quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] ) function x264_quant_8x8_neon @@ -165,7 +163,7 @@ .endr vorr d0, d0, d1 QUANT_END d0 -.endfunc +endfunc .macro DEQUANT_START mf_size offset dc=no mov r3, #0x2b @@ -257,7 +255,7 @@ bgt dequant_\size\()_rshift_loop .endif bx lr -.endfunc +endfunc .endm DEQUANT 4x4, 4 @@ -307,7 +305,7 @@ vmovn.s32 d3, q13 vst1.16 {d0-d3}, [r0,:128] bx lr -.endfunc +endfunc // int coeff_last( int16_t *l ) @@ -319,7 +317,21 @@ lsrs r2, r2, #16 addne r0, r0, #1 bx lr -.endfunc +endfunc + +function x264_coeff_last8_arm + ldrd r2, r3, [r0, #8] + orrs ip, r2, r3 + movne r0, #4 + ldrdeq r2, r3, [r0] + moveq r0, #0 + tst r3, r3 + addne r0, #2 + movne r2, r3 + lsrs r2, r2, #16 + addne r0, r0, #1 + bx lr +endfunc .macro COEFF_LAST_1x size function x264_coeff_last\size\()_neon @@ -344,7 +356,7 @@ subslt r0, r3, r0, lsr #2 movlt r0, #0 bx lr -.endfunc +endfunc .endm COEFF_LAST_1x 15 @@ -393,4 +405,4 @@ subslt r0, ip, r0 movlt r0, #0 bx lr -.endfunc +endfunc
View file
x264-snapshot-20140321-2245.tar.bz2/common/arm/quant.h -> x264-snapshot-20141104-2245.tar.bz2/common/arm/quant.h
Changed
@@ -39,6 +39,7 @@ void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp ); int x264_coeff_last4_arm( int16_t * ); +int x264_coeff_last8_arm( int16_t * ); int x264_coeff_last15_neon( int16_t * ); int x264_coeff_last16_neon( int16_t * ); int x264_coeff_last64_neon( int16_t * );
View file
x264-snapshot-20140321-2245.tar.bz2/common/bitstream.c -> x264-snapshot-20141104-2245.tar.bz2/common/bitstream.c
Changed
@@ -4,7 +4,7 @@ * Copyright (C) 2003-2014 x264 project * * Authors: Laurent Aimar <fenrir@via.ecp.fr> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/common/bitstream.h -> x264-snapshot-20141104-2245.tar.bz2/common/bitstream.h
Changed
@@ -4,7 +4,7 @@ * Copyright (C) 2003-2014 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * Laurent Aimar <fenrir@via.ecp.fr> * * This program is free software; you can redistribute it and/or modify
View file
x264-snapshot-20140321-2245.tar.bz2/common/cabac.c -> x264-snapshot-20141104-2245.tar.bz2/common/cabac.c
Changed
@@ -5,7 +5,7 @@ * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/common/common.h -> x264-snapshot-20141104-2245.tar.bz2/common/common.h
Changed
@@ -316,8 +316,8 @@ static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum( uint8_t *mvdleft, uint8_t *mvdtop ) { - int amvd0 = abs(mvdleft[0]) + abs(mvdtop[0]); - int amvd1 = abs(mvdleft[1]) + abs(mvdtop[1]); + int amvd0 = mvdleft[0] + mvdtop[0]; + int amvd1 = mvdleft[1] + mvdtop[1]; amvd0 = (amvd0 > 2) + (amvd0 > 32); amvd1 = (amvd1 > 2) + (amvd1 > 32); return amvd0 + (amvd1<<8);
View file
x264-snapshot-20140321-2245.tar.bz2/common/cpu.c -> x264-snapshot-20141104-2245.tar.bz2/common/cpu.c
Changed
@@ -5,7 +5,7 @@ * * Authors: Loren Merritt <lorenm@u.washington.edu> * Laurent Aimar <fenrir@via.ecp.fr> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -89,6 +89,9 @@ {"ARMv6", X264_CPU_ARMV6}, {"NEON", X264_CPU_NEON}, {"FastNeonMRC", X264_CPU_FAST_NEON_MRC}, +#elif ARCH_AARCH64 + {"ARMv8", X264_CPU_ARMV8}, + {"NEON", X264_CPU_NEON}, #endif {"", 0}, }; @@ -338,6 +341,9 @@ uint32_t x264_cpu_detect( void ) { +#ifdef __NO_FPRS__ + return 0; +#else static void (*oldsig)( int ); oldsig = signal( SIGILL, sigill_handler ); @@ -357,6 +363,7 @@ signal( SIGILL, oldsig ); return X264_CPU_ALTIVEC; +#endif } #endif @@ -405,6 +412,13 @@ return flags; } +#elif ARCH_AARCH64 + +uint32_t x264_cpu_detect( void ) +{ + return X264_CPU_ARMV8 | X264_CPU_NEON; +} + #else uint32_t x264_cpu_detect( void )
View file
x264-snapshot-20140321-2245.tar.bz2/common/dct.c -> x264-snapshot-20141104-2245.tar.bz2/common/dct.c
Changed
@@ -35,6 +35,9 @@ #if ARCH_ARM # include "arm/dct.h" #endif +#if ARCH_AARCH64 +# include "aarch64/dct.h" +#endif /* the inverse of the scaling factors introduced by 8x8 fdct */ /* uint32 is for the asm implementation of trellis. the actual values fit in uint16. */ @@ -723,7 +726,7 @@ } #endif -#if HAVE_ARMV6 +#if HAVE_ARMV6 || ARCH_AARCH64 if( cpu&X264_CPU_NEON ) { dctf->sub4x4_dct = x264_sub4x4_dct_neon; @@ -999,10 +1002,10 @@ pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec; } #endif -#if HAVE_ARMV6 +#if HAVE_ARMV6 || ARCH_AARCH64 if( cpu&X264_CPU_NEON ) pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_neon; -#endif +#endif // HAVE_ARMV6 || ARCH_AARCH64 #endif // HIGH_BIT_DEPTH pf_interlaced->interleave_8x8_cavlc =
View file
x264-snapshot-20140321-2245.tar.bz2/common/deblock.c -> x264-snapshot-20141104-2245.tar.bz2/common/deblock.c
Changed
@@ -5,7 +5,7 @@ * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * Henrik Gramner <henrik@gramner.com> * * This program is free software; you can redistribute it and/or modify @@ -729,11 +729,14 @@ void x264_deblock_h_luma_altivec( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #endif // ARCH_PPC -#if HAVE_ARMV6 +#if HAVE_ARMV6 || ARCH_AARCH64 void x264_deblock_v_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_h_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_v_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_h_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + int mvy_limit, int bframe ); #endif void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff ) @@ -835,13 +838,14 @@ } #endif // HAVE_ALTIVEC -#if HAVE_ARMV6 +#if HAVE_ARMV6 || ARCH_AARCH64 if( cpu&X264_CPU_NEON ) { pf->deblock_luma[1] = x264_deblock_v_luma_neon; pf->deblock_luma[0] = x264_deblock_h_luma_neon; pf->deblock_chroma[1] = x264_deblock_v_chroma_neon; pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon; + pf->deblock_strength = x264_deblock_strength_neon; } #endif #endif // !HIGH_BIT_DEPTH
View file
x264-snapshot-20140321-2245.tar.bz2/common/frame.c -> x264-snapshot-20141104-2245.tar.bz2/common/frame.c
Changed
@@ -5,7 +5,7 @@ * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/common/frame.h -> x264-snapshot-20141104-2245.tar.bz2/common/frame.h
Changed
@@ -5,7 +5,7 @@ * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/common/macroblock.c -> x264-snapshot-20141104-2245.tar.bz2/common/macroblock.c
Changed
@@ -3,7 +3,7 @@ ***************************************************************************** * Copyright (C) 2003-2014 x264 project * - * Authors: Jason Garrett-Glaser <darkshikari@gmail.com> + * Authors: Fiona Glaser <fiona@x264.com> * Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> * Henrik Gramner <henrik@gramner.com>
View file
x264-snapshot-20140321-2245.tar.bz2/common/macroblock.h -> x264-snapshot-20141104-2245.tar.bz2/common/macroblock.h
Changed
@@ -5,7 +5,7 @@ * * Authors: Loren Merritt <lorenm@u.washington.edu> * Laurent Aimar <fenrir@via.ecp.fr> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/common/mc.c -> x264-snapshot-20141104-2245.tar.bz2/common/mc.c
Changed
@@ -35,6 +35,9 @@ #if ARCH_ARM #include "arm/mc.h" #endif +#if ARCH_AARCH64 +#include "aarch64/mc.h" +#endif static inline void pixel_avg( pixel *dst, intptr_t i_dst_stride, @@ -641,6 +644,9 @@ #if HAVE_ARMV6 x264_mc_init_arm( cpu, pf ); #endif +#if ARCH_AARCH64 + x264_mc_init_aarch64( cpu, pf ); +#endif if( cpu_independent ) {
View file
x264-snapshot-20140321-2245.tar.bz2/common/mvpred.c -> x264-snapshot-20141104-2245.tar.bz2/common/mvpred.c
Changed
@@ -4,7 +4,7 @@ * Copyright (C) 2003-2014 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * Laurent Aimar <fenrir@via.ecp.fr> * * This program is free software; you can redistribute it and/or modify
View file
x264-snapshot-20140321-2245.tar.bz2/common/opencl.c -> x264-snapshot-20141104-2245.tar.bz2/common/opencl.c
Changed
@@ -135,7 +135,8 @@ rewind( fp ); CHECKED_MALLOC( binary, size ); - fread( binary, 1, size, fp ); + if ( fread( binary, 1, size, fp ) != size ) + goto fail; const uint8_t *ptr = (const uint8_t*)binary; #define CHECK_STRING( STR )\
View file
x264-snapshot-20140321-2245.tar.bz2/common/osdep.h -> x264-snapshot-20141104-2245.tar.bz2/common/osdep.h
Changed
@@ -48,7 +48,7 @@ #define log2(x) (log(x)/0.693147180559945) #endif -#ifdef __ICL +#ifdef _MSC_VER #define inline __inline #define strcasecmp _stricmp #define strncasecmp _strnicmp @@ -57,10 +57,6 @@ #define S_ISREG(x) (((x) & S_IFMT) == S_IFREG) #endif -#if (defined(__GNUC__) || defined(__INTEL_COMPILER)) && (ARCH_X86 || ARCH_X86_64) -#define HAVE_X86_INLINE_ASM 1 -#endif - #if !defined(isfinite) && (SYS_OPENBSD || SYS_SunOS) #define isfinite finite #endif @@ -89,7 +85,7 @@ #define x264_is_pipe(x) 0 #endif -#ifdef __ICL +#ifdef _MSC_VER #define DECLARE_ALIGNED( var, n ) __declspec(align(n)) var #else #define DECLARE_ALIGNED( var, n ) var __attribute__((aligned(n))) @@ -156,7 +152,7 @@ #define x264_constant_p(x) __builtin_constant_p(x) #define x264_nonconstant_p(x) (!__builtin_constant_p(x)) #else -#ifdef __ICL +#ifdef _MSC_VER #define ALWAYS_INLINE __forceinline #define NOINLINE __declspec(noinline) #else
View file
x264-snapshot-20140321-2245.tar.bz2/common/pixel.c -> x264-snapshot-20141104-2245.tar.bz2/common/pixel.c
Changed
@@ -5,7 +5,7 @@ * * Authors: Loren Merritt <lorenm@u.washington.edu> * Laurent Aimar <fenrir@via.ecp.fr> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -38,8 +38,9 @@ # include "arm/pixel.h" # include "arm/predict.h" #endif -#if ARCH_UltraSPARC -# include "sparc/pixel.h" +#if ARCH_AARCH64 +# include "aarch64/pixel.h" +# include "aarch64/predict.h" #endif @@ -200,7 +201,7 @@ #define PIXEL_VAR2_C( name, w, h, shift ) \ static int name( pixel *pix1, intptr_t i_stride1, pixel *pix2, intptr_t i_stride2, int *ssd ) \ { \ - uint32_t var = 0, sum = 0, sqr = 0; \ + int var = 0, sum = 0, sqr = 0; \ for( int y = 0; y < h; y++ ) \ { \ for( int x = 0; x < w; x++ ) \ @@ -212,8 +213,7 @@ pix1 += i_stride1; \ pix2 += i_stride2; \ } \ - sum = abs(sum); \ - var = sqr - ((uint64_t)sum * sum >> shift); \ + var = sqr - ((int64_t)sum * sum >> shift); \ *ssd = sqr; \ return var; \ } @@ -454,15 +454,6 @@ SAD_X( 4x8 ) SAD_X( 4x4 ) -#if !HIGH_BIT_DEPTH -#if ARCH_UltraSPARC -SAD_X( 16x16_vis ) -SAD_X( 16x8_vis ) -SAD_X( 8x16_vis ) -SAD_X( 8x8_vis ) -#endif -#endif // !HIGH_BIT_DEPTH - /**************************************************************************** * pixel_satd_x4 * no faster than single satd, but needed for satd to be a drop-in replacement for sad @@ -509,7 +500,7 @@ #endif #if !HIGH_BIT_DEPTH -#if HAVE_ARMV6 +#if HAVE_ARMV6 || ARCH_AARCH64 SATD_X_DECL7( _neon ) #endif #endif // !HIGH_BIT_DEPTH @@ -533,7 +524,7 @@ INTRA_MBCMP_8x8( sad, _mmx2, _c ) INTRA_MBCMP_8x8(sa8d, _sse2, _sse2 ) #endif -#if !HIGH_BIT_DEPTH && HAVE_ARMV6 +#if !HIGH_BIT_DEPTH && (HAVE_ARMV6 || ARCH_AARCH64) INTRA_MBCMP_8x8( sad, _neon, _neon ) INTRA_MBCMP_8x8(sa8d, _neon, _neon ) #endif @@ -593,8 +584,18 @@ #endif #endif #if !HIGH_BIT_DEPTH && HAVE_ARMV6 -INTRA_MBCMP( sad, 4x4, v, h, dc, , _neon, _c ) -INTRA_MBCMP(satd, 4x4, v, h, dc, , _neon, _c ) +INTRA_MBCMP( sad, 4x4, v, h, dc, , _neon, _armv6 ) +INTRA_MBCMP(satd, 4x4, v, h, dc, , _neon, _armv6 ) +INTRA_MBCMP( sad, 8x8, dc, h, v, c, _neon, _neon ) +INTRA_MBCMP(satd, 8x8, dc, h, v, c, _neon, _neon ) +INTRA_MBCMP( sad, 8x16, dc, h, v, c, _neon, _c ) +INTRA_MBCMP(satd, 8x16, dc, h, v, c, _neon, _c ) +INTRA_MBCMP( sad, 16x16, v, h, dc, , _neon, _neon ) +INTRA_MBCMP(satd, 16x16, v, h, dc, , _neon, _neon ) +#endif +#if !HIGH_BIT_DEPTH && ARCH_AARCH64 +INTRA_MBCMP( sad, 4x4, v, h, dc, , _neon, _neon ) +INTRA_MBCMP(satd, 4x4, v, h, dc, , _neon, _neon ) INTRA_MBCMP( sad, 8x8, dc, h, v, c, _neon, _neon ) INTRA_MBCMP(satd, 8x8, dc, h, v, c, _neon, _neon ) INTRA_MBCMP( sad, 8x16, dc, h, v, c, _neon, _c ) @@ -1021,8 +1022,16 @@ } if( cpu&X264_CPU_XOP ) { + INIT5( sad_x3, _xop ); + INIT5( sad_x4, _xop ); + pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_xop; + pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop; + pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop; pixf->vsad = x264_pixel_vsad_xop; pixf->asd8 = x264_pixel_asd8_xop; +#if ARCH_X86_64 + pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_xop; +#endif } if( cpu&X264_CPU_AVX2 ) { @@ -1308,6 +1317,7 @@ pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_xop; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_xop; pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_xop; + pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_xop; pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop; pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_xop; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop; @@ -1394,6 +1404,46 @@ } } #endif + +#if ARCH_AARCH64 + if( cpu&X264_CPU_NEON ) + { + INIT7( sad, _neon ); + // AArch64 has no distinct instructions for aligned load/store + INIT7_NAME( sad_aligned, sad, _neon ); + INIT7( sad_x3, _neon ); + INIT7( sad_x4, _neon ); + INIT7( ssd, _neon ); + INIT7( satd, _neon ); + INIT7( satd_x3, _neon ); + INIT7( satd_x4, _neon ); + INIT4( hadamard_ac, _neon ); + + pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_neon; + pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon; + + pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_neon; + pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_neon; + pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_neon; + pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon; + pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon; + + pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_neon; + pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_neon; + pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_neon; + pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_neon; + pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_neon; + pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_neon; + pixf->intra_sad_x3_8x16c = x264_intra_sad_x3_8x16c_neon; + pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_neon; + pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_neon; + pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_neon; + + pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_neon; + pixf->ssim_end4 = x264_pixel_ssim_end4_neon; + } +#endif // ARCH_AARCH64 + #endif // HIGH_BIT_DEPTH #if HAVE_ALTIVEC if( cpu&X264_CPU_ALTIVEC ) @@ -1401,13 +1451,6 @@ x264_pixel_altivec_init( pixf ); } #endif -#if !HIGH_BIT_DEPTH -#if ARCH_UltraSPARC - INIT4( sad, _vis ); - INIT4( sad_x3, _vis ); - INIT4( sad_x4, _vis ); -#endif -#endif // !HIGH_BIT_DEPTH pixf->ads[PIXEL_8x16] = pixf->ads[PIXEL_8x4] =
View file
x264-snapshot-20140321-2245.tar.bz2/common/pixel.h -> x264-snapshot-20141104-2245.tar.bz2/common/pixel.h
Changed
@@ -4,7 +4,7 @@ * Copyright (C) 2004-2014 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> Henrik Gramner <henrik@gramner.com> * * This program is free software; you can redistribute it and/or modify
View file
x264-snapshot-20140321-2245.tar.bz2/common/predict.c -> x264-snapshot-20141104-2245.tar.bz2/common/predict.c
Changed
@@ -5,7 +5,7 @@ * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * Henrik Gramner <henrik@gramner.com> * * This program is free software; you can redistribute it and/or modify @@ -40,6 +40,9 @@ #if ARCH_ARM # include "arm/predict.h" #endif +#if ARCH_AARCH64 +# include "aarch64/predict.h" +#endif /**************************************************************************** * 16x16 prediction for intra luma block @@ -899,6 +902,10 @@ #if HAVE_ARMV6 x264_predict_16x16_init_arm( cpu, pf ); #endif + +#if ARCH_AARCH64 + x264_predict_16x16_init_aarch64( cpu, pf ); +#endif } void x264_predict_8x8c_init( int cpu, x264_predict_t pf[7] ) @@ -923,6 +930,10 @@ #if HAVE_ARMV6 x264_predict_8x8c_init_arm( cpu, pf ); #endif + +#if ARCH_AARCH64 + x264_predict_8x8c_init_aarch64( cpu, pf ); +#endif } void x264_predict_8x16c_init( int cpu, x264_predict_t pf[7] ) @@ -963,6 +974,10 @@ #if HAVE_ARMV6 x264_predict_8x8_init_arm( cpu, pf, predict_filter ); #endif + +#if ARCH_AARCH64 + x264_predict_8x8_init_aarch64( cpu, pf, predict_filter ); +#endif } void x264_predict_4x4_init( int cpu, x264_predict_t pf[12] ) @@ -987,5 +1002,9 @@ #if HAVE_ARMV6 x264_predict_4x4_init_arm( cpu, pf ); #endif + +#if ARCH_AARCH64 + x264_predict_4x4_init_aarch64( cpu, pf ); +#endif }
View file
x264-snapshot-20140321-2245.tar.bz2/common/quant.c -> x264-snapshot-20141104-2245.tar.bz2/common/quant.c
Changed
@@ -4,7 +4,7 @@ * Copyright (C) 2005-2014 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * Christian Heine <sennindemokrit@gmx.net> * Henrik Gramner <henrik@gramner.com> * @@ -37,6 +37,9 @@ #if ARCH_ARM # include "arm/quant.h" #endif +#if ARCH_AARCH64 +# include "aarch64/quant.h" +#endif #define QUANT_ONE( coef, mf, f ) \ { \ @@ -556,7 +559,6 @@ { #if ARCH_X86 pf->quant_4x4 = x264_quant_4x4_mmx; - pf->quant_4x4x4 = x264_quant_4x4x4_mmx; pf->quant_8x8 = x264_quant_8x8_mmx; pf->dequant_4x4 = x264_dequant_4x4_mmx; pf->dequant_4x4_dc = x264_dequant_4x4dc_mmx2; @@ -725,8 +727,12 @@ #if HAVE_ARMV6 if( cpu&X264_CPU_ARMV6 ) + { pf->coeff_last4 = x264_coeff_last4_arm; - + pf->coeff_last8 = x264_coeff_last8_arm; + } +#endif +#if HAVE_ARMV6 || ARCH_AARCH64 if( cpu&X264_CPU_NEON ) { pf->quant_2x2_dc = x264_quant_2x2_dc_neon; @@ -742,6 +748,13 @@ pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_neon; } #endif +#if ARCH_AARCH64 + if( cpu&X264_CPU_ARMV8 ) + { + pf->coeff_last4 = x264_coeff_last4_aarch64; + pf->coeff_last8 = x264_coeff_last8_aarch64; + } +#endif #endif // HIGH_BIT_DEPTH pf->coeff_last[DCT_LUMA_DC] = pf->coeff_last[DCT_CHROMAU_DC] = pf->coeff_last[DCT_CHROMAV_DC] = pf->coeff_last[DCT_CHROMAU_4x4] = pf->coeff_last[DCT_CHROMAV_4x4] = pf->coeff_last[DCT_LUMA_4x4];
View file
x264-snapshot-20140321-2245.tar.bz2/common/quant.h -> x264-snapshot-20141104-2245.tar.bz2/common/quant.h
Changed
@@ -4,7 +4,7 @@ * Copyright (C) 2005-2014 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/common/rectangle.c -> x264-snapshot-20141104-2245.tar.bz2/common/rectangle.c
Changed
@@ -3,7 +3,7 @@ ***************************************************************************** * Copyright (C) 2010-2014 x264 project * - * Authors: Jason Garrett-Glaser <darkshikari@gmail.com> + * Authors: Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/common/rectangle.h -> x264-snapshot-20141104-2245.tar.bz2/common/rectangle.h
Changed
@@ -3,7 +3,7 @@ ***************************************************************************** * Copyright (C) 2003-2014 x264 project * - * Authors: Jason Garrett-Glaser <darkshikari@gmail.com> + * Authors: Fiona Glaser <fiona@x264.com> * Loren Merritt <lorenm@u.washington.edu> * * This program is free software; you can redistribute it and/or modify
View file
x264-snapshot-20140321-2245.tar.bz2/common/vlc.c -> x264-snapshot-20141104-2245.tar.bz2/common/vlc.c
Changed
@@ -4,7 +4,7 @@ * Copyright (C) 2003-2014 x264 project * * Authors: Laurent Aimar <fenrir@via.ecp.fr> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * Henrik Gramner <henrik@gramner.com> * * This program is free software; you can redistribute it and/or modify
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/bitstream-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/bitstream-a.asm
Changed
@@ -3,7 +3,7 @@ ;***************************************************************************** ;* Copyright (C) 2010-2014 x264 project ;* -;* Authors: Jason Garrett-Glaser <darkshikari@gmail.com> +;* Authors: Fiona Glaser <fiona@x264.com> ;* Henrik Gramner <henrik@gramner.com> ;* ;* This program is free software; you can redistribute it and/or modify
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/cabac-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/cabac-a.asm
Changed
@@ -4,7 +4,7 @@ ;* Copyright (C) 2008-2014 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> -;* Jason Garrett-Glaser <darkshikari@gmail.com> +;* Fiona Glaser <fiona@x264.com> ;* Holger Lubitz <holger@lubitz.org> ;* ;* This program is free software; you can redistribute it and/or modify
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/const-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/const-a.asm
Changed
@@ -4,7 +4,7 @@ ;* Copyright (C) 2010-2014 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> -;* Jason Garrett-Glaser <darkshikari@gmail.com> +;* Fiona Glaser <fiona@x264.com> ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/cpu-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/cpu-a.asm
Changed
@@ -5,7 +5,7 @@ ;* ;* Authors: Laurent Aimar <fenrir@via.ecp.fr> ;* Loren Merritt <lorenm@u.washington.edu> -;* Jason Garrett-Glaser <darkshikari@gmail.com> +;* Fiona Glaser <fiona@x264.com> ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/dct-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/dct-a.asm
Changed
@@ -7,7 +7,7 @@ ;* Loren Merritt <lorenm@u.washington.edu> ;* Laurent Aimar <fenrir@via.ecp.fr> ;* Min Chen <chenm001.163.com> -;* Jason Garrett-Glaser <darkshikari@gmail.com> +;* Fiona Glaser <fiona@x264.com> ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/dct.h -> x264-snapshot-20141104-2245.tar.bz2/common/x86/dct.h
Changed
@@ -5,7 +5,7 @@ * * Authors: Loren Merritt <lorenm@u.washington.edu> * Laurent Aimar <fenrir@via.ecp.fr> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/deblock-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/deblock-a.asm
Changed
@@ -4,7 +4,7 @@ ;* Copyright (C) 2005-2014 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> -;* Jason Garrett-Glaser <darkshikari@gmail.com> +;* Fiona Glaser <fiona@x264.com> ;* Oskar Arvidsson <oskar@irock.se> ;* ;* This program is free software; you can redistribute it and/or modify
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/mc-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/mc-a.asm
Changed
@@ -4,7 +4,7 @@ ;* Copyright (C) 2003-2014 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> -;* Jason Garrett-Glaser <darkshikari@gmail.com> +;* Fiona Glaser <fiona@x264.com> ;* Laurent Aimar <fenrir@via.ecp.fr> ;* Dylan Yudaken <dyudaken@gmail.com> ;* Holger Lubitz <holger@lubitz.org>
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/mc-a2.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/mc-a2.asm
Changed
@@ -4,7 +4,7 @@ ;* Copyright (C) 2005-2014 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> -;* Jason Garrett-Glaser <darkshikari@gmail.com> +;* Fiona Glaser <fiona@x264.com> ;* Holger Lubitz <holger@lubitz.org> ;* Mathieu Monnier <manao@melix.net> ;* Oskar Arvidsson <oskar@irock.se>
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/mc-c.c -> x264-snapshot-20141104-2245.tar.bz2/common/x86/mc-c.c
Changed
@@ -5,7 +5,7 @@ * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/pixel-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/pixel-a.asm
Changed
@@ -7,7 +7,7 @@ ;* Holger Lubitz <holger@lubitz.org> ;* Laurent Aimar <fenrir@via.ecp.fr> ;* Alex Izvorski <aizvorksi@gmail.com> -;* Jason Garrett-Glaser <darkshikari@gmail.com> +;* Fiona Glaser <fiona@x264.com> ;* Oskar Arvidsson <oskar@irock.se> ;* ;* This program is free software; you can redistribute it and/or modify @@ -561,10 +561,15 @@ pshufhw m0, m0, q3120 pshufhw m1, m1, q3120 %endif +%if cpuflag(xop) + pmadcswd m2, m0, m0, m2 + pmadcswd m3, m1, m1, m3 +%else pmaddwd m0, m0 pmaddwd m1, m1 paddd m2, m0 paddd m3, m1 +%endif add r6, 2*mmsize jl .loopx %if mmsize == 32 ; avx2 may overread by 32 bytes, that has to be handled @@ -657,10 +662,15 @@ por m0, m1 psrlw m2, m0, 8 pand m0, m5 +%if cpuflag(xop) + pmadcswd m4, m2, m2, m4 + pmadcswd m3, m0, m0, m3 +%else pmaddwd m2, m2 pmaddwd m0, m0 - paddd m3, m0 paddd m4, m2 + paddd m3, m0 +%endif add r6, mmsize jl .loopx %if mmsize == 32 ; avx2 may overread by 16 bytes, that has to be handled @@ -695,6 +705,8 @@ SSD_NV12 INIT_XMM avx SSD_NV12 +INIT_XMM xop +SSD_NV12 INIT_YMM avx2 SSD_NV12 @@ -4677,12 +4689,13 @@ ;----------------------------------------------------------------------------- ; float pixel_ssim_end( int sum0[5][4], int sum1[5][4], int width ) ;----------------------------------------------------------------------------- -cglobal pixel_ssim_end4, 3,3,7 - movdqa m0, [r0+ 0] - movdqa m1, [r0+16] - movdqa m2, [r0+32] - movdqa m3, [r0+48] - movdqa m4, [r0+64] +cglobal pixel_ssim_end4, 2,3 + mov r2d, r2m + mova m0, [r0+ 0] + mova m1, [r0+16] + mova m2, [r0+32] + mova m3, [r0+48] + mova m4, [r0+64] paddd m0, [r1+ 0] paddd m1, [r1+16] paddd m2, [r1+32] @@ -4692,8 +4705,6 @@ paddd m1, m2 paddd m2, m3 paddd m3, m4 - movdqa m5, [ssim_c1] - movdqa m6, [ssim_c2] TRANSPOSE4x4D 0, 1, 2, 3, 4 ; s1=m0, s2=m1, ss=m2, s12=m3 @@ -4702,20 +4713,21 @@ cvtdq2ps m1, m1 cvtdq2ps m2, m2 cvtdq2ps m3, m3 + mulps m4, m0, m1 ; s1*s2 + mulps m0, m0 ; s1*s1 + mulps m1, m1 ; s2*s2 mulps m2, [pf_64] ; ss*64 mulps m3, [pf_128] ; s12*128 - movdqa m4, m1 - mulps m4, m0 ; s1*s2 - mulps m1, m1 ; s2*s2 - mulps m0, m0 ; s1*s1 addps m4, m4 ; s1*s2*2 addps m0, m1 ; s1*s1 + s2*s2 subps m2, m0 ; vars subps m3, m4 ; covar*2 - addps m4, m5 ; s1*s2*2 + ssim_c1 - addps m0, m5 ; s1*s1 + s2*s2 + ssim_c1 - addps m2, m6 ; vars + ssim_c2 - addps m3, m6 ; covar*2 + ssim_c2 + movaps m1, [ssim_c1] + addps m4, m1 ; s1*s2*2 + ssim_c1 + addps m0, m1 ; s1*s1 + s2*s2 + ssim_c1 + movaps m1, [ssim_c2] + addps m2, m1 ; vars + ssim_c2 + addps m3, m1 ; covar*2 + ssim_c2 %else pmaddwd m4, m1, m0 ; s1*s2 pslld m1, 16 @@ -4726,10 +4738,12 @@ pslld m2, 6 psubd m3, m4 ; covar*2 psubd m2, m0 ; vars - paddd m0, m5 - paddd m4, m5 - paddd m3, m6 - paddd m2, m6 + mova m1, [ssim_c1] + paddd m0, m1 + paddd m4, m1 + mova m1, [ssim_c2] + paddd m3, m1 + paddd m2, m1 cvtdq2ps m0, m0 ; (float)(s1*s1 + s2*s2 + ssim_c1) cvtdq2ps m4, m4 ; (float)(s1*s2*2 + ssim_c1) cvtdq2ps m3, m3 ; (float)(covar*2 + ssim_c2) @@ -4742,20 +4756,31 @@ cmp r2d, 4 je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level neg r2 + %ifdef PIC lea r3, [mask_ff + 16] - movdqu m1, [r3 + r2*4] + %xdefine %%mask r3 %else - movdqu m1, [mask_ff + r2*4 + 16] + %xdefine %%mask mask_ff + 16 %endif - pand m4, m1 +%if cpuflag(avx) + andps m4, [%%mask + r2*4] +%else + movups m0, [%%mask + r2*4] + andps m4, m0 +%endif + .skip: movhlps m0, m4 addps m0, m4 +%if cpuflag(ssse3) + movshdup m4, m0 +%else pshuflw m4, m0, q0032 +%endif addss m0, m4 %if ARCH_X86_64 == 0 - movd r0m, m0 + movss r0m, m0 fld dword r0m %endif RET
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/pixel.h -> x264-snapshot-20141104-2245.tar.bz2/common/x86/pixel.h
Changed
@@ -5,7 +5,7 @@ * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -56,6 +56,7 @@ DECL_X4( sad, sse2 ) DECL_X4( sad, sse3 ) DECL_X4( sad, ssse3 ) +DECL_X4( sad, xop ) DECL_X4( sad, avx ) DECL_X4( sad, avx2 ) DECL_X1( ssd, mmx ) @@ -153,6 +154,9 @@ void x264_pixel_ssd_nv12_core_avx ( pixel *pixuv1, intptr_t stride1, pixel *pixuv2, intptr_t stride2, int width, int height, uint64_t *ssd_u, uint64_t *ssd_v ); +void x264_pixel_ssd_nv12_core_xop ( pixel *pixuv1, intptr_t stride1, + pixel *pixuv2, intptr_t stride2, int width, + int height, uint64_t *ssd_u, uint64_t *ssd_v ); void x264_pixel_ssd_nv12_core_avx2( pixel *pixuv1, intptr_t stride1, pixel *pixuv2, intptr_t stride2, int width, int height, uint64_t *ssd_u, uint64_t *ssd_v );
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/predict-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/predict-a.asm
Changed
@@ -5,7 +5,7 @@ ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> ;* Holger Lubitz <holger@lubitz.org> -;* Jason Garrett-Glaser <darkshikari@gmail.com> +;* Fiona Glaser <fiona@x264.com> ;* Henrik Gramner <henrik@gramner.com> ;* ;* This program is free software; you can redistribute it and/or modify
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/predict-c.c -> x264-snapshot-20141104-2245.tar.bz2/common/x86/predict-c.c
Changed
@@ -5,7 +5,7 @@ * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/quant-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/quant-a.asm
Changed
@@ -4,7 +4,7 @@ ;* Copyright (C) 2005-2014 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> -;* Jason Garrett-Glaser <darkshikari@gmail.com> +;* Fiona Glaser <fiona@x264.com> ;* Christian Heine <sennindemokrit@gmx.net> ;* Oskar Arvidsson <oskar@irock.se> ;* Henrik Gramner <henrik@gramner.com> @@ -292,14 +292,11 @@ QUANT_4x4 0, 6 QUANT_4x4 64, 7 packssdw m6, m7 - packssdw m5, m6 - packssdw m5, m5 ; AA BB CC DD - packsswb m5, m5 ; A B C D + packssdw m5, m6 ; AAAA BBBB CCCC DDDD pxor m4, m4 - pcmpeqb m5, m4 - pmovmskb eax, m5 - not eax - and eax, 0xf + pcmpeqd m5, m4 + movmskps eax, m5 + xor eax, 0xf RET %endmacro @@ -444,16 +441,11 @@ QUANT_4x4 64, 5 QUANT_4x4 96, 6 packssdw m5, m6 - packssdw m4, m5 -%if mmsize == 16 - packssdw m4, m4 ; AA BB CC DD -%endif - packsswb m4, m4 ; A B C D + packssdw m4, m5 ; AAAA BBBB CCCC DDDD pxor m3, m3 - pcmpeqb m4, m3 - pmovmskb eax, m4 - not eax - and eax, 0xf + pcmpeqd m4, m3 + movmskps eax, m4 + xor eax, 0xf RET %endmacro @@ -464,7 +456,6 @@ INIT_MMX mmx QUANT_AC quant_4x4, 4 QUANT_AC quant_8x8, 16 -QUANT_4x4x4 %endif INIT_XMM sse2
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/quant.h -> x264-snapshot-20141104-2245.tar.bz2/common/x86/quant.h
Changed
@@ -4,7 +4,7 @@ * Copyright (C) 2005-2014 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * Christian Heine <sennindemokrit@gmx.net> * * This program is free software; you can redistribute it and/or modify @@ -31,7 +31,6 @@ int x264_quant_2x2_dc_mmx2( dctcoef dct[4], int mf, int bias ); int x264_quant_4x4_dc_mmx2( dctcoef dct[16], int mf, int bias ); int x264_quant_4x4_mmx( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] ); -int x264_quant_4x4x4_mmx( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] ); int x264_quant_8x8_mmx( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] ); int x264_quant_2x2_dc_sse2( dctcoef dct[16], int mf, int bias ); int x264_quant_4x4_dc_sse2( dctcoef dct[16], int mf, int bias );
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/sad-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/sad-a.asm
Changed
@@ -4,7 +4,7 @@ ;* Copyright (C) 2003-2014 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> -;* Jason Garrett-Glaser <darkshikari@gmail.com> +;* Fiona Glaser <fiona@x264.com> ;* Laurent Aimar <fenrir@via.ecp.fr> ;* Alex Izvorski <aizvorksi@gmail.com> ;*
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/sad16-a.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/sad16-a.asm
Changed
@@ -519,6 +519,19 @@ SAD_X 4, 8, 16 SAD_X 4, 8, 8 SAD_X 4, 8, 4 +INIT_XMM xop +%define XMM_REGS 7 +SAD_X 3, 16, 16 +SAD_X 3, 16, 8 +SAD_X 3, 8, 16 +SAD_X 3, 8, 8 +SAD_X 3, 8, 4 +%define XMM_REGS 9 +SAD_X 4, 16, 16 +SAD_X 4, 16, 8 +SAD_X 4, 8, 16 +SAD_X 4, 8, 8 +SAD_X 4, 8, 4 INIT_YMM avx2 %define XMM_REGS 7 SAD_X 3, 16, 16 @@ -533,7 +546,12 @@ %macro INTRA_SAD_X3_4x4 0 cglobal intra_sad_x3_4x4, 3,3,7 +%if cpuflag(ssse3) movddup m0, [r1-1*FDEC_STRIDEB] +%else + movq m0, [r1-1*FDEC_STRIDEB] + punpcklqdq m0, m0 +%endif movq m1, [r0+0*FENC_STRIDEB] movq m2, [r0+2*FENC_STRIDEB] pshuflw m6, m0, q1032
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/util.h -> x264-snapshot-20141104-2245.tar.bz2/common/x86/util.h
Changed
@@ -3,7 +3,7 @@ ***************************************************************************** * Copyright (C) 2008-2014 x264 project * - * Authors: Jason Garrett-Glaser <darkshikari@gmail.com> + * Authors: Fiona Glaser <fiona@x264.com> * Loren Merritt <lorenm@u.washington.edu> * * This program is free software; you can redistribute it and/or modify
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/x86inc.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/x86inc.asm
Changed
@@ -5,7 +5,7 @@ ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> ;* Anton Mitrofanov <BugMaster@narod.ru> -;* Jason Garrett-Glaser <darkshikari@gmail.com> +;* Fiona Glaser <fiona@x264.com> ;* Henrik Gramner <henrik@gramner.com> ;* ;* Permission to use, copy, modify, and/or distribute this software for any @@ -90,9 +90,6 @@ default rel %endif -; Always use long nops (reduces 0x90 spam in disassembly on x86_32) -CPU amdnop - ; Macros to eliminate most code duplication between x86_32 and x86_64: ; Currently this works only for leaf functions which load all their arguments ; into registers at the start, and make no other use of the stack. Luckily that @@ -756,19 +753,26 @@ %define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) %define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) -; Takes up to 2 cpuflags from the above list. +; Takes an arbitrary number of cpuflags from the above list. ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. -%macro INIT_CPUFLAGS 0-2 - CPU amdnop +%macro INIT_CPUFLAGS 0-* + %xdefine SUFFIX + %undef cpuname + %assign cpuflags 0 + %if %0 >= 1 - %xdefine cpuname %1 - %assign cpuflags cpuflags_%1 - %if %0 >= 2 - %xdefine cpuname %1_%2 - %assign cpuflags cpuflags | cpuflags_%2 - %endif + %rep %0 + %ifdef cpuname + %xdefine cpuname cpuname %+ _%1 + %else + %xdefine cpuname %1 + %endif + %assign cpuflags cpuflags | cpuflags_%1 + %rotate 1 + %endrep %xdefine SUFFIX _ %+ cpuname + %if cpuflag(avx) %assign avx_enabled 1 %endif @@ -779,16 +783,15 @@ %endif %if cpuflag(aligned) %define movu mova - %elifidn %1, sse3 + %elif cpuflag(sse3) && notcpuflag(ssse3) %define movu lddqu %endif - %if ARCH_X86_64 == 0 && notcpuflag(sse2) - CPU basicnop - %endif + %endif + + %if ARCH_X86_64 || cpuflag(sse2) + CPU amdnop %else - %xdefine SUFFIX - %undef cpuname - %undef cpuflags + CPU basicnop %endif %endmacro
View file
x264-snapshot-20140321-2245.tar.bz2/common/x86/x86util.asm -> x264-snapshot-20141104-2245.tar.bz2/common/x86/x86util.asm
Changed
@@ -298,11 +298,16 @@ paddd %1, %2 %endif %if mmsize >= 16 +%if cpuflag(xop) && sizeof%1 == 16 + vphadddq %1, %1 +%endif movhlps %2, %1 paddd %1, %2 %endif +%if notcpuflag(xop) || sizeof%1 != 16 PSHUFLW %2, %1, q0032 paddd %1, %2 +%endif %undef %1 %undef %2 %endmacro
View file
x264-snapshot-20140321-2245.tar.bz2/configure -> x264-snapshot-20141104-2245.tar.bz2/configure
Changed
@@ -73,32 +73,36 @@ echo "$1" >> config.log } -intel_cflags() { - # Intel Compiler issues an incredibly large number of warnings on any warning level, +cc_cflags() { + # several non gcc compilers issue an incredibly large number of warnings on any warning level, # suppress them by disabling all warnings rather than having to use #pragmas to disable most of them for arg in $*; do [ $arg = -ffast-math ] && arg= [[ "$arg" = -falign-loops* ]] && arg= [ "$arg" = -fno-tree-vectorize ] && arg= [ "$arg" = -Wshadow ] && arg= + [ "$arg" = -Wno-maybe-uninitialized ] && arg= [[ "$arg" = -mpreferred-stack-boundary* ]] && arg= [[ "$arg" = -l* ]] && arg= [[ "$arg" = -L* ]] && arg= - if [ $compiler = ICL ]; then + if [ $compiler_style = MS ]; then [ "$arg" = -Wall ] && arg=-W0 + [ "$arg" = -Werror ] && arg="-W3 -WX" [ "$arg" = -g ] && arg=-Z7 [ "$arg" = -fomit-frame-pointer ] && arg= [ "$arg" = -s ] && arg= [ "$arg" = -fPIC ] && arg= else [ "$arg" = -Wall ] && arg=-w0 + [ "$arg" = -Werror ] && arg="-w3 -Werror" fi + [ $compiler = CL -a "$arg" = -O3 ] && arg=-O2 [ -n "$arg" ] && echo -n "$arg " done } -icl_ldflags() { +cl_ldflags() { for arg in $*; do arg=${arg/LIBPATH/libpath} [ ${arg#-libpath:} == $arg -a ${arg#-l} != $arg ] && arg=${arg#-l}.lib @@ -106,6 +110,11 @@ [ $arg = -Wl,--large-address-aware ] && arg=-largeaddressaware [ $arg = -s ] && arg= [ "$arg" = -Wl,-Bsymbolic ] && arg= + [ "$arg" = -fno-tree-vectorize ] && arg= + [ "$arg" = -Werror ] && arg= + [ "$arg" = -Wshadow ] && arg= + [ "$arg" = -Wmaybe-uninitialized ] && arg= + [[ "$arg" = -Qdiag-error* ]] && arg= arg=${arg/pthreadGC/pthreadVC} [ "$arg" = avifil32.lib ] && arg=vfw32.lib @@ -135,11 +144,11 @@ fi rm -f conftest.c [ -n "$1" ] && echo "#include <$1>" > conftest.c - echo "int main () { $3 return 0; }" >> conftest.c - if [ $compiler = ICL ]; then - cc_cmd="$CC conftest.c $(intel_cflags $CFLAGS $2) -link $(icl_ldflags $2 $LDFLAGSCLI $LDFLAGS)" + echo "int main (void) { $3 return 0; }" >> conftest.c + if [ $compiler_style = MS ]; then + cc_cmd="$CC conftest.c $(cc_cflags $CFLAGS $CHECK_CFLAGS $2) -link $(cl_ldflags $2 $LDFLAGSCLI $LDFLAGS)" else - cc_cmd="$CC conftest.c $CFLAGS $2 $LDFLAGSCLI $LDFLAGS -o conftest" + cc_cmd="$CC conftest.c $CFLAGS $CHECK_CFLAGS $2 $LDFLAGSCLI $LDFLAGS -o conftest" fi if $cc_cmd >conftest.log 2>&1; then res=$? @@ -165,8 +174,12 @@ rm -f conftest.c [ -n "$1" ] && echo "#include <$1>" > conftest.c echo -e "#if !($3) \n#error $4 \n#endif " >> conftest.c - - if $CC conftest.c $CFLAGS $2 -E -o conftest >conftest.log 2>&1; then + if [ $compiler_style = MS ]; then + cpp_cmd="$CC conftest.c $(cc_cflags $CFLAGS $2) -P" + else + cpp_cmd="$CC conftest.c $CFLAGS $2 -E -o conftest" + fi + if $cpp_cmd >conftest.log 2>&1; then res=$? log_ok else @@ -185,8 +198,9 @@ as_check() { log_check "whether $AS supports $1" - echo "$1" > conftest.asm - if $AS conftest.asm $ASFLAGS $2 -o conftest.o >conftest.log 2>&1; then + echo "$1" > conftest$AS_EXT + as_cmd="$AS conftest$AS_EXT $ASFLAGS $2 -o conftest.o" + if $as_cmd >conftest.log 2>&1; then res=$? log_ok else @@ -194,12 +208,12 @@ log_fail log_msg "Failed commandline was:" log_msg "--------------------------------------------------" - log_msg "$AS conftest.asm $ASFLAGS $2 -o conftest.o" + log_msg "$as_cmd" cat conftest.log >> config.log log_msg "--------------------------------------------------" log_msg "Failed program was:" log_msg "--------------------------------------------------" - cat conftest.asm >> config.log + cat conftest$AS_EXT >> config.log log_msg "--------------------------------------------------" fi return $res @@ -208,10 +222,10 @@ rc_check() { log_check "whether $RC works" echo "$1" > conftest.rc - if [ $compiler = ICL ]; then - rc_cmd="$RC $RCFLAGS -foconftest.o conftest.rc" - else + if [ $compiler = GNU ]; then rc_cmd="$RC $RCFLAGS -o conftest.o conftest.rc" + else + rc_cmd="$RC $RCFLAGS -foconftest.o conftest.rc" fi if $rc_cmd >conftest.log 2>&1; then res=$? @@ -278,21 +292,26 @@ bit_depth="8" chroma_format="all" compiler="GNU" +compiler_style="GNU" opencl="yes" CFLAGS="$CFLAGS -Wall -I. -I\$(SRCPATH)" LDFLAGS="$LDFLAGS" LDFLAGSCLI="$LDFLAGSCLI" -ASFLAGS="$ASFLAGS" +ASFLAGS="$ASFLAGS -I. -I\$(SRCPATH)" RCFLAGS="$RCFLAGS" +CHECK_CFLAGS="" HAVE_GETOPT_LONG=1 cross_prefix="" EXE="" +AS_EXT=".S" +NL=" +" # list of all preprocessor HAVE values we can define CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F SWSCALE \ - LAVF FFMS GPAC AVS GPL VECTOREXT INTERLACED CPU_COUNT OPENCL THP LSMASH" + LAVF FFMS GPAC AVS GPL VECTOREXT INTERLACED CPU_COUNT OPENCL THP LSMASH X86_INLINE_ASM AS_FUNC" # parse options @@ -439,23 +458,44 @@ host_vendor="${host%%-*}" host_os="${host#*-}" -# test for use of Intel Compiler +# test for use of compilers that require specific handling +cc_base=`basename "$CC"` +QPRE="-" if [[ $host_os = mingw* || $host_os = cygwin* ]]; then - if [[ `basename "$CC"` = icl* ]]; then + if [[ "$cc_base" = icl || "$cc_base" = icl[\ .]* ]]; then # Windows Intel Compiler creates dependency generation with absolute Windows paths, Cygwin's make does not support Windows paths. [[ $host_os = cygwin* ]] && die "Windows Intel Compiler support requires MSYS" compiler=ICL + compiler_style=MS CFLAGS="$CFLAGS -Qstd=c99 -nologo -Qms0 -DHAVE_STRING_H -I\$(SRCPATH)/extras" QPRE="-Q" `$CC 2>&1 | grep -q IA-32` && host_cpu=i486 `$CC 2>&1 | grep -q "Intel(R) 64"` && host_cpu=x86_64 cpp_check "" "" "_MSC_VER >= 1400" || die "Windows Intel Compiler support requires Visual Studio 2005 or newer" + if cc_check '' -Qdiag-error:10006,10157 ; then + CHECK_CFLAGS="$CHECK_CFLAGS -Qdiag-error:10006,10157" + fi + elif [[ "$cc_base" = cl || "$cc_base" = cl[\ .]* ]]; then + # Standard Microsoft Visual Studio + # Dependency creation includes absolute windows paths, Cygwin's make does not support Windows paths. + [[ $host_os = cygwin* ]] && die "Microsoft Visual Studio support requires MSYS" + compiler=CL + compiler_style=MS + CFLAGS="$CFLAGS -nologo -DHAVE_STRING_H -I\$(SRCPATH)/extras" + `$CC 2>&1 | grep -q 'for x86'` && host_cpu=i486 + `$CC 2>&1 | grep -q 'for x64'` && host_cpu=x86_64 + cpp_check '' '' '_MSC_VER > 1800 || (_MSC_VER == 1800 && _MSC_FULL_VER >= 180030324)' || die "Microsoft Visual Studio support requires Visual Studio 2013 Update 2 or newer" fi else - if [[ `basename "$CC"` = icc* ]]; then + if [[ "$cc_base" = icc || "$cc_base" = icc[\ .]* ]]; then AR="xiar" compiler=ICC - QPRE="-" + fi +fi + +if [[ "$cc_base" = clang || "$cc_base" = clang[\ .]* ]]; then + if cc_check '' -Werror=unknown-warning-option ; then + CHECK_CFLAGS="$CHECK_CFLAGS -Werror=unknown-warning-option" fi fi @@ -467,7 +507,6 @@ ;; darwin*) SYS="MACOSX" - CFLAGS="$CFLAGS -falign-loops=16" libm="-lm" if [ "$pic" = "no" ]; then cc_check "" -mdynamic-no-pic && CFLAGS="$CFLAGS -mdynamic-no-pic" @@ -521,7 +560,7 @@ EXE=".exe" DEVNULL="NUL" LDFLAGSCLI="$LDFLAGSCLI -lshell32" - [ $compiler = ICL ] && RC="${RC-rc}" || RC="${RC-${cross_prefix}windres}" + [ $compiler = GNU ] && RC="${RC-${cross_prefix}windres}" || RC="${RC-rc}" ;; sunos*|solaris*) SYS="SunOS" @@ -562,7 +601,8 @@ i*86) ARCH="X86" AS="yasm" - ASFLAGS="$ASFLAGS -O2" + AS_EXT=".asm" + ASFLAGS="$ASFLAGS -O2 -DARCH_X86_64=0 -I\$(SRCPATH)/common/x86/" if [ $compiler = GNU ]; then if [[ "$asm" == auto && "$CFLAGS" != *-march* ]]; then CFLAGS="$CFLAGS -march=i686" @@ -572,7 +612,7 @@ fi CFLAGS="-m32 $CFLAGS" LDFLAGS="-m32 $LDFLAGS" - else + elif [ $compiler = ICC ]; then # icc on linux has various degrees of mod16 stack support if [ $SYS = LINUX ]; then # < 11 is completely incapable of keeping a mod16 stack @@ -584,8 +624,9 @@ fi # >= 12 defaults to a mod16 stack fi - # icl on windows has no mod16 stack support - [ $SYS = WINDOWS ] && stack_alignment=4 + else # ICL/CL + # always a mod4 stack + stack_alignment=4 fi if [ "$SYS" = MACOSX ]; then ASFLAGS="$ASFLAGS -f macho -DPREFIX" @@ -601,6 +642,8 @@ x86_64) ARCH="X86_64" AS="yasm" + AS_EXT=".asm" + ASFLAGS="$ASFLAGS -DARCH_X86_64=1 -I\$(SRCPATH)/common/x86/" [ $compiler = GNU ] && CFLAGS="-m64 $CFLAGS" && LDFLAGS="-m64 $LDFLAGS" if [ "$SYS" = MACOSX ]; then ASFLAGS="$ASFLAGS -f macho64 -m amd64 -DPIC -DPREFIX" @@ -622,7 +665,8 @@ ARCH="PPC" if [ $asm = auto ] ; then define HAVE_ALTIVEC - AS="${AS-${cross_prefix}gcc}" + AS="${AS-${CC}}" + AS_EXT=".c" if [ $SYS = MACOSX ] ; then CFLAGS="$CFLAGS -faltivec -fastf -mcpu=G4" else @@ -633,19 +677,6 @@ ;; sparc) ARCH="SPARC" - case $(uname -m) in - sun4u|sun4v) - if [ $asm = auto ]; then - ARCH="UltraSPARC" - if ! echo $CFLAGS | grep -Eq '\-mcpu' ; then - CFLAGS="$CFLAGS -mcpu=ultrasparc" - LDFLAGS="$LDFLAGS -mcpu=ultrasparc" - fi - AS="${AS-${cross_prefix}as}" - ASFLAGS="$ASFLAGS -xarch=v8plusa" - fi - ;; - esac ;; mips|mipsel|mips64|mips64el) ARCH="MIPS" @@ -661,7 +692,16 @@ LDFLAGS="$LDFLAGS -arch armv7" fi else - AS="${AS-${cross_prefix}gcc}" + AS="${AS-${CC}}" + fi + ;; + aarch64) + ARCH="AARCH64" + if [ "$SYS" = MACOSX ] ; then + AS="${AS-extras/gas-preprocessor.pl $CC}" + ASFLAGS="$ASFLAGS -DPREFIX" + else + AS="${AS-${CC}}" fi ;; s390|s390x) @@ -701,7 +741,7 @@ cc_check || die "No working C compiler found." -if [ $compiler != ICL ]; then +if [ $compiler_style = GNU ]; then if cc_check '' -std=gnu99 'for( int i = 0; i < 9; i++ );' ; then CFLAGS="$CFLAGS -std=gnu99" elif cc_check '' -std=c99 'for( int i = 0; i < 9; i++ );' ; then @@ -711,7 +751,7 @@ fi fi -if [ $shared = yes -a \( $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" -o $ARCH = "ARM" -o $ARCH = "IA64" -o $ARCH = "PARISC" -o $ARCH = "MIPS" \) ] ; then +if [ $shared = yes -a \( $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" -o $ARCH = "ARM" -o $ARCH = "IA64" -o $ARCH = "PARISC" -o $ARCH = "MIPS" -o $ARCH = "AARCH64" \) ] ; then pic="yes" fi @@ -723,14 +763,7 @@ echo "If you really want to compile without asm, configure with --disable-asm." exit 1 fi - if ! cc_check '' '' '__asm__("pabsw %xmm0, %xmm0");' ; then - VER=`(${cross_prefix}as --version || echo no gnu as) 2>/dev/null | head -n 1` - echo "Found $VER" - echo "Minimum version is binutils-2.17" - echo "Your compiler can't handle inline SSSE3 asm." - echo "If you really want to compile without asm, configure with --disable-asm." - exit 1 - fi + cc_check '' '' '__asm__("pabsw %xmm0, %xmm0");' && define HAVE_X86_INLINE_ASM ASFLAGS="$ASFLAGS -Worphan-labels" define HAVE_MMX if [ $compiler = GNU ] && cc_check '' -mpreferred-stack-boundary=5 ; then @@ -746,7 +779,7 @@ if cc_check '' '' '__asm__("rev ip, ip");' ; then define HAVE_ARMV6 cc_check '' '' '__asm__("movt r0, #0");' && define HAVE_ARMV6T2 cc_check '' '' '__asm__("vadd.i16 q0, q0, q0");' && define HAVE_NEON - ASFLAGS="$ASFLAGS $CFLAGS -c" + ASFLAGS="$ASFLAGS -c" else echo "You specified a pre-ARMv6 or Thumb-1 CPU in your CFLAGS." echo "If you really want to run on such a CPU, configure with --disable-asm." @@ -754,6 +787,24 @@ fi fi +if [ $asm = auto -a $ARCH = AARCH64 ] ; then + # set flags so neon is built by default + echo $CFLAGS | grep -Eq '(-mcpu|-march|-mfpu|-arch)' || CFLAGS="$CFLAGS -arch arm64 -mfpu=neon" + + if cc_check '' '' '__asm__("cmeq v0.8h, v0.8h, #0");' ; then define HAVE_NEON + ASFLAGS="$ASFLAGS -c" + else + echo "no NEON support, try adding -mfpu=neon to CFLAGS" + echo "If you really want to run on such a CPU, configure with --disable-asm." + exit 1 + fi +fi + +if [ $asm = auto -a \( $ARCH = ARM -o $ARCH = AARCH64 \) ] ; then + # check if the assembler supports '.func' (clang 3.5 does not) + as_check ".func test${NL}.endfunc" && define HAVE_AS_FUNC 1 +fi + [ $asm = no ] && AS="" [ "x$AS" = x ] && asm="no" || asm="yes" @@ -763,7 +814,7 @@ define STACK_ALIGNMENT $stack_alignment ASFLAGS="$ASFLAGS -DSTACK_ALIGNMENT=$stack_alignment" -# skip endianness check for Intel Compiler, as all supported platforms are little. the -ipo flag will also cause the check to fail +# skip endianness check for Intel Compiler and MSVS, as all supported platforms are little. each have flags that will cause the check to fail as well if [ $compiler = GNU ]; then echo "int i[2] = {0x42494745,0}; double f[2] = {0x1.0656e6469616ep+102,0};" > conftest.c $CC $CFLAGS conftest.c -c -o conftest.o 2>/dev/null || die "endian test failed" @@ -870,12 +921,12 @@ fi if [ -z "$LAVF_LIBS" -a -z "$LAVF_CFLAGS" ]; then LAVF_LIBS="-lavformat" - for lib in -lpostproc -lavcodec -lavcore -lswscale -lavutil -lm -lz -lbz2 $libpthread -lavifil32 -lws2_32; do + for lib in -lpostproc -lavcodec -lswscale -lavutil -lm -lz -lbz2 $libpthread -lavifil32 -lws2_32; do cc_check "" $lib && LAVF_LIBS="$LAVF_LIBS $lib" done fi LAVF_LIBS="-L. $LAVF_LIBS" - if cc_check libavformat/avformat.h "$LAVF_CFLAGS $LAVF_LIBS" "avformat_close_input(0);" ; then + if cc_check libavformat/avformat.h "$LAVF_CFLAGS $LAVF_LIBS" "av_frame_free(0);" ; then if [ "$swscale" = "yes" ]; then lavf="yes" else @@ -937,10 +988,10 @@ [ -z "$LSMASH_LIBS" ] && LSMASH_LIBS="-llsmash" if cc_check lsmash.h "$LSMASH_CFLAGS $LSMASH_LIBS" ; then - if cpp_check lsmash.h "$LSMASH_CFLAGS" "LSMASH_VERSION_MAJOR > 0 || (LSMASH_VERSION_MAJOR == 0 && LSMASH_VERSION_MINOR >= 1)" ; then + if cpp_check lsmash.h "$LSMASH_CFLAGS" "LSMASH_VERSION_MAJOR > 1 || (LSMASH_VERSION_MAJOR == 1 && LSMASH_VERSION_MINOR >= 5)" ; then lsmash="yes" else - echo "Warning: lsmash is too old, update to rev.751 or later" + echo "Warning: lsmash is too old, update to rev.895 or later" fi fi fi @@ -1005,16 +1056,11 @@ fi if [ "$strip" = "yes" ]; then - CFLAGS="$CFLAGS -s" LDFLAGS="$LDFLAGS -s" fi if [ "$debug" = "yes" ]; then CFLAGS="-O1 -g $CFLAGS" -elif [ $ARCH = ARM ]; then - # arm-gcc-4.2 produces incorrect output with -ffast-math - # and it doesn't save any speed anyway on 4.4, so disable it - CFLAGS="-O3 -fno-fast-math $CFLAGS" else CFLAGS="-O3 -ffast-math $CFLAGS" fi @@ -1043,6 +1089,10 @@ CFLAGS="-Wshadow $CFLAGS" fi +if cc_check '' -Wmaybe-uninitialized ; then + CFLAGS="-Wno-maybe-uninitialized $CFLAGS" +fi + if [ "$bit_depth" -gt "8" ]; then define HIGH_BIT_DEPTH ASFLAGS="$ASFLAGS -DHIGH_BIT_DEPTH=1" @@ -1064,15 +1114,6 @@ libdl="" if [ "$opencl" = "yes" ]; then opencl="no" - log_check "for perl" - output=$(perl -v) - if [ "$output" = "" ]; then - log_fail - echo 'OpenCL support requires perl to compile.' - echo 'use --disable-opencl to compile without OpenCL.' - exit 1 - fi - log_ok # cygwin can use opencl if it can use LoadLibrary if [ $SYS = WINDOWS ] || ([ $SYS = CYGWIN ] && cc_check windows.h "" "LoadLibraryW(0);") ; then opencl="yes" @@ -1090,17 +1131,18 @@ grep -q "HAVE_$var 1" config.h || define HAVE_$var 0 done -if [ $compiler = ICL ]; then - AR="xilib -nologo -out:" - DEPMM=-QMM - DEPMT=-QMT +DEPMM="${QPRE}MM" +DEPMT="${QPRE}MT" +if [ $compiler_style = MS ]; then + AR="lib -nologo -out:" + LD="link -out:" + [ $compiler = ICL ] && AR="xi$AR" && LD="xi$LD" HAVE_GETOPT_LONG=0 - LD="xilink -out:" - LDFLAGS="-nologo -incremental:no $(icl_ldflags $LDFLAGS)" - LDFLAGSCLI="$(icl_ldflags $LDFLAGSCLI)" + LDFLAGS="-nologo -incremental:no $(cl_ldflags $LDFLAGS)" + LDFLAGSCLI="$(cl_ldflags $LDFLAGSCLI)" LIBX264=libx264.lib RANLIB= - [ -n "$RC" ] && RCFLAGS="$RCFLAGS -I. -I\$(SRCPATH)/extras -fo" + [ -n "$RC" ] && RCFLAGS="$RCFLAGS -nologo -I. -I\$(SRCPATH)/extras -fo" STRIP= if [ $debug = yes ]; then LDFLAGS="-debug $LDFLAGS" @@ -1108,27 +1150,37 @@ else CFLAGS="-DNDEBUG $CFLAGS" fi -else +else # gcc/icc + DEPMM="$DEPMM -g0" AR="$AR rc " - DEPMM="-MM -g0" - DEPMT="-MT" LD="$CC -o " LIBX264=libx264.a [ -n "$RC" ] && RCFLAGS="$RCFLAGS -I. -o " fi -if [ $compiler = GNU ]; then - PROF_GEN_CC="-fprofile-generate" - PROF_GEN_LD="-fprofile-generate" - PROF_USE_CC="-fprofile-use" - PROF_USE_LD="-fprofile-use" -else - CFLAGS="$(intel_cflags $CFLAGS)" +[ $compiler != GNU ] && CFLAGS="$(cc_cflags $CFLAGS)" +if [ $compiler = ICC -o $compiler = ICL ]; then # icc does not define __SSE__ until SSE2 optimization and icl never defines it or _M_IX86_FP [ \( $ARCH = X86_64 -o $ARCH = X86 \) -a $asm = yes ] && ! cpp_check "" "" "defined(__SSE__)" && define __SSE__ PROF_GEN_CC="${QPRE}prof-gen ${QPRE}prof-dir." PROF_GEN_LD= PROF_USE_CC="${QPRE}prof-use ${QPRE}prof-dir." PROF_USE_LD= +elif [ $compiler = CL ]; then + # Visual Studio + # _M_IX86_FP is only defined on x86 + [ $ARCH = X86 ] && cpp_check '' '' '_M_IX86_FP >= 1' && define __SSE__ + [ $ARCH = X86_64 ] && define __SSE__ + # As long as the cli application can't link against the dll, the dll can not be pgo'd. + # pgds are link flag specific and the -dll flag for creating the dll makes it unshareable with the cli + PROF_GEN_CC="-GL" + PROF_GEN_LD="-LTCG:PGINSTRUMENT" + PROF_USE_CC="-GL" + PROF_USE_LD="-LTCG:PGOPTIMIZE" +else + PROF_GEN_CC="-fprofile-generate" + PROF_GEN_LD="-fprofile-generate" + PROF_USE_CC="-fprofile-use" + PROF_USE_LD="-fprofile-use" fi rm -f conftest* @@ -1157,6 +1209,8 @@ SYS=$SYS CC=$CC CFLAGS=$CFLAGS +COMPILER=$compiler +COMPILER_STYLE=$compiler_style DEPMM=$DEPMM DEPMT=$DEPMT LD=$LD @@ -1180,7 +1234,7 @@ HAVE_OPENCL=$opencl EOF -if [ $compiler = ICL ]; then +if [ $compiler_style = MS ]; then echo '%.o: %.c' >> config.mak echo ' $(CC) $(CFLAGS) -c -Fo$@ $<' >> config.mak fi @@ -1194,7 +1248,7 @@ API=$(grep '#define X264_BUILD' < ${SRCPATH}/x264.h | cut -f 3 -d ' ') if [ "$SYS" = "WINDOWS" -o "$SYS" = "CYGWIN" ]; then echo "SONAME=libx264-$API.dll" >> config.mak - if [ $compiler = ICL ]; then + if [ $compiler_style = MS ]; then echo 'IMPLIBNAME=libx264.dll.lib' >> config.mak # GNU ld on windows defaults to exporting all global functions if there are no explicit __declspec(dllexport) declarations # MSVC link does not act similarly, so it is required to make an export definition out of x264.h and use it at link time @@ -1297,7 +1351,7 @@ rm conftest.log [ "$SRCPATH" != "." ] && ln -sf ${SRCPATH}/Makefile ./Makefile -mkdir -p common/{arm,ppc,sparc,x86} encoder extras filters/video input output tools +mkdir -p common/{aarch64,arm,ppc,x86} encoder extras filters/video input output tools echo echo "You can run 'make' or 'make fprofiled' now."
View file
x264-snapshot-20140321-2245.tar.bz2/encoder/analyse.c -> x264-snapshot-20141104-2245.tar.bz2/encoder/analyse.c
Changed
@@ -5,7 +5,7 @@ * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/encoder/cabac.c -> x264-snapshot-20141104-2245.tar.bz2/encoder/cabac.c
Changed
@@ -5,7 +5,7 @@ * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/encoder/cavlc.c -> x264-snapshot-20141104-2245.tar.bz2/encoder/cavlc.c
Changed
@@ -5,7 +5,7 @@ * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/encoder/encoder.c -> x264-snapshot-20141104-2245.tar.bz2/encoder/encoder.c
Changed
@@ -5,7 +5,7 @@ * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -97,11 +97,14 @@ int cw = h->param.i_width>>1; int ch = h->param.i_height>>CHROMA_V_SHIFT; pixel *planeu = x264_malloc( (cw*ch*2+32)*sizeof(pixel) ); - pixel *planev = planeu + cw*ch + 16; - h->mc.plane_copy_deinterleave( planeu, cw, planev, cw, h->fdec->plane[1], h->fdec->i_stride[1], cw, ch ); - fwrite( planeu, 1, cw*ch*sizeof(pixel), f ); - fwrite( planev, 1, cw*ch*sizeof(pixel), f ); - x264_free( planeu ); + if( planeu ) + { + pixel *planev = planeu + cw*ch + 16; + h->mc.plane_copy_deinterleave( planeu, cw, planev, cw, h->fdec->plane[1], h->fdec->i_stride[1], cw, ch ); + fwrite( planeu, 1, cw*ch*sizeof(pixel), f ); + fwrite( planev, 1, cw*ch*sizeof(pixel), f ); + x264_free( planeu ); + } } fclose( f ); } @@ -412,6 +415,12 @@ static int x264_validate_parameters( x264_t *h, int b_open ) { + if( !h->param.pf_log ) + { + x264_log( NULL, X264_LOG_ERROR, "pf_log not set! did you forget to call x264_param_default?\n" ); + return -1; + } + #if HAVE_MMX if( b_open ) { @@ -818,6 +827,8 @@ /* 8x8dct is not useful without RD in CAVLC lossless */ if( !h->param.b_cabac && h->param.analyse.i_subpel_refine < 6 ) h->param.analyse.b_transform_8x8 = 0; + h->param.analyse.inter &= ~X264_ANALYSE_I8x8; + h->param.analyse.intra &= ~X264_ANALYSE_I8x8; } if( h->param.rc.i_rc_method == X264_RC_CQP ) { @@ -1403,7 +1414,11 @@ /* Init x264_t */ h->i_frame = -1; h->i_frame_num = 0; - h->i_idr_pic_id = 0; + + if( h->param.i_avcintra_class ) + h->i_idr_pic_id = 5; + else + h->i_idr_pic_id = 0; if( (uint64_t)h->param.i_timebase_den * 2 > UINT32_MAX ) { @@ -2154,6 +2169,31 @@ h->fref[1][h->i_ref[1]++] = h->frames.reference[i]; } + if( h->sh.i_mmco_remove_from_end ) + { + /* Order ref0 for MMCO remove */ + do + { + b_ok = 1; + for( int i = 0; i < h->i_ref[0] - 1; i++ ) + { + if( h->fref[0][i]->i_frame < h->fref[0][i+1]->i_frame ) + { + XCHG( x264_frame_t*, h->fref[0][i], h->fref[0][i+1] ); + b_ok = 0; + break; + } + } + } while( !b_ok ); + + for( int i = h->i_ref[0]-1; i >= h->i_ref[0] - h->sh.i_mmco_remove_from_end; i-- ) + { + int diff = h->i_frame_num - h->fref[0][i]->i_frame_num; + h->sh.mmco[h->sh.i_mmco_command_count].i_poc = h->fref[0][i]->i_poc; + h->sh.mmco[h->sh.i_mmco_command_count++].i_difference_of_pic_nums = diff; + } + } + /* Order reference lists by distance from the current frame. */ for( int list = 0; list < 2; list++ ) { @@ -2176,14 +2216,6 @@ } while( !b_ok ); } - if( h->sh.i_mmco_remove_from_end ) - for( int i = h->i_ref[0]-1; i >= h->i_ref[0] - h->sh.i_mmco_remove_from_end; i-- ) - { - int diff = h->i_frame_num - h->fref[0][i]->i_frame_num; - h->sh.mmco[h->sh.i_mmco_command_count].i_poc = h->fref[0][i]->i_poc; - h->sh.mmco[h->sh.i_mmco_command_count++].i_difference_of_pic_nums = diff; - } - x264_reference_check_reorder( h ); h->i_ref[1] = X264_MIN( h->i_ref[1], h->frames.i_max_ref1 ); @@ -2438,7 +2470,24 @@ x264_slice_header_init( h, &h->sh, h->sps, h->pps, h->i_idr_pic_id, h->i_frame_num, i_global_qp ); /* alternate id */ - h->i_idr_pic_id ^= 1; + if( h->param.i_avcintra_class ) + { + switch( h->i_idr_pic_id ) + { + case 5: + h->i_idr_pic_id = 3; + break; + case 3: + h->i_idr_pic_id = 4; + break; + case 4: + default: + h->i_idr_pic_id = 5; + break; + } + } + else + h->i_idr_pic_id ^= 1; } else { @@ -3539,15 +3588,15 @@ return -1; overhead += h->out.nal[h->out.i_nal-1].i_payload + SEI_OVERHEAD; } + } - if( h->param.i_frame_packing >= 0 ) - { - x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE ); - x264_sei_frame_packing_write( h, &h->out.bs ); - if( x264_nal_end( h ) ) - return -1; - overhead += h->out.nal[h->out.i_nal-1].i_payload + SEI_OVERHEAD; - } + if( h->param.i_frame_packing >= 0 && (h->fenc->b_keyframe || h->param.i_frame_packing == 5) ) + { + x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE ); + x264_sei_frame_packing_write( h, &h->out.bs ); + if( x264_nal_end( h ) ) + return -1; + overhead += h->out.nal[h->out.i_nal-1].i_payload + SEI_OVERHEAD; } /* generate sei pic timing */
View file
x264-snapshot-20140321-2245.tar.bz2/encoder/macroblock.c -> x264-snapshot-20141104-2245.tar.bz2/encoder/macroblock.c
Changed
@@ -5,7 +5,7 @@ * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * Henrik Gramner <henrik@gramner.com> * * This program is free software; you can redistribute it and/or modify
View file
x264-snapshot-20140321-2245.tar.bz2/encoder/me.c -> x264-snapshot-20141104-2245.tar.bz2/encoder/me.c
Changed
@@ -5,7 +5,7 @@ * * Authors: Loren Merritt <lorenm@u.washington.edu> * Laurent Aimar <fenrir@via.ecp.fr> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/encoder/ratecontrol.c -> x264-snapshot-20141104-2245.tar.bz2/encoder/ratecontrol.c
Changed
@@ -6,7 +6,7 @@ * Authors: Loren Merritt <lorenm@u.washington.edu> * Michael Niedermayer <michaelni@gmx.at> * Gabriel Bouvigne <gabriel.bouvigne@joost.com> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * Måns Rullgård <mru@mru.ath.cx> * * This program is free software; you can redistribute it and/or modify @@ -158,7 +158,7 @@ double frame_size_maximum; /* Maximum frame size due to MinCR */ double frame_size_planned; double slice_size_planned; - predictor_t (*row_pred)[2]; + predictor_t *row_pred; predictor_t row_preds[3][2]; predictor_t *pred_b_from_p; /* predict B-frame size from P-frame satd */ int bframes; /* # consecutive B-frames before this P-frame */ @@ -1418,7 +1418,7 @@ memset( h->fdec->i_row_bits, 0, h->mb.i_mb_height * sizeof(int) ); memset( h->fdec->f_row_qp, 0, h->mb.i_mb_height * sizeof(float) ); memset( h->fdec->f_row_qscale, 0, h->mb.i_mb_height * sizeof(float) ); - rc->row_pred = &rc->row_preds[h->sh.i_type]; + rc->row_pred = rc->row_preds[h->sh.i_type]; rc->buffer_rate = h->fenc->i_cpb_duration * rc->vbv_max_rate * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale; update_vbv_plan( h, overhead ); @@ -1504,7 +1504,7 @@ /* average between two predictors: * absolute SATD, and scaled bit cost of the colocated row in the previous frame */ x264_ratecontrol_t *rc = h->rc; - float pred_s = predict_size( rc->row_pred[0], qscale, h->fdec->i_row_satd[y] ); + float pred_s = predict_size( &rc->row_pred[0], qscale, h->fdec->i_row_satd[y] ); if( h->sh.i_type == SLICE_TYPE_I || qscale >= h->fref[0][0]->f_row_qscale[y] ) { if( h->sh.i_type == SLICE_TYPE_P @@ -1522,7 +1522,7 @@ /* Our QP is lower than the reference! */ else { - float pred_intra = predict_size( rc->row_pred[1], qscale, h->fdec->i_row_satds[0][0][y] ); + float pred_intra = predict_size( &rc->row_pred[1], qscale, h->fdec->i_row_satds[0][0][y] ); /* Sum: better to overestimate than underestimate by using only one of the two predictors. */ return pred_intra + pred_s; } @@ -1570,9 +1570,9 @@ h->fdec->f_row_qp[y] = rc->qpm; h->fdec->f_row_qscale[y] = qscale; - update_predictor( rc->row_pred[0], qscale, h->fdec->i_row_satd[y], h->fdec->i_row_bits[y] ); + update_predictor( &rc->row_pred[0], qscale, h->fdec->i_row_satd[y], h->fdec->i_row_bits[y] ); if( h->sh.i_type == SLICE_TYPE_P && rc->qpm < h->fref[0][0]->f_row_qp[y] ) - update_predictor( rc->row_pred[1], qscale, h->fdec->i_row_satds[0][0][y], h->fdec->i_row_bits[y] ); + update_predictor( &rc->row_pred[1], qscale, h->fdec->i_row_satds[0][0][y], h->fdec->i_row_bits[y] ); /* update ratecontrol per-mbpair in MBAFF */ if( SLICE_MBAFF && !(y&1) ) @@ -2612,7 +2612,7 @@ x264_t *t = h->thread[i]; if( t != h ) memcpy( t->rc, rc, offsetof(x264_ratecontrol_t, row_pred) ); - t->rc->row_pred = &t->rc->row_preds[h->sh.i_type]; + t->rc->row_pred = t->rc->row_preds[h->sh.i_type]; /* Calculate the planned slice size. */ if( rc->b_vbv && rc->frame_size_planned ) {
View file
x264-snapshot-20140321-2245.tar.bz2/encoder/rdo.c -> x264-snapshot-20141104-2245.tar.bz2/encoder/rdo.c
Changed
@@ -4,7 +4,7 @@ * Copyright (C) 2005-2014 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20140321-2245.tar.bz2/encoder/set.c -> x264-snapshot-20141104-2245.tar.bz2/encoder/set.c
Changed
@@ -675,7 +675,9 @@ bs_write( &q, 4, 0 ); // frame1_grid_position_y } bs_write( &q, 8, 0 ); // frame_packing_arrangement_reserved_byte - bs_write_ue( &q, 1 ); // frame_packing_arrangement_repetition_period + // "frame_packing_arrangement_repetition_period equal to 1 specifies that the frame packing arrangement SEI message persists in output" + // for (i_frame_packing == 5) this will undermine current_frame_is_frame0_flag which must alternate every view sequence + bs_write_ue( &q, h->param.i_frame_packing != 5 ); // frame_packing_arrangement_repetition_period bs_write1( &q, 0 ); // frame_packing_arrangement_extension_flag bs_align_10( &q ); @@ -740,11 +742,15 @@ data[20] = 0x13; /* These bytes appear to be some sort of frame/seconds counter in certain applications, * but others jump around, so leave them as zero for now */ - data[21] = data[22] = 0; - + data[22] = data[23] = data[25] = data[26] = 0; data[28] = 0x14; + data[30] = data[31] = data[33] = data[34] = 0; data[36] = 0x60; data[41] = 0x22; /* Believed to be some sort of end of basic UMID identifier */ + data[60] = 0x62; + data[62] = data[63] = data[65] = data[66] = 0; + data[68] = 0x63; + data[70] = data[71] = data[73] = data[74] = 0; x264_sei_write( &h->out.bs, data, len, SEI_USER_DATA_UNREGISTERED );
View file
x264-snapshot-20140321-2245.tar.bz2/encoder/slicetype.c -> x264-snapshot-20141104-2245.tar.bz2/encoder/slicetype.c
Changed
@@ -3,7 +3,7 @@ ***************************************************************************** * Copyright (C) 2005-2014 x264 project * - * Authors: Jason Garrett-Glaser <darkshikari@gmail.com> + * Authors: Fiona Glaser <fiona@x264.com> * Loren Merritt <lorenm@u.washington.edu> * Dylan Yudaken <dyudaken@gmail.com> *
View file
x264-snapshot-20140321-2245.tar.bz2/filters/filters.c -> x264-snapshot-20141104-2245.tar.bz2/filters/filters.c
Changed
@@ -38,13 +38,13 @@ if( sep_count == 0 ) { if( string[0] == '\0' ) - return calloc( 1, sizeof( char** ) ); - char **ret = calloc( 2, sizeof( char** ) ); + return calloc( 1, sizeof( char* ) ); + char **ret = calloc( 2, sizeof( char* ) ); ret[0] = strdup( string ); return ret; } - char **split = calloc( ( limit > 0 ? limit : sep_count ) + 2, sizeof(char**) ); + char **split = calloc( ( limit > 0 ? limit : sep_count ) + 2, sizeof(char*) ); int i = 0; char *str = strdup( string ); assert( str ); @@ -104,7 +104,7 @@ while( options[options_count] != NULL ) ++options_count; - char **opts = calloc( split_count * 2 + 2, sizeof( char ** ) ); + char **opts = calloc( split_count * 2 + 2, sizeof( char * ) ); char **arg = NULL; int opt = 0, found_named = 0, invalid = 0; for( int i = 0; split[i] != NULL; i++, invalid = 0 )
View file
x264-snapshot-20140321-2245.tar.bz2/filters/video/select_every.c -> x264-snapshot-20141104-2245.tar.bz2/filters/video/select_every.c
Changed
@@ -51,7 +51,7 @@ printf( " apply a selection pattern to input frames\n" " step: the number of frames in the pattern\n" " offsets: the offset into the step to select a frame\n" - " see: http://avisynth.org/mediawiki/Select#SelectEvery\n" ); + " see: http://avisynth.nl/index.php/Select#SelectEvery\n" ); } static int init( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x264_param_t *param, char *opt_string )
View file
x264-snapshot-20140321-2245.tar.bz2/input/avs.c -> x264-snapshot-20141104-2245.tar.bz2/input/avs.c
Changed
@@ -298,7 +298,10 @@ opt->input_range = opt->output_range; } const char *arg_name[] = { NULL, "interlaced", "matrix" }; - AVS_Value arg_arr[] = { res, avs_new_value_bool( info->interlaced ), avs_new_value_string( matrix ) }; + AVS_Value arg_arr[3]; + arg_arr[0] = res; + arg_arr[1] = avs_new_value_bool( info->interlaced ); + arg_arr[2] = avs_new_value_string( matrix ); AVS_Value res2 = h->func.avs_invoke( h->env, conv_func, avs_new_value_array( arg_arr, arg_count ), arg_name ); FAIL_IF_ERROR( avs_is_error( res2 ), "couldn't convert input clip to %s\n", csp ) res = update_clip( h, &vi, res2, res ); @@ -308,7 +311,9 @@ { const char *levels = opt->output_range ? "TV->PC" : "PC->TV"; x264_cli_log( "avs", X264_LOG_WARNING, "performing %s conversion\n", levels ); - AVS_Value arg_arr[] = { res, avs_new_value_string( levels ) }; + AVS_Value arg_arr[2]; + arg_arr[0] = res; + arg_arr[1] = avs_new_value_string( levels ); const char *arg_name[] = { NULL, "levels" }; AVS_Value res2 = h->func.avs_invoke( h->env, "ColorYUV", avs_new_value_array( arg_arr, 2 ), arg_name ); FAIL_IF_ERROR( avs_is_error( res2 ), "couldn't convert range: %s\n", avs_as_error( res2 ) )
View file
x264-snapshot-20140321-2245.tar.bz2/input/ffms.c -> x264-snapshot-20141104-2245.tar.bz2/input/ffms.c
Changed
@@ -177,8 +177,9 @@ static int picture_alloc( cli_pic_t *pic, int csp, int width, int height ) { - if( x264_cli_pic_alloc( pic, csp, width, height ) ) + if( x264_cli_pic_alloc( pic, X264_CSP_NONE, width, height ) ) return -1; + pic->img.csp = csp; pic->img.planes = 4; return 0; }
View file
x264-snapshot-20140321-2245.tar.bz2/input/lavf.c -> x264-snapshot-20141104-2245.tar.bz2/input/lavf.c
Changed
@@ -42,12 +42,6 @@ cli_pic_t *first_pic; } lavf_hnd_t; -#define x264_free_packet( pkt )\ -{\ - av_free_packet( pkt );\ - av_init_packet( pkt );\ -} - /* handle the deprecated jpeg pixel formats */ static int handle_jpeg( int csp, int *fullrange ) { @@ -70,9 +64,7 @@ { XCHG( cli_image_t, p_pic->img, h->first_pic->img ); p_pic->pts = h->first_pic->pts; - XCHG( void*, p_pic->opaque, h->first_pic->opaque ); } - lavf_input.release_frame( h->first_pic, NULL ); lavf_input.picture_clean( h->first_pic ); free( h->first_pic ); h->first_pic = NULL; @@ -81,9 +73,11 @@ } AVCodecContext *c = h->lavf->streams[h->stream_id]->codec; - AVPacket *pkt = p_pic->opaque; - avcodec_get_frame_defaults( h->frame ); + AVPacket pkt; + av_init_packet( &pkt ); + pkt.data = NULL; + pkt.size = 0; while( i_frame >= h->next_frame ) { @@ -91,20 +85,23 @@ int ret = 0; do { - ret = av_read_frame( h->lavf, pkt ); + ret = av_read_frame( h->lavf, &pkt ); - if( pkt->stream_index == h->stream_id ) + if( ret < 0 ) { - if( ret < 0 ) - pkt->size = 0; + av_init_packet( &pkt ); + pkt.data = NULL; + pkt.size = 0; + } - c->reordered_opaque = pkt->pts; - if( avcodec_decode_video2( c, h->frame, &finished, pkt ) < 0 ) + if( ret < 0 || pkt.stream_index == h->stream_id ) + { + if( avcodec_decode_video2( c, h->frame, &finished, &pkt ) < 0 ) x264_cli_log( "lavf", X264_LOG_WARNING, "video decoding failed on frame %d\n", h->next_frame ); } - /* if the packet successfully decoded but the data from it is not desired, free it */ - else if( ret >= 0 ) - x264_free_packet( pkt ); + + if( ret >= 0 ) + av_free_packet( &pkt ); } while( !finished && ret >= 0 ); if( !finished ) @@ -130,10 +127,10 @@ if( h->vfr_input ) { p_pic->pts = p_pic->duration = 0; - if( c->has_b_frames && h->frame->reordered_opaque != AV_NOPTS_VALUE ) - p_pic->pts = h->frame->reordered_opaque; - else if( pkt->dts != AV_NOPTS_VALUE ) - p_pic->pts = pkt->dts; // for AVI files + if( h->frame->pkt_pts != AV_NOPTS_VALUE ) + p_pic->pts = h->frame->pkt_pts; + else if( h->frame->pkt_dts != AV_NOPTS_VALUE ) + p_pic->pts = h->frame->pkt_dts; // for AVI files else if( info ) { h->vfr_input = info->vfr = 0; @@ -153,7 +150,7 @@ if( !strcmp( psz_filename, "-" ) ) psz_filename = "pipe:"; - h->frame = avcodec_alloc_frame(); + h->frame = av_frame_alloc(); if( !h->frame ) return -1; @@ -220,13 +217,10 @@ static int picture_alloc( cli_pic_t *pic, int csp, int width, int height ) { - if( x264_cli_pic_alloc( pic, csp, width, height ) ) + if( x264_cli_pic_alloc( pic, X264_CSP_NONE, width, height ) ) return -1; + pic->img.csp = csp; pic->img.planes = 4; - pic->opaque = malloc( sizeof(AVPacket) ); - if( !pic->opaque ) - return -1; - av_init_packet( pic->opaque ); return 0; } @@ -235,15 +229,8 @@ return read_frame_internal( pic, handle, i_frame, NULL ); } -static int release_frame( cli_pic_t *pic, hnd_t handle ) -{ - x264_free_packet( pic->opaque ); - return 0; -} - static void picture_clean( cli_pic_t *pic ) { - free( pic->opaque ); memset( pic, 0, sizeof(cli_pic_t) ); } @@ -252,13 +239,9 @@ lavf_hnd_t *h = handle; avcodec_close( h->lavf->streams[h->stream_id]->codec ); avformat_close_input( &h->lavf ); -#if LIBAVCODEC_VERSION_INT >= AV_VERSION_INT(54, 28, 0) - avcodec_free_frame( &h->frame ); -#else - av_freep( &h->frame ); -#endif + av_frame_free( &h->frame ); free( h ); return 0; } -const cli_input_t lavf_input = { open_file, picture_alloc, read_frame, release_frame, picture_clean, close_file }; +const cli_input_t lavf_input = { open_file, picture_alloc, read_frame, NULL, picture_clean, close_file };
View file
x264-snapshot-20140321-2245.tar.bz2/input/thread.c -> x264-snapshot-20141104-2245.tar.bz2/input/thread.c
Changed
@@ -88,7 +88,11 @@ if( h->next_frame == i_frame ) XCHG( cli_pic_t, *p_pic, h->pic ); else + { + if( h->next_frame >= 0 ) + thread_input.release_frame( &h->pic, handle ); ret |= h->input.read_frame( p_pic, h->p_handle, i_frame ); + } if( !h->frame_total || i_frame+1 < h->frame_total ) {
View file
x264-snapshot-20140321-2245.tar.bz2/output/mp4_lsmash.c -> x264-snapshot-20141104-2245.tar.bz2/output/mp4_lsmash.c
Changed
@@ -79,6 +79,7 @@ int i_dts_compress_multiplier; int b_use_recovery; int b_fragments; + lsmash_file_parameters_t file_param; } mp4_hnd_t; /*******************/ @@ -88,16 +89,10 @@ mp4_hnd_t *p_mp4 = handle; if( !p_mp4 ) return; - if( p_mp4->p_sei_buffer ) - { - free( p_mp4->p_sei_buffer ); - p_mp4->p_sei_buffer = NULL; - } - if( p_mp4->p_root ) - { - lsmash_destroy_root( p_mp4->p_root ); - p_mp4->p_root = NULL; - } + lsmash_cleanup_summary( (lsmash_summary_t *)p_mp4->summary ); + lsmash_close_file( &p_mp4->file_param ); + lsmash_destroy_root( p_mp4->p_root ); + free( p_mp4->p_sei_buffer ); free( p_mp4 ); } @@ -181,9 +176,13 @@ p_mp4->b_fragments = !b_regular; p_mp4->b_stdout = !strcmp( psz_filename, "-" ); - p_mp4->p_root = lsmash_open_movie( psz_filename, p_mp4->b_fragments ? LSMASH_FILE_MODE_WRITE_FRAGMENTED : LSMASH_FILE_MODE_WRITE ); + p_mp4->p_root = lsmash_create_root(); MP4_FAIL_IF_ERR_EX( !p_mp4->p_root, "failed to create root.\n" ); + MP4_FAIL_IF_ERR_EX( lsmash_open_file( psz_filename, 0, &p_mp4->file_param ) < 0, "failed to open an output file.\n" ); + if( p_mp4->b_fragments ) + p_mp4->file_param.mode |= LSMASH_FILE_MODE_FRAGMENTED; + p_mp4->summary = (lsmash_video_summary_t *)lsmash_create_summary( LSMASH_SUMMARY_TYPE_VIDEO ); MP4_FAIL_IF_ERR_EX( !p_mp4->summary, "failed to allocate memory for summary information of video.\n" ); @@ -219,12 +218,17 @@ brands[brand_count++] = ISOM_BRAND_TYPE_ISO6; /* cslg and visual random access grouping */ } + /* Set file */ + lsmash_file_parameters_t *file_param = &p_mp4->file_param; + file_param->major_brand = brands[0]; + file_param->brands = brands; + file_param->brand_count = brand_count; + file_param->minor_version = 0; + MP4_FAIL_IF_ERR( !lsmash_set_file( p_mp4->p_root, file_param ), "failed to add an output file into a ROOT.\n" ); + /* Set movie parameters. */ lsmash_movie_parameters_t movie_param; lsmash_initialize_movie_parameters( &movie_param ); - movie_param.major_brand = ISOM_BRAND_TYPE_MP42; - movie_param.brands = brands; - movie_param.number_of_brands = brand_count; MP4_FAIL_IF_ERR( lsmash_set_movie_parameters( p_mp4->p_root, &movie_param ), "failed to set movie parameters.\n" ); p_mp4->i_movie_timescale = lsmash_get_movie_timescale( p_mp4->p_root );
View file
x264-snapshot-20140321-2245.tar.bz2/tools/checkasm.c -> x264-snapshot-20141104-2245.tar.bz2/tools/checkasm.c
Changed
@@ -5,7 +5,7 @@ * * Authors: Loren Merritt <lorenm@u.washington.edu> * Laurent Aimar <fenrir@via.ecp.fr> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -90,11 +90,11 @@ { uint32_t a = 0; #if HAVE_X86_INLINE_ASM - asm volatile( "rdtsc" :"=a"(a) ::"edx" ); + asm volatile( "rdtsc" : "=a"(a) :: "edx", "memory" ); #elif ARCH_PPC - asm volatile( "mftb %0" : "=r" (a) ); + asm volatile( "mftb %0" : "=r"(a) :: "memory" ); #elif ARCH_ARM // ARMv7 only - asm volatile( "mrc p15, 0, %0, c9, c13, 0" : "=r"(a) ); + asm volatile( "mrc p15, 0, %0, c9, c13, 0" : "=r"(a) :: "memory" ); #endif return a; } @@ -184,6 +184,9 @@ #elif ARCH_ARM b->cpu&X264_CPU_NEON ? "neon" : b->cpu&X264_CPU_ARMV6 ? "armv6" : +#elif ARCH_AARCH64 + b->cpu&X264_CPU_NEON ? "neon" : + b->cpu&X264_CPU_ARMV8 ? "armv8" : #endif "c", #if HAVE_MMX @@ -728,11 +731,14 @@ fprintf( stderr, "ssim: %.7f != %.7f [FAILED]\n", res_c, res_a ); } set_func_name( "ssim_core" ); - call_c2( pixel_c.ssim_4x4x2_core, pbuf1+2, (intptr_t)32, pbuf2+2, (intptr_t)32, sums ); - call_a2( pixel_asm.ssim_4x4x2_core, pbuf1+2, (intptr_t)32, pbuf2+2, (intptr_t)32, sums ); + call_c( pixel_c.ssim_4x4x2_core, pbuf1+2, (intptr_t)32, pbuf2+2, (intptr_t)32, sums ); + call_a( pixel_asm.ssim_4x4x2_core, pbuf1+2, (intptr_t)32, pbuf2+2, (intptr_t)32, sums ); set_func_name( "ssim_end" ); call_c2( pixel_c.ssim_end4, sums, sums, 4 ); call_a2( pixel_asm.ssim_end4, sums, sums, 4 ); + /* check incorrect assumptions that 32-bit ints are zero-extended to 64-bit */ + call_c1( pixel_c.ssim_end4, sums, sums, 3 ); + call_a1( pixel_asm.ssim_end4, sums, sums, 3 ); report( "ssim :" ); } @@ -1097,6 +1103,7 @@ TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, dct8[0], 8 ); TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 4 ); TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 ); + TEST_ZIGZAG_SUB( sub_8x8, level1, level2, 64 ); TEST_ZIGZAG_SUBAC( sub_4x4ac, level1, level2 ); report( interlace ? "zigzag_field :" : "zigzag_frame :" ); } @@ -2624,8 +2631,9 @@ { int ret = 0; int cpu0 = 0, cpu1 = 0; + uint32_t cpu_detect = x264_cpu_detect(); #if HAVE_MMX - if( x264_cpu_detect() & X264_CPU_MMX2 ) + if( cpu_detect & X264_CPU_MMX2 ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_MMX | X264_CPU_MMX2, "MMX" ); ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "MMX Cache64" ); @@ -2634,7 +2642,7 @@ ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_32, "MMX Cache32" ); cpu1 &= ~X264_CPU_CACHELINE_32; #endif - if( x264_cpu_detect() & X264_CPU_LZCNT ) + if( cpu_detect & X264_CPU_LZCNT ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX_LZCNT" ); cpu1 &= ~X264_CPU_LZCNT; @@ -2642,9 +2650,9 @@ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "MMX SlowCTZ" ); cpu1 &= ~X264_CPU_SLOW_CTZ; } - if( x264_cpu_detect() & X264_CPU_SSE ) + if( cpu_detect & X264_CPU_SSE ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE, "SSE" ); - if( x264_cpu_detect() & X264_CPU_SSE2 ) + if( cpu_detect & X264_CPU_SSE2 ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2 | X264_CPU_SSE2_IS_SLOW, "SSE2Slow" ); ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2_IS_FAST, "SSE2Fast" ); @@ -2655,17 +2663,17 @@ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" ); cpu1 &= ~X264_CPU_SLOW_CTZ; } - if( x264_cpu_detect() & X264_CPU_LZCNT ) + if( cpu_detect & X264_CPU_LZCNT ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE_LZCNT" ); cpu1 &= ~X264_CPU_LZCNT; } - if( x264_cpu_detect() & X264_CPU_SSE3 ) + if( cpu_detect & X264_CPU_SSE3 ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3 | X264_CPU_CACHELINE_64, "SSE3" ); cpu1 &= ~X264_CPU_CACHELINE_64; } - if( x264_cpu_detect() & X264_CPU_SSSE3 ) + if( cpu_detect & X264_CPU_SSSE3 ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" ); ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" ); @@ -2679,54 +2687,59 @@ cpu1 &= ~X264_CPU_CACHELINE_64; cpu1 &= ~X264_CPU_SLOW_ATOM; } - if( x264_cpu_detect() & X264_CPU_SSE4 ) + if( cpu_detect & X264_CPU_SSE4 ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" ); - if( x264_cpu_detect() & X264_CPU_AVX ) + if( cpu_detect & X264_CPU_AVX ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX, "AVX" ); - if( x264_cpu_detect() & X264_CPU_XOP ) + if( cpu_detect & X264_CPU_XOP ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_XOP, "XOP" ); - if( x264_cpu_detect() & X264_CPU_FMA4 ) + if( cpu_detect & X264_CPU_FMA4 ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA4, "FMA4" ); cpu1 &= ~X264_CPU_FMA4; } - if( x264_cpu_detect() & X264_CPU_BMI1 ) + if( cpu_detect & X264_CPU_BMI1 ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" ); cpu1 &= ~X264_CPU_BMI1; } - if( x264_cpu_detect() & X264_CPU_AVX2 ) + if( cpu_detect & X264_CPU_AVX2 ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" ); - if( x264_cpu_detect() & X264_CPU_LZCNT ) + if( cpu_detect & X264_CPU_LZCNT ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "AVX2_LZCNT" ); cpu1 &= ~X264_CPU_LZCNT; } } - if( x264_cpu_detect() & X264_CPU_BMI2 ) + if( cpu_detect & X264_CPU_BMI2 ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1|X264_CPU_BMI2, "BMI2" ); cpu1 &= ~(X264_CPU_BMI1|X264_CPU_BMI2); } - if( x264_cpu_detect() & X264_CPU_FMA3 ) + if( cpu_detect & X264_CPU_FMA3 ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" ); cpu1 &= ~X264_CPU_FMA3; } #elif ARCH_PPC - if( x264_cpu_detect() & X264_CPU_ALTIVEC ) + if( cpu_detect & X264_CPU_ALTIVEC ) { fprintf( stderr, "x264: ALTIVEC against C\n" ); ret = check_all_funcs( 0, X264_CPU_ALTIVEC ); } #elif ARCH_ARM - if( x264_cpu_detect() & X264_CPU_ARMV6 ) + if( cpu_detect & X264_CPU_ARMV6 ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_ARMV6, "ARMv6" ); - if( x264_cpu_detect() & X264_CPU_NEON ) + if( cpu_detect & X264_CPU_NEON ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_NEON, "NEON" ); - if( x264_cpu_detect() & X264_CPU_FAST_NEON_MRC ) + if( cpu_detect & X264_CPU_FAST_NEON_MRC ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_FAST_NEON_MRC, "Fast NEON MRC" ); +#elif ARCH_AARCH64 + if( cpu_detect & X264_CPU_ARMV8 ) + ret |= add_flags( &cpu0, &cpu1, X264_CPU_ARMV8, "ARMv8" ); + if( cpu_detect & X264_CPU_NEON ) + ret |= add_flags( &cpu0, &cpu1, X264_CPU_NEON, "NEON" ); #endif return ret; }
View file
x264-snapshot-20141104-2245.tar.bz2/tools/cltostr.sh
Added
@@ -0,0 +1,32 @@ +#!/bin/sh +# Convert standard input to a C char array, write to a file, then create an +# MD5 sum of that file and append said MD5 sum as char array to the file. + +FILE=$1 + +# Filter out whitespace, empty lines, and comments. +sanitize() { + sed 's/^[[:space:]]*//; /^$/d; /^\/\//d' +} + +# Convert stdin to a \0-terminated char array. +dump() { + printf 'static const char %s[] = {\n' $1 + od -v -A n -t x1 | sed 's/[[:space:]]*\([[:alnum:]]\{2\}\)/0x\1, /g' + printf '0x00 };\n' +} + +# Print MD5 hash w/o newline character to not embed the character in the array. +hash() { + # md5sum is not standard, so try different platform-specific alternatives. + { md5sum $1 2> /dev/null || md5 -q $1 || digest -a md5 $1; } | + cut -b -32 | tr -d '\n\r' +} + +trap "rm -f $FILE.temp" EXIT + +sanitize | tee $FILE.temp | + dump x264_opencl_source > $FILE + +hash $FILE.temp | + dump x264_opencl_source_hash >> $FILE
View file
x264-snapshot-20141104-2245.tar.bz2/tools/msvsdepend.sh
Added
@@ -0,0 +1,21 @@ +#!/bin/sh +# There's a lot of things going on here +# expected arguments are $(CC) $(CFLAGS) $(SRC) $(OBJ) +# 1) start the dependency line with the object argument +# 2) need to add -Zs -showIncludes to the flags to have the compiler output list of include files without compilation +# 3) look for notes in the output that start with "Note: including file:" +# 4) retain only the filepath from the notes +# 5) convert \ foldername separators to / +# 6) escape spaces in the filepath +# 7) remove system includes (hack: check for "/Program Files" string in filepath) +# 8) sort and remove duplicate filepath entries +# 9) convert newlines to spaces to collapse the dependencies into the one dependency line +# 10) print a newline character, to properly separate dependency lines +echo -n "$4: " +$1 $2 $3 -Zs -showIncludes 2>&1 | + grep '^Note: including file:' | + sed 's/^Note: including file:[[:space:]]*\(.*\)$/\1/; s/\\/\//g; s/ /\\ /g' | + sed '/\/[Pp]rogram\\ [Ff]iles/d' | + sort | uniq | + tr -s '\n\r' ' ' +echo ''
View file
x264-snapshot-20140321-2245.tar.bz2/x264.c -> x264-snapshot-20141104-2245.tar.bz2/x264.c
Changed
@@ -6,7 +6,7 @@ * Authors: Loren Merritt <lorenm@u.washington.edu> * Laurent Aimar <fenrir@via.ecp.fr> * Steven Walters <kemuri9@gmail.com> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * Kieran Kunhya <kieran@kunhya.com> * Henrik Gramner <henrik@gramner.com> * @@ -320,6 +320,8 @@ printf( "intel: %.2f (%d)\n", __INTEL_COMPILER / 100.f, __INTEL_COMPILER_BUILD_DATE ); #elif defined(__GNUC__) printf( "gcc: " __VERSION__ "\n" ); +#elif defined(_MSC_FULL_VER) + printf( "msvc: %.2f (%u)\n", _MSC_VER / 100.f, _MSC_FULL_VER ); #else printf( "using an unknown compiler\n" ); #endif
View file
x264-snapshot-20140321-2245.tar.bz2/x264.h -> x264-snapshot-20141104-2245.tar.bz2/x264.h
Changed
@@ -5,7 +5,7 @@ * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> - * Jason Garrett-Glaser <darkshikari@gmail.com> + * Fiona Glaser <fiona@x264.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -28,8 +28,8 @@ #ifndef X264_X264_H #define X264_X264_H -#if !defined(_STDINT_H) && !defined(_STDINT_H_) && !defined(_STDINT_H_INCLUDED) &&\ - !defined(_INTTYPES_H) && !defined(_INTTYPES_H_) +#if !defined(_STDINT_H) && !defined(_STDINT_H_) && !defined(_STDINT_H_INCLUDED) && !defined(_STDINT) &&\ + !defined(_INTTYPES_H) && !defined(_INTTYPES_H_) && !defined(_INTTYPES) # ifdef _MSC_VER # pragma message("You must include stdint.h or inttypes.h before x264.h") # else @@ -152,10 +152,11 @@ /* PowerPC */ #define X264_CPU_ALTIVEC 0x0000001 -/* ARM */ +/* ARM and AArch64 */ #define X264_CPU_ARMV6 0x0000001 #define X264_CPU_NEON 0x0000002 /* ARM NEON */ #define X264_CPU_FAST_NEON_MRC 0x0000004 /* Transfer from NEON to ARM register is fast (Cortex-A9) */ +#define X264_CPU_ARMV8 0x0000008 /* Analyse flags */ #define X264_ANALYSE_I4x4 0x0001 /* Analyse i4x4 */
Locations
Projects
Search
Status Monitor
Help
Open Build Service
OBS Manuals
API Documentation
OBS Portal
Reporting a Bug
Contact
Mailing List
Forums
Chat (IRC)
Twitter
Open Build Service (OBS)
is an
openSUSE project
.