Projects
Essentials
libx264
Sign Up
Log In
Username
Password
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
Expand all
Collapse all
Changes of Revision 4
View file
libx264.changes
Changed
@@ -1,4 +1,9 @@ ------------------------------------------------------------------- +Wed Jul 24 14:11:22 UTC 2013 - i@margueirte.su + +- update version 20130723. + +------------------------------------------------------------------- Thu Mar 7 08:36:00 UTC+0800 2013 - marguerite@opensuse.org - fallback to 8-bit depth again.
View file
libx264.spec
Changed
@@ -1,5 +1,6 @@ # vim: set ts=4 sw=4 et: # Copyright (c) 2012 Pascal Bleser <pascal.bleser@opensuse.org> +# COpyright (c) 2013 Marguerite Su <marguerite@opensuse.org> # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed @@ -10,20 +11,19 @@ # license that conforms to the Open Source Definition (Version 1.9) # published by the Open Source Initiative. -# Please submit bugfixes or comments via http://bugs.opensuse.org/ +# Please submit bugfixes or comments via http://bugs.links2linux.org/ Name: libx264 -%define libname %{name} -%define soname 129 -%define svn 20130224 +%define soname 135 +%define svn 20130723 Version: 0.%{soname}svn%{svn} Release: 1 License: GPL-2.0+ Summary: A free h264/avc encoder - encoder binary Url: http://developers.videolan.org/x264.html Group: Productivity/Multimedia/Video/Editors and Convertors -Source0: ftp://ftp.videolan.org/pub/videolan/x264/snapshots/x264-snapshot-%{svn}-2245.tar.bz2 -Patch0: x264-use-shared-library.patch +Source: ftp://ftp.videolan.org/pub/videolan/x264/snapshots/x264-snapshot-%{svn}-2245.tar.bz2 +Patch: x264-use-shared-library.patch BuildRequires: nasm BuildRequires: pkg-config BuildRequires: yasm >= 1.2.0 @@ -59,11 +59,11 @@ moment so please use mencoder or another tool that supports x264 library for all other file types. -%package -n %{libname}-%{soname} +%package %{soname} Summary: A free h264/avc encoder - encoder binary Group: Productivity/Multimedia/Video/Editors and Convertors -%description -n %{libname}-%{soname} +%description %{soname} x264 is a free library for encoding next-generation H264/AVC video streams. The code is written from scratch by Laurent Aimar, Loren Merritt, Eric Petit (OS X), Min Chen (vfw/asm), Justin Clay (vfw), Mans @@ -73,15 +73,14 @@ development with libx264. This library is needed to build mplayer/mencoder with H264 encoding support. -%package -n %{libname}-devel +%package devel Summary: Libraries and include file for the %{name} encoder Group: Development/Libraries/C and C++ -Requires: %{buildrequires} -Requires: %{libname}-%{soname} = %{version} -Provides: %{name}-devel = %{version} -Obsoletes: %{name}-devel < %{version} +Requires: %{name}-%{soname} = %{version} +Provides: x264-devel = %{version} +Obsoletes: x264-devel < %{version} -%description -n %{libname}-devel +%description devel x264 is a free library for encoding next-generation H264/AVC video streams. The code is written from scratch by Laurent Aimar, Loren Merritt, Eric Petit (OS X), Min Chen (vfw/asm), Justin Clay (vfw), Mans @@ -92,8 +91,8 @@ mplayer/mencoder with H264 encoding support. %prep -%setup -q -n "x264-snapshot-%{svn}-2245" -%patch0 -p0 +%setup -q -n x264-snapshot-%{svn}-2245 +%patch -p1 FAKE_BUILDDATE=$(LC_ALL=C date -u -r %{_sourcedir}/%{name}.changes '+%%b %%e %%Y') sed -i "s/__DATE__/\"$FAKE_BUILDDATE\"/" x264.c @@ -104,29 +103,26 @@ %install %makeinstall -rm -f "%{buildroot}%{_libdir}/%{libname}.so" -rm -f "%{buildroot}%{_libdir}/%{libname}.a" -ln -s %{libname}.so.%{soname} "%{buildroot}%{_libdir}/%{libname}.so" +rm -f %{buildroot}%{_libdir}/%{name}.so +rm -f %{buildroot}%{_libdir}/%{name}.a +ln -s %{name}.so.%{soname} %{buildroot}%{_libdir}/%{name}.so -rm "%{buildroot}%{_bindir}"/* +rm %{buildroot}%{_bindir}/* -echo "%{libname}-%{soname}" > %{_sourcedir}/baselibs.conf +echo "%{name}-%{soname}" > %{_sourcedir}/baselibs.conf -%clean -%{?buildroot:%__rm -rf "%{buildroot}"} +%post -n %{name}-%{soname} -p /sbin/ldconfig +%postun -n %{name}-%{soname} -p /sbin/ldconfig -%post -n %{libname}-%{soname} -p /sbin/ldconfig -%postun -n %{libname}-%{soname} -p /sbin/ldconfig - -%files -n %{libname}-%{soname} +%files %{soname} %defattr(0644,root,root) -%{_libdir}/%{libname}.so.%{soname} +%{_libdir}/%{name}.so.%{soname} -%files -n %{libname}-devel +%files devel %defattr(0644,root,root) %{_includedir}/x264.h %{_includedir}/x264_config.h %{_libdir}/pkgconfig/x264.pc -%{_libdir}/%{libname}.so +%{_libdir}/%{name}.so %changelog
View file
x264-use-shared-library.patch
Changed
@@ -1,21 +1,23 @@ ---- Makefile.orig 2011-12-26 22:45:03.000000000 +0100 -+++ Makefile 2011-12-27 20:03:46.070404383 +0100 -@@ -152,6 +152,7 @@ +Index: x264-snapshot-20130723-2245/Makefile +=================================================================== +--- x264-snapshot-20130723-2245.orig/Makefile ++++ x264-snapshot-20130723-2245/Makefile +@@ -171,6 +171,7 @@ $(LIBX264): $(GENERATED) .depend $(OBJS) - $(SONAME): .depend $(OBJS) $(OBJASM) $(OBJSO) + $(SONAME): $(GENERATED) .depend $(OBJS) $(OBJASM) $(OBJSO) $(LD)$@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS) + ln -s $(SONAME) libx264.so ifneq ($(EXE),) .PHONY: x264 checkasm -@@ -159,8 +160,8 @@ +@@ -178,8 +179,8 @@ x264: x264$(EXE) checkasm: checkasm$(EXE) endif --x264$(EXE): .depend $(OBJCLI) $(CLI_LIBX264) +-x264$(EXE): $(GENERATED) .depend $(OBJCLI) $(CLI_LIBX264) - $(LD)$@ $(OBJCLI) $(CLI_LIBX264) $(LDFLAGSCLI) $(LDFLAGS) -+x264$(EXE): .depend $(OBJCLI) $(SONAME) ++x264$(EXE): $(GENERATED) .depend $(OBJCLI) $(SONAME) + $(LD)$@ $(OBJCLI) -L. -lx264 $(LDFLAGSCLI) $(LDFLAGS) - checkasm$(EXE): .depend $(OBJCHK) $(LIBX264) + checkasm$(EXE): $(GENERATED) .depend $(OBJCHK) $(LIBX264) $(LD)$@ $(OBJCHK) $(LIBX264) $(LDFLAGS)
View file
x264-snapshot-20130224-2245.tar.bz2/.gitignore -> x264-snapshot-20130723-2245.tar.bz2/.gitignore
Changed
@@ -43,3 +43,5 @@ .digress_x264 dataDec.txt log.dec +common/oclobj.h +x264_lookahead.clbin
View file
x264-snapshot-20130224-2245.tar.bz2/Makefile -> x264-snapshot-20130723-2245.tar.bz2/Makefile
Changed
@@ -8,6 +8,8 @@ vpath %.asm $(SRCPATH) vpath %.rc $(SRCPATH) +GENERATED = + all: default default: @@ -145,6 +147,13 @@ endif endif +ifeq ($(HAVE_OPENCL),yes) +common/oclobj.h: common/opencl/x264-cl.h $(wildcard $(SRCPATH)/common/opencl/*.cl) + cat $^ | perl $(SRCPATH)/tools/cltostr.pl x264_opencl_source > $@ +GENERATED += common/oclobj.h +SRCS += common/opencl.c encoder/slicetype-cl.c +endif + OBJS += $(SRCS:%.c=%.o) OBJCLI += $(SRCCLI:%.c=%.o) OBJSO += $(SRCSO:%.c=%.o) @@ -155,12 +164,12 @@ lib-static: $(LIBX264) lib-shared: $(SONAME) -$(LIBX264): .depend $(OBJS) $(OBJASM) +$(LIBX264): $(GENERATED) .depend $(OBJS) $(OBJASM) rm -f $(LIBX264) $(AR)$@ $(OBJS) $(OBJASM) $(if $(RANLIB), $(RANLIB) $@) -$(SONAME): .depend $(OBJS) $(OBJASM) $(OBJSO) +$(SONAME): $(GENERATED) .depend $(OBJS) $(OBJASM) $(OBJSO) $(LD)$@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS) ifneq ($(EXE),) @@ -169,10 +178,10 @@ checkasm: checkasm$(EXE) endif -x264$(EXE): .depend $(OBJCLI) $(CLI_LIBX264) +x264$(EXE): $(GENERATED) .depend $(OBJCLI) $(CLI_LIBX264) $(LD)$@ $(OBJCLI) $(CLI_LIBX264) $(LDFLAGSCLI) $(LDFLAGS) -checkasm$(EXE): .depend $(OBJCHK) $(LIBX264) +checkasm$(EXE): $(GENERATED) .depend $(OBJCHK) $(LIBX264) $(LD)$@ $(OBJCHK) $(LIBX264) $(LDFLAGS) $(OBJS) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK): .depend @@ -231,7 +240,7 @@ clean: rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(SONAME) *.a *.lib *.exp *.pdb x264 x264.exe .depend TAGS - rm -f checkasm checkasm.exe $(OBJCHK) + rm -f checkasm checkasm.exe $(OBJCHK) $(GENERATED) x264_lookahead.clbin rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock distclean: clean
View file
x264-snapshot-20130224-2245.tar.bz2/common/arm/mc-a.S -> x264-snapshot-20130723-2245.tar.bz2/common/arm/mc-a.S
Changed
@@ -5,6 +5,7 @@ * * Authors: David Conrad <lessen42@gmail.com> * Mans Rullgard <mans@mansr.com> + * Stefan Groenroos <stefan.gronroos@gmail.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -813,54 +814,57 @@ // void x264_mc_chroma_neon( uint8_t *dst, intptr_t i_dst_stride, // uint8_t *src, intptr_t i_src_stride, // int dx, int dy, int i_width, int i_height ); + function x264_mc_chroma_neon - push {r4-r6, lr} - ldrd r4, [sp, #16] - ldr r6, [sp, #24] + push {r4-r8, lr} + vpush {d8-d11} + ldrd r4, [sp, #56] + ldrd r6, [sp, #64] - asr lr, r5, #3 - mul lr, r3, lr - add r2, r2, r4, asr #3 - cmp r6, #4 - add r2, r2, lr + asr lr, r6, #3 + mul lr, r4, lr + add r3, r3, r5, asr #2 + cmp r7, #4 - and r4, r4, #7 and r5, r5, #7 - pld [r2] - pld [r2, r3] + and r6, r6, #7 + + add r3, r3, lr + bic r3, r3, #0x1 + + pld [r3] + pld [r3, r4] bgt mc_chroma_w8 beq mc_chroma_w4 -// calculate cA cB cC cD -.macro CHROMA_MC_START r0 r1 - muls lr, r4, r5 - rsb r6, lr, r5, lsl #3 - rsb ip, lr, r4, lsl #3 - sub r4, lr, r4, lsl #3 - sub r4, r4, r5, lsl #3 - add r4, r4, #64 +.macro CHROMA_MC_START r00, r01, r10, r11 + muls lr, r5, r6 + rsb r7, lr, r6, lsl #3 + rsb ip, lr, r5, lsl #3 + sub r5, lr, r5, lsl #3 + sub r5, r5, r6, lsl #3 + add r5, r5, #64 beq 2f + vld2.8 {\r00-\r01}, [r3], r4 - add r5, r2, r3 + vdup.8 d0, r5 + vdup.8 d1, ip - vdup.8 d0, r4 - lsl r3, r3, #1 - vdup.8 d1, ip - vld1.64 {\r0}, [r2], r3 - vdup.8 d2, r6 - vld1.64 {\r1}, [r5], r3 - vdup.8 d3, lr - ldr r4, [sp, #28] - - vext.8 d5, d4, d5, #1 - vext.8 d7, d6, d7, #1 + vdup.8 d2, r7 + vld2.8 {\r10-\r11}, [r3], r4 + vdup.8 d3, lr + ldr r5, [sp, #72] .endm .macro CHROMA_MC width, align mc_chroma_w\width: - CHROMA_MC_START d4, d6 + CHROMA_MC_START d4, d5, d8, d9 + vext.8 d6, d4, d6, #1 + vext.8 d7, d5, d7, #1 + vext.8 d10, d8, d10, #1 + vext.8 d11, d9, d11, #1 // since the element size varies, there's a different index for the 2nd store .if \width == 4 .set st2, 1 @@ -868,187 +872,292 @@ .set st2, 2 .endif - vtrn.32 d4, d5 - vtrn.32 d6, d7 + vtrn.32 d4, d6 + vtrn.32 d5, d7 + vtrn.32 d8, d10 + vtrn.32 d9, d11 - vtrn.32 d0, d1 - vtrn.32 d2, d3 + vtrn.32 d0, d1 + vtrn.32 d2, d3 1: // height loop, interpolate xy - pld [r5] + vmull.u8 q8, d4, d0 - vmlal.u8 q8, d6, d2 - vld1.64 {d4}, [r2], r3 - vext.8 d5, d4, d5, #1 - vtrn.32 d4, d5 - vmull.u8 q9, d6, d0 - vmlal.u8 q9, d4, d2 - vld1.64 {d6}, [r5], r3 + vmlal.u8 q8, d8, d2 + vmull.u8 q9, d5, d0 + vmlal.u8 q9, d9, d2 + + vld2.8 {d4-d5}, [r3], r4 + + vext.8 d6, d4, d6, #1 + vext.8 d7, d5, d7, #1 + vadd.i16 d16, d16, d17 vadd.i16 d17, d18, d19 + + vtrn.32 d4, d6 + vtrn.32 d5, d7 + + vmull.u8 q10, d8, d0 + vmlal.u8 q10, d4, d2 + vmull.u8 q11, d9, d0 + vmlal.u8 q11, d5, d2 + + vld2.8 {d8-d9}, [r3], r4 + vrshrn.u16 d16, q8, #6 - subs r4, r4, #2 - pld [r2] - vext.8 d7, d6, d7, #1 - vtrn.32 d6, d7 - vst1.\align {d16[0]}, [r0,:\align], r1 - vst1.\align {d16[st2]}, [r0,:\align], r1 + + vext.8 d10, d8, d10, #1 + vext.8 d11, d9, d11, #1 + + vadd.i16 d18, d20, d21 + vadd.i16 d19, d22, d23 + + vtrn.32 d8, d10 + vtrn.32 d9, d11 + + vrshrn.u16 d18, q9, #6 + + subs r5, r5, #2 + + pld [r3] + pld [r3, r4] + + vst1.\align {d16[0]}, [r0,:\align], r2 + vst1.\align {d16[st2]}, [r1,:\align], r2 + vst1.\align {d18[0]}, [r0,:\align], r2 + vst1.\align {d18[st2]}, [r1,:\align], r2 bgt 1b - pop {r4-r6, pc} + vpop {d8-d11} + pop {r4-r8, pc} 2: // dx or dy are 0 - tst r6, r6 - add ip, ip, r6 - vdup.8 d0, r4 + tst r7, r7 + add ip, ip, r7 + vdup.8 d0, r5 + ldr r5, [sp, #72] vdup.8 d1, ip - vtrn.32 d0, d1 - ldr r4, [sp, #28] beq 4f - vext.32 d1, d0, d1, #1 - add r5, r2, r3 - lsl r3, r3, #1 - vld1.32 {d4[0]}, [r2], r3 - vld1.32 {d4[1]}, [r5], r3 + vld1.64 {d4}, [r3], r4 + vld1.64 {d6}, [r3], r4 3: // vertical interpolation loop - pld [r5] + vmull.u8 q8, d4, d0 - vld1.32 {d4[0]}, [r2], r3 - vmull.u8 q9, d4, d1 - vld1.32 {d4[1]}, [r5], r3 - vadd.i16 d16, d16, d17 - vadd.i16 d17, d18, d19 - vrshrn.u16 d16, q8, #6 - subs r4, r4, #2 - pld [r2] - vst1.\align {d16[0]}, [r0,:\align], r1 - vst1.\align {d16[st2]}, [r0,:\align], r1 + vmlal.u8 q8, d6, d1 + vmull.u8 q9, d6, d0 + vld1.64 {d4}, [r3], r4 + vmlal.u8 q9, d4, d1 + vld1.64 {d6}, [r3], r4 + + vrshrn.u16 d16, q8, #6 // uvuvuvuv + vrshrn.u16 d17, q9, #6 // uvuvuvuv + subs r5, r5, #2 + vuzp.8 d16, d17 // d16=uuuu|uuuu, d17=vvvv|vvvv + + pld [r3] + pld [r3, r4] + + vst1.\align {d16[0]}, [r0,:\align], r2 + vst1.\align {d16[st2]}, [r0,:\align], r2 + vst1.\align {d17[0]}, [r1,:\align], r2 + vst1.\align {d17[st2]}, [r1,:\align], r2 bgt 3b - pop {r4-r6, pc} + vpop {d8-d11} + pop {r4-r8, pc} 4: // dy is 0 - vld1.64 {d4}, [r2], r3 - vld1.64 {d6}, [r2], r3 - vext.8 d5, d4, d5, #1 - vext.8 d7, d6, d7, #1 - vtrn.32 d4, d5 - vtrn.32 d6, d7 + + vld1.64 {d4-d5}, [r3], r4 + vld1.64 {d6-d7}, [r3], r4 + + vext.8 d5, d4, d5, #2 + vext.8 d7, d6, d7, #2 5: // horizontal interpolation loop + vmull.u8 q8, d4, d0 + vmlal.u8 q8, d5, d1 vmull.u8 q9, d6, d0 - subs r4, r4, #2 - vld1.64 {d4}, [r2], r3 - vext.8 d5, d4, d5, #1 - vtrn.32 d4, d5 - vadd.i16 d16, d16, d17 - vadd.i16 d17, d18, d19 - pld [r2] + vmlal.u8 q9, d7, d1 + + subs r5, r5, #2 + vld1.64 {d4-d5}, [r3], r4 + vld1.64 {d6-d7}, [r3], r4 + vext.8 d5, d4, d5, #2 vrshrn.u16 d16, q8, #6 - vld1.64 {d6}, [r2], r3 - vext.8 d7, d6, d7, #1 - vtrn.32 d6, d7 - pld [r2] - vst1.\align {d16[0]}, [r0,:\align], r1 - vst1.\align {d16[st2]}, [r0,:\align], r1 + vrshrn.u16 d17, q9, #6 + vext.8 d7, d6, d7, #2 + vuzp.8 d16, d17 + + pld [r3] + pld [r3, r4] + + vst1.\align {d16[0]}, [r0,:\align], r2 + vst1.\align {d16[st2]}, [r0,:\align], r2 + vst1.\align {d17[0]}, [r1,:\align], r2 + vst1.\align {d17[st2]}, [r1,:\align], r2 bgt 5b - pop {r4-r6, pc} + vpop {d8-d11} + pop {r4-r8, pc} .endm - CHROMA_MC 2, 16 - CHROMA_MC 4, 32 + CHROMA_MC 2, 16 + CHROMA_MC 4, 32 -// the optimial timing for width 8 is different enough that it's not -// readable to put it in the same macro as width 2/4 mc_chroma_w8: - CHROMA_MC_START d4-d5, d6-d7 + CHROMA_MC_START d4, d7, d8, d11 + vext.8 d5, d4, d5, #1 + vext.8 d9, d8, d9, #1 + vext.8 d7, d6, d7, #1 + vext.8 d11, d10, d11, #1 1: // height loop, interpolate xy - pld [r5] vmull.u8 q8, d4, d0 vmlal.u8 q8, d5, d1 - vld1.64 {d4, d5}, [r2], r3 - vmlal.u8 q8, d6, d2 - vext.8 d5, d4, d5, #1 - vmlal.u8 q8, d7, d3 + vmlal.u8 q8, d8, d2 + vmlal.u8 q8, d9, d3 + vmull.u8 q9, d6, d0 - subs r4, r4, #2 vmlal.u8 q9, d7, d1 - vmlal.u8 q9, d4, d2 - vmlal.u8 q9, d5, d3 + vmlal.u8 q9, d10, d2 + vmlal.u8 q9, d11, d3 + + vld2.8 {d4-d7}, [r3], r4 + + vext.8 d5, d4, d5, #1 + vext.8 d7, d6, d7, #1 + + vmull.u8 q10, d8, d0 + vmlal.u8 q10, d9, d1 + vmlal.u8 q10, d4, d2 + vmlal.u8 q10, d5, d3 + + vmull.u8 q11, d10, d0 + vmlal.u8 q11, d11, d1 + vmlal.u8 q11, d6, d2 + vmlal.u8 q11, d7, d3 + + subs r5, r5, #2 + vld2.8 {d8-d11}, [r3], r4 + vrshrn.u16 d16, q8, #6 - vld1.64 {d6, d7}, [r5], r3 - pld [r2] vrshrn.u16 d17, q9, #6 - vext.8 d7, d6, d7, #1 - vst1.64 {d16}, [r0,:64], r1 - vst1.64 {d17}, [r0,:64], r1 + vrshrn.u16 d18, q10, #6 + vext.8 d9, d8, d9, #1 + vrshrn.u16 d19, q11, #6 + vext.8 d11, d10, d11, #1 + + pld [r3] + pld [r3, r4] + + vst1.64 {d16}, [r0,:64], r2 + vst1.64 {d17}, [r1,:64], r2 + vst1.64 {d18}, [r0,:64], r2 + vst1.64 {d19}, [r1,:64], r2 + bgt 1b - pop {r4-r6, pc} + vpop {d8-d11} + pop {r4-r8, pc} 2: // dx or dy are 0 - tst r6, r6 - add ip, ip, r6 - vdup.8 d0, r4 + tst r7, r7 + add ip, ip, r7 + vdup.8 d0, r5 + ldr r5, [sp, #72] vdup.8 d1, ip - ldr r4, [sp, #28] beq 4f - add r5, r2, r3 - lsl r3, r3, #1 - vld1.64 {d4}, [r2], r3 - vld1.64 {d6}, [r5], r3 + vld2.8 {d4-d5}, [r3], r4 + vld2.8 {d6-d7}, [r3], r4 3: // vertical interpolation loop - pld [r5] - vmull.u8 q8, d4, d0 + vmull.u8 q8, d4, d0 //U vmlal.u8 q8, d6, d1 - vld1.64 {d4}, [r2], r3 - vmull.u8 q9, d6, d0 - vmlal.u8 q9, d4, d1 - vld1.64 {d6}, [r5], r3 + vmull.u8 q9, d5, d0 //V + vmlal.u8 q9, d7, d1 + + vld2.8 {d4-d5}, [r3], r4 + + vmull.u8 q10, d6, d0 + vmlal.u8 q10, d4, d1 + vmull.u8 q11, d7, d0 + vmlal.u8 q11, d5, d1 + + vld2.8 {d6-d7}, [r3], r4 + vrshrn.u16 d16, q8, #6 vrshrn.u16 d17, q9, #6 - subs r4, r4, #2 - pld [r2] - vst1.64 {d16}, [r0,:64], r1 - vst1.64 {d17}, [r0,:64], r1 + vrshrn.u16 d18, q10, #6 + vrshrn.u16 d19, q11, #6 + subs r5, r5, #2 + + pld [r3] + pld [r3, r4] + + vst1.64 {d16}, [r0,:64], r2 + vst1.64 {d17}, [r1,:64], r2 + vst1.64 {d18}, [r0,:64], r2 + vst1.64 {d19}, [r1,:64], r2 + bgt 3b - pop {r4-r6, pc} + vpop {d8-d11} + pop {r4-r8, pc} 4: // dy is 0 - vld1.64 {d4, d5}, [r2], r3 - vld1.64 {d6, d7}, [r2], r3 + + vld2.8 {d4-d7}, [r3], r4 + vld2.8 {d8-d11}, [r3], r4 vext.8 d5, d4, d5, #1 vext.8 d7, d6, d7, #1 + vext.8 d9, d8, d9, #1 + vext.8 d11, d10, d11, #1 5: // horizontal interpolation loop - pld [r2] - subs r4, r4, #2 - vmull.u8 q8, d4, d0 + subs r5, r5, #2 + vmull.u8 q8, d4, d0 //U vmlal.u8 q8, d5, d1 - vld1.64 {d4, d5}, [r2], r3 - vmull.u8 q9, d6, d0 + vmull.u8 q9, d6, d0 //V vmlal.u8 q9, d7, d1 - pld [r2] + + vld2.8 {d4-d7}, [r3], r4 + + vmull.u8 q10, d8, d0 + vmlal.u8 q10, d9, d1 + vmull.u8 q11, d10, d0 + vmlal.u8 q11, d11, d1 + + vld2.8 {d8-d11}, [r3], r4 + vext.8 d5, d4, d5, #1 vrshrn.u16 d16, q8, #6 - vrshrn.u16 d17, q9, #6 - vld1.64 {d6, d7}, [r2], r3 vext.8 d7, d6, d7, #1 - vst1.64 {d16}, [r0,:64], r1 - vst1.64 {d17}, [r0,:64], r1 + vrshrn.u16 d17, q9, #6 + vext.8 d9, d8, d9, #1 + vrshrn.u16 d18, q10, #6 + vext.8 d11, d10, d11, #1 + vrshrn.u16 d19, q11, #6 + + pld [r3] + pld [r3, r4] + + vst1.64 {d16}, [r0,:64], r2 + vst1.64 {d17}, [r1,:64], r2 + vst1.64 {d18}, [r0,:64], r2 + vst1.64 {d19}, [r1,:64], r2 bgt 5b - pop {r4-r6, pc} + vpop {d8-d11} + pop {r4-r8, pc} + .endfunc
View file
x264-snapshot-20130224-2245.tar.bz2/common/arm/mc-c.c -> x264-snapshot-20130723-2245.tar.bz2/common/arm/mc-c.c
Changed
@@ -238,7 +238,7 @@ pf->offsetsub = x264_mc_offsetsub_wtab_neon; pf->weight_cache = x264_weight_cache_neon; -// pf->mc_chroma = x264_mc_chroma_neon; + pf->mc_chroma = x264_mc_chroma_neon; pf->mc_luma = mc_luma_neon; pf->get_ref = get_ref_neon; pf->hpel_filter = hpel_filter_neon;
View file
x264-snapshot-20130224-2245.tar.bz2/common/arm/quant-a.S -> x264-snapshot-20130723-2245.tar.bz2/common/arm/quant-a.S
Changed
@@ -35,7 +35,7 @@ .text -.macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 load_mf=no +.macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 mask load_mf=no vadd.u16 q8, q8, \bias0 vadd.u16 q9, q9, \bias1 .ifc \load_mf, yes @@ -55,7 +55,7 @@ veor q9, q9, q15 vsub.s16 q8, q8, q14 vsub.s16 q9, q9, q15 - vorr \bias0, q8, q9 + vorr \mask, q8, q9 vst1.64 {d16-d19}, [r0,:128]! .endm @@ -89,7 +89,7 @@ vabs.s16 q9, q15 vdup.16 q0, r2 vdup.16 q2, r1 - QUANT_TWO q0, q0, d4, d5, d4, d5 + QUANT_TWO q0, q0, d4, d5, d4, d5, q0 vorr d0, d0, d1 QUANT_END d0 .endfunc @@ -101,11 +101,52 @@ vabs.s16 q9, q15 vld1.64 {d0-d3}, [r2,:128] vld1.64 {d4-d7}, [r1,:128] - QUANT_TWO q0, q1, d4, d5, d6, d7 + QUANT_TWO q0, q1, d4, d5, d6, d7, q0 vorr d0, d0, d1 QUANT_END d0 .endfunc +// quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] ) +function x264_quant_4x4x4_neon + vpush {d8-d15} + vld1.64 {d28-d31}, [r0,:128] + vabs.s16 q8, q14 + vabs.s16 q9, q15 + vld1.64 {d0-d3}, [r2,:128] + vld1.64 {d4-d7}, [r1,:128] + QUANT_TWO q0, q1, d4, d5, d6, d7, q4 + vld1.64 {d28-d31}, [r0,:128] + vabs.s16 q8, q14 + vabs.s16 q9, q15 + QUANT_TWO q0, q1, d4, d5, d6, d7, q5 + vld1.64 {d28-d31}, [r0,:128] + vabs.s16 q8, q14 + vabs.s16 q9, q15 + QUANT_TWO q0, q1, d4, d5, d6, d7, q6 + vld1.64 {d28-d31}, [r0,:128] + vabs.s16 q8, q14 + vabs.s16 q9, q15 + QUANT_TWO q0, q1, d4, d5, d6, d7, q7 + vorr d8, d8, d9 + vorr d10, d10, d11 + vorr d12, d12, d13 + vorr d14, d14, d15 + vmov r0, r1, d8 + vmov r2, r3, d10 + orrs r0, r1 + movne r0, #1 + orrs r2, r3 + orrne r0, #2 + vmov r1, r2, d12 + vmov r3, ip, d14 + orrs r1, r2 + orrne r0, #4 + orrs r3, ip + orrne r0, #8 + vpop {d8-d15} + bx lr +.endfunc + // quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] ) function x264_quant_8x8_neon vld1.64 {d28-d31}, [r0,:128] @@ -113,13 +154,13 @@ vabs.s16 q9, q15 vld1.64 {d0-d3}, [r2,:128]! vld1.64 {d4-d7}, [r1,:128]! - QUANT_TWO q0, q1, d4, d5, d6, d7 + QUANT_TWO q0, q1, d4, d5, d6, d7, q0 .rept 3 vld1.64 {d28-d31}, [r0,:128] vabs.s16 q8, q14 vabs.s16 q9, q15 vld1.64 {d2-d5}, [r2,:128]! - QUANT_TWO q1, q2, d4, d5, d6, d7, yes + QUANT_TWO q1, q2, d4, d5, d6, d7, q1, yes vorr q0, q0, q1 .endr vorr d0, d0, d1
View file
x264-snapshot-20130224-2245.tar.bz2/common/arm/quant.h -> x264-snapshot-20130723-2245.tar.bz2/common/arm/quant.h
Changed
@@ -31,6 +31,7 @@ int x264_quant_2x2_dc_neon( int16_t dct[4], int mf, int bias ); int x264_quant_4x4_dc_neon( int16_t dct[16], int mf, int bias ); int x264_quant_4x4_neon( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ); +int x264_quant_4x4x4_neon( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] ); int x264_quant_8x8_neon( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] ); void x264_dequant_4x4_dc_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
View file
x264-snapshot-20130224-2245.tar.bz2/common/bitstream.c -> x264-snapshot-20130723-2245.tar.bz2/common/bitstream.c
Changed
@@ -39,11 +39,20 @@ return dst; } -#if HAVE_MMX uint8_t *x264_nal_escape_mmx2( uint8_t *dst, uint8_t *src, uint8_t *end ); uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end ); -uint8_t *x264_nal_escape_avx( uint8_t *dst, uint8_t *src, uint8_t *end ); -#endif +uint8_t *x264_nal_escape_avx2( uint8_t *dst, uint8_t *src, uint8_t *end ); +void x264_cabac_block_residual_rd_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_rd_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_rd_internal_ssse3 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_8x8_rd_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_8x8_rd_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_8x8_rd_internal_ssse3 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_internal_avx2_bmi2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); /**************************************************************************** * x264_nal_encode: @@ -88,13 +97,49 @@ void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf ) { + memset( pf, 0, sizeof(*pf) ); + pf->nal_escape = x264_nal_escape_c; #if HAVE_MMX +#if ARCH_X86_64 + pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_sse2; + pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_sse2; + pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_sse2; +#endif + if( cpu&X264_CPU_MMX2 ) pf->nal_escape = x264_nal_escape_mmx2; - if( (cpu&X264_CPU_SSE2) && (cpu&X264_CPU_SSE2_IS_FAST) ) - pf->nal_escape = x264_nal_escape_sse2; - if( cpu&X264_CPU_AVX ) - pf->nal_escape = x264_nal_escape_avx; + if( cpu&X264_CPU_SSE2 ) + { +#if ARCH_X86_64 + if( cpu&X264_CPU_LZCNT ) + { + pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_sse2_lzcnt; + pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_sse2_lzcnt; + pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_sse2_lzcnt; + } +#endif + if( cpu&X264_CPU_SSE2_IS_FAST ) + pf->nal_escape = x264_nal_escape_sse2; + } +#if ARCH_X86_64 + if( cpu&X264_CPU_SSSE3 ) + { + pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_ssse3; + pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_ssse3; + if( cpu&X264_CPU_LZCNT ) + { + pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_ssse3_lzcnt; + pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt; + } + } + + if( cpu&X264_CPU_AVX2 ) + { + pf->nal_escape = x264_nal_escape_avx2; + if( cpu&X264_CPU_BMI2 ) + pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx2_bmi2; + } +#endif #endif }
View file
x264-snapshot-20130224-2245.tar.bz2/common/bitstream.h -> x264-snapshot-20130723-2245.tar.bz2/common/bitstream.h
Changed
@@ -55,9 +55,9 @@ typedef struct { - int last; - int mask; - dctcoef level[16]; + int32_t last; + int32_t mask; + ALIGNED_16( dctcoef level[18] ); } x264_run_level_t; extern const vlc_t x264_coeff0_token[6]; @@ -69,6 +69,12 @@ typedef struct { uint8_t *(*nal_escape) ( uint8_t *dst, uint8_t *src, uint8_t *end ); + void (*cabac_block_residual_internal)( dctcoef *l, int b_interlaced, + intptr_t ctx_block_cat, x264_cabac_t *cb ); + void (*cabac_block_residual_rd_internal)( dctcoef *l, int b_interlaced, + intptr_t ctx_block_cat, x264_cabac_t *cb ); + void (*cabac_block_residual_8x8_rd_internal)( dctcoef *l, int b_interlaced, + intptr_t ctx_block_cat, x264_cabac_t *cb ); } x264_bitstream_function_t; void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf );
View file
x264-snapshot-20130224-2245.tar.bz2/common/common.c -> x264-snapshot-20130723-2245.tar.bz2/common/common.c
Changed
@@ -171,6 +171,10 @@ param->b_pic_struct = 0; param->b_fake_interlaced = 0; param->i_frame_packing = -1; + param->b_opencl = 0; + param->i_opencl_device = 0; + param->opencl_device_id = NULL; + param->psz_clbin_file = NULL; } static int x264_param_apply_preset( x264_param_t *param, const char *preset ) @@ -563,6 +567,8 @@ } #define atobool(str) ( name_was_bool = 1, x264_atobool( str, &b_error ) ) +#undef atoi +#undef atof #define atoi(str) x264_atoi( str, &b_error ) #define atof(str) x264_atof( str, &b_error ) @@ -620,10 +626,8 @@ b_error = 1; } free( buf ); - if( p->cpu & X264_CPU_SSSE3 ) + if( (p->cpu&X264_CPU_SSSE3) && !(p->cpu&X264_CPU_SSE2_IS_SLOW) ) p->cpu |= X264_CPU_SSE2_IS_FAST; - if( p->cpu & X264_CPU_SSE4 ) - p->cpu |= X264_CPU_SHUFFLE_IS_FAST; } } OPT("threads") @@ -778,8 +782,12 @@ p->i_slice_max_size = atoi(value); OPT("slice-max-mbs") p->i_slice_max_mbs = atoi(value); + OPT("slice-min-mbs") + p->i_slice_min_mbs = atoi(value); OPT("slices") p->i_slice_count = atoi(value); + OPT("slices-max") + p->i_slice_count_max = atoi(value); OPT("cabac") p->b_cabac = atobool(value); OPT("cabac-idc") @@ -1029,6 +1037,14 @@ p->b_fake_interlaced = atobool(value); OPT("frame-packing") p->i_frame_packing = atoi(value); + OPT("stitchable") + p->b_stitchable = atobool(value); + OPT("opencl") + p->b_opencl = atobool( value ); + OPT("opencl-clbin") + p->psz_clbin_file = strdup( value ); + OPT("opencl-device") + p->i_opencl_device = atoi( value ); else return X264_PARAM_BAD_NAME; #undef OPT @@ -1166,17 +1182,14 @@ void *x264_malloc( int i_size ) { uint8_t *align_buf = NULL; -#if SYS_MACOSX || (SYS_WINDOWS && ARCH_X86_64) - /* Mac OS X and Win x64 always returns 16 byte aligned memory */ - align_buf = malloc( i_size ); -#elif HAVE_MALLOC_H - align_buf = memalign( 16, i_size ); +#if HAVE_MALLOC_H + align_buf = memalign( NATIVE_ALIGN, i_size ); #else - uint8_t *buf = malloc( i_size + 15 + sizeof(void **) ); + uint8_t *buf = malloc( i_size + (NATIVE_ALIGN-1) + sizeof(void **) ); if( buf ) { - align_buf = buf + 15 + sizeof(void **); - align_buf -= (intptr_t) align_buf & 15; + align_buf = buf + (NATIVE_ALIGN-1) + sizeof(void **); + align_buf -= (intptr_t) align_buf & (NATIVE_ALIGN-1); *( (void **) ( align_buf - sizeof(void **) ) ) = buf; } #endif @@ -1192,7 +1205,7 @@ { if( p ) { -#if HAVE_MALLOC_H || SYS_MACOSX || (SYS_WINDOWS && ARCH_X86_64) +#if HAVE_MALLOC_H free( p ); #else free( *( ( ( void **) p ) - 1 ) ); @@ -1281,6 +1294,8 @@ s += sprintf( s, "bitdepth=%d ", BIT_DEPTH ); } + if( p->b_opencl ) + s += sprintf( s, "opencl=%d ", p->b_opencl ); s += sprintf( s, "cabac=%d", p->b_cabac ); s += sprintf( s, " ref=%d", p->i_frame_reference ); s += sprintf( s, " deblock=%d:%d:%d", p->b_deblocking_filter, @@ -1305,14 +1320,20 @@ s += sprintf( s, " sliced_threads=%d", p->b_sliced_threads ); if( p->i_slice_count ) s += sprintf( s, " slices=%d", p->i_slice_count ); + if( p->i_slice_count_max ) + s += sprintf( s, " slices_max=%d", p->i_slice_count_max ); if( p->i_slice_max_size ) s += sprintf( s, " slice_max_size=%d", p->i_slice_max_size ); if( p->i_slice_max_mbs ) s += sprintf( s, " slice_max_mbs=%d", p->i_slice_max_mbs ); + if( p->i_slice_min_mbs ) + s += sprintf( s, " slice_min_mbs=%d", p->i_slice_min_mbs ); s += sprintf( s, " nr=%d", p->analyse.i_noise_reduction ); s += sprintf( s, " decimate=%d", p->analyse.b_dct_decimate ); s += sprintf( s, " interlaced=%s", p->b_interlaced ? p->b_tff ? "tff" : "bff" : p->b_fake_interlaced ? "fake" : "0" ); s += sprintf( s, " bluray_compat=%d", p->b_bluray_compat ); + if( p->b_stitchable ) + s += sprintf( s, " stitchable=%d", p->b_stitchable ); s += sprintf( s, " constrained_intra=%d", p->b_constrained_intra );
View file
x264-snapshot-20130224-2245.tar.bz2/common/common.h -> x264-snapshot-20130723-2245.tar.bz2/common/common.h
Changed
@@ -40,6 +40,7 @@ #define IS_DISPOSABLE(type) ( type == X264_TYPE_B ) #define FIX8(f) ((int)(f*(1<<8)+.5)) #define ALIGN(x,a) (((x)+((a)-1))&~((a)-1)) +#define ARRAY_ELEMS(a) ((sizeof(a))/(sizeof(a[0]))) #define CHECKED_MALLOC( var, size )\ do {\ @@ -53,6 +54,8 @@ memset( var, 0, size );\ } while( 0 ) +#define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0])) + #define X264_BFRAME_MAX 16 #define X264_REF_MAX 16 #define X264_THREAD_MAX 128 @@ -202,6 +205,10 @@ }; #include "x264.h" +#if HAVE_OPENCL +#include "opencl.h" +#endif +#include "cabac.h" #include "bitstream.h" #include "set.h" #include "predict.h" @@ -209,7 +216,6 @@ #include "mc.h" #include "frame.h" #include "dct.h" -#include "cabac.h" #include "quant.h" #include "cpu.h" #include "threadpool.h" @@ -291,17 +297,6 @@ return amvd0 + (amvd1<<8); } -static void ALWAYS_INLINE x264_predictor_roundclip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max ) -{ - for( int i = 0; i < i_mvc; i++ ) - { - int mx = (mvc[i][0] + 2) >> 2; - int my = (mvc[i][1] + 2) >> 2; - dst[i][0] = x264_clip3( mx, mv_x_min, mv_x_max ); - dst[i][1] = x264_clip3( my, mv_y_min, mv_y_max ); - } -} - extern const uint8_t x264_exp2_lut[64]; extern const float x264_log2_lut[128]; extern const float x264_log2_lz_lut[32]; @@ -614,11 +609,11 @@ /* Current MB DCT coeffs */ struct { - ALIGNED_16( dctcoef luma16x16_dc[3][16] ); + ALIGNED_N( dctcoef luma16x16_dc[3][16] ); ALIGNED_16( dctcoef chroma_dc[2][8] ); // FIXME share memory? - ALIGNED_16( dctcoef luma8x8[12][64] ); - ALIGNED_16( dctcoef luma4x4[16*3][16] ); + ALIGNED_N( dctcoef luma8x8[12][64] ); + ALIGNED_N( dctcoef luma4x4[16*3][16] ); } dct; /* MB table and cache for current frame/mb */ @@ -671,8 +666,7 @@ int mv_miny_spel_row[3]; int mv_maxy_spel_row[3]; /* Fullpel MV range for motion search */ - int mv_min_fpel[2]; - int mv_max_fpel[2]; + ALIGNED_8( int16_t mv_limit_fpel[2][2] ); /* min_x, min_y, max_x, max_y */ int mv_miny_fpel_row[3]; int mv_maxy_fpel_row[3]; @@ -758,7 +752,7 @@ #define FENC_STRIDE 16 #define FDEC_STRIDE 32 ALIGNED_16( pixel fenc_buf[48*FENC_STRIDE] ); - ALIGNED_16( pixel fdec_buf[52*FDEC_STRIDE] ); + ALIGNED_N( pixel fdec_buf[52*FDEC_STRIDE] ); /* i4x4 and i8x8 backup data, for skipping the encode stage when possible */ ALIGNED_16( pixel i4x4_fdec_buf[16*16] ); @@ -775,8 +769,8 @@ ALIGNED_16( dctcoef fenc_dct4[16][16] ); /* Psy RD SATD/SA8D scores cache */ - ALIGNED_16( uint64_t fenc_hadamard_cache[9] ); - ALIGNED_16( uint32_t fenc_satd_cache[32] ); + ALIGNED_N( uint64_t fenc_hadamard_cache[9] ); + ALIGNED_N( uint32_t fenc_satd_cache[32] ); /* pointer over mb of the frame to be compressed */ pixel *p_fenc[3]; /* y,u,v */ @@ -910,8 +904,8 @@ uint32_t (*nr_residual_sum)[64]; uint32_t *nr_count; - ALIGNED_16( udctcoef nr_offset_denoise[4][64] ); - ALIGNED_16( uint32_t nr_residual_sum_buf[2][4][64] ); + ALIGNED_N( udctcoef nr_offset_denoise[4][64] ); + ALIGNED_N( uint32_t nr_residual_sum_buf[2][4][64] ); uint32_t nr_count_buf[2][4]; uint8_t luma2chroma_pixel[7]; /* Subsampled pixel size */ @@ -947,11 +941,48 @@ struct visualize_t *visualize; #endif x264_lookahead_t *lookahead; + +#if HAVE_OPENCL + x264_opencl_t opencl; +#endif }; // included at the end because it needs x264_t #include "macroblock.h" +static int ALWAYS_INLINE x264_predictor_roundclip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv ) +{ + int cnt = 0; + for( int i = 0; i < i_mvc; i++ ) + { + int mx = (mvc[i][0] + 2) >> 2; + int my = (mvc[i][1] + 2) >> 2; + uint32_t mv = pack16to32_mask(mx, my); + if( !mv || mv == pmv ) continue; + dst[cnt][0] = x264_clip3( mx, mv_limit[0][0], mv_limit[1][0] ); + dst[cnt][1] = x264_clip3( my, mv_limit[0][1], mv_limit[1][1] ); + cnt++; + } + return cnt; +} + +static int ALWAYS_INLINE x264_predictor_clip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv ) +{ + int cnt = 0; + int qpel_limit[4] = {mv_limit[0][0] << 2, mv_limit[0][1] << 2, mv_limit[1][0] << 2, mv_limit[1][1] << 2}; + for( int i = 0; i < i_mvc; i++ ) + { + uint32_t mv = M32( mvc[i] ); + int mx = mvc[i][0]; + int my = mvc[i][1]; + if( !mv || mv == pmv ) continue; + dst[cnt][0] = x264_clip3( mx, qpel_limit[0], qpel_limit[2] ); + dst[cnt][1] = x264_clip3( my, qpel_limit[1], qpel_limit[3] ); + cnt++; + } + return cnt; +} + #if ARCH_X86 || ARCH_X86_64 #include "x86/util.h" #endif
View file
x264-snapshot-20130224-2245.tar.bz2/common/cpu.c -> x264-snapshot-20130723-2245.tar.bz2/common/cpu.c
Changed
@@ -47,18 +47,19 @@ const x264_cpu_name_t x264_cpu_names[] = { - {"Altivec", X264_CPU_ALTIVEC}, -// {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore - {"MMX2", X264_CPU_MMX|X264_CPU_MMX2}, - {"MMXEXT", X264_CPU_MMX|X264_CPU_MMX2}, -// {"SSE", X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_SSE}, // there are no sse1 functions in x264 -#define SSE2 X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_SSE|X264_CPU_SSE2 +#if HAVE_MMX +// {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore +// {"CMOV", X264_CPU_CMOV}, // we require this unconditionally, so don't print it +#define MMX2 X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_CMOV + {"MMX2", MMX2}, + {"MMXEXT", MMX2}, + {"SSE", MMX2|X264_CPU_SSE}, +#define SSE2 MMX2|X264_CPU_SSE|X264_CPU_SSE2 {"SSE2Slow", SSE2|X264_CPU_SSE2_IS_SLOW}, {"SSE2", SSE2}, {"SSE2Fast", SSE2|X264_CPU_SSE2_IS_FAST}, {"SSE3", SSE2|X264_CPU_SSE3}, {"SSSE3", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3}, - {"FastShuffle", SSE2|X264_CPU_SHUFFLE_IS_FAST}, {"SSE4.1", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4}, {"SSE4", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4}, {"SSE4.2", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42}, @@ -70,19 +71,26 @@ {"FMA3", AVX|X264_CPU_FMA3}, #undef AVX #undef SSE2 +#undef MMX2 {"Cache32", X264_CPU_CACHELINE_32}, {"Cache64", X264_CPU_CACHELINE_64}, {"SSEMisalign", X264_CPU_SSE_MISALIGN}, {"LZCNT", X264_CPU_LZCNT}, {"BMI1", X264_CPU_BMI1}, {"BMI2", X264_CPU_BMI1|X264_CPU_BMI2}, - {"TBM", X264_CPU_TBM}, - {"Slow_mod4_stack", X264_CPU_STACK_MOD4}, - {"ARMv6", X264_CPU_ARMV6}, - {"NEON", X264_CPU_NEON}, - {"Fast_NEON_MRC", X264_CPU_FAST_NEON_MRC}, {"SlowCTZ", X264_CPU_SLOW_CTZ}, {"SlowAtom", X264_CPU_SLOW_ATOM}, + {"SlowPshufb", X264_CPU_SLOW_PSHUFB}, + {"SlowPalignr", X264_CPU_SLOW_PALIGNR}, + {"SlowShuffle", X264_CPU_SLOW_SHUFFLE}, + {"UnalignedStack", X264_CPU_STACK_MOD4}, +#elif ARCH_PPC + {"Altivec", X264_CPU_ALTIVEC}, +#elif ARCH_ARM + {"ARMv6", X264_CPU_ARMV6}, + {"NEON", X264_CPU_NEON}, + {"FastNeonMRC", X264_CPU_FAST_NEON_MRC}, +#endif {"", 0}, }; @@ -131,9 +139,13 @@ if( edx&0x00800000 ) cpu |= X264_CPU_MMX; else - return 0; + return cpu; if( edx&0x02000000 ) cpu |= X264_CPU_MMX2|X264_CPU_SSE; + if( edx&0x00008000 ) + cpu |= X264_CPU_CMOV; + else + return cpu; if( edx&0x04000000 ) cpu |= X264_CPU_SSE2; if( ecx&0x00000001 ) @@ -170,46 +182,56 @@ if( cpu & X264_CPU_SSSE3 ) cpu |= X264_CPU_SSE2_IS_FAST; - if( cpu & X264_CPU_SSE4 ) - cpu |= X264_CPU_SHUFFLE_IS_FAST; x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx ); max_extended_cap = eax; - if( !strcmp((char*)vendor, "AuthenticAMD") && max_extended_cap >= 0x80000001 ) + if( max_extended_cap >= 0x80000001 ) { - cpu |= X264_CPU_SLOW_CTZ; x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx ); - if( edx&0x00400000 ) - cpu |= X264_CPU_MMX2; - if( cpu & X264_CPU_SSE2 ) + + if( ecx&0x00000020 ) + cpu |= X264_CPU_LZCNT; /* Supported by Intel chips starting with Haswell */ + if( ecx&0x00000040 ) /* SSE4a, AMD only */ { - if( ecx&0x00000040 ) /* SSE4a */ + int family = ((eax>>8)&0xf) + ((eax>>20)&0xff); + cpu |= X264_CPU_SSE2_IS_FAST; /* Phenom and later CPUs have fast SSE units */ + if( family == 0x14 ) { - cpu |= X264_CPU_SSE2_IS_FAST; - cpu |= X264_CPU_LZCNT; - cpu |= X264_CPU_SHUFFLE_IS_FAST; - cpu &= ~X264_CPU_SLOW_CTZ; + cpu &= ~X264_CPU_SSE2_IS_FAST; /* SSSE3 doesn't imply fast SSE anymore... */ + cpu |= X264_CPU_SSE2_IS_SLOW; /* Bobcat has 64-bit SIMD units */ + cpu |= X264_CPU_SLOW_PALIGNR; /* palignr is insanely slow on Bobcat */ } - else - cpu |= X264_CPU_SSE2_IS_SLOW; - - if( ecx&0x00000080 ) /* Misalign SSE */ + if( family == 0x16 ) { - cpu |= X264_CPU_SSE_MISALIGN; - x264_cpu_mask_misalign_sse(); + cpu |= X264_CPU_SLOW_PSHUFB; /* Jaguar's pshufb isn't that slow, but it's slow enough + * compared to alternate instruction sequences that this + * is equal or faster on almost all such functions. */ } + } - if( cpu & X264_CPU_AVX ) - { - if( ecx&0x00000800 ) /* XOP */ - cpu |= X264_CPU_XOP; - if( ecx&0x00010000 ) /* FMA4 */ - cpu |= X264_CPU_FMA4; - } + if( ecx&0x00000080 ) /* Misalign SSE */ + { + cpu |= X264_CPU_SSE_MISALIGN; + x264_cpu_mask_misalign_sse(); + } - if( ecx&0x00200000 ) - cpu |= X264_CPU_TBM; + if( cpu & X264_CPU_AVX ) + { + if( ecx&0x00000800 ) /* XOP */ + cpu |= X264_CPU_XOP; + if( ecx&0x00010000 ) /* FMA4 */ + cpu |= X264_CPU_FMA4; + } + + if( !strcmp((char*)vendor, "AuthenticAMD") ) + { + if( edx&0x00400000 ) + cpu |= X264_CPU_MMX2; + if( !(cpu&X264_CPU_LZCNT) ) + cpu |= X264_CPU_SLOW_CTZ; + if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_FAST) ) + cpu |= X264_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */ } } @@ -233,11 +255,12 @@ { cpu |= X264_CPU_SLOW_ATOM; cpu |= X264_CPU_SLOW_CTZ; + cpu |= X264_CPU_SLOW_PSHUFB; } - /* Some Penryns and Nehalems are pointlessly crippled (SSE4 disabled), so - * detect them here. */ - else if( model >= 23 ) - cpu |= X264_CPU_SHUFFLE_IS_FAST; + /* Conroe has a slow shuffle unit. Check the model number to make sure not + * to include crippled low-end Penryns and Nehalems that don't have SSE4. */ + else if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE4) && model < 23 ) + cpu |= X264_CPU_SLOW_SHUFFLE; } }
View file
x264-snapshot-20130224-2245.tar.bz2/common/cpu.h -> x264-snapshot-20130723-2245.tar.bz2/common/cpu.h
Changed
@@ -48,15 +48,17 @@ void x264_cpu_mask_misalign_sse( void ); void x264_safe_intel_cpu_indicator_init( void ); -/* kluge: +/* kludge: * gcc can't give variables any greater alignment than the stack frame has. - * We need 16 byte alignment for SSE2, so here we make sure that the stack is - * aligned to 16 bytes. + * We need 32 byte alignment for AVX2, so here we make sure that the stack is + * aligned to 32 bytes. * gcc 4.2 introduced __attribute__((force_align_arg_pointer)) to fix this * problem, but I don't want to require such a new version. - * This applies only to x86_32, since other architectures that need alignment - * either have ABIs that ensure aligned stack, or don't support it at all. */ -#if ARCH_X86 && HAVE_MMX + * aligning to 32 bytes only works if the compiler supports keeping that + * alignment between functions (osdep.h handles manual alignment of arrays + * if it doesn't). + */ +#if (ARCH_X86 || HAVE_32B_STACK_ALIGNMENT) && HAVE_MMX int x264_stack_align( void (*func)(), ... ); #define x264_stack_align(func,...) x264_stack_align((void (*)())func, __VA_ARGS__) #else
View file
x264-snapshot-20130224-2245.tar.bz2/common/dct.c -> x264-snapshot-20130723-2245.tar.bz2/common/dct.c
Changed
@@ -640,23 +640,32 @@ dctf->add8x8_idct8 = x264_add8x8_idct8_sse2; dctf->add16x16_idct8= x264_add16x16_idct8_sse2; - dctf->sub8x8_dct = x264_sub8x8_dct_sse2; - dctf->sub16x16_dct = x264_sub16x16_dct_sse2; - dctf->add8x8_idct = x264_add8x8_idct_sse2; - dctf->add16x16_idct = x264_add16x16_idct_sse2; - dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2; + if( !(cpu&X264_CPU_SSE2_IS_SLOW) ) + { + dctf->sub8x8_dct = x264_sub8x8_dct_sse2; + dctf->sub16x16_dct = x264_sub16x16_dct_sse2; + dctf->add8x8_idct = x264_add8x8_idct_sse2; + dctf->add16x16_idct = x264_add16x16_idct_sse2; + dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2; + } } - if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SLOW_ATOM) ) + if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE2_IS_SLOW) ) { - dctf->sub4x4_dct = x264_sub4x4_dct_ssse3; - dctf->sub8x8_dct = x264_sub8x8_dct_ssse3; - dctf->sub16x16_dct = x264_sub16x16_dct_ssse3; - dctf->sub8x8_dct8 = x264_sub8x8_dct8_ssse3; - dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3; dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_ssse3; - dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3; - dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3; + if( !(cpu&X264_CPU_SLOW_ATOM) ) + { + dctf->sub4x4_dct = x264_sub4x4_dct_ssse3; + dctf->sub8x8_dct = x264_sub8x8_dct_ssse3; + dctf->sub16x16_dct = x264_sub16x16_dct_ssse3; + dctf->sub8x8_dct8 = x264_sub8x8_dct8_ssse3; + dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3; + if( !(cpu&X264_CPU_SLOW_PSHUFB) ) + { + dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3; + dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3; + } + } } if( cpu&X264_CPU_SSE4 ) @@ -681,6 +690,18 @@ dctf->sub8x8_dct = x264_sub8x8_dct_xop; dctf->sub16x16_dct = x264_sub16x16_dct_xop; } + + if( cpu&X264_CPU_AVX2 ) + { + dctf->add8x8_idct = x264_add8x8_idct_avx2; + dctf->add16x16_idct = x264_add16x16_idct_avx2; + dctf->sub8x8_dct = x264_sub8x8_dct_avx2; + dctf->sub16x16_dct = x264_sub16x16_dct_avx2; + dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx2; +#if ARCH_X86_64 + dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx2; +#endif + } #endif //HAVE_MMX #if HAVE_ALTIVEC @@ -951,7 +972,7 @@ pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_ssse3; pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3; pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3; - if( cpu&X264_CPU_SHUFFLE_IS_FAST ) + if( !(cpu&X264_CPU_SLOW_SHUFFLE) ) pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3; } if( cpu&X264_CPU_AVX ) @@ -962,8 +983,7 @@ pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_avx; pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_avx; #endif - if( cpu&X264_CPU_SHUFFLE_IS_FAST ) - pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx; + pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx; } if( cpu&X264_CPU_XOP ) { @@ -1005,7 +1025,7 @@ pf_interlaced->interleave_8x8_cavlc = pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx; } - if( cpu&X264_CPU_SHUFFLE_IS_FAST ) + if( (cpu&X264_CPU_SSE2) && !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SSE2_IS_SLOW)) ) { pf_interlaced->interleave_8x8_cavlc = pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2; @@ -1016,6 +1036,12 @@ pf_interlaced->interleave_8x8_cavlc = pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx; } + + if( cpu&X264_CPU_AVX2 ) + { + pf_interlaced->interleave_8x8_cavlc = + pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx2; + } #endif // HIGH_BIT_DEPTH #endif }
View file
x264-snapshot-20130224-2245.tar.bz2/common/deblock.c -> x264-snapshot-20130723-2245.tar.bz2/common/deblock.c
Changed
@@ -686,6 +686,9 @@ void x264_deblock_strength_avx ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit, int bframe ); +void x264_deblock_strength_avx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + int mvy_limit, int bframe ); void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta ); void x264_deblock_h_chroma_intra_mbaff_sse2( pixel *pix, intptr_t stride, int alpha, int beta ); @@ -816,6 +819,10 @@ #endif } } + if( cpu&X264_CPU_AVX2 ) + { + pf->deblock_strength = x264_deblock_strength_avx2; + } } #endif
View file
x264-snapshot-20130224-2245.tar.bz2/common/display-x11.c -> x264-snapshot-20130723-2245.tar.bz2/common/display-x11.c
Changed
@@ -49,7 +49,7 @@ abort(); } -static void disp_init_display() +static void disp_init_display( void ) { Visual *visual; int dpy_class; @@ -130,7 +130,7 @@ XFree( shint ); } -void disp_sync() +void disp_sync( void ) { XSync( disp_display, 1 ); }
View file
x264-snapshot-20130224-2245.tar.bz2/common/frame.c -> x264-snapshot-20130723-2245.tar.bz2/common/frame.c
Changed
@@ -72,8 +72,18 @@ int i_mb_count = h->mb.i_mb_count; int i_stride, i_width, i_lines, luma_plane_count; int i_padv = PADV << PARAM_INTERLACED; - int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16; - int disalign = h->param.cpu&X264_CPU_ALTIVEC ? 1<<9 : 1<<10; + int align = 16; +#if ARCH_X86 || ARCH_X86_64 + if( h->param.cpu&X264_CPU_CACHELINE_64 ) + align = 64; + else if( h->param.cpu&X264_CPU_CACHELINE_32 || h->param.cpu&X264_CPU_AVX2 ) + align = 32; +#endif +#if ARCH_PPC + int disalign = 1<<9; +#else + int disalign = 1<<10; +#endif CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) ); @@ -251,6 +261,10 @@ if( x264_pthread_cond_init( &frame->cv, NULL ) ) goto fail; +#if HAVE_OPENCL + frame->opencl.ocl = h->opencl.ocl; +#endif + return frame; fail: @@ -312,6 +326,9 @@ } x264_pthread_mutex_destroy( &frame->mutex ); x264_pthread_cond_destroy( &frame->cv ); +#if HAVE_OPENCL + x264_opencl_frame_delete( frame ); +#endif } x264_free( frame ); } @@ -655,6 +672,21 @@ x264_pthread_mutex_unlock( &h->mutex ); } +int x264_frame_new_slice( x264_t *h, x264_frame_t *frame ) +{ + if( h->param.i_slice_count_max ) + { + int slice_count; + if( h->param.b_sliced_threads ) + slice_count = x264_pthread_fetch_and_add( &frame->i_slice_count, 1, &frame->mutex ); + else + slice_count = frame->i_slice_count++; + if( slice_count >= h->param.i_slice_count_max ) + return -1; + } + return 0; +} + /* list operators */ void x264_frame_push( x264_frame_t **list, x264_frame_t *frame ) @@ -717,6 +749,7 @@ frame->b_scenecut = 1; frame->b_keyframe = 0; frame->b_corrupt = 0; + frame->i_slice_count = h->param.b_sliced_threads ? h->param.i_threads : 1; memset( frame->weight, 0, sizeof(frame->weight) ); memset( frame->f_weighted_cost_delta, 0, sizeof(frame->f_weighted_cost_delta) );
View file
x264-snapshot-20130224-2245.tar.bz2/common/frame.h -> x264-snapshot-20130723-2245.tar.bz2/common/frame.h
Changed
@@ -152,6 +152,7 @@ int i_reference_count; /* number of threads using this frame (not necessarily the number of pointers) */ x264_pthread_mutex_t mutex; x264_pthread_cond_t cv; + int i_slice_count; /* Atomically written to/read from with slice threads */ /* periodic intra refresh */ float f_pir_position; @@ -171,6 +172,10 @@ /* user frame properties */ uint8_t *mb_info; void (*mb_info_free)( void* ); + +#if HAVE_OPENCL + x264_frame_opencl_t opencl; +#endif } x264_frame_t; /* synchronized frame list */ @@ -230,6 +235,7 @@ void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed ); void x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed ); +int x264_frame_new_slice( x264_t *h, x264_frame_t *frame ); void x264_threadslice_cond_broadcast( x264_t *h, int pass ); void x264_threadslice_cond_wait( x264_t *h, int pass );
View file
x264-snapshot-20130224-2245.tar.bz2/common/macroblock.c -> x264-snapshot-20130723-2245.tar.bz2/common/macroblock.c
Changed
@@ -122,8 +122,8 @@ int mvy1 = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y; int i_mode = x264_size2pixel[height][width]; intptr_t i_stride0 = 16, i_stride1 = 16; - ALIGNED_ARRAY_16( pixel, tmp0,[16*16] ); - ALIGNED_ARRAY_16( pixel, tmp1,[16*16] ); + ALIGNED_ARRAY_N( pixel, tmp0,[16*16] ); + ALIGNED_ARRAY_N( pixel, tmp1,[16*16] ); pixel *src0, *src1; MC_LUMA_BI( 0 ); @@ -387,7 +387,7 @@ int scratch_size = 0; if( !b_lookahead ) { - int buf_hpel = (h->thread[0]->fdec->i_width[0]+48) * sizeof(int16_t); + int buf_hpel = (h->thread[0]->fdec->i_width[0]+48+32) * sizeof(int16_t); int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int); int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range); int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
View file
x264-snapshot-20130224-2245.tar.bz2/common/mc.c -> x264-snapshot-20130723-2245.tar.bz2/common/mc.c
Changed
@@ -469,7 +469,7 @@ } } -void x264_mc_init( int cpu, x264_mc_functions_t *pf ) +void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent ) { pf->mc_luma = mc_luma; pf->get_ref = get_ref; @@ -534,6 +534,9 @@ #if HAVE_ARMV6 x264_mc_init_arm( cpu, pf ); #endif + + if( cpu_independent ) + pf->mbtree_propagate_cost = mbtree_propagate_cost; } void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
View file
x264-snapshot-20130224-2245.tar.bz2/common/mc.h -> x264-snapshot-20130723-2245.tar.bz2/common/mc.h
Changed
@@ -123,6 +123,6 @@ uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); } x264_mc_functions_t; -void x264_mc_init( int cpu, x264_mc_functions_t *pf ); +void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent ); #endif
View file
x264-snapshot-20130723-2245.tar.bz2/common/opencl
Added
+(directory)
View file
x264-snapshot-20130723-2245.tar.bz2/common/opencl.c
Added
@@ -0,0 +1,718 @@ +/***************************************************************************** + * opencl.c: OpenCL initialization and kernel compilation + ***************************************************************************** + * Copyright (C) 2012-2013 x264 project + * + * Authors: Steve Borho <sborho@multicorewareinc.com> + * Anton Mitrofanov <BugMaster@narod.ru> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "common.h" + +#ifdef _WIN32 +#include <windows.h> +#define ocl_open LoadLibrary( "OpenCL" ) +#define ocl_close FreeLibrary +#define ocl_address GetProcAddress +#else +#include <dlfcn.h> //dlopen, dlsym, dlclose +#if SYS_MACOSX +#define ocl_open dlopen( "/System/Library/Frameworks/OpenCL.framework/OpenCL", RTLD_NOW ) +#else +#define ocl_open dlopen( "libOpenCL.so", RTLD_NOW ) +#endif +#define ocl_close dlclose +#define ocl_address dlsym +#endif + +#define LOAD_OCL_FUNC(name, continue_on_fail)\ +{\ + ocl->name = (void*)ocl_address( ocl->library, #name );\ + if( !continue_on_fail && !ocl->name )\ + goto fail;\ +} + +/* load the library and functions we require from it */ +x264_opencl_function_t *x264_opencl_load_library( void ) +{ + x264_opencl_function_t *ocl; +#undef fail +#define fail fail0 + CHECKED_MALLOCZERO( ocl, sizeof(x264_opencl_function_t) ); +#undef fail +#define fail fail1 + ocl->library = ocl_open; + if( !ocl->library ) + goto fail; +#undef fail +#define fail fail2 + LOAD_OCL_FUNC( clBuildProgram, 0 ); + LOAD_OCL_FUNC( clCreateBuffer, 0 ); + LOAD_OCL_FUNC( clCreateCommandQueue, 0 ); + LOAD_OCL_FUNC( clCreateContext, 0 ); + LOAD_OCL_FUNC( clCreateImage2D, 0 ); + LOAD_OCL_FUNC( clCreateKernel, 0 ); + LOAD_OCL_FUNC( clCreateProgramWithBinary, 0 ); + LOAD_OCL_FUNC( clCreateProgramWithSource, 0 ); + LOAD_OCL_FUNC( clEnqueueCopyBuffer, 0 ); + LOAD_OCL_FUNC( clEnqueueMapBuffer, 0 ); + LOAD_OCL_FUNC( clEnqueueNDRangeKernel, 0 ); + LOAD_OCL_FUNC( clEnqueueReadBuffer, 0 ); + LOAD_OCL_FUNC( clEnqueueWriteBuffer, 0 ); + LOAD_OCL_FUNC( clFinish, 0 ); + LOAD_OCL_FUNC( clGetCommandQueueInfo, 0 ); + LOAD_OCL_FUNC( clGetDeviceIDs, 0 ); + LOAD_OCL_FUNC( clGetDeviceInfo, 0 ); + LOAD_OCL_FUNC( clGetKernelWorkGroupInfo, 0 ); + LOAD_OCL_FUNC( clGetPlatformIDs, 0 ); + LOAD_OCL_FUNC( clGetProgramBuildInfo, 0 ); + LOAD_OCL_FUNC( clGetProgramInfo, 0 ); + LOAD_OCL_FUNC( clGetSupportedImageFormats, 0 ); + LOAD_OCL_FUNC( clReleaseCommandQueue, 0 ); + LOAD_OCL_FUNC( clReleaseContext, 0 ); + LOAD_OCL_FUNC( clReleaseKernel, 0 ); + LOAD_OCL_FUNC( clReleaseMemObject, 0 ); + LOAD_OCL_FUNC( clReleaseProgram, 0 ); + LOAD_OCL_FUNC( clSetKernelArg, 0 ); + return ocl; +#undef fail +fail2: + ocl_close( ocl->library ); +fail1: + x264_free( ocl ); +fail0: + return NULL; +} + +void x264_opencl_close_library( x264_opencl_function_t *ocl ) +{ + if( !ocl ) + return; + ocl_close( ocl->library ); + x264_free( ocl ); +} + +/* define from recent cl_ext.h, copied here in case headers are old */ +#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD 0x4042 + +/* Requires full include path in case of out-of-tree builds */ +#include "common/oclobj.h" + +static int x264_detect_switchable_graphics( void ); + +/* Try to load the cached compiled program binary, verify the device context is + * still valid before reuse */ +static cl_program x264_opencl_cache_load( x264_t *h, char *dev_name, char *dev_vendor, char *driver_version ) +{ + /* try to load cached program binary */ + FILE *fp = fopen( h->param.psz_clbin_file, "rb" ); + if( !fp ) + return NULL; + + x264_opencl_function_t *ocl = h->opencl.ocl; + cl_program program = NULL; + uint8_t *binary = NULL; + + fseek( fp, 0, SEEK_END ); + size_t size = ftell( fp ); + rewind( fp ); + CHECKED_MALLOC( binary, size ); + + fread( binary, 1, size, fp ); + const uint8_t *ptr = (const uint8_t*)binary; + +#define CHECK_STRING( STR )\ + do {\ + size_t len = strlen( STR );\ + if( size <= len || strncmp( (char*)ptr, STR, len ) )\ + goto fail;\ + else {\ + size -= (len+1); ptr += (len+1);\ + }\ + } while( 0 ) + + CHECK_STRING( dev_name ); + CHECK_STRING( dev_vendor ); + CHECK_STRING( driver_version ); + CHECK_STRING( x264_opencl_source_hash ); +#undef CHECK_STRING + + cl_int status; + program = ocl->clCreateProgramWithBinary( h->opencl.context, 1, &h->opencl.device, &size, &ptr, NULL, &status ); + if( status != CL_SUCCESS ) + program = NULL; + +fail: + fclose( fp ); + x264_free( binary ); + return program; +} + +/* Save the compiled program binary to a file for later reuse. Device context + * is also saved in the cache file so we do not reuse stale binaries */ +static void x264_opencl_cache_save( x264_t *h, cl_program program, char *dev_name, char *dev_vendor, char *driver_version ) +{ + FILE *fp = fopen( h->param.psz_clbin_file, "wb" ); + if( !fp ) + { + x264_log( h, X264_LOG_INFO, "OpenCL: unable to open clbin file for write\n" ); + return; + } + + x264_opencl_function_t *ocl = h->opencl.ocl; + uint8_t *binary = NULL; + + size_t size = 0; + cl_int status = ocl->clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL ); + if( status != CL_SUCCESS || !size ) + { + x264_log( h, X264_LOG_INFO, "OpenCL: Unable to query program binary size, no cache file generated\n" ); + goto fail; + } + + CHECKED_MALLOC( binary, size ); + status = ocl->clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &binary, NULL ); + if( status != CL_SUCCESS ) + { + x264_log( h, X264_LOG_INFO, "OpenCL: Unable to query program binary, no cache file generated\n" ); + goto fail; + } + + fputs( dev_name, fp ); + fputc( '\n', fp ); + fputs( dev_vendor, fp ); + fputc( '\n', fp ); + fputs( driver_version, fp ); + fputc( '\n', fp ); + fputs( x264_opencl_source_hash, fp ); + fputc( '\n', fp ); + fwrite( binary, 1, size, fp ); + +fail: + fclose( fp ); + x264_free( binary ); + return; +} + +/* The OpenCL source under common/opencl will be merged into common/oclobj.h by + * the Makefile. It defines a x264_opencl_source byte array which we will pass + * to clCreateProgramWithSource(). We also attempt to use a cache file for the + * compiled binary, stored in the current working folder. */ +static cl_program x264_opencl_compile( x264_t *h ) +{ + x264_opencl_function_t *ocl = h->opencl.ocl; + cl_program program = NULL; + char *build_log = NULL; + + char dev_name[64]; + char dev_vendor[64]; + char driver_version[64]; + cl_int status; + status = ocl->clGetDeviceInfo( h->opencl.device, CL_DEVICE_NAME, sizeof(dev_name), dev_name, NULL ); + status |= ocl->clGetDeviceInfo( h->opencl.device, CL_DEVICE_VENDOR, sizeof(dev_vendor), dev_vendor, NULL ); + status |= ocl->clGetDeviceInfo( h->opencl.device, CL_DRIVER_VERSION, sizeof(driver_version), driver_version, NULL ); + if( status != CL_SUCCESS ) + return NULL; + + // Most AMD GPUs have vector registers + int vectorize = !strcmp( dev_vendor, "Advanced Micro Devices, Inc." ); + h->opencl.b_device_AMD_SI = 0; + + if( vectorize ) + { + /* Disable OpenCL on Intel/AMD switchable graphics devices */ + if( x264_detect_switchable_graphics() ) + { + x264_log( h, X264_LOG_INFO, "OpenCL acceleration disabled, switchable graphics detected\n" ); + return NULL; + } + + /* Detect AMD SouthernIsland or newer device (single-width registers) */ + cl_uint simdwidth = 4; + status = ocl->clGetDeviceInfo( h->opencl.device, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, sizeof(cl_uint), &simdwidth, NULL ); + if( status == CL_SUCCESS && simdwidth == 1 ) + { + vectorize = 0; + h->opencl.b_device_AMD_SI = 1; + } + } + + x264_log( h, X264_LOG_INFO, "OpenCL acceleration enabled with %s %s %s\n", dev_vendor, dev_name, h->opencl.b_device_AMD_SI ? "(SI)" : "" ); + + program = x264_opencl_cache_load( h, dev_name, dev_vendor, driver_version ); + if( !program ) + { + /* clCreateProgramWithSource() requires a pointer variable, you cannot just use &x264_opencl_source */ + x264_log( h, X264_LOG_INFO, "Compiling OpenCL kernels...\n" ); + const char *strptr = (const char*)x264_opencl_source; + size_t size = sizeof(x264_opencl_source); + program = ocl->clCreateProgramWithSource( h->opencl.context, 1, &strptr, &size, &status ); + if( status != CL_SUCCESS || !program ) + { + x264_log( h, X264_LOG_WARNING, "OpenCL: unable to create program\n" ); + return NULL; + } + } + + /* Build the program binary for the OpenCL device */ + const char *buildopts = vectorize ? "-DVECTORIZE=1" : ""; + status = ocl->clBuildProgram( program, 1, &h->opencl.device, buildopts, NULL, NULL ); + if( status == CL_SUCCESS ) + { + x264_opencl_cache_save( h, program, dev_name, dev_vendor, driver_version ); + return program; + } + + /* Compile failure, should not happen with production code. */ + + size_t build_log_len = 0; + status = ocl->clGetProgramBuildInfo( program, h->opencl.device, CL_PROGRAM_BUILD_LOG, 0, NULL, &build_log_len ); + if( status != CL_SUCCESS || !build_log_len ) + { + x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to query build log\n" ); + goto fail; + } + + build_log = x264_malloc( build_log_len ); + if( !build_log ) + { + x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to alloc build log\n" ); + goto fail; + } + + status = ocl->clGetProgramBuildInfo( program, h->opencl.device, CL_PROGRAM_BUILD_LOG, build_log_len, build_log, NULL ); + if( status != CL_SUCCESS ) + { + x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to get build log\n" ); + goto fail; + } + + FILE *log_file = fopen( "x264_kernel_build_log.txt", "w" ); + if( !log_file ) + { + x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to create file x264_kernel_build_log.txt\n" ); + goto fail; + } + fwrite( build_log, 1, build_log_len, log_file ); + fclose( log_file ); + x264_log( h, X264_LOG_WARNING, "OpenCL: kernel build errors written to x264_kernel_build_log.txt\n" ); + +fail: + x264_free( build_log ); + if( program ) + ocl->clReleaseProgram( program ); + return NULL; +} + +static int x264_opencl_lookahead_alloc( x264_t *h ) +{ + if( !h->param.rc.i_lookahead ) + return -1; + + static const char *kernelnames[] = { + "mb_intra_cost_satd_8x8", + "sum_intra_cost", + "downscale_hpel", + "downscale1", + "downscale2", + "memset_int16", + "weightp_scaled_images", + "weightp_hpel", + "hierarchical_motion", + "subpel_refine", + "mode_selection", + "sum_inter_cost" + }; + + cl_kernel *kernels[] = { + &h->opencl.intra_kernel, + &h->opencl.rowsum_intra_kernel, + &h->opencl.downscale_hpel_kernel, + &h->opencl.downscale_kernel1, + &h->opencl.downscale_kernel2, + &h->opencl.memset_kernel, + &h->opencl.weightp_scaled_images_kernel, + &h->opencl.weightp_hpel_kernel, + &h->opencl.hme_kernel, + &h->opencl.subpel_refine_kernel, + &h->opencl.mode_select_kernel, + &h->opencl.rowsum_inter_kernel + }; + + x264_opencl_function_t *ocl = h->opencl.ocl; + cl_int status; + + h->opencl.lookahead_program = x264_opencl_compile( h ); + if( !h->opencl.lookahead_program ) + goto fail; + + for( int i = 0; i < ARRAY_SIZE(kernelnames); i++ ) + { + *kernels[i] = ocl->clCreateKernel( h->opencl.lookahead_program, kernelnames[i], &status ); + if( status != CL_SUCCESS ) + { + x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to compile kernel '%s' (%d)\n", kernelnames[i], status ); + goto fail; + } + } + + h->opencl.page_locked_buffer = ocl->clCreateBuffer( h->opencl.context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR, PAGE_LOCKED_BUF_SIZE, NULL, &status ); + if( status != CL_SUCCESS ) + { + x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to allocate page-locked buffer, error '%d'\n", status ); + goto fail; + } + h->opencl.page_locked_ptr = ocl->clEnqueueMapBuffer( h->opencl.queue, h->opencl.page_locked_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, + 0, PAGE_LOCKED_BUF_SIZE, 0, NULL, NULL, &status ); + if( status != CL_SUCCESS ) + { + x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to map page-locked buffer, error '%d'\n", status ); + goto fail; + } + + return 0; +fail: + x264_opencl_lookahead_delete( h ); + return -1; +} + +static void CL_CALLBACK x264_opencl_error_notify( const char *errinfo, const void *private_info, size_t cb, void *user_data ) +{ + /* Any error notification can be assumed to be fatal to the OpenCL context. + * We need to stop using it immediately to prevent further damage. */ + x264_t *h = (x264_t*)user_data; + h->param.b_opencl = 0; + h->opencl.b_fatal_error = 1; + x264_log( h, X264_LOG_ERROR, "OpenCL: %s\n", errinfo ); + x264_log( h, X264_LOG_ERROR, "OpenCL: fatal error, aborting encode\n" ); +} + +int x264_opencl_lookahead_init( x264_t *h ) +{ + x264_opencl_function_t *ocl = h->opencl.ocl; + cl_platform_id *platforms = NULL; + cl_device_id *devices = NULL; + cl_image_format *imageType = NULL; + cl_context context = NULL; + int ret = -1; + + cl_uint numPlatforms = 0; + cl_int status = ocl->clGetPlatformIDs( 0, NULL, &numPlatforms ); + if( status != CL_SUCCESS || !numPlatforms ) + { + x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to query installed platforms\n" ); + goto fail; + } + platforms = (cl_platform_id*)x264_malloc( sizeof(cl_platform_id) * numPlatforms ); + if( !platforms ) + { + x264_log( h, X264_LOG_WARNING, "OpenCL: malloc of installed platforms buffer failed\n" ); + goto fail; + } + status = ocl->clGetPlatformIDs( numPlatforms, platforms, NULL ); + if( status != CL_SUCCESS ) + { + x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to query installed platforms\n" ); + goto fail; + } + + /* Select the first OpenCL platform with a GPU device that supports our + * required image (texture) formats */ + for( cl_uint i = 0; i < numPlatforms; i++ ) + { + cl_uint gpu_count = 0; + status = ocl->clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &gpu_count ); + if( status != CL_SUCCESS || !gpu_count ) + continue; + + x264_free( devices ); + devices = x264_malloc( sizeof(cl_device_id) * gpu_count ); + if( !devices ) + continue; + + status = ocl->clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, gpu_count, devices, NULL ); + if( status != CL_SUCCESS ) + continue; + + /* Find a GPU device that supports our image formats */ + for( cl_uint gpu = 0; gpu < gpu_count; gpu++ ) + { + h->opencl.device = devices[gpu]; + + /* if the user has specified an exact device ID, skip all other + * GPUs. If this device matches, allow it to continue through the + * checks for supported images, etc. */ + if( h->param.opencl_device_id && devices[gpu] != (cl_device_id)h->param.opencl_device_id ) + continue; + + cl_bool image_support = 0; + status = ocl->clGetDeviceInfo( h->opencl.device, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &image_support, NULL ); + if( status != CL_SUCCESS || !image_support ) + continue; + + if( context ) + ocl->clReleaseContext( context ); + context = ocl->clCreateContext( NULL, 1, &h->opencl.device, (void*)x264_opencl_error_notify, (void*)h, &status ); + if( status != CL_SUCCESS || !context ) + continue; + + cl_uint imagecount = 0; + status = ocl->clGetSupportedImageFormats( context, CL_MEM_READ_WRITE, CL_MEM_OBJECT_IMAGE2D, 0, NULL, &imagecount ); + if( status != CL_SUCCESS || !imagecount ) + continue; + + x264_free( imageType ); + imageType = x264_malloc( sizeof(cl_image_format) * imagecount ); + if( !imageType ) + continue; + + status = ocl->clGetSupportedImageFormats( context, CL_MEM_READ_WRITE, CL_MEM_OBJECT_IMAGE2D, imagecount, imageType, NULL ); + if( status != CL_SUCCESS ) + continue; + + int b_has_r = 0; + int b_has_rgba = 0; + for( cl_uint j = 0; j < imagecount; j++ ) + { + if( imageType[j].image_channel_order == CL_R && + imageType[j].image_channel_data_type == CL_UNSIGNED_INT32 ) + b_has_r = 1; + else if( imageType[j].image_channel_order == CL_RGBA && + imageType[j].image_channel_data_type == CL_UNSIGNED_INT8 ) + b_has_rgba = 1; + } + if( !b_has_r || !b_has_rgba ) + { + char dev_name[64]; + status = ocl->clGetDeviceInfo( h->opencl.device, CL_DEVICE_NAME, sizeof(dev_name), dev_name, NULL ); + if( status == CL_SUCCESS ) + { + /* emit warning if we are discarding the user's explicit choice */ + int level = h->param.opencl_device_id ? X264_LOG_WARNING : X264_LOG_DEBUG; + x264_log( h, level, "OpenCL: %s does not support required image formats\n", dev_name ); + } + continue; + } + + /* user selection of GPU device, skip N first matches */ + if( h->param.i_opencl_device ) + { + h->param.i_opencl_device--; + continue; + } + + h->opencl.queue = ocl->clCreateCommandQueue( context, h->opencl.device, 0, &status ); + if( status != CL_SUCCESS || !h->opencl.queue ) + continue; + + h->opencl.context = context; + context = NULL; + + ret = 0; + break; + } + + if( !ret ) + break; + } + + if( !h->param.psz_clbin_file ) + h->param.psz_clbin_file = "x264_lookahead.clbin"; + + if( ret ) + x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to find a compatible device\n" ); + else + ret = x264_opencl_lookahead_alloc( h ); + +fail: + if( context ) + ocl->clReleaseContext( context ); + x264_free( imageType ); + x264_free( devices ); + x264_free( platforms ); + return ret; +} + +static void x264_opencl_lookahead_free( x264_t *h ) +{ + x264_opencl_function_t *ocl = h->opencl.ocl; + +#define RELEASE( a, f ) do { if( a ) { ocl->f( a ); a = NULL; } } while( 0 ) + RELEASE( h->opencl.downscale_hpel_kernel, clReleaseKernel ); + RELEASE( h->opencl.downscale_kernel1, clReleaseKernel ); + RELEASE( h->opencl.downscale_kernel2, clReleaseKernel ); + RELEASE( h->opencl.weightp_hpel_kernel, clReleaseKernel ); + RELEASE( h->opencl.weightp_scaled_images_kernel, clReleaseKernel ); + RELEASE( h->opencl.memset_kernel, clReleaseKernel ); + RELEASE( h->opencl.intra_kernel, clReleaseKernel ); + RELEASE( h->opencl.rowsum_intra_kernel, clReleaseKernel ); + RELEASE( h->opencl.hme_kernel, clReleaseKernel ); + RELEASE( h->opencl.subpel_refine_kernel, clReleaseKernel ); + RELEASE( h->opencl.mode_select_kernel, clReleaseKernel ); + RELEASE( h->opencl.rowsum_inter_kernel, clReleaseKernel ); + + RELEASE( h->opencl.lookahead_program, clReleaseProgram ); + + RELEASE( h->opencl.page_locked_buffer, clReleaseMemObject ); + RELEASE( h->opencl.luma_16x16_image[0], clReleaseMemObject ); + RELEASE( h->opencl.luma_16x16_image[1], clReleaseMemObject ); + for( int i = 0; i < NUM_IMAGE_SCALES; i++ ) + RELEASE( h->opencl.weighted_scaled_images[i], clReleaseMemObject ); + RELEASE( h->opencl.weighted_luma_hpel, clReleaseMemObject ); + RELEASE( h->opencl.row_satds[0], clReleaseMemObject ); + RELEASE( h->opencl.row_satds[1], clReleaseMemObject ); + RELEASE( h->opencl.mv_buffers[0], clReleaseMemObject ); + RELEASE( h->opencl.mv_buffers[1], clReleaseMemObject ); + RELEASE( h->opencl.lowres_mv_costs, clReleaseMemObject ); + RELEASE( h->opencl.mvp_buffer, clReleaseMemObject ); + RELEASE( h->opencl.lowres_costs[0], clReleaseMemObject ); + RELEASE( h->opencl.lowres_costs[1], clReleaseMemObject ); + RELEASE( h->opencl.frame_stats[0], clReleaseMemObject ); + RELEASE( h->opencl.frame_stats[1], clReleaseMemObject ); +#undef RELEASE +} + +void x264_opencl_lookahead_delete( x264_t *h ) +{ + x264_opencl_function_t *ocl = h->opencl.ocl; + + if( !ocl ) + return; + + if( h->opencl.queue ) + ocl->clFinish( h->opencl.queue ); + + x264_opencl_lookahead_free( h ); + + if( h->opencl.queue ) + { + ocl->clReleaseCommandQueue( h->opencl.queue ); + h->opencl.queue = NULL; + } + if( h->opencl.context ) + { + ocl->clReleaseContext( h->opencl.context ); + h->opencl.context = NULL; + } +} + +void x264_opencl_frame_delete( x264_frame_t *frame ) +{ + x264_opencl_function_t *ocl = frame->opencl.ocl; + + if( !ocl ) + return; + +#define RELEASEBUF(mem) do { if( mem ) { ocl->clReleaseMemObject( mem ); mem = NULL; } } while( 0 ) + for( int j = 0; j < NUM_IMAGE_SCALES; j++ ) + RELEASEBUF( frame->opencl.scaled_image2Ds[j] ); + RELEASEBUF( frame->opencl.luma_hpel ); + RELEASEBUF( frame->opencl.inv_qscale_factor ); + RELEASEBUF( frame->opencl.intra_cost ); + RELEASEBUF( frame->opencl.lowres_mvs0 ); + RELEASEBUF( frame->opencl.lowres_mvs1 ); + RELEASEBUF( frame->opencl.lowres_mv_costs0 ); + RELEASEBUF( frame->opencl.lowres_mv_costs1 ); +#undef RELEASEBUF +} + +/* OpenCL misbehaves on hybrid laptops with Intel iGPU and AMD dGPU, so + * we consult AMD's ADL interface to detect this situation and disable + * OpenCL on these machines (Linux and Windows) */ +#ifdef _WIN32 +#define ADL_API_CALL +#define ADL_CALLBACK __stdcall +#define adl_close FreeLibrary +#define adl_address GetProcAddress +#else +#define ADL_API_CALL +#define ADL_CALLBACK +#define adl_close dlclose +#define adl_address dlsym +#endif + +typedef void* ( ADL_CALLBACK *ADL_MAIN_MALLOC_CALLBACK )( int ); +typedef int ( ADL_API_CALL *ADL_MAIN_CONTROL_CREATE )( ADL_MAIN_MALLOC_CALLBACK, int ); +typedef int ( ADL_API_CALL *ADL_ADAPTER_NUMBEROFADAPTERS_GET )( int * ); +typedef int ( ADL_API_CALL *ADL_POWERXPRESS_SCHEME_GET )( int, int *, int *, int * ); +typedef int ( ADL_API_CALL *ADL_MAIN_CONTROL_DESTROY )( void ); + +#define ADL_OK 0 +#define ADL_PX_SCHEME_DYNAMIC 2 + +static void* ADL_CALLBACK adl_malloc_wrapper( int iSize ) +{ + return x264_malloc( iSize ); +} + +static int x264_detect_switchable_graphics( void ) +{ + void *hDLL; + ADL_MAIN_CONTROL_CREATE ADL_Main_Control_Create; + ADL_ADAPTER_NUMBEROFADAPTERS_GET ADL_Adapter_NumberOfAdapters_Get; + ADL_POWERXPRESS_SCHEME_GET ADL_PowerXpress_Scheme_Get; + ADL_MAIN_CONTROL_DESTROY ADL_Main_Control_Destroy; + int ret = 0; + +#ifdef _WIN32 + hDLL = LoadLibrary( "atiadlxx.dll" ); + if( !hDLL ) + hDLL = LoadLibrary( "atiadlxy.dll" ); +#else + hDLL = dlopen( "libatiadlxx.so", RTLD_LAZY|RTLD_GLOBAL ); +#endif + if( !hDLL ) + goto fail0; + + ADL_Main_Control_Create = (ADL_MAIN_CONTROL_CREATE)adl_address(hDLL, "ADL_Main_Control_Create"); + ADL_Main_Control_Destroy = (ADL_MAIN_CONTROL_DESTROY)adl_address(hDLL, "ADL_Main_Control_Destroy"); + ADL_Adapter_NumberOfAdapters_Get = (ADL_ADAPTER_NUMBEROFADAPTERS_GET)adl_address(hDLL, "ADL_Adapter_NumberOfAdapters_Get"); + ADL_PowerXpress_Scheme_Get = (ADL_POWERXPRESS_SCHEME_GET)adl_address(hDLL, "ADL_PowerXpress_Scheme_Get"); + if( !ADL_Main_Control_Destroy || !ADL_Main_Control_Destroy || !ADL_Adapter_NumberOfAdapters_Get || + !ADL_PowerXpress_Scheme_Get ) + goto fail1; + + if( ADL_OK != ADL_Main_Control_Create( adl_malloc_wrapper, 1 ) ) + goto fail1; + + int numAdapters = 0; + if( ADL_OK != ADL_Adapter_NumberOfAdapters_Get( &numAdapters ) ) + goto fail2; + + for( int i = 0; i < numAdapters; i++ ) + { + int PXSchemeRange, PXSchemeCurrentState, PXSchemeDefaultState; + if( ADL_OK != ADL_PowerXpress_Scheme_Get( i, &PXSchemeRange, &PXSchemeCurrentState, &PXSchemeDefaultState) ) + break; + + if( PXSchemeRange >= ADL_PX_SCHEME_DYNAMIC ) + { + ret = 1; + break; + } + } + +fail2: + ADL_Main_Control_Destroy(); +fail1: + adl_close( hDLL ); +fail0: + return ret; +}
View file
x264-snapshot-20130723-2245.tar.bz2/common/opencl.h
Added
@@ -0,0 +1,804 @@ +/***************************************************************************** + * opencl.h: OpenCL structures and defines + ***************************************************************************** + * Copyright (C) 2012-2013 x264 project + * + * Authors: Steve Borho <sborho@multicorewareinc.com> + * Anton Mitrofanov <BugMaster@narod.ru> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_OPENCL_H +#define X264_OPENCL_H + +#define CL_USE_DEPRECATED_OPENCL_1_1_APIS +#include "extras/cl.h" + +#define OCL_API(ret, attr, name) typedef ret (attr *name##_func) + +/* Platform API */ +OCL_API(cl_int, CL_API_CALL, clGetPlatformIDs) +( cl_uint /* num_entries */, + cl_platform_id * /* platforms */, + cl_uint * /* num_platforms */); + +OCL_API(cl_int, CL_API_CALL, clGetPlatformInfo) +( cl_platform_id /* platform */, + cl_platform_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */); + +/* Device APIs */ +OCL_API(cl_int, CL_API_CALL, clGetDeviceIDs) +( cl_platform_id /* platform */, + cl_device_type /* device_type */, + cl_uint /* num_entries */, + cl_device_id * /* devices */, + cl_uint * /* num_devices */); + +OCL_API(cl_int, CL_API_CALL, clGetDeviceInfo) +( cl_device_id /* device */, + cl_device_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */); + +OCL_API(cl_int, CL_API_CALL, clCreateSubDevices) +( cl_device_id /* in_device */, + const cl_device_partition_property * /* properties */, + cl_uint /* num_devices */, + cl_device_id * /* out_devices */, + cl_uint * /* num_devices_ret */); + +OCL_API(cl_int, CL_API_CALL, clRetainDevice) +( cl_device_id /* device */); + +OCL_API(cl_int, CL_API_CALL, clReleaseDevice) +( cl_device_id /* device */); + +/* Context APIs */ +OCL_API(cl_context, CL_API_CALL, clCreateContext) +( const cl_context_properties * /* properties */, + cl_uint /* num_devices */, + const cl_device_id * /* devices */, + void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *), + void * /* user_data */, + cl_int * /* errcode_ret */); + +OCL_API(cl_context, CL_API_CALL, clCreateContextFromType) +( const cl_context_properties * /* properties */, + cl_device_type /* device_type */, + void (CL_CALLBACK * /* pfn_notify*/ )(const char *, const void *, size_t, void *), + void * /* user_data */, + cl_int * /* errcode_ret */); + +OCL_API(cl_int, CL_API_CALL, clRetainContext) +( cl_context /* context */); + +OCL_API(cl_int, CL_API_CALL, clReleaseContext) +( cl_context /* context */); + +OCL_API(cl_int, CL_API_CALL, clGetContextInfo) +( cl_context /* context */, + cl_context_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */); + +/* Command Queue APIs */ +OCL_API(cl_command_queue, CL_API_CALL, clCreateCommandQueue) +( cl_context /* context */, + cl_device_id /* device */, + cl_command_queue_properties /* properties */, + cl_int * /* errcode_ret */); + +OCL_API(cl_int, CL_API_CALL, clRetainCommandQueue) +( cl_command_queue /* command_queue */); + +OCL_API(cl_int, CL_API_CALL, clReleaseCommandQueue) +( cl_command_queue /* command_queue */); + +OCL_API(cl_int, CL_API_CALL, clGetCommandQueueInfo) +( cl_command_queue /* command_queue */, + cl_command_queue_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */); + +/* Memory Object APIs */ +OCL_API(cl_mem, CL_API_CALL, clCreateBuffer) +( cl_context /* context */, + cl_mem_flags /* flags */, + size_t /* size */, + void * /* host_ptr */, + cl_int * /* errcode_ret */); + +OCL_API(cl_mem, CL_API_CALL, clCreateSubBuffer) +( cl_mem /* buffer */, + cl_mem_flags /* flags */, + cl_buffer_create_type /* buffer_create_type */, + const void * /* buffer_create_info */, + cl_int * /* errcode_ret */); + +OCL_API(cl_mem, CL_API_CALL, clCreateImage) +( cl_context /* context */, + cl_mem_flags /* flags */, + const cl_image_format * /* image_format */, + const cl_image_desc * /* image_desc */, + void * /* host_ptr */, + cl_int * /* errcode_ret */); + +OCL_API(cl_int, CL_API_CALL, clRetainMemObject) +( cl_mem /* memobj */); + +OCL_API(cl_int, CL_API_CALL, clReleaseMemObject) +( cl_mem /* memobj */); + +OCL_API(cl_int, CL_API_CALL, clGetSupportedImageFormats) +( cl_context /* context */, + cl_mem_flags /* flags */, + cl_mem_object_type /* image_type */, + cl_uint /* num_entries */, + cl_image_format * /* image_formats */, + cl_uint * /* num_image_formats */); + +OCL_API(cl_int, CL_API_CALL, clGetMemObjectInfo) +( cl_mem /* memobj */, + cl_mem_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */); + +OCL_API(cl_int, CL_API_CALL, clGetImageInfo) +( cl_mem /* image */, + cl_image_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */); + +OCL_API(cl_int, CL_API_CALL, clSetMemObjectDestructorCallback) +( cl_mem /* memobj */, + void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), + void * /*user_data */ ); + +/* Sampler APIs */ +OCL_API(cl_sampler, CL_API_CALL, clCreateSampler) +( cl_context /* context */, + cl_bool /* normalized_coords */, + cl_addressing_mode /* addressing_mode */, + cl_filter_mode /* filter_mode */, + cl_int * /* errcode_ret */); + +OCL_API(cl_int, CL_API_CALL, clRetainSampler) +( cl_sampler /* sampler */); + +OCL_API(cl_int, CL_API_CALL, clReleaseSampler) +( cl_sampler /* sampler */); + +OCL_API(cl_int, CL_API_CALL, clGetSamplerInfo) +( cl_sampler /* sampler */, + cl_sampler_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */); + +/* Program Object APIs */ +OCL_API(cl_program, CL_API_CALL, clCreateProgramWithSource) +( cl_context /* context */, + cl_uint /* count */, + const char ** /* strings */, + const size_t * /* lengths */, + cl_int * /* errcode_ret */); + +OCL_API(cl_program, CL_API_CALL, clCreateProgramWithBinary) +( cl_context /* context */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const size_t * /* lengths */, + const unsigned char ** /* binaries */, + cl_int * /* binary_status */, + cl_int * /* errcode_ret */); + +OCL_API(cl_program, CL_API_CALL, clCreateProgramWithBuiltInKernels) +( cl_context /* context */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const char * /* kernel_names */, + cl_int * /* errcode_ret */); + +OCL_API(cl_int, CL_API_CALL, clRetainProgram) +( cl_program /* program */); + +OCL_API(cl_int, CL_API_CALL, clReleaseProgram) +( cl_program /* program */); + +OCL_API(cl_int, CL_API_CALL, clBuildProgram) +( cl_program /* program */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const char * /* options */, + void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */), + void * /* user_data */); + +OCL_API(cl_int, CL_API_CALL, clCompileProgram) +( cl_program /* program */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const char * /* options */, + cl_uint /* num_input_headers */, + const cl_program * /* input_headers */, + const char ** /* header_include_names */, + void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */), + void * /* user_data */); + +OCL_API(cl_program, CL_API_CALL, clLinkProgram) +( cl_context /* context */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const char * /* options */, + cl_uint /* num_input_programs */, + const cl_program * /* input_programs */, + void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */), + void * /* user_data */, + cl_int * /* errcode_ret */ ); + + +OCL_API(cl_int, CL_API_CALL, clUnloadPlatformCompiler) +( cl_platform_id /* platform */); + +OCL_API(cl_int, CL_API_CALL, clGetProgramInfo) +( cl_program /* program */, + cl_program_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */); + +OCL_API(cl_int, CL_API_CALL, clGetProgramBuildInfo) +( cl_program /* program */, + cl_device_id /* device */, + cl_program_build_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */); + +/* Kernel Object APIs */ +OCL_API(cl_kernel, CL_API_CALL, clCreateKernel) +( cl_program /* program */, + const char * /* kernel_name */, + cl_int * /* errcode_ret */); + +OCL_API(cl_int, CL_API_CALL, clCreateKernelsInProgram) +( cl_program /* program */, + cl_uint /* num_kernels */, + cl_kernel * /* kernels */, + cl_uint * /* num_kernels_ret */); + +OCL_API(cl_int, CL_API_CALL, clRetainKernel) +( cl_kernel /* kernel */); + +OCL_API(cl_int, CL_API_CALL, clReleaseKernel) +( cl_kernel /* kernel */); + +OCL_API(cl_int, CL_API_CALL, clSetKernelArg) +( cl_kernel /* kernel */, + cl_uint /* arg_index */, + size_t /* arg_size */, + const void * /* arg_value */); + +OCL_API(cl_int, CL_API_CALL, clGetKernelInfo) +( cl_kernel /* kernel */, + cl_kernel_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */); + +OCL_API(cl_int, CL_API_CALL, clGetKernelArgInfo) +( cl_kernel /* kernel */, + cl_uint /* arg_indx */, + cl_kernel_arg_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */); + +OCL_API(cl_int, CL_API_CALL, clGetKernelWorkGroupInfo) +( cl_kernel /* kernel */, + cl_device_id /* device */, + cl_kernel_work_group_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */); + +/* Event Object APIs */ +OCL_API(cl_int, CL_API_CALL, clWaitForEvents) +( cl_uint /* num_events */, + const cl_event * /* event_list */); + +OCL_API(cl_int, CL_API_CALL, clGetEventInfo) +( cl_event /* event */, + cl_event_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */); + +OCL_API(cl_event, CL_API_CALL, clCreateUserEvent) +( cl_context /* context */, + cl_int * /* errcode_ret */); + +OCL_API(cl_int, CL_API_CALL, clRetainEvent) +( cl_event /* event */); + +OCL_API(cl_int, CL_API_CALL, clReleaseEvent) +( cl_event /* event */); + +OCL_API(cl_int, CL_API_CALL, clSetUserEventStatus) +( cl_event /* event */, + cl_int /* execution_status */); + +OCL_API(cl_int, CL_API_CALL, clSetEventCallback) +( cl_event /* event */, + cl_int /* command_exec_callback_type */, + void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *), + void * /* user_data */); + +/* Profiling APIs */ +OCL_API(cl_int, CL_API_CALL, clGetEventProfilingInfo) +( cl_event /* event */, + cl_profiling_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */); + +/* Flush and Finish APIs */ +OCL_API(cl_int, CL_API_CALL, clFlush) +( cl_command_queue /* command_queue */); + +OCL_API(cl_int, CL_API_CALL, clFinish) +( cl_command_queue /* command_queue */); + +/* Enqueued Commands APIs */ +OCL_API(cl_int, CL_API_CALL, clEnqueueReadBuffer) +( cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_read */, + size_t /* offset */, + size_t /* size */, + void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */); + +OCL_API(cl_int, CL_API_CALL, clEnqueueReadBufferRect) +( cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_read */, + const size_t * /* buffer_offset */, + const size_t * /* host_offset */, + const size_t * /* region */, + size_t /* buffer_row_pitch */, + size_t /* buffer_slice_pitch */, + size_t /* host_row_pitch */, + size_t /* host_slice_pitch */, + void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */); + +OCL_API(cl_int, CL_API_CALL, clEnqueueWriteBuffer) +( cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_write */, + size_t /* offset */, + size_t /* size */, + const void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */); + +OCL_API(cl_int, CL_API_CALL, clEnqueueWriteBufferRect) +( cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_write */, + const size_t * /* buffer_offset */, + const size_t * /* host_offset */, + const size_t * /* region */, + size_t /* buffer_row_pitch */, + size_t /* buffer_slice_pitch */, + size_t /* host_row_pitch */, + size_t /* host_slice_pitch */, + const void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */); + +OCL_API(cl_int, CL_API_CALL, clEnqueueFillBuffer) +( cl_command_queue /* command_queue */, + cl_mem /* buffer */, + const void * /* pattern */, + size_t /* pattern_size */, + size_t /* offset */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */); + +OCL_API(cl_int, CL_API_CALL, clEnqueueCopyBuffer) +( cl_command_queue /* command_queue */, + cl_mem /* src_buffer */, + cl_mem /* dst_buffer */, + size_t /* src_offset */, + size_t /* dst_offset */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */); + +OCL_API(cl_int, CL_API_CALL, clEnqueueCopyBufferRect) +( cl_command_queue /* command_queue */, + cl_mem /* src_buffer */, + cl_mem /* dst_buffer */, + const size_t * /* src_origin */, + const size_t * /* dst_origin */, + const size_t * /* region */, + size_t /* src_row_pitch */, + size_t /* src_slice_pitch */, + size_t /* dst_row_pitch */, + size_t /* dst_slice_pitch */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */); + +OCL_API(cl_int, CL_API_CALL, clEnqueueReadImage) +( cl_command_queue /* command_queue */, + cl_mem /* image */, + cl_bool /* blocking_read */, + const size_t * /* origin[3] */, + const size_t * /* region[3] */, + size_t /* row_pitch */, + size_t /* slice_pitch */, + void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */); + +OCL_API(cl_int, CL_API_CALL, clEnqueueWriteImage) +( cl_command_queue /* command_queue */, + cl_mem /* image */, + cl_bool /* blocking_write */, + const size_t * /* origin[3] */, + const size_t * /* region[3] */, + size_t /* input_row_pitch */, + size_t /* input_slice_pitch */, + const void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */); + +OCL_API(cl_int, CL_API_CALL, clEnqueueFillImage) +( cl_command_queue /* command_queue */, + cl_mem /* image */, + const void * /* fill_color */, + const size_t * /* origin[3] */, + const size_t * /* region[3] */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */); + +OCL_API(cl_int, CL_API_CALL, clEnqueueCopyImage) +( cl_command_queue /* command_queue */, + cl_mem /* src_image */, + cl_mem /* dst_image */, + const size_t * /* src_origin[3] */, + const size_t * /* dst_origin[3] */, + const size_t * /* region[3] */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */); + +OCL_API(cl_int, CL_API_CALL, clEnqueueCopyImageToBuffer) +( cl_command_queue /* command_queue */, + cl_mem /* src_image */, + cl_mem /* dst_buffer */, + const size_t * /* src_origin[3] */, + const size_t * /* region[3] */, + size_t /* dst_offset */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */); + +OCL_API(cl_int, CL_API_CALL, clEnqueueCopyBufferToImage) +( cl_command_queue /* command_queue */, + cl_mem /* src_buffer */, + cl_mem /* dst_image */, + size_t /* src_offset */, + const size_t * /* dst_origin[3] */, + const size_t * /* region[3] */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */); + +OCL_API(void *, CL_API_CALL, clEnqueueMapBuffer) +( cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_map */, + cl_map_flags /* map_flags */, + size_t /* offset */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */, + cl_int * /* errcode_ret */); + +OCL_API(void *, CL_API_CALL, clEnqueueMapImage) +( cl_command_queue /* command_queue */, + cl_mem /* image */, + cl_bool /* blocking_map */, + cl_map_flags /* map_flags */, + const size_t * /* origin[3] */, + const size_t * /* region[3] */, + size_t * /* image_row_pitch */, + size_t * /* image_slice_pitch */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */, + cl_int * /* errcode_ret */); + +OCL_API(cl_int, CL_API_CALL, clEnqueueUnmapMemObject) +( cl_command_queue /* command_queue */, + cl_mem /* memobj */, + void * /* mapped_ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */); + +OCL_API(cl_int, CL_API_CALL, clEnqueueMigrateMemObjects) +( cl_command_queue /* command_queue */, + cl_uint /* num_mem_objects */, + const cl_mem * /* mem_objects */, + cl_mem_migration_flags /* flags */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */); + +OCL_API(cl_int, CL_API_CALL, clEnqueueNDRangeKernel) +( cl_command_queue /* command_queue */, + cl_kernel /* kernel */, + cl_uint /* work_dim */, + const size_t * /* global_work_offset */, + const size_t * /* global_work_size */, + const size_t * /* local_work_size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */); + +OCL_API(cl_int, CL_API_CALL, clEnqueueTask) +( cl_command_queue /* command_queue */, + cl_kernel /* kernel */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */); + +OCL_API(cl_int, CL_API_CALL, clEnqueueNativeKernel) +( cl_command_queue /* command_queue */, + void (CL_CALLBACK * /*user_func*/)(void *), + void * /* args */, + size_t /* cb_args */, + cl_uint /* num_mem_objects */, + const cl_mem * /* mem_list */, + const void ** /* args_mem_loc */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */); + +OCL_API(cl_int, CL_API_CALL, clEnqueueMarkerWithWaitList) +( cl_command_queue /* command_queue */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */); + +OCL_API(cl_int, CL_API_CALL, clEnqueueBarrierWithWaitList) +( cl_command_queue /* command_queue */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */); + + +/* Extension function access +* +* Returns the extension function address for the given function name, +* or NULL if a valid function can not be found. The client must +* check to make sure the address is not NULL, before using or +* calling the returned function address. +*/ +OCL_API(void *, CL_API_CALL, clGetExtensionFunctionAddressForPlatform) +( cl_platform_id /* platform */, + const char * /* func_name */); + + +// Deprecated OpenCL 1.1 APIs +OCL_API(cl_mem, CL_API_CALL, clCreateImage2D) +( cl_context /* context */, + cl_mem_flags /* flags */, + const cl_image_format * /* image_format */, + size_t /* image_width */, + size_t /* image_height */, + size_t /* image_row_pitch */, + void * /* host_ptr */, + cl_int * /* errcode_ret */); + +OCL_API(cl_mem, CL_API_CALL, clCreateImage3D) +( cl_context /* context */, + cl_mem_flags /* flags */, + const cl_image_format * /* image_format */, + size_t /* image_width */, + size_t /* image_height */, + size_t /* image_depth */, + size_t /* image_row_pitch */, + size_t /* image_slice_pitch */, + void * /* host_ptr */, + cl_int * /* errcode_ret */); + +OCL_API(cl_int, CL_API_CALL, clEnqueueMarker) +( cl_command_queue /* command_queue */, + cl_event * /* event */); + +OCL_API(cl_int, CL_API_CALL, clEnqueueWaitForEvents) +( cl_command_queue /* command_queue */, + cl_uint /* num_events */, + const cl_event * /* event_list */); + +OCL_API(cl_int, CL_API_CALL, clEnqueueBarrier) +( cl_command_queue /* command_queue */); + +OCL_API(cl_int, CL_API_CALL, clUnloadCompiler) +( void); + +OCL_API(void *, CL_API_CALL, clGetExtensionFunctionAddress) +( const char * /* func_name */); + +#define OCL_DECLARE_FUNC(name) name##_func name + +typedef struct +{ + void *library; + + OCL_DECLARE_FUNC( clBuildProgram ); + OCL_DECLARE_FUNC( clCreateBuffer ); + OCL_DECLARE_FUNC( clCreateCommandQueue ); + OCL_DECLARE_FUNC( clCreateContext ); + OCL_DECLARE_FUNC( clCreateImage2D ); + OCL_DECLARE_FUNC( clCreateKernel ); + OCL_DECLARE_FUNC( clCreateProgramWithBinary ); + OCL_DECLARE_FUNC( clCreateProgramWithSource ); + OCL_DECLARE_FUNC( clEnqueueCopyBuffer ); + OCL_DECLARE_FUNC( clEnqueueMapBuffer ); + OCL_DECLARE_FUNC( clEnqueueNDRangeKernel ); + OCL_DECLARE_FUNC( clEnqueueReadBuffer ); + OCL_DECLARE_FUNC( clEnqueueWriteBuffer ); + OCL_DECLARE_FUNC( clFinish ); + OCL_DECLARE_FUNC( clGetCommandQueueInfo ); + OCL_DECLARE_FUNC( clGetDeviceIDs ); + OCL_DECLARE_FUNC( clGetDeviceInfo ); + OCL_DECLARE_FUNC( clGetKernelWorkGroupInfo ); + OCL_DECLARE_FUNC( clGetPlatformIDs ); + OCL_DECLARE_FUNC( clGetProgramBuildInfo ); + OCL_DECLARE_FUNC( clGetProgramInfo ); + OCL_DECLARE_FUNC( clGetSupportedImageFormats ); + OCL_DECLARE_FUNC( clReleaseCommandQueue ); + OCL_DECLARE_FUNC( clReleaseContext ); + OCL_DECLARE_FUNC( clReleaseKernel ); + OCL_DECLARE_FUNC( clReleaseMemObject ); + OCL_DECLARE_FUNC( clReleaseProgram ); + OCL_DECLARE_FUNC( clSetKernelArg ); +} x264_opencl_function_t; + +/* Number of downscale resolutions to use for motion search */ +#define NUM_IMAGE_SCALES 4 + +/* Number of PCIe copies that can be queued before requiring a flush */ +#define MAX_FINISH_COPIES 1024 + +/* Size (in bytes) of the page-locked buffer used for PCIe xfers */ +#define PAGE_LOCKED_BUF_SIZE 32 * 1024 * 1024 + +typedef struct +{ + x264_opencl_function_t *ocl; + + cl_context context; + cl_device_id device; + cl_command_queue queue; + + cl_program lookahead_program; + cl_int last_buf; + + cl_mem page_locked_buffer; + char *page_locked_ptr; + int pl_occupancy; + + struct + { + void *src; + void *dest; + int bytes; + } copies[MAX_FINISH_COPIES]; + int num_copies; + + int b_device_AMD_SI; + int b_fatal_error; + int lookahead_thread_pri; + int opencl_thread_pri; + + /* downscale lowres luma */ + cl_kernel downscale_hpel_kernel; + cl_kernel downscale_kernel1; + cl_kernel downscale_kernel2; + cl_mem luma_16x16_image[2]; + + /* weightp filtering */ + cl_kernel weightp_hpel_kernel; + cl_kernel weightp_scaled_images_kernel; + cl_mem weighted_scaled_images[NUM_IMAGE_SCALES]; + cl_mem weighted_luma_hpel; + + /* intra */ + cl_kernel memset_kernel; + cl_kernel intra_kernel; + cl_kernel rowsum_intra_kernel; + cl_mem row_satds[2]; + + /* hierarchical motion estimation */ + cl_kernel hme_kernel; + cl_kernel subpel_refine_kernel; + cl_mem mv_buffers[2]; + cl_mem lowres_mv_costs; + cl_mem mvp_buffer; + + /* bidir */ + cl_kernel mode_select_kernel; + cl_kernel rowsum_inter_kernel; + cl_mem lowres_costs[2]; + cl_mem frame_stats[2]; /* cost_est, cost_est_aq, intra_mbs */ +} x264_opencl_t; + +typedef struct +{ + x264_opencl_function_t *ocl; + + cl_mem scaled_image2Ds[NUM_IMAGE_SCALES]; + cl_mem luma_hpel; + cl_mem inv_qscale_factor; + cl_mem intra_cost; + cl_mem lowres_mvs0; + cl_mem lowres_mvs1; + cl_mem lowres_mv_costs0; + cl_mem lowres_mv_costs1; +} x264_frame_opencl_t; + +typedef struct x264_frame x264_frame; + +x264_opencl_function_t *x264_opencl_load_library( void ); +void x264_opencl_close_library( x264_opencl_function_t *ocl ); + +int x264_opencl_lookahead_init( x264_t *h ); +void x264_opencl_lookahead_delete( x264_t *h ); + +void x264_opencl_frame_delete( x264_frame *frame ); + +#endif
View file
x264-snapshot-20130723-2245.tar.bz2/common/opencl/bidir.cl
Added
@@ -0,0 +1,265 @@ +/* Mode selection routines, select the least SATD cost mode for each lowres + * macroblock. When measuring B slices, this includes measuring the cost of + * three bidir modes. */ + +/* Four threads cooperatively measure 8x8 BIDIR cost with SATD */ +int bidir_satd_8x8_ii_coop4( read_only image2d_t fenc_lowres, + int2 fencpos, + read_only image2d_t fref0_planes, + int2 qpos0, + read_only image2d_t fref1_planes, + int2 qpos1, + int weight, + local sum2_t *tmpp, + int idx ) +{ + volatile local sum2_t( *tmp )[4] = (volatile local sum2_t( * )[4])tmpp; + sum2_t b0, b1, b2, b3; + sum2_t sum = 0; + + // fencpos is full-pel position of original MB + // qpos0 is qpel position within reference frame 0 + // qpos1 is qpel position within reference frame 1 + + int2 fref0Apos = (int2)(qpos0.x>>2, qpos0.y>>2); + int hpel0A = ((qpos0.x&2)>>1) + (qpos0.y&2); + + int2 qpos0B = (int2)qpos0 + (int2)(((qpos0.x&1)<<1), ((qpos0.y&1)<<1)); + int2 fref0Bpos = (int2)(qpos0B.x>>2, qpos0B.y>>2); + int hpel0B = ((qpos0B.x&2)>>1) + (qpos0B.y&2); + + int2 fref1Apos = (int2)(qpos1.x>>2, qpos1.y>>2); + int hpel1A = ((qpos1.x&2)>>1) + (qpos1.y&2); + + int2 qpos1B = (int2)qpos1 + (int2)(((qpos1.x&1)<<1), ((qpos1.y&1)<<1)); + int2 fref1Bpos = (int2)(qpos1B.x>>2, qpos1B.y>>2); + int hpel1B = ((qpos1B.x&2)>>1) + (qpos1B.y&2); + + uint mask_shift0A = 8 * hpel0A, mask_shift0B = 8 * hpel0B; + uint mask_shift1A = 8 * hpel1A, mask_shift1B = 8 * hpel1B; + + uint vA, vB; + uint enc, ref0, ref1; + uint a0, a1; + const int weight2 = 64 - weight; + +#define READ_BIDIR_DIFF( OUT, X )\ + enc = read_imageui( fenc_lowres, sampler, fencpos + (int2)(X, idx) ).s0;\ + vA = (read_imageui( fref0_planes, sampler, fref0Apos + (int2)(X, idx) ).s0 >> mask_shift0A) & 0xFF;\ + vB = (read_imageui( fref0_planes, sampler, fref0Bpos + (int2)(X, idx) ).s0 >> mask_shift0B) & 0xFF;\ + ref0 = rhadd( vA, vB );\ + vA = (read_imageui( fref1_planes, sampler, fref1Apos + (int2)(X, idx) ).s0 >> mask_shift1A) & 0xFF;\ + vB = (read_imageui( fref1_planes, sampler, fref1Bpos + (int2)(X, idx) ).s0 >> mask_shift1B) & 0xFF;\ + ref1 = rhadd( vA, vB );\ + OUT = enc - ((ref0 * weight + ref1 * weight2 + (1 << 5)) >> 6); + +#define READ_DIFF_EX( OUT, a, b )\ + READ_BIDIR_DIFF( a0, a );\ + READ_BIDIR_DIFF( a1, b );\ + OUT = a0 + (a1<<BITS_PER_SUM); + +#define ROW_8x4_SATD( a, b, c )\ + fencpos.y += a;\ + fref0Apos.y += b;\ + fref0Bpos.y += b;\ + fref1Apos.y += c;\ + fref1Bpos.y += c;\ + READ_DIFF_EX( b0, 0, 4 );\ + READ_DIFF_EX( b1, 1, 5 );\ + READ_DIFF_EX( b2, 2, 6 );\ + READ_DIFF_EX( b3, 3, 7 );\ + HADAMARD4( tmp[idx][0], tmp[idx][1], tmp[idx][2], tmp[idx][3], b0, b1, b2, b3 );\ + HADAMARD4( b0, b1, b2, b3, tmp[0][idx], tmp[1][idx], tmp[2][idx], tmp[3][idx] );\ + sum += abs2( b0 ) + abs2( b1 ) + abs2( b2 ) + abs2( b3 ); + + ROW_8x4_SATD( 0, 0, 0 ); + ROW_8x4_SATD( 4, 4, 4 ); + +#undef READ_BIDIR_DIFF +#undef READ_DIFF_EX +#undef ROW_8x4_SATD + + return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1; +} + +/* + * mode selection - pick the least cost partition type for each 8x8 macroblock. + * Intra, list0 or list1. When measuring a B slice, also test three bidir + * possibilities. + * + * fenc_lowres_mvs[0|1] and fenc_lowres_mv_costs[0|1] are large buffers that + * hold many frames worth of motion vectors. We must offset into the correct + * location for this frame's vectors: + * + * CPU equivalent: fenc->lowres_mvs[0][b - p0 - 1] + * GPU equivalent: fenc_lowres_mvs0[(b - p0 - 1) * mb_count] + * + * global launch dimensions for P slice estimate: [mb_width, mb_height] + * global launch dimensions for B slice estimate: [mb_width * 4, mb_height] + */ +kernel void mode_selection( read_only image2d_t fenc_lowres, + read_only image2d_t fref0_planes, + read_only image2d_t fref1_planes, + const global short2 *fenc_lowres_mvs0, + const global short2 *fenc_lowres_mvs1, + const global short2 *fref1_lowres_mvs0, + const global int16_t *fenc_lowres_mv_costs0, + const global int16_t *fenc_lowres_mv_costs1, + const global uint16_t *fenc_intra_cost, + global uint16_t *lowres_costs, + global int *frame_stats, + local int16_t *cost_local, + local sum2_t *satd_local, + int mb_width, + int bipred_weight, + int dist_scale_factor, + int b, + int p0, + int p1, + int lambda ) +{ + int mb_x = get_global_id( 0 ); + int b_bidir = b < p1; + if( b_bidir ) + { + /* when mode_selection is run for B frames, it must perform BIDIR SATD + * measurements, so it is launched with four times as many threads in + * order to spread the work around more of the GPU. And it can add + * padding threads in the X direction. */ + mb_x >>= 2; + if( mb_x >= mb_width ) + return; + } + int mb_y = get_global_id( 1 ); + int mb_height = get_global_size( 1 ); + int mb_count = mb_width * mb_height; + int mb_xy = mb_x + mb_y * mb_width; + + /* Initialize int frame_stats[4] for next kernel (sum_inter_cost) */ + if( mb_x < 4 && mb_y == 0 ) + frame_stats[mb_x] = 0; + + int bcost = COST_MAX; + int list_used = 0; + + if( !b_bidir ) + { + int icost = fenc_intra_cost[mb_xy]; + COPY2_IF_LT( bcost, icost, list_used, 0 ); + } + if( b != p0 ) + { + int mv_cost0 = fenc_lowres_mv_costs0[(b - p0 - 1) * mb_count + mb_xy]; + COPY2_IF_LT( bcost, mv_cost0, list_used, 1 ); + } + if( b != p1 ) + { + int mv_cost1 = fenc_lowres_mv_costs1[(p1 - b - 1) * mb_count + mb_xy]; + COPY2_IF_LT( bcost, mv_cost1, list_used, 2 ); + } + + if( b_bidir ) + { + int2 coord = (int2)(mb_x, mb_y) << 3; + int mb_i = get_global_id( 0 ) & 3; + int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2); + cost_local += mb_in_group * 4; + satd_local += mb_in_group * 16; + +#define TRY_BIDIR( mv0, mv1, penalty )\ +{\ + int2 qpos0 = (int2)((coord.x<<2) + mv0.x, (coord.y<<2) + mv0.y);\ + int2 qpos1 = (int2)((coord.x<<2) + mv1.x, (coord.y<<2) + mv1.y);\ + cost_local[mb_i] = bidir_satd_8x8_ii_coop4( fenc_lowres, coord, fref0_planes, qpos0, fref1_planes, qpos1, bipred_weight, satd_local, mb_i );\ + int cost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3];\ + COPY2_IF_LT( bcost, penalty * lambda + cost, list_used, 3 );\ +} + + /* temporal prediction */ + short2 dmv0, dmv1; + short2 mvr = fref1_lowres_mvs0[mb_xy]; + dmv0 = (mvr * (short) dist_scale_factor + (short) 128) >> (short) 8; + dmv1 = dmv0 - mvr; + TRY_BIDIR( dmv0, dmv1, 0 ) + + if( as_uint( dmv0 ) || as_uint( dmv1 ) ) + { + /* B-direct prediction */ + dmv0 = 0; dmv1 = 0; + TRY_BIDIR( dmv0, dmv1, 0 ); + } + + /* L0+L1 prediction */ + dmv0 = fenc_lowres_mvs0[(b - p0 - 1) * mb_count + mb_xy]; + dmv1 = fenc_lowres_mvs1[(p1 - b - 1) * mb_count + mb_xy]; + TRY_BIDIR( dmv0, dmv1, 5 ); +#undef TRY_BIDIR + } + + lowres_costs[mb_xy] = min( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT); +} + +/* + * parallel sum inter costs + * + * global launch dimensions: [256, mb_height] + */ +kernel void sum_inter_cost( const global uint16_t *fenc_lowres_costs, + const global uint16_t *inv_qscale_factor, + global int *fenc_row_satds, + global int *frame_stats, + int mb_width, + int bframe_bias, + int b, + int p0, + int p1 ) +{ + int y = get_global_id( 1 ); + int mb_height = get_global_size( 1 ); + + int row_satds = 0; + int cost_est = 0; + int cost_est_aq = 0; + int intra_mbs = 0; + + for( int x = get_global_id( 0 ); x < mb_width; x += get_global_size( 0 )) + { + int mb_xy = x + y * mb_width; + int cost = fenc_lowres_costs[mb_xy] & LOWRES_COST_MASK; + int list = fenc_lowres_costs[mb_xy] >> LOWRES_COST_SHIFT; + int b_frame_score_mb = (x > 0 && x < mb_width - 1 && y > 0 && y < mb_height - 1) || mb_width <= 2 || mb_height <= 2; + + if( list == 0 && b_frame_score_mb ) + intra_mbs++; + + int cost_aq = (cost * inv_qscale_factor[mb_xy] + 128) >> 8; + + row_satds += cost_aq; + + if( b_frame_score_mb ) + { + cost_est += cost; + cost_est_aq += cost_aq; + } + } + + local int buffer[256]; + int x = get_global_id( 0 ); + + row_satds = parallel_sum( row_satds, x, buffer ); + cost_est = parallel_sum( cost_est, x, buffer ); + cost_est_aq = parallel_sum( cost_est_aq, x, buffer ); + intra_mbs = parallel_sum( intra_mbs, x, buffer ); + + if( b != p1 ) + // Use floating point math to avoid 32bit integer overflow conditions + cost_est = (int)((float)cost_est * 100.0f / (120.0f + (float)bframe_bias)); + + if( get_global_id( 0 ) == 0 ) + { + fenc_row_satds[y] = row_satds; + atomic_add( frame_stats + COST_EST, cost_est ); + atomic_add( frame_stats + COST_EST_AQ, cost_est_aq ); + atomic_add( frame_stats + INTRA_MBS, intra_mbs ); + } +}
View file
x264-snapshot-20130723-2245.tar.bz2/common/opencl/downscale.cl
Added
@@ -0,0 +1,135 @@ +/* + * downscale lowres luma: full-res buffer to down scale image, and to packed hpel image + * + * -- + * + * fenc_img is an output image (area of memory referenced through a texture + * cache). A read of any pixel location (x,y) returns four pixel values: + * + * val.s0 = P(x,y) + * val.s1 = P(x+1,y) + * val.s2 = P(x+2,y) + * val.s3 = P(x+3,y) + * + * This is a 4x replication of the lowres pixels, a trade-off between memory + * size and read latency. + * + * -- + * + * hpel_planes is an output image that contains the four HPEL planes used for + * subpel refinement. A read of any pixel location (x,y) returns a UInt32 with + * the four planar values C | V | H | F + * + * launch dimensions: [lowres-width, lowres-height] + */ +kernel void downscale_hpel( const global pixel *fenc, + write_only image2d_t fenc_img, + write_only image2d_t hpel_planes, + int stride ) +{ + int x = get_global_id( 0 ); + int y = get_global_id( 1 ); + uint4 values; + + fenc += y * stride * 2; + const global pixel *src1 = fenc + stride; + const global pixel *src2 = (y == get_global_size( 1 )-1) ? src1 : src1 + stride; + int2 pos = (int2)(x, y); + pixel right, left; + + right = rhadd( fenc[x*2], src1[x*2] ); + left = rhadd( fenc[x*2+1], src1[x*2+1] ); + values.s0 = rhadd( right, left ); // F + + right = rhadd( fenc[2*x+1], src1[2*x+1] ); + left = rhadd( fenc[2*x+2], src1[2*x+2] ); + values.s1 = rhadd( right, left ); // H + + right = rhadd( src1[2*x], src2[2*x] ); + left = rhadd( src1[2*x+1], src2[2*x+1] ); + values.s2 = rhadd( right, left ); // V + + right = rhadd( src1[2*x+1], src2[2*x+1] ); + left = rhadd( src1[2*x+2], src2[2*x+2] ); + values.s3 = rhadd( right, left ); // C + + uint4 val = (uint4) ((values.s3 & 0xff) << 24) | ((values.s2 & 0xff) << 16) | ((values.s1 & 0xff) << 8) | (values.s0 & 0xff); + write_imageui( hpel_planes, pos, val ); + + x = select( x, x+1, x+1 < get_global_size( 0 ) ); + right = rhadd( fenc[x*2], src1[x*2] ); + left = rhadd( fenc[x*2+1], src1[x*2+1] ); + values.s1 = rhadd( right, left ); + + x = select( x, x+1, x+1 < get_global_size( 0 ) ); + right = rhadd( fenc[x*2], src1[x*2] ); + left = rhadd( fenc[x*2+1], src1[x*2+1] ); + values.s2 = rhadd( right, left ); + + x = select( x, x+1, x+1 < get_global_size( 0 ) ); + right = rhadd( fenc[x*2], src1[x*2] ); + left = rhadd( fenc[x*2+1], src1[x*2+1] ); + values.s3 = rhadd( right, left ); + + write_imageui( fenc_img, pos, values ); +} + +/* + * downscale lowres hierarchical motion search image, copy from one image to + * another decimated image. This kernel is called iteratively to generate all + * of the downscales. + * + * launch dimensions: [lower_res width, lower_res height] + */ +kernel void downscale1( read_only image2d_t higher_res, write_only image2d_t lower_res ) +{ + int x = get_global_id( 0 ); + int y = get_global_id( 1 ); + int2 pos = (int2)(x, y); + int gs = get_global_size( 0 ); + uint4 top, bot, values; + top = read_imageui( higher_res, sampler, (int2)(x*2, 2*y) ); + bot = read_imageui( higher_res, sampler, (int2)(x*2, 2*y+1) ); + values.s0 = rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ); + + /* these select statements appear redundant, and they should be, but tests break when + * they are not here. I believe this was caused by a driver bug + */ + values.s1 = select( values.s0, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 1 < gs) ); + top = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y) ); + bot = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y+1) ); + values.s2 = select( values.s1, rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ), ( x + 2 < gs ) ); + values.s3 = select( values.s2, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 3 < gs ) ); + write_imageui( lower_res, pos, (uint4)(values) ); +} + +/* + * Second copy of downscale kernel, no differences. This is a (no perf loss) + * workaround for a scheduling bug in current Tahiti drivers. This bug has + * theoretically been fixed in the July 2012 driver release from AMD. + */ +kernel void downscale2( read_only image2d_t higher_res, write_only image2d_t lower_res ) +{ + int x = get_global_id( 0 ); + int y = get_global_id( 1 ); + int2 pos = (int2)(x, y); + int gs = get_global_size( 0 ); + uint4 top, bot, values; + top = read_imageui( higher_res, sampler, (int2)(x*2, 2*y) ); + bot = read_imageui( higher_res, sampler, (int2)(x*2, 2*y+1) ); + values.s0 = rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ); + + // see comment in above function copy + values.s1 = select( values.s0, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 1 < gs) ); + top = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y) ); + bot = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y+1) ); + values.s2 = select( values.s1, rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ), ( x + 2 < gs ) ); + values.s3 = select( values.s2, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 3 < gs ) ); + write_imageui( lower_res, pos, (uint4)(values) ); +} + +/* OpenCL 1.2 finally added a memset command, but we're not targeting 1.2 */ +kernel void memset_int16( global int16_t *buf, int16_t value ) +{ + buf[get_global_id( 0 )] = value; +}
View file
x264-snapshot-20130723-2245.tar.bz2/common/opencl/intra.cl
Added
@@ -0,0 +1,1072 @@ +/* Lookahead lowres intra analysis + * + * Each intra analysis function has been implemented twice, once for scalar GPUs + * (NV) and once for vectorized GPUs (AMD pre-Southern Islands). x264 detects + * the GPU type and sets the -DVECTORIZE compile flag accordingly. + * + * All the intra analysis functions were based on their C versions in pixel.c + * and produce the exact same results. + */ + +/* force all clamp arguments and return value to int, prevent ambiguous types */ +#define clamp_int( X, MIN, MAX ) (int) clamp( (int)(X), (int)(MIN), (int)(MAX) ) + +#if VECTORIZE +int satd_8x4_intra_lr( const local pixel *data, int data_stride, int8 pr0, int8 pr1, int8 pr2, int8 pr3 ) +{ + int8 a_v, d_v; + int2 tmp00, tmp01, tmp02, tmp03, tmp10, tmp11, tmp12, tmp13; + int2 tmp20, tmp21, tmp22, tmp23, tmp30, tmp31, tmp32, tmp33; + + d_v = convert_int8( vload8( 0, data ) ); + a_v.s01234567 = (d_v - pr0).s04152637; + HADAMARD4V( tmp00, tmp01, tmp02, tmp03, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi ); + + data += data_stride; + d_v = convert_int8( vload8( 0, data ) ); + a_v.s01234567 = (d_v - pr1).s04152637; + HADAMARD4V( tmp10, tmp11, tmp12, tmp13, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi ); + + data += data_stride; + d_v = convert_int8( vload8( 0, data ) ); + a_v.s01234567 = (d_v - pr2).s04152637; + HADAMARD4V( tmp20, tmp21, tmp22, tmp23, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi ); + + data += data_stride; + d_v = convert_int8( vload8( 0, data ) ); + a_v.s01234567 = (d_v - pr3).s04152637; + HADAMARD4V( tmp30, tmp31, tmp32, tmp33, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi ); + + uint8 sum_v; + + HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp00, tmp10, tmp20, tmp30 ); + sum_v = abs( a_v ); + + HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp01, tmp11, tmp21, tmp31 ); + sum_v += abs( a_v ); + + HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp02, tmp12, tmp22, tmp32 ); + sum_v += abs( a_v ); + + HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp03, tmp13, tmp23, tmp33 ); + sum_v += abs( a_v ); + + uint4 sum2 = sum_v.hi + sum_v.lo; + uint2 sum3 = sum2.hi + sum2.lo; + return ( sum3.hi + sum3.lo ) >> 1; +} +#else +SATD_C_8x4_Q( satd_8x4_lp, const local, private ) +#endif + +/**************************************************************************** + * 8x8 prediction for intra luma block + ****************************************************************************/ + +#define F1 rhadd +#define F2( a, b, c ) ( a+2*b+c+2 )>>2 + +#if VECTORIZE +int x264_predict_8x8_ddl( const local pixel *src, int src_stride, const local pixel *top ) +{ + int8 pr0, pr1, pr2, pr3; + + // Upper half of pred[] + pr0.s0 = ( 2 + top[0] + 2*top[1] + top[2] ) >> 2; + pr0.s1 = ( 2 + top[1] + 2*top[2] + top[3] ) >> 2; + pr0.s2 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2; + pr0.s3 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2; + pr0.s4 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2; + pr0.s5 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2; + pr0.s6 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2; + pr0.s7 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2; + + pr1.s0 = ( 2 + top[1] + 2*top[2] + top[3] ) >> 2; + pr1.s1 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2; + pr1.s2 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2; + pr1.s3 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2; + pr1.s4 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2; + pr1.s5 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2; + pr1.s6 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2; + pr1.s7 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2; + + pr2.s0 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2; + pr2.s1 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2; + pr2.s2 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2; + pr2.s3 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2; + pr2.s4 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2; + pr2.s5 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2; + pr2.s6 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2; + pr2.s7 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2; + + pr3.s0 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2; + pr3.s1 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2; + pr3.s2 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2; + pr3.s3 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2; + pr3.s4 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2; + pr3.s5 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2; + pr3.s6 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2; + pr3.s7 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2; + int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 ); + + // Lower half of pred[] + pr0.s0 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2; + pr0.s1 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2; + pr0.s2 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2; + pr0.s3 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2; + pr0.s4 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2; + pr0.s5 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2; + pr0.s6 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2; + pr0.s7 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2; + + pr1.s0 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2; + pr1.s1 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2; + pr1.s2 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2; + pr1.s3 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2; + pr1.s4 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2; + pr1.s5 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2; + pr1.s6 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2; + pr1.s7 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2; + + pr2.s0 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2; + pr2.s1 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2; + pr2.s2 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2; + pr2.s3 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2; + pr2.s4 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2; + pr2.s5 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2; + pr2.s6 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2; + pr2.s7 = ( 2 + top[13] + 2*top[14] + top[15] ) >> 2; + + pr3.s0 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2; + pr3.s1 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2; + pr3.s2 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2; + pr3.s3 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2; + pr3.s4 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2; + pr3.s5 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2; + pr3.s6 = ( 2 + top[13] + 2*top[14] + top[15] ) >> 2; + pr3.s7 = ( 2 + top[14] + 3*top[15] ) >> 2; + + return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 ); +} + +int x264_predict_8x8_ddr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top ) +{ + int8 pr0, pr1, pr2, pr3; + + // Upper half of pred[] + pr3.s0 = F2( left[1], left[2], left[3] ); + pr2.s0 = pr3.s1 = F2( left[0], left[1], left[2] ); + pr1.s0 = pr2.s1 = pr3.s2 = F2( left[1], left[0], left_top ); + pr0.s0 = pr1.s1 = pr2.s2 = pr3.s3 = F2( left[0], left_top, top[0] ); + pr0.s1 = pr1.s2 = pr2.s3 = pr3.s4 = F2( left_top, top[0], top[1] ); + pr0.s2 = pr1.s3 = pr2.s4 = pr3.s5 = F2( top[0], top[1], top[2] ); + pr0.s3 = pr1.s4 = pr2.s5 = pr3.s6 = F2( top[1], top[2], top[3] ); + pr0.s4 = pr1.s5 = pr2.s6 = pr3.s7 = F2( top[2], top[3], top[4] ); + pr0.s5 = pr1.s6 = pr2.s7 = F2( top[3], top[4], top[5] ); + pr0.s6 = pr1.s7 = F2( top[4], top[5], top[6] ); + pr0.s7 = F2( top[5], top[6], top[7] ); + int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 ); + + // Lower half of pred[] + pr3.s0 = F2( left[5], left[6], left[7] ); + pr2.s0 = pr3.s1 = F2( left[4], left[5], left[6] ); + pr1.s0 = pr2.s1 = pr3.s2 = F2( left[3], left[4], left[5] ); + pr0.s0 = pr1.s1 = pr2.s2 = pr3.s3 = F2( left[2], left[3], left[4] ); + pr0.s1 = pr1.s2 = pr2.s3 = pr3.s4 = F2( left[1], left[2], left[3] ); + pr0.s2 = pr1.s3 = pr2.s4 = pr3.s5 = F2( left[0], left[1], left[2] ); + pr0.s3 = pr1.s4 = pr2.s5 = pr3.s6 = F2( left[1], left[0], left_top ); + pr0.s4 = pr1.s5 = pr2.s6 = pr3.s7 = F2( left[0], left_top, top[0] ); + pr0.s5 = pr1.s6 = pr2.s7 = F2( left_top, top[0], top[1] ); + pr0.s6 = pr1.s7 = F2( top[0], top[1], top[2] ); + pr0.s7 = F2( top[1], top[2], top[3] ); + return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 ); +} + +int x264_predict_8x8_vr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top ) +{ + int8 pr0, pr1, pr2, pr3; + + // Upper half of pred[] + pr2.s0 = F2( left[1], left[0], left_top ); + pr3.s0 = F2( left[2], left[1], left[0] ); + pr1.s0 = pr3.s1 = F2( left[0], left_top, top[0] ); + pr0.s0 = pr2.s1 = F1( left_top, top[0] ); + pr1.s1 = pr3.s2 = F2( left_top, top[0], top[1] ); + pr0.s1 = pr2.s2 = F1( top[0], top[1] ); + pr1.s2 = pr3.s3 = F2( top[0], top[1], top[2] ); + pr0.s2 = pr2.s3 = F1( top[1], top[2] ); + pr1.s3 = pr3.s4 = F2( top[1], top[2], top[3] ); + pr0.s3 = pr2.s4 = F1( top[2], top[3] ); + pr1.s4 = pr3.s5 = F2( top[2], top[3], top[4] ); + pr0.s4 = pr2.s5 = F1( top[3], top[4] ); + pr1.s5 = pr3.s6 = F2( top[3], top[4], top[5] ); + pr0.s5 = pr2.s6 = F1( top[4], top[5] ); + pr1.s6 = pr3.s7 = F2( top[4], top[5], top[6] ); + pr0.s6 = pr2.s7 = F1( top[5], top[6] ); + pr1.s7 = F2( top[5], top[6], top[7] ); + pr0.s7 = F1( top[6], top[7] ); + int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 ); + + // Lower half of pred[] + pr2.s0 = F2( left[5], left[4], left[3] ); + pr3.s0 = F2( left[6], left[5], left[4] ); + pr0.s0 = pr2.s1 = F2( left[3], left[2], left[1] ); + pr1.s0 = pr3.s1 = F2( left[4], left[3], left[2] ); + pr0.s1 = pr2.s2 = F2( left[1], left[0], left_top ); + pr1.s1 = pr3.s2 = F2( left[2], left[1], left[0] ); + pr1.s2 = pr3.s3 = F2( left[0], left_top, top[0] ); + pr0.s2 = pr2.s3 = F1( left_top, top[0] ); + pr1.s3 = pr3.s4 = F2( left_top, top[0], top[1] ); + pr0.s3 = pr2.s4 = F1( top[0], top[1] ); + pr1.s4 = pr3.s5 = F2( top[0], top[1], top[2] ); + pr0.s4 = pr2.s5 = F1( top[1], top[2] ); + pr1.s5 = pr3.s6 = F2( top[1], top[2], top[3] ); + pr0.s5 = pr2.s6 = F1( top[2], top[3] ); + pr1.s6 = pr3.s7 = F2( top[2], top[3], top[4] ); + pr0.s6 = pr2.s7 = F1( top[3], top[4] ); + pr1.s7 = F2( top[3], top[4], top[5] ); + pr0.s7 = F1( top[4], top[5] ); + return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 ); +#undef PRED +} + +int x264_predict_8x8_hd( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top ) +{ + int8 pr0, pr1, pr2, pr3; + + // Upper half of pred[] + pr0.s0 = F1( left_top, left[0] ); pr0.s1 = (left[0] + 2 * left_top + top[0] + 2) >> 2; + pr0.s2 = F2( top[1], top[0], left_top ); pr0.s3 = F2( top[2], top[1], top[0] ); + pr0.s4 = F2( top[3], top[2], top[1] ); pr0.s5 = F2( top[4], top[3], top[2] ); + pr0.s6 = F2( top[5], top[4], top[3] ); pr0.s7 = F2( top[6], top[5], top[4] ); + + pr1.s0 = F1( left[0], left[1] ); pr1.s1 = (left_top + 2 * left[0] + left[1] + 2) >> 2; + pr1.s2 = F1( left_top, left[0] ); pr1.s3 = (left[0] + 2 * left_top + top[0] + 2) >> 2; + pr1.s4 = F2( top[1], top[0], left_top ); pr1.s5 = F2( top[2], top[1], top[0] ); + pr1.s6 = F2( top[3], top[2], top[1] ); pr1.s7 = F2( top[4], top[3], top[2] ); + + pr2.s0 = F1( left[1], left[2] ); pr2.s1 = (left[0] + 2 * left[1] + left[2] + 2) >> 2; + pr2.s2 = F1( left[0], left[1] ); pr2.s3 = (left_top + 2 * left[0] + left[1] + 2) >> 2; + pr2.s4 = F1( left_top, left[0] ); pr2.s5 = (left[0] + 2 * left_top + top[0] + 2) >> 2; + pr2.s6 = F2( top[1], top[0], left_top ); pr2.s7 = F2( top[2], top[1], top[0] ); + + pr3.s0 = F1( left[2], left[3] ); pr3.s1 = (left[1] + 2 * left[2] + left[3] + 2) >> 2; + pr3.s2 = F1( left[1], left[2] ); pr3.s3 = (left[0] + 2 * left[1] + left[2] + 2) >> 2; + pr3.s4 = F1( left[0], left[1] ); pr3.s5 = (left_top + 2 * left[0] + left[1] + 2) >> 2; + pr3.s6 = F1( left_top, left[0] ); pr3.s7 = (left[0] + 2 * left_top + top[0] + 2) >> 2; + int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 ); + + // Lower half of pred[] + pr0.s0 = F1( left[3], left[4] ); pr0.s1 = (left[2] + 2 * left[3] + left[4] + 2) >> 2; + pr0.s2 = F1( left[2], left[3] ); pr0.s3 = (left[1] + 2 * left[2] + left[3] + 2) >> 2; + pr0.s4 = F1( left[1], left[2] ); pr0.s5 = (left[0] + 2 * left[1] + left[2] + 2) >> 2; + pr0.s6 = F1( left[0], left[1] ); pr0.s7 = (left_top + 2 * left[0] + left[1] + 2) >> 2; + + pr1.s0 = F1( left[4], left[5] ); pr1.s1 = (left[3] + 2 * left[4] + left[5] + 2) >> 2; + pr1.s2 = F1( left[3], left[4] ); pr1.s3 = (left[2] + 2 * left[3] + left[4] + 2) >> 2; + pr1.s4 = F1( left[2], left[3] ); pr1.s5 = (left[1] + 2 * left[2] + left[3] + 2) >> 2; + pr1.s6 = F1( left[1], left[2] ); pr1.s7 = (left[0] + 2 * left[1] + left[2] + 2) >> 2; + + pr2.s0 = F1( left[5], left[6] ); pr2.s1 = (left[4] + 2 * left[5] + left[6] + 2) >> 2; + pr2.s2 = F1( left[4], left[5] ); pr2.s3 = (left[3] + 2 * left[4] + left[5] + 2) >> 2; + pr2.s4 = F1( left[3], left[4] ); pr2.s5 = (left[2] + 2 * left[3] + left[4] + 2) >> 2; + pr2.s6 = F1( left[2], left[3] ); pr2.s7 = (left[1] + 2 * left[2] + left[3] + 2) >> 2; + + pr3.s0 = F1( left[6], left[7] ); pr3.s1 = (left[5] + 2 * left[6] + left[7] + 2) >> 2; + pr3.s2 = F1( left[5], left[6] ); pr3.s3 = (left[4] + 2 * left[5] + left[6] + 2) >> 2; + pr3.s4 = F1( left[4], left[5] ); pr3.s5 = (left[3] + 2 * left[4] + left[5] + 2) >> 2; + pr3.s6 = F1( left[3], left[4] ); pr3.s7 = (left[2] + 2 * left[3] + left[4] + 2) >> 2; + return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 ); +} + +int x264_predict_8x8_vl( const local pixel *src, int src_stride, const local pixel *top ) +{ + int8 pr0, pr1, pr2, pr3; + + // Upper half of pred[] + pr0.s0 = F1( top[0], top[1] ); + pr1.s0 = F2( top[0], top[1], top[2] ); + pr2.s0 = pr0.s1 = F1( top[1], top[2] ); + pr3.s0 = pr1.s1 = F2( top[1], top[2], top[3] ); + pr2.s1 = pr0.s2 = F1( top[2], top[3] ); + pr3.s1 = pr1.s2 = F2( top[2], top[3], top[4] ); + pr2.s2 = pr0.s3 = F1( top[3], top[4] ); + pr3.s2 = pr1.s3 = F2( top[3], top[4], top[5] ); + pr2.s3 = pr0.s4 = F1( top[4], top[5] ); + pr3.s3 = pr1.s4 = F2( top[4], top[5], top[6] ); + pr2.s4 = pr0.s5 = F1( top[5], top[6] ); + pr3.s4 = pr1.s5 = F2( top[5], top[6], top[7] ); + pr2.s5 = pr0.s6 = F1( top[6], top[7] ); + pr3.s5 = pr1.s6 = F2( top[6], top[7], top[8] ); + pr2.s6 = pr0.s7 = F1( top[7], top[8] ); + pr3.s6 = pr1.s7 = F2( top[7], top[8], top[9] ); + pr2.s7 = F1( top[8], top[9] ); + pr3.s7 = F2( top[8], top[9], top[10] ); + int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 ); + + // Lower half of pred[] + pr0.s0 = F1( top[2], top[3] ); + pr1.s0 = F2( top[2], top[3], top[4] ); + pr2.s0 = pr0.s1 = F1( top[3], top[4] ); + pr3.s0 = pr1.s1 = F2( top[3], top[4], top[5] ); + pr2.s1 = pr0.s2 = F1( top[4], top[5] ); + pr3.s1 = pr1.s2 = F2( top[4], top[5], top[6] ); + pr2.s2 = pr0.s3 = F1( top[5], top[6] ); + pr3.s2 = pr1.s3 = F2( top[5], top[6], top[7] ); + pr2.s3 = pr0.s4 = F1( top[6], top[7] ); + pr3.s3 = pr1.s4 = F2( top[6], top[7], top[8] ); + pr2.s4 = pr0.s5 = F1( top[7], top[8] ); + pr3.s4 = pr1.s5 = F2( top[7], top[8], top[9] ); + pr2.s5 = pr0.s6 = F1( top[8], top[9] ); + pr3.s5 = pr1.s6 = F2( top[8], top[9], top[10] ); + pr2.s6 = pr0.s7 = F1( top[9], top[10] ); + pr3.s6 = pr1.s7 = F2( top[9], top[10], top[11] ); + pr2.s7 = F1( top[10], top[11] ); + pr3.s7 = F2( top[10], top[11], top[12] ); + return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 ); +} + +int x264_predict_8x8_hu( const local pixel *src, int src_stride, const local pixel *left ) +{ + int8 pr0, pr1, pr2, pr3; + + // Upper half of pred[] + pr0.s0 = F1( left[0], left[1] ); pr0.s1 = (left[0] + 2 * left[1] + left[2] + 2) >> 2; + pr0.s2 = F1( left[1], left[2] ); pr0.s3 = (left[1] + 2 * left[2] + left[3] + 2) >> 2; + pr0.s4 = F1( left[2], left[3] ); pr0.s5 = (left[2] + 2 * left[3] + left[4] + 2) >> 2; + pr0.s6 = F1( left[3], left[4] ); pr0.s7 = (left[3] + 2 * left[4] + left[5] + 2) >> 2; + + pr1.s0 = F1( left[1], left[2] ); pr1.s1 = (left[1] + 2 * left[2] + left[3] + 2) >> 2; + pr1.s2 = F1( left[2], left[3] ); pr1.s3 = (left[2] + 2 * left[3] + left[4] + 2) >> 2; + pr1.s4 = F1( left[3], left[4] ); pr1.s5 = (left[3] + 2 * left[4] + left[5] + 2) >> 2; + pr1.s6 = F1( left[4], left[5] ); pr1.s7 = (left[4] + 2 * left[5] + left[6] + 2) >> 2; + + pr2.s0 = F1( left[2], left[3] ); pr2.s1 = (left[2] + 2 * left[3] + left[4] + 2) >> 2; + pr2.s2 = F1( left[3], left[4] ); pr2.s3 = (left[3] + 2 * left[4] + left[5] + 2) >> 2; + pr2.s4 = F1( left[4], left[5] ); pr2.s5 = (left[4] + 2 * left[5] + left[6] + 2) >> 2; + pr2.s6 = F1( left[5], left[6] ); pr2.s7 = (left[5] + 2 * left[6] + left[7] + 2) >> 2; + + pr3.s0 = F1( left[3], left[4] ); pr3.s1 = (left[3] + 2 * left[4] + left[5] + 2) >> 2; + pr3.s2 = F1( left[4], left[5] ); pr3.s3 = (left[4] + 2 * left[5] + left[6] + 2) >> 2; + pr3.s4 = F1( left[5], left[6] ); pr3.s5 = (left[5] + 2 * left[6] + left[7] + 2) >> 2; + pr3.s6 = F1( left[6], left[7] ); pr3.s7 = (left[6] + 2 * left[7] + left[7] + 2) >> 2; + int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 ); + + // Lower half of pred[] + pr0.s0 = F1( left[4], left[5] ); pr0.s1 = (left[4] + 2 * left[5] + left[6] + 2) >> 2; + pr0.s2 = F1( left[5], left[6] ); pr0.s3 = (left[5] + 2 * left[6] + left[7] + 2) >> 2; + pr0.s4 = F1( left[6], left[7] ); pr0.s5 = (left[6] + 2 * left[7] + left[7] + 2) >> 2; + pr0.s6 = left[7]; pr0.s7 = left[7]; + + pr1.s0 = F1( left[5], left[6] ); pr1.s1 = (left[5] + 2 * left[6] + left[7] + 2) >> 2; + pr1.s2 = F1( left[6], left[7] ); pr1.s3 = (left[6] + 2 * left[7] + left[7] + 2) >> 2; + pr1.s4 = left[7]; pr1.s5 = left[7]; + pr1.s6 = left[7]; pr1.s7 = left[7]; + + pr2.s0 = F1( left[6], left[7] ); pr2.s1 = (left[6] + 2 * left[7] + left[7] + 2) >> 2; + pr2.s2 = left[7]; pr2.s3 = left[7]; + pr2.s4 = left[7]; pr2.s5 = left[7]; + pr2.s6 = left[7]; pr2.s7 = left[7]; + + pr3 = (int8)left[7]; + + return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 ); +} + +int x264_predict_8x8c_h( const local pixel *src, int src_stride ) +{ + const local pixel *src_l = src; + int8 pr0, pr1, pr2, pr3; + + // Upper half of pred[] + pr0 = (int8)src[-1]; src += src_stride; + pr1 = (int8)src[-1]; src += src_stride; + pr2 = (int8)src[-1]; src += src_stride; + pr3 = (int8)src[-1]; src += src_stride; + int satd = satd_8x4_intra_lr( src_l, src_stride, pr0, pr1, pr2, pr3 ); + + //Lower half of pred[] + pr0 = (int8)src[-1]; src += src_stride; + pr1 = (int8)src[-1]; src += src_stride; + pr2 = (int8)src[-1]; src += src_stride; + pr3 = (int8)src[-1]; + return satd + satd_8x4_intra_lr( src_l + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 ); +} + +int x264_predict_8x8c_v( const local pixel *src, int src_stride ) +{ + int8 pred = convert_int8( vload8( 0, &src[-src_stride] )); + return satd_8x4_intra_lr( src, src_stride, pred, pred, pred, pred ) + + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pred, pred, pred, pred ); +} + +int x264_predict_8x8c_p( const local pixel *src, int src_stride ) +{ + int H = 0, V = 0; + for( int i = 0; i < 4; i++ ) + { + H += (i + 1) * (src[4 + i - src_stride] - src[2 - i - src_stride]); + V += (i + 1) * (src[-1 + (i + 4) * src_stride] - src[-1 + (2 - i) * src_stride]); + } + + int a = 16 * (src[-1 + 7 * src_stride] + src[7 - src_stride]); + int b = (17 * H + 16) >> 5; + int c = (17 * V + 16) >> 5; + int i00 = a - 3 * b - 3 * c + 16; + + // Upper half of pred[] + int pix = i00; + int8 pr0, pr1, pr2, pr3; + pr0.s0 = x264_clip_pixel( pix >> 5 ); pix += b; + pr0.s1 = x264_clip_pixel( pix >> 5 ); pix += b; + pr0.s2 = x264_clip_pixel( pix >> 5 ); pix += b; + pr0.s3 = x264_clip_pixel( pix >> 5 ); pix += b; + pr0.s4 = x264_clip_pixel( pix >> 5 ); pix += b; + pr0.s5 = x264_clip_pixel( pix >> 5 ); pix += b; + pr0.s6 = x264_clip_pixel( pix >> 5 ); pix += b; + pr0.s7 = x264_clip_pixel( pix >> 5 ); i00 += c; + + pix = i00; + pr1.s0 = x264_clip_pixel( pix >> 5 ); pix += b; + pr1.s1 = x264_clip_pixel( pix >> 5 ); pix += b; + pr1.s2 = x264_clip_pixel( pix >> 5 ); pix += b; + pr1.s3 = x264_clip_pixel( pix >> 5 ); pix += b; + pr1.s4 = x264_clip_pixel( pix >> 5 ); pix += b; + pr1.s5 = x264_clip_pixel( pix >> 5 ); pix += b; + pr1.s6 = x264_clip_pixel( pix >> 5 ); pix += b; + pr1.s7 = x264_clip_pixel( pix >> 5 ); i00 += c; + + pix = i00; + pr2.s0 = x264_clip_pixel( pix >> 5 ); pix += b; + pr2.s1 = x264_clip_pixel( pix >> 5 ); pix += b; + pr2.s2 = x264_clip_pixel( pix >> 5 ); pix += b; + pr2.s3 = x264_clip_pixel( pix >> 5 ); pix += b; + pr2.s4 = x264_clip_pixel( pix >> 5 ); pix += b; + pr2.s5 = x264_clip_pixel( pix >> 5 ); pix += b; + pr2.s6 = x264_clip_pixel( pix >> 5 ); pix += b; + pr2.s7 = x264_clip_pixel( pix >> 5 ); i00 += c; + + pix = i00; + pr3.s0 = x264_clip_pixel( pix >> 5 ); pix += b; + pr3.s1 = x264_clip_pixel( pix >> 5 ); pix += b; + pr3.s2 = x264_clip_pixel( pix >> 5 ); pix += b; + pr3.s3 = x264_clip_pixel( pix >> 5 ); pix += b; + pr3.s4 = x264_clip_pixel( pix >> 5 ); pix += b; + pr3.s5 = x264_clip_pixel( pix >> 5 ); pix += b; + pr3.s6 = x264_clip_pixel( pix >> 5 ); pix += b; + pr3.s7 = x264_clip_pixel( pix >> 5 ); i00 += c; + int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 ); + + //Lower half of pred[] + pix = i00; + pr0.s0 = x264_clip_pixel( pix >> 5 ); pix += b; + pr0.s1 = x264_clip_pixel( pix >> 5 ); pix += b; + pr0.s2 = x264_clip_pixel( pix >> 5 ); pix += b; + pr0.s3 = x264_clip_pixel( pix >> 5 ); pix += b; + pr0.s4 = x264_clip_pixel( pix >> 5 ); pix += b; + pr0.s5 = x264_clip_pixel( pix >> 5 ); pix += b; + pr0.s6 = x264_clip_pixel( pix >> 5 ); pix += b; + pr0.s7 = x264_clip_pixel( pix >> 5 ); i00 += c; + + pix = i00; + pr1.s0 = x264_clip_pixel( pix >> 5 ); pix += b; + pr1.s1 = x264_clip_pixel( pix >> 5 ); pix += b; + pr1.s2 = x264_clip_pixel( pix >> 5 ); pix += b; + pr1.s3 = x264_clip_pixel( pix >> 5 ); pix += b; + pr1.s4 = x264_clip_pixel( pix >> 5 ); pix += b; + pr1.s5 = x264_clip_pixel( pix >> 5 ); pix += b; + pr1.s6 = x264_clip_pixel( pix >> 5 ); pix += b; + pr1.s7 = x264_clip_pixel( pix >> 5 ); i00 += c; + + pix = i00; + pr2.s0 = x264_clip_pixel( pix >> 5 ); pix += b; + pr2.s1 = x264_clip_pixel( pix >> 5 ); pix += b; + pr2.s2 = x264_clip_pixel( pix >> 5 ); pix += b; + pr2.s3 = x264_clip_pixel( pix >> 5 ); pix += b; + pr2.s4 = x264_clip_pixel( pix >> 5 ); pix += b; + pr2.s5 = x264_clip_pixel( pix >> 5 ); pix += b; + pr2.s6 = x264_clip_pixel( pix >> 5 ); pix += b; + pr2.s7 = x264_clip_pixel( pix >> 5 ); i00 += c; + + pix = i00; + pr3.s0 = x264_clip_pixel( pix >> 5 ); pix += b; + pr3.s1 = x264_clip_pixel( pix >> 5 ); pix += b; + pr3.s2 = x264_clip_pixel( pix >> 5 ); pix += b; + pr3.s3 = x264_clip_pixel( pix >> 5 ); pix += b; + pr3.s4 = x264_clip_pixel( pix >> 5 ); pix += b; + pr3.s5 = x264_clip_pixel( pix >> 5 ); pix += b; + pr3.s6 = x264_clip_pixel( pix >> 5 ); pix += b; + pr3.s7 = x264_clip_pixel( pix >> 5 ); i00 += c; + return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 ); +} + +int x264_predict_8x8c_dc( const local pixel *src, int src_stride ) +{ + int s0 = 0, s1 = 0, s2 = 0, s3 = 0; + for( int i = 0; i < 4; i++ ) + { + s0 += src[i - src_stride]; + s1 += src[i + 4 - src_stride]; + s2 += src[-1 + i * src_stride]; + s3 += src[-1 + (i+4)*src_stride]; + } + + // Upper half of pred[] + int8 dc0; + dc0.lo = (int4)( (s0 + s2 + 4) >> 3 ); + dc0.hi = (int4)( (s1 + 2) >> 2 ); + int satd = satd_8x4_intra_lr( src, src_stride, dc0, dc0, dc0, dc0 ); + + // Lower half of pred[] + dc0.lo = (int4)( (s3 + 2) >> 2 ); + dc0.hi = (int4)( (s1 + s3 + 4) >> 3 ); + return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, dc0, dc0, dc0, dc0 ); +} + +#else /* not vectorized: private is cheap registers are scarce */ + +int x264_predict_8x8_ddl( const local pixel *src, int src_stride, const local pixel *top ) +{ + private pixel pred[32]; + + // Upper half of pred[] + for( int y = 0; y < 4; y++ ) + { + for( int x = 0; x < 8; x++ ) + { + pixel x_plus_y = (pixel) clamp_int( x + y, 0, 13 ); + pred[x + y*8] = ( 2 + top[x_plus_y] + 2*top[x_plus_y + 1] + top[x_plus_y + 2] ) >> 2; + } + } + int satd = satd_8x4_lp( src, src_stride, pred, 8 ); + //Lower half of pred[] + for( int y = 4; y < 8; y++ ) + { + for( int x = 0; x < 8; x++ ) + { + pixel x_plus_y = (pixel) clamp_int( x + y, 0, 13 ); + pred[x + ( y - 4 )*8] = ( 2 + top[x_plus_y] + 2*top[x_plus_y + 1] + top[x_plus_y + 2] ) >> 2; + } + } + pred[31] = ( 2 + top[14] + 3*top[15] ) >> 2; + satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 ); + return satd; +} + +int x264_predict_8x8_ddr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top ) +{ + private pixel pred[32]; +#define PRED( x, y ) pred[(x) + (y)*8] + // Upper half of pred[] + PRED( 0, 3 ) = F2( left[1], left[2], left[3] ); + PRED( 0, 2 ) = PRED( 1, 3 ) = F2( left[0], left[1], left[2] ); + PRED( 0, 1 ) = PRED( 1, 2 ) = PRED( 2, 3 ) = F2( left[1], left[0], left_top ); + PRED( 0, 0 ) = PRED( 1, 1 ) = PRED( 2, 2 ) = PRED( 3, 3 ) = F2( left[0], left_top, top[0] ); + PRED( 1, 0 ) = PRED( 2, 1 ) = PRED( 3, 2 ) = PRED( 4, 3 ) = F2( left_top, top[0], top[1] ); + PRED( 2, 0 ) = PRED( 3, 1 ) = PRED( 4, 2 ) = PRED( 5, 3 ) = F2( top[0], top[1], top[2] ); + PRED( 3, 0 ) = PRED( 4, 1 ) = PRED( 5, 2 ) = PRED( 6, 3 ) = F2( top[1], top[2], top[3] ); + PRED( 4, 0 ) = PRED( 5, 1 ) = PRED( 6, 2 ) = PRED( 7, 3 ) = F2( top[2], top[3], top[4] ); + PRED( 5, 0 ) = PRED( 6, 1 ) = PRED( 7, 2 ) = F2( top[3], top[4], top[5] ); + PRED( 6, 0 ) = PRED( 7, 1 ) = F2( top[4], top[5], top[6] ); + PRED( 7, 0 ) = F2( top[5], top[6], top[7] ); + int satd = satd_8x4_lp( src, src_stride, pred, 8 ); + + // Lower half of pred[] + PRED( 0, 3 ) = F2( left[5], left[6], left[7] ); + PRED( 0, 2 ) = PRED( 1, 3 ) = F2( left[4], left[5], left[6] ); + PRED( 0, 1 ) = PRED( 1, 2 ) = PRED( 2, 3 ) = F2( left[3], left[4], left[5] ); + PRED( 0, 0 ) = PRED( 1, 1 ) = PRED( 2, 2 ) = PRED( 3, 3 ) = F2( left[2], left[3], left[4] ); + PRED( 1, 0 ) = PRED( 2, 1 ) = PRED( 3, 2 ) = PRED( 4, 3 ) = F2( left[1], left[2], left[3] ); + PRED( 2, 0 ) = PRED( 3, 1 ) = PRED( 4, 2 ) = PRED( 5, 3 ) = F2( left[0], left[1], left[2] ); + PRED( 3, 0 ) = PRED( 4, 1 ) = PRED( 5, 2 ) = PRED( 6, 3 ) = F2( left[1], left[0], left_top ); + PRED( 4, 0 ) = PRED( 5, 1 ) = PRED( 6, 2 ) = PRED( 7, 3 ) = F2( left[0], left_top, top[0] ); + PRED( 5, 0 ) = PRED( 6, 1 ) = PRED( 7, 2 ) = F2( left_top, top[0], top[1] ); + PRED( 6, 0 ) = PRED( 7, 1 ) = F2( top[0], top[1], top[2] ); + PRED( 7, 0 ) = F2( top[1], top[2], top[3] ); + satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 ); + return satd; +#undef PRED +} + +int x264_predict_8x8_vr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top ) +{ + private pixel pred[32]; +#define PRED( x, y ) pred[(x) + (y)*8] + // Upper half of pred[] + PRED( 0, 2 ) = F2( left[1], left[0], left_top ); + PRED( 0, 3 ) = F2( left[2], left[1], left[0] ); + PRED( 0, 1 ) = PRED( 1, 3 ) = F2( left[0], left_top, top[0] ); + PRED( 0, 0 ) = PRED( 1, 2 ) = F1( left_top, top[0] ); + PRED( 1, 1 ) = PRED( 2, 3 ) = F2( left_top, top[0], top[1] ); + PRED( 1, 0 ) = PRED( 2, 2 ) = F1( top[0], top[1] ); + PRED( 2, 1 ) = PRED( 3, 3 ) = F2( top[0], top[1], top[2] ); + PRED( 2, 0 ) = PRED( 3, 2 ) = F1( top[1], top[2] ); + PRED( 3, 1 ) = PRED( 4, 3 ) = F2( top[1], top[2], top[3] ); + PRED( 3, 0 ) = PRED( 4, 2 ) = F1( top[2], top[3] ); + PRED( 4, 1 ) = PRED( 5, 3 ) = F2( top[2], top[3], top[4] ); + PRED( 4, 0 ) = PRED( 5, 2 ) = F1( top[3], top[4] ); + PRED( 5, 1 ) = PRED( 6, 3 ) = F2( top[3], top[4], top[5] ); + PRED( 5, 0 ) = PRED( 6, 2 ) = F1( top[4], top[5] ); + PRED( 6, 1 ) = PRED( 7, 3 ) = F2( top[4], top[5], top[6] ); + PRED( 6, 0 ) = PRED( 7, 2 ) = F1( top[5], top[6] ); + PRED( 7, 1 ) = F2( top[5], top[6], top[7] ); + PRED( 7, 0 ) = F1( top[6], top[7] ); + int satd = satd_8x4_lp( src, src_stride, pred, 8 ); + + //Lower half of pred[] + PRED( 0, 2 ) = F2( left[5], left[4], left[3] ); + PRED( 0, 3 ) = F2( left[6], left[5], left[4] ); + PRED( 0, 0 ) = PRED( 1, 2 ) = F2( left[3], left[2], left[1] ); + PRED( 0, 1 ) = PRED( 1, 3 ) = F2( left[4], left[3], left[2] ); + PRED( 1, 0 ) = PRED( 2, 2 ) = F2( left[1], left[0], left_top ); + PRED( 1, 1 ) = PRED( 2, 3 ) = F2( left[2], left[1], left[0] ); + PRED( 2, 1 ) = PRED( 3, 3 ) = F2( left[0], left_top, top[0] ); + PRED( 2, 0 ) = PRED( 3, 2 ) = F1( left_top, top[0] ); + PRED( 3, 1 ) = PRED( 4, 3 ) = F2( left_top, top[0], top[1] ); + PRED( 3, 0 ) = PRED( 4, 2 ) = F1( top[0], top[1] ); + PRED( 4, 1 ) = PRED( 5, 3 ) = F2( top[0], top[1], top[2] ); + PRED( 4, 0 ) = PRED( 5, 2 ) = F1( top[1], top[2] ); + PRED( 5, 1 ) = PRED( 6, 3 ) = F2( top[1], top[2], top[3] ); + PRED( 5, 0 ) = PRED( 6, 2 ) = F1( top[2], top[3] ); + PRED( 6, 1 ) = PRED( 7, 3 ) = F2( top[2], top[3], top[4] ); + PRED( 6, 0 ) = PRED( 7, 2 ) = F1( top[3], top[4] ); + PRED( 7, 1 ) = F2( top[3], top[4], top[5] ); + PRED( 7, 0 ) = F1( top[4], top[5] ); + satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 ); + return satd; +#undef PRED +} + +inline uint32_t pack16to32( uint32_t a, uint32_t b ) +{ + return a + (b << 16); +} + +inline uint32_t pack8to16( uint32_t a, uint32_t b ) +{ + return a + (b << 8); +} + +int x264_predict_8x8_hd( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top ) +{ + private pixel pred[32]; + int satd; + int p1 = pack8to16( (F1( left[6], left[7] )), ((left[5] + 2 * left[6] + left[7] + 2) >> 2) ); + int p2 = pack8to16( (F1( left[5], left[6] )), ((left[4] + 2 * left[5] + left[6] + 2) >> 2) ); + int p3 = pack8to16( (F1( left[4], left[5] )), ((left[3] + 2 * left[4] + left[5] + 2) >> 2) ); + int p4 = pack8to16( (F1( left[3], left[4] )), ((left[2] + 2 * left[3] + left[4] + 2) >> 2) ); + int p5 = pack8to16( (F1( left[2], left[3] )), ((left[1] + 2 * left[2] + left[3] + 2) >> 2) ); + int p6 = pack8to16( (F1( left[1], left[2] )), ((left[0] + 2 * left[1] + left[2] + 2) >> 2) ); + int p7 = pack8to16( (F1( left[0], left[1] )), ((left_top + 2 * left[0] + left[1] + 2) >> 2) ); + int p8 = pack8to16( (F1( left_top, left[0] )), ((left[0] + 2 * left_top + top[0] + 2) >> 2) ); + int p9 = pack8to16( (F2( top[1], top[0], left_top )), (F2( top[2], top[1], top[0] )) ); + int p10 = pack8to16( (F2( top[3], top[2], top[1] )), (F2( top[4], top[3], top[2] )) ); + int p11 = pack8to16( (F2( top[5], top[4], top[3] )), (F2( top[6], top[5], top[4] )) ); + // Upper half of pred[] + vstore4( as_uchar4( pack16to32( p8, p9 ) ), 0, &pred[0 + 0 * 8] ); + vstore4( as_uchar4( pack16to32( p10, p11 ) ), 0, &pred[4 + 0 * 8] ); + vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[0 + 1 * 8] ); + vstore4( as_uchar4( pack16to32( p9, p10 ) ), 0, &pred[4 + 1 * 8] ); + vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[0 + 2 * 8] ); + vstore4( as_uchar4( pack16to32( p8, p9 ) ), 0, &pred[4 + 2 * 8] ); + vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[0 + 3 * 8] ); + vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[4 + 3 * 8] ); + satd = satd_8x4_lp( src, src_stride, pred, 8 ); + // Lower half of pred[] + vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[0 + 0 * 8] ); + vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[4 + 0 * 8] ); + vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[0 + 1 * 8] ); + vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[4 + 1 * 8] ); + vstore4( as_uchar4( pack16to32( p2, p3 ) ), 0, &pred[0 + 2 * 8] ); + vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[4 + 2 * 8] ); + vstore4( as_uchar4( pack16to32( p1, p2 ) ), 0, &pred[0 + 3 * 8] ); + vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[4 + 3 * 8] ); + satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 ); + return satd; +} + +int x264_predict_8x8_vl( const local pixel *src, int src_stride, const local pixel *top ) +{ + private pixel pred[32]; + int satd; +#define PRED( x, y ) pred[(x) + (y)*8] + // Upper half of pred[] + PRED( 0, 0 ) = F1( top[0], top[1] ); + PRED( 0, 1 ) = F2( top[0], top[1], top[2] ); + PRED( 0, 2 ) = PRED( 1, 0 ) = F1( top[1], top[2] ); + PRED( 0, 3 ) = PRED( 1, 1 ) = F2( top[1], top[2], top[3] ); + PRED( 1, 2 ) = PRED( 2, 0 ) = F1( top[2], top[3] ); + PRED( 1, 3 ) = PRED( 2, 1 ) = F2( top[2], top[3], top[4] ); + PRED( 2, 2 ) = PRED( 3, 0 ) = F1( top[3], top[4] ); + PRED( 2, 3 ) = PRED( 3, 1 ) = F2( top[3], top[4], top[5] ); + PRED( 3, 2 ) = PRED( 4, 0 ) = F1( top[4], top[5] ); + PRED( 3, 3 ) = PRED( 4, 1 ) = F2( top[4], top[5], top[6] ); + PRED( 4, 2 ) = PRED( 5, 0 ) = F1( top[5], top[6] ); + PRED( 4, 3 ) = PRED( 5, 1 ) = F2( top[5], top[6], top[7] ); + PRED( 5, 2 ) = PRED( 6, 0 ) = F1( top[6], top[7] ); + PRED( 5, 3 ) = PRED( 6, 1 ) = F2( top[6], top[7], top[8] ); + PRED( 6, 2 ) = PRED( 7, 0 ) = F1( top[7], top[8] ); + PRED( 6, 3 ) = PRED( 7, 1 ) = F2( top[7], top[8], top[9] ); + PRED( 7, 2 ) = F1( top[8], top[9] ); + PRED( 7, 3 ) = F2( top[8], top[9], top[10] ); + satd = satd_8x4_lp( src, src_stride, pred, 8 ); + // Lower half of pred[] + PRED( 0, 0 ) = F1( top[2], top[3] ); + PRED( 0, 1 ) = F2( top[2], top[3], top[4] ); + PRED( 0, 2 ) = PRED( 1, 0 ) = F1( top[3], top[4] ); + PRED( 0, 3 ) = PRED( 1, 1 ) = F2( top[3], top[4], top[5] ); + PRED( 1, 2 ) = PRED( 2, 0 ) = F1( top[4], top[5] ); + PRED( 1, 3 ) = PRED( 2, 1 ) = F2( top[4], top[5], top[6] ); + PRED( 2, 2 ) = PRED( 3, 0 ) = F1( top[5], top[6] ); + PRED( 2, 3 ) = PRED( 3, 1 ) = F2( top[5], top[6], top[7] ); + PRED( 3, 2 ) = PRED( 4, 0 ) = F1( top[6], top[7] ); + PRED( 3, 3 ) = PRED( 4, 1 ) = F2( top[6], top[7], top[8] ); + PRED( 4, 2 ) = PRED( 5, 0 ) = F1( top[7], top[8] ); + PRED( 4, 3 ) = PRED( 5, 1 ) = F2( top[7], top[8], top[9] ); + PRED( 5, 2 ) = PRED( 6, 0 ) = F1( top[8], top[9] ); + PRED( 5, 3 ) = PRED( 6, 1 ) = F2( top[8], top[9], top[10] ); + PRED( 6, 2 ) = PRED( 7, 0 ) = F1( top[9], top[10] ); + PRED( 6, 3 ) = PRED( 7, 1 ) = F2( top[9], top[10], top[11] ); + PRED( 7, 2 ) = F1( top[10], top[11] ); + PRED( 7, 3 ) = F2( top[10], top[11], top[12] ); + satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 ); + return satd; +#undef PRED +} + +int x264_predict_8x8_hu( const local pixel *src, int src_stride, const local pixel *left ) +{ + private pixel pred[32]; + int satd; + int p1 = pack8to16( (F1( left[0], left[1] )), ((left[0] + 2 * left[1] + left[2] + 2) >> 2) ); + int p2 = pack8to16( (F1( left[1], left[2] )), ((left[1] + 2 * left[2] + left[3] + 2) >> 2) ); + int p3 = pack8to16( (F1( left[2], left[3] )), ((left[2] + 2 * left[3] + left[4] + 2) >> 2) ); + int p4 = pack8to16( (F1( left[3], left[4] )), ((left[3] + 2 * left[4] + left[5] + 2) >> 2) ); + int p5 = pack8to16( (F1( left[4], left[5] )), ((left[4] + 2 * left[5] + left[6] + 2) >> 2) ); + int p6 = pack8to16( (F1( left[5], left[6] )), ((left[5] + 2 * left[6] + left[7] + 2) >> 2) ); + int p7 = pack8to16( (F1( left[6], left[7] )), ((left[6] + 2 * left[7] + left[7] + 2) >> 2) ); + int p8 = pack8to16( left[7], left[7] ); + // Upper half of pred[] + vstore4( as_uchar4( pack16to32( p1, p2 ) ), 0, &pred[( 0 ) + ( 0 ) * 8] ); + vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[( 4 ) + ( 0 ) * 8] ); + vstore4( as_uchar4( pack16to32( p2, p3 ) ), 0, &pred[( 0 ) + ( 1 ) * 8] ); + vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[( 4 ) + ( 1 ) * 8] ); + vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[( 0 ) + ( 2 ) * 8] ); + vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[( 4 ) + ( 2 ) * 8] ); + vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[( 0 ) + ( 3 ) * 8] ); + vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[( 4 ) + ( 3 ) * 8] ); + satd = satd_8x4_lp( src, src_stride, pred, 8 ); + // Lower half of pred[] + vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[( 0 ) + ( 0 ) * 8] ); + vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[( 4 ) + ( 0 ) * 8] ); + vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[( 0 ) + ( 1 ) * 8] ); + vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 1 ) * 8] ); + vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[( 0 ) + ( 2 ) * 8] ); + vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 2 ) * 8] ); + vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 0 ) + ( 3 ) * 8] ); + vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 3 ) * 8] ); + satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 ); + return satd; +} + +int x264_predict_8x8c_h( const local pixel *src, int src_stride ) +{ + private pixel pred[32]; + const local pixel *src_l = src; + + // Upper half of pred[] + vstore8( (uchar8)(src[-1]), 0, pred ); src += src_stride; + vstore8( (uchar8)(src[-1]), 1, pred ); src += src_stride; + vstore8( (uchar8)(src[-1]), 2, pred ); src += src_stride; + vstore8( (uchar8)(src[-1]), 3, pred ); src += src_stride; + int satd = satd_8x4_lp( src_l, src_stride, pred, 8 ); + + // Lower half of pred[] + vstore8( (uchar8)(src[-1]), 0, pred ); src += src_stride; + vstore8( (uchar8)(src[-1]), 1, pred ); src += src_stride; + vstore8( (uchar8)(src[-1]), 2, pred ); src += src_stride; + vstore8( (uchar8)(src[-1]), 3, pred ); + return satd + satd_8x4_lp( src_l + ( src_stride << 2 ), src_stride, pred, 8 ); +} + +int x264_predict_8x8c_v( const local pixel *src, int src_stride ) +{ + private pixel pred[32]; + uchar16 v16; + v16.lo = vload8( 0, &src[-src_stride] ); + v16.hi = vload8( 0, &src[-src_stride] ); + + vstore16( v16, 0, pred ); + vstore16( v16, 1, pred ); + + return satd_8x4_lp( src, src_stride, pred, 8 ) + + satd_8x4_lp( src + (src_stride << 2), src_stride, pred, 8 ); +} + +int x264_predict_8x8c_p( const local pixel *src, int src_stride ) +{ + int H = 0, V = 0; + private pixel pred[32]; + int satd; + + for( int i = 0; i < 4; i++ ) + { + H += (i + 1) * (src[4 + i - src_stride] - src[2 - i - src_stride]); + V += (i + 1) * (src[-1 + (i + 4) * src_stride] - src[-1 + (2 - i) * src_stride]); + } + + int a = 16 * (src[-1 + 7 * src_stride] + src[7 - src_stride]); + int b = (17 * H + 16) >> 5; + int c = (17 * V + 16) >> 5; + int i00 = a - 3 * b - 3 * c + 16; + + // Upper half of pred[] + for( int y = 0; y < 4; y++ ) + { + int pix = i00; + for( int x = 0; x < 8; x++ ) + { + pred[x + y*8] = x264_clip_pixel( pix >> 5 ); + pix += b; + } + i00 += c; + } + satd = satd_8x4_lp( src, src_stride, pred, 8 ); + // Lower half of pred[] + for( int y = 0; y < 4; y++ ) + { + int pix = i00; + for( int x = 0; x < 8; x++ ) + { + pred[x + y*8] = x264_clip_pixel( pix >> 5 ); + pix += b; + } + i00 += c; + } + satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 ); + return satd; +} + +int x264_predict_8x8c_dc( const local pixel *src, int src_stride ) +{ + private pixel pred[32]; + int s0 = 0, s1 = 0, s2 = 0, s3 = 0; + for( int i = 0; i < 4; i++ ) + { + s0 += src[i - src_stride]; + s1 += src[i + 4 - src_stride]; + s2 += src[-1 + i * src_stride]; + s3 += src[-1 + (i+4)*src_stride]; + } + + // Upper half of pred[] + uchar8 dc0; + dc0.lo = (uchar4)( (s0 + s2 + 4) >> 3 ); + dc0.hi = (uchar4)( (s1 + 2) >> 2 ); + vstore8( dc0, 0, pred ); + vstore8( dc0, 1, pred ); + vstore8( dc0, 2, pred ); + vstore8( dc0, 3, pred ); + int satd = satd_8x4_lp( src, src_stride, pred, 8 ); + + // Lower half of pred[] + dc0.lo = (uchar4)( (s3 + 2) >> 2 ); + dc0.hi = (uchar4)( (s1 + s3 + 4) >> 3 ); + vstore8( dc0, 0, pred ); + vstore8( dc0, 1, pred ); + vstore8( dc0, 2, pred ); + vstore8( dc0, 3, pred ); + return satd + satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 ); +} +#endif + +/* Find the least cost intra mode for 32 8x8 macroblocks per workgroup + * + * Loads 33 macroblocks plus the pixels directly above them into local memory, + * padding where necessary with edge pixels. It then cooperatively calculates + * smoothed top and left pixels for use in some of the analysis. + * + * Then groups of 32 threads each calculate a single intra mode for each 8x8 + * block. Since consecutive threads are calculating the same intra mode there + * is no code-path divergence. 8 intra costs are calculated simultaneously. If + * the "slow" argument is not zero, the final two (least likely) intra modes are + * tested in a second pass. The slow mode is only enabled for presets slow, + * slower, and placebo. + * + * This allows all of the pixels functions to read pixels from local memory, and + * avoids re-fetching edge pixels from global memory. And it allows us to + * calculate all of the intra mode costs simultaneously without branch divergence. + * + * Local dimension: [ 32, 8 ] + * Global dimensions: [ paddedWidth, height ] */ +kernel void mb_intra_cost_satd_8x8( read_only image2d_t fenc, + global uint16_t *fenc_intra_cost, + global int *frame_stats, + int lambda, + int mb_width, + int slow ) +{ +#define CACHE_STRIDE 265 +#define BLOCK_OFFSET 266 + local pixel cache[2385]; + local int cost_buf[32]; + local pixel top[32 * 16]; + local pixel left[32 * 8]; + local pixel left_top[32]; + + int lx = get_local_id( 0 ); + int ly = get_local_id( 1 ); + int gx = get_global_id( 0 ); + int gy = get_global_id( 1 ); + int gidx = get_group_id( 0 ); + int gidy = get_group_id( 1 ); + int linear_id = ly * get_local_size( 0 ) + lx; + int satd = COST_MAX; + int basex = gidx << 8; + int basey = (gidy << 3) - 1; + + /* Load 33 8x8 macroblocks and the pixels above them into local cache */ + for( int y = 0; y < 9 && linear_id < (33<<3)>>2; y++ ) + { + int x = linear_id << 2; + uint4 data = read_imageui( fenc, sampler, (int2)(x + basex, y + basey) ); + cache[y * CACHE_STRIDE + 1 + x] = data.s0; + cache[y * CACHE_STRIDE + 1 + x + 1] = data.s1; + cache[y * CACHE_STRIDE + 1 + x + 2] = data.s2; + cache[y * CACHE_STRIDE + 1 + x + 3] = data.s3; + } + /* load pixels on left edge */ + if( linear_id < 9 ) + cache[linear_id * CACHE_STRIDE] = read_imageui( fenc, sampler, (int2)( basex - 1, linear_id + basey) ).s0; + + barrier( CLK_LOCAL_MEM_FENCE ); + + // Cooperatively build the top edge for the macroblock using lowpass filter + int j = ly; + top[lx*16 + j] = ( cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j - 1, -1, 15 )] + + 2*cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j, 0, 15 )] + + cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j + 1, 0, 15 )] + 2 ) >> 2; + j += 8; + top[lx*16 + j] = ( cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j - 1, -1, 15 )] + + 2*cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j, 0, 15 )] + + cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j + 1, 0, 15 )] + 2 ) >> 2; + // Cooperatively build the left edge for the macroblock using lowpass filter + left[lx*8 + ly] = ( cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*(ly - 1)] + + 2*cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*ly] + + cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*clamp((ly + 1), 0, 7 )] + 2 ) >> 2; + // One left_top per macroblock + if( 0 == ly ) + { + left_top[lx] = ( cache[BLOCK_OFFSET + 8*lx - 1] + 2*cache[BLOCK_OFFSET + 8*lx - 1 - CACHE_STRIDE] + + cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE] + 2 ) >> 2; + cost_buf[lx] = COST_MAX; + } + barrier( CLK_LOCAL_MEM_FENCE ); + + // each warp/wavefront generates a different prediction type; no divergence + switch( ly ) + { + case 0: + satd = x264_predict_8x8c_h( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE ); + break; + case 1: + satd = x264_predict_8x8c_v( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE ); + break; + case 2: + satd = x264_predict_8x8c_dc( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE ); + break; + case 3: + satd = x264_predict_8x8c_p( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE ); + break; + case 4: + satd = x264_predict_8x8_ddr( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] ); + break; + case 5: + satd = x264_predict_8x8_vr( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] ); + break; + case 6: + satd = x264_predict_8x8_hd( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] ); + break; + case 7: + satd = x264_predict_8x8_hu( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &left[8*lx] ); + break; + default: + break; + } + atom_min( &cost_buf[lx], satd ); + if( slow ) + { + // Do the remaining two (least likely) prediction modes + switch( ly ) + { + case 0: // DDL + satd = x264_predict_8x8_ddl( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx] ); + atom_min( &cost_buf[lx], satd ); + break; + case 1: // VL + satd = x264_predict_8x8_vl( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx] ); + atom_min( &cost_buf[lx], satd ); + break; + default: + break; + } + } + barrier( CLK_LOCAL_MEM_FENCE ); + + if( (0 == ly) && (gx < mb_width) ) + fenc_intra_cost[gidy * mb_width + gx] = cost_buf[lx]+ 5*lambda; + + // initialize the frame_stats[2] buffer for kernel sum_intra_cost(). + if( gx < 2 && gy == 0 ) + frame_stats[gx] = 0; +#undef CACHE_STRIDE +#undef BLOCK_OFFSET +} + +/* + * parallel sum intra costs + * + * global launch dimensions: [256, mb_height] + */ +kernel void sum_intra_cost( const global uint16_t *fenc_intra_cost, + const global uint16_t *inv_qscale_factor, + global int *fenc_row_satds, + global int *frame_stats, + int mb_width ) +{ + int y = get_global_id( 1 ); + int mb_height = get_global_size( 1 ); + + int row_satds = 0; + int cost_est = 0; + int cost_est_aq = 0; + + for( int x = get_global_id( 0 ); x < mb_width; x += get_global_size( 0 )) + { + int mb_xy = x + y * mb_width; + int cost = fenc_intra_cost[mb_xy]; + int cost_aq = (cost * inv_qscale_factor[mb_xy] + 128) >> 8; + int b_frame_score_mb = (x > 0 && x < mb_width - 1 && y > 0 && y < mb_height - 1) || mb_width <= 2 || mb_height <= 2; + + row_satds += cost_aq; + if( b_frame_score_mb ) + { + cost_est += cost; + cost_est_aq += cost_aq; + } + } + + local int buffer[256]; + int x = get_global_id( 0 ); + + row_satds = parallel_sum( row_satds, x, buffer ); + cost_est = parallel_sum( cost_est, x, buffer ); + cost_est_aq = parallel_sum( cost_est_aq, x, buffer ); + + if( get_global_id( 0 ) == 0 ) + { + fenc_row_satds[y] = row_satds; + atomic_add( frame_stats + COST_EST, cost_est ); + atomic_add( frame_stats + COST_EST_AQ, cost_est_aq ); + } +}
View file
x264-snapshot-20130723-2245.tar.bz2/common/opencl/motionsearch.cl
Added
@@ -0,0 +1,249 @@ +/* Hierarchical (iterative) OpenCL lowres motion search */ + +inline int find_downscale_mb_xy( int x, int y, int mb_width, int mb_height ) +{ + /* edge macroblocks might not have a direct descendant, use nearest */ + x = select( x >> 1, (x - (mb_width&1)) >> 1, x == mb_width-1 ); + y = select( y >> 1, (y - (mb_height&1)) >> 1, y == mb_height-1 ); + return (mb_width>>1) * y + x; +} + +/* Four threads calculate an 8x8 SAD. Each does two rows */ +int sad_8x8_ii_coop4( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos, int idx, local int16_t *costs ) +{ + frefpos.y += idx << 1; + fencpos.y += idx << 1; + int cost = 0; + if( frefpos.x < 0 ) + { + /* slow path when MV goes past left edge. The GPU clamps reads from + * (-1, 0) to (0,0), so you get pixels [0, 1, 2, 3] when what you really + * want are [0, 0, 1, 2] + */ + for( int y = 0; y < 2; y++ ) + { + for( int x = 0; x < 8; x++ ) + { + pixel enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0; + pixel ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0; + cost += abs_diff( enc, ref ); + } + } + } + else + { + uint4 enc, ref, costs = 0; + enc = read_imageui( fenc, sampler, fencpos ); + ref = read_imageui( fref, sampler, frefpos ); + costs += abs_diff( enc, ref ); + enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 0) ); + ref = read_imageui( fref, sampler, frefpos + (int2)(4, 0) ); + costs += abs_diff( enc, ref ); + enc = read_imageui( fenc, sampler, fencpos + (int2)(0, 1) ); + ref = read_imageui( fref, sampler, frefpos + (int2)(0, 1) ); + costs += abs_diff( enc, ref ); + enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 1) ); + ref = read_imageui( fref, sampler, frefpos + (int2)(4, 1) ); + costs += abs_diff( enc, ref ); + cost = costs.s0 + costs.s1 + costs.s2 + costs.s3; + } + costs[idx] = cost; + return costs[0] + costs[1] + costs[2] + costs[3]; +} + +/* One thread performs 8x8 SAD */ +int sad_8x8_ii( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos ) +{ + if( frefpos.x < 0 ) + { + /* slow path when MV goes past left edge */ + int cost = 0; + for( int y = 0; y < 8; y++ ) + { + for( int x = 0; x < 8; x++ ) + { + uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0; + uint ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0; + cost += abs_diff( enc, ref ); + } + } + return cost; + } + else + { + uint4 enc, ref, cost = 0; + for( int y = 0; y < 8; y++ ) + { + for( int x = 0; x < 8; x += 4 ) + { + enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ); + ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ); + cost += abs_diff( enc, ref ); + } + } + return cost.s0 + cost.s1 + cost.s2 + cost.s3; + } +} +/* + * hierarchical motion estimation + * + * Each kernel launch is a single iteration + * + * MB per work group is determined by lclx / 4 * lcly + * + * global launch dimensions: [mb_width * 4, mb_height] + */ +kernel void hierarchical_motion( read_only image2d_t fenc, + read_only image2d_t fref, + const global short2 *in_mvs, + global short2 *out_mvs, + global int16_t *out_mv_costs, + global short2 *mvp_buffer, + local int16_t *cost_local, + local short2 *mvc_local, + int mb_width, + int lambda, + int me_range, + int scale, + int b_shift_index, + int b_first_iteration, + int b_reverse_references ) +{ + int mb_x = get_global_id( 0 ) >> 2; + if( mb_x >= mb_width ) + return; + int mb_height = get_global_size( 1 ); + int mb_i = get_global_id( 0 ) & 3; + int mb_y = get_global_id( 1 ); + int mb_xy = mb_y * mb_width + mb_x; + const int mb_size = 8; + int2 coord = (int2)(mb_x, mb_y) * mb_size; + + const int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2); + cost_local += 4 * mb_in_group; + + int i_mvc = 0; + mvc_local += 4 * mb_in_group; + mvc_local[mb_i] = 0; + int2 mvp =0; + + if( !b_first_iteration ) + { +#define MVC( DX, DY )\ + {\ + int px = mb_x + DX;\ + int py = mb_y + DY;\ + mvc_local[i_mvc] = b_shift_index ? in_mvs[find_downscale_mb_xy( px, py, mb_width, mb_height )] : \ + in_mvs[mb_width * py + px];\ + mvc_local[i_mvc] >>= (short) scale;\ + i_mvc++;\ + } + /* Find MVP from median of MVCs */ + if( b_reverse_references ) + { + /* odd iterations: derive MVP from down and right */ + if( mb_x < mb_width - 1 ) + MVC( 1, 0 ); + if( mb_y < mb_height - 1 ) + { + MVC( 0, 1 ); + if( mb_x > b_shift_index ) + MVC( -1, 1 ); + if( mb_x < mb_width - 1 ) + MVC( 1, 1 ); + } + } + else + { + /* even iterations: derive MVP from up and left */ + if( mb_x > 0 ) + MVC( -1, 0 ); + if( mb_y > 0 ) + { + MVC( 0, -1 ); + if( mb_x < mb_width - 1 ) + MVC( 1, -1 ); + if( mb_x > b_shift_index ) + MVC( -1, -1 ); + } + } +#undef MVC + mvp = (i_mvc <= 1) ? convert_int2_sat(mvc_local[0]) : x264_median_mv( mvc_local[0], mvc_local[1], mvc_local[2] ); + } + /* current mvp matches the previous mvp and we have not changed scale. We know + * we're going to arrive at the same MV again, so just copy the previous + * result to our output. */ + if( !b_shift_index && mvp.x == mvp_buffer[mb_xy].x && mvp.y == mvp_buffer[mb_xy].y ) + { + out_mvs[mb_xy] = in_mvs[mb_xy]; + return; + } + mvp_buffer[mb_xy] = convert_short2_sat(mvp); + int2 mv_min = -mb_size * (int2)(mb_x, mb_y) - 4; + int2 mv_max = mb_size * ((int2)(mb_width, mb_height) - (int2)(mb_x, mb_y) - 1) + 4; + + int2 bestmv = clamp(mvp, mv_min, mv_max); + int2 refcrd = coord + bestmv; + + /* measure cost at bestmv */ + int bcost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) + + lambda * mv_cost( abs_diff( bestmv, mvp ) << (2 + scale) ); + + do + { + /* measure costs at offsets from bestmv */ + refcrd = coord + bestmv + dia_offs[mb_i]; + int2 trymv = bestmv + dia_offs[mb_i]; + int cost = sad_8x8_ii( fenc, coord, fref, refcrd ) + + lambda * mv_cost( abs_diff( trymv, mvp ) << (2 + scale) ); + + cost_local[mb_i] = (cost<<2) | mb_i; + cost = min( cost_local[0], min( cost_local[1], min( cost_local[2], cost_local[3] ) ) ); + + if( (cost >> 2) >= bcost ) + break; + + bestmv += dia_offs[cost&3]; + bcost = cost>>2; + + if( bestmv.x >= mv_max.x || bestmv.x <= mv_min.x || bestmv.y >= mv_max.y || bestmv.y <= mv_min.y ) + break; + } + while( --me_range > 0 ); + + int2 trymv = 0, diff = 0; + +#define COST_MV_NO_PAD( L )\ + trymv = clamp( trymv, mv_min, mv_max );\ + diff = convert_int2_sat(abs_diff( mvp, trymv ));\ + if( diff.x > 1 || diff.y > 1 ) {\ + int2 refcrd = coord + trymv;\ + int cost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) +\ + L * mv_cost( abs_diff( trymv, mvp ) << (2 + scale) );\ + if( cost < bcost ) { bcost = cost; bestmv = trymv; } } + + COST_MV_NO_PAD( 0 ); + + if( !b_first_iteration ) + { + /* try cost at previous iteration's MV, if MVP was too far away */ + int2 prevmv = b_shift_index ? convert_int2_sat(in_mvs[find_downscale_mb_xy( mb_x, mb_y, mb_width, mb_height )]) : convert_int2_sat(in_mvs[mb_xy]); + prevmv >>= scale; + trymv = prevmv; + COST_MV_NO_PAD( lambda ); + } + + for( int i = 0; i < i_mvc; i++ ) + { + /* try cost at each candidate MV, if MVP was too far away */ + trymv = convert_int2_sat( mvc_local[i] ); + COST_MV_NO_PAD( lambda ); + } + + if( mb_i == 0 ) + { + bestmv <<= scale; + out_mvs[mb_xy] = convert_short2_sat(bestmv); + out_mv_costs[mb_xy] = min( bcost, LOWRES_COST_MASK ); + } +}
View file
x264-snapshot-20130723-2245.tar.bz2/common/opencl/subpel.cl
Added
@@ -0,0 +1,242 @@ +/* OpenCL lowres subpel Refine */ + +/* Each thread performs 8x8 SAD. 4 threads per MB, so the 4 DIA HPEL offsets are + * calculated simultaneously */ +int sad_8x8_ii_hpel( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref_planes, int2 qpos ) +{ + int2 frefpos = qpos >> 2; + int hpel_idx = ((qpos.x & 2) >> 1) + (qpos.y & 2); + uint mask_shift = 8 * hpel_idx; + + uint4 cost4 = 0; + + for( int y = 0; y < 8; y++ ) + { + uint4 enc, val4; + enc = read_imageui( fenc, sampler, fencpos + (int2)(0, y)); + val4.s0 = (read_imageui( fref_planes, sampler, frefpos + (int2)(0, y)).s0 >> mask_shift) & 0xFF; + val4.s1 = (read_imageui( fref_planes, sampler, frefpos + (int2)(1, y)).s0 >> mask_shift) & 0xFF; + val4.s2 = (read_imageui( fref_planes, sampler, frefpos + (int2)(2, y)).s0 >> mask_shift) & 0xFF; + val4.s3 = (read_imageui( fref_planes, sampler, frefpos + (int2)(3, y)).s0 >> mask_shift) & 0xFF; + cost4 += abs_diff( enc, val4 ); + + enc = read_imageui( fenc, sampler, fencpos + (int2)(4, y)); + val4.s0 = (read_imageui( fref_planes, sampler, frefpos + (int2)(4, y)).s0 >> mask_shift) & 0xFF; + val4.s1 = (read_imageui( fref_planes, sampler, frefpos + (int2)(5, y)).s0 >> mask_shift) & 0xFF; + val4.s2 = (read_imageui( fref_planes, sampler, frefpos + (int2)(6, y)).s0 >> mask_shift) & 0xFF; + val4.s3 = (read_imageui( fref_planes, sampler, frefpos + (int2)(7, y)).s0 >> mask_shift) & 0xFF; + cost4 += abs_diff( enc, val4 ); + } + + return cost4.s0 + cost4.s1 + cost4.s2 + cost4.s3; +} + +/* One thread measures 8x8 SAD cost at a QPEL offset into an HPEL plane */ +int sad_8x8_ii_qpel( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref_planes, int2 qpos ) +{ + int2 frefApos = qpos >> 2; + int hpelA = ((qpos.x & 2) >> 1) + (qpos.y & 2); + + int2 qposB = qpos + ((qpos & 1) << 1); + int2 frefBpos = qposB >> 2; + int hpelB = ((qposB.x & 2) >> 1) + (qposB.y & 2); + + uint mask_shift0 = 8 * hpelA, mask_shift1 = 8 * hpelB; + + int cost = 0; + + for( int y = 0; y < 8; y++ ) + { + for( int x = 0; x < 8; x++ ) + { + uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y)).s0; + uint vA = (read_imageui( fref_planes, sampler, frefApos + (int2)(x, y)).s0 >> mask_shift0) & 0xFF; + uint vB = (read_imageui( fref_planes, sampler, frefBpos + (int2)(x, y)).s0 >> mask_shift1) & 0xFF; + cost += abs_diff( enc, rhadd( vA, vB ) ); + } + } + + return cost; +} + +/* Four threads measure 8x8 SATD cost at a QPEL offset into an HPEL plane + * + * Each thread collects 1/4 of the rows of diffs and processes one quarter of + * the transforms + */ +int satd_8x8_ii_qpel_coop4( read_only image2d_t fenc, + int2 fencpos, + read_only image2d_t fref_planes, + int2 qpos, + local sum2_t *tmpp, + int idx ) +{ + volatile local sum2_t( *tmp )[4] = (volatile local sum2_t( * )[4])tmpp; + sum2_t b0, b1, b2, b3; + + // fencpos is full-pel position of original MB + // qpos is qpel position within reference frame + int2 frefApos = qpos >> 2; + int hpelA = ((qpos.x&2)>>1) + (qpos.y&2); + + int2 qposB = qpos + (int2)(((qpos.x&1)<<1), ((qpos.y&1)<<1)); + int2 frefBpos = qposB >> 2; + int hpelB = ((qposB.x&2)>>1) + (qposB.y&2); + + uint mask_shift0 = 8 * hpelA, mask_shift1 = 8 * hpelB; + + uint vA, vB; + uint a0, a1; + uint enc; + sum2_t sum = 0; + +#define READ_DIFF( OUT, X )\ + enc = read_imageui( fenc, sampler, fencpos + (int2)(X, idx) ).s0;\ + vA = (read_imageui( fref_planes, sampler, frefApos + (int2)(X, idx) ).s0 >> mask_shift0) & 0xFF;\ + vB = (read_imageui( fref_planes, sampler, frefBpos + (int2)(X, idx) ).s0 >> mask_shift1) & 0xFF;\ + OUT = enc - rhadd( vA, vB ); + +#define READ_DIFF_EX( OUT, a, b )\ + {\ + READ_DIFF( a0, a );\ + READ_DIFF( a1, b );\ + OUT = a0 + (a1<<BITS_PER_SUM);\ + } +#define ROW_8x4_SATD( a, b )\ + {\ + fencpos.y += a;\ + frefApos.y += b;\ + frefBpos.y += b;\ + READ_DIFF_EX( b0, 0, 4 );\ + READ_DIFF_EX( b1, 1, 5 );\ + READ_DIFF_EX( b2, 2, 6 );\ + READ_DIFF_EX( b3, 3, 7 );\ + HADAMARD4( tmp[idx][0], tmp[idx][1], tmp[idx][2], tmp[idx][3], b0, b1, b2, b3 );\ + HADAMARD4( b0, b1, b2, b3, tmp[0][idx], tmp[1][idx], tmp[2][idx], tmp[3][idx] );\ + sum += abs2( b0 ) + abs2( b1 ) + abs2( b2 ) + abs2( b3 );\ + } + ROW_8x4_SATD( 0, 0 ); + ROW_8x4_SATD( 4, 4 ); + +#undef READ_DIFF +#undef READ_DIFF_EX +#undef ROW_8x4_SATD + return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1; +} + +constant int2 hpoffs[4] = +{ + {0, -2}, {-2, 0}, {2, 0}, {0, 2} +}; + +/* sub pixel refinement of motion vectors, output MVs and costs are moved from + * temporary buffers into final per-frame buffer + * + * global launch dimensions: [mb_width * 4, mb_height] + * + * With X being the source 16x16 pixels, F is the lowres pixel used by the + * motion search. We will now utilize the H V and C pixels (stored in separate + * planes) to search at half-pel increments. + * + * X X X X X X + * F H F H F + * X X X X X X + * V C V C V + * X X X X X X + * F H F H F + * X X X X X X + * + * The YX HPEL bits of the motion vector selects the plane we search in. The + * four planes are packed in the fref_planes 2D image buffer. Each sample + * returns: s0 = F, s1 = H, s2 = V, s3 = C */ +kernel void subpel_refine( read_only image2d_t fenc, + read_only image2d_t fref_planes, + const global short2 *in_mvs, + const global int16_t *in_sad_mv_costs, + local int16_t *cost_local, + local sum2_t *satd_local, + local short2 *mvc_local, + global short2 *fenc_lowres_mv, + global int16_t *fenc_lowres_mv_costs, + int mb_width, + int lambda, + int b, + int ref, + int b_islist1 ) +{ + int mb_x = get_global_id( 0 ) >> 2; + if( mb_x >= mb_width ) + return; + int mb_height = get_global_size( 1 ); + + int mb_i = get_global_id( 0 ) & 3; + int mb_y = get_global_id( 1 ); + int mb_xy = mb_y * mb_width + mb_x; + + /* fenc_lowres_mv and fenc_lowres_mv_costs are large buffers that + * hold many frames worth of motion vectors. We must offset into the correct + * location for this frame's vectors. The kernel will be passed the correct + * directional buffer for the direction of the search: list1 or list0 + * + * CPU equivalent: fenc->lowres_mvs[0][b - p0 - 1] + * GPU equivalent: fenc_lowres_mvs[(b - p0 - 1) * mb_count] */ + fenc_lowres_mv += (b_islist1 ? (ref-b-1) : (b-ref-1)) * mb_width * mb_height; + fenc_lowres_mv_costs += (b_islist1 ? (ref-b-1) : (b-ref-1)) * mb_width * mb_height; + + /* Adjust pointers into local memory buffers for this thread's data */ + int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2); + cost_local += mb_in_group * 4; + satd_local += mb_in_group * 16; + mvc_local += mb_in_group * 4; + + int i_mvc = 0; + + mvc_local[0] = mvc_local[1] = mvc_local[2] = mvc_local[3] = 0; + +#define MVC( DX, DY ) mvc_local[i_mvc++] = in_mvs[mb_width * (mb_y + DY) + (mb_x + DX)]; + if( mb_x > 0 ) + MVC( -1, 0 ); + if( mb_y > 0 ) + { + MVC( 0, -1 ); + if( mb_x < mb_width - 1 ) + MVC( 1, -1 ); + if( mb_x > 0 ) + MVC( -1, -1 ); + } +#undef MVC + int2 mvp = (i_mvc <= 1) ? convert_int2_sat(mvc_local[0]) : x264_median_mv( mvc_local[0], mvc_local[1], mvc_local[2] ); + + int bcost = in_sad_mv_costs[mb_xy]; + int2 coord = (int2)(mb_x, mb_y) << 3; + int2 bmv = convert_int2_sat( in_mvs[mb_xy] ); + + /* Make mvp and bmv QPEL MV */ + mvp <<= 2; bmv <<= 2; + +#define HPEL_QPEL( ARR, FUNC )\ + {\ + int2 trymv = bmv + ARR[mb_i];\ + int2 qpos = (coord << 2) + trymv;\ + int cost = FUNC( fenc, coord, fref_planes, qpos ) + lambda * mv_cost( abs_diff( trymv, mvp ) );\ + cost_local[mb_i] = (cost<<2) + mb_i;\ + cost = min( cost_local[0], min( cost_local[1], min( cost_local[2], cost_local[3] ) ) );\ + if( (cost>>2) < bcost )\ + {\ + bmv += ARR[cost&3];\ + bcost = cost>>2;\ + }\ + } + + HPEL_QPEL( hpoffs, sad_8x8_ii_hpel ); + HPEL_QPEL( dia_offs, sad_8x8_ii_qpel ); + fenc_lowres_mv[mb_xy] = convert_short2_sat( bmv ); + + /* remeasure cost of bmv using SATD */ + int2 qpos = (coord << 2) + bmv; + cost_local[mb_i] = satd_8x8_ii_qpel_coop4( fenc, coord, fref_planes, qpos, satd_local, mb_i ); + bcost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3]; + bcost += lambda * mv_cost( abs_diff( bmv, mvp ) ); + + fenc_lowres_mv_costs[mb_xy] = min( bcost, LOWRES_COST_MASK ); +}
View file
x264-snapshot-20130723-2245.tar.bz2/common/opencl/weightp.cl
Added
@@ -0,0 +1,48 @@ +/* Weightp filter a downscaled image into a temporary output buffer. + * This kernel is launched once for each scale. + * + * Launch dimensions: width x height (in pixels) + */ +kernel void weightp_scaled_images( read_only image2d_t in_plane, + write_only image2d_t out_plane, + uint offset, + uint scale, + uint denom ) +{ + int gx = get_global_id( 0 ); + int gy = get_global_id( 1 ); + uint4 input_val; + uint4 output_val; + + input_val = read_imageui( in_plane, sampler, (int2)(gx, gy)); + output_val = (uint4)(offset) + ( ( ((uint4)(scale)) * input_val ) >> ((uint4)(denom)) ); + write_imageui( out_plane, (int2)(gx, gy), output_val ); +} + +/* Weightp filter for the half-pel interpolated image + * + * Launch dimensions: width x height (in pixels) + */ +kernel void weightp_hpel( read_only image2d_t in_plane, + write_only image2d_t out_plane, + uint offset, + uint scale, + uint denom ) +{ + int gx = get_global_id( 0 ); + int gy = get_global_id( 1 ); + uint input_val; + uint output_val; + + input_val = read_imageui( in_plane, sampler, (int2)(gx, gy)).s0; + //Unpack + uint4 temp; + temp.s0 = input_val & 0x00ff; temp.s1 = (input_val >> 8) & 0x00ff; + temp.s2 = (input_val >> 16) & 0x00ff; temp.s3 = (input_val >> 24) & 0x00ff; + + temp = (uint4)(offset) + ( ( ((uint4)(scale)) * temp ) >> ((uint4)(denom)) ); + + //Pack + output_val = temp.s0 | (temp.s1 << 8) | (temp.s2 << 16) | (temp.s3 << 24); + write_imageui( out_plane, (int2)(gx, gy), output_val ); +}
View file
x264-snapshot-20130723-2245.tar.bz2/common/opencl/x264-cl.h
Added
@@ -0,0 +1,132 @@ +#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable + +constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; + +/* 7.18.1.1 Exact-width integer types */ +typedef signed char int8_t; +typedef unsigned char uint8_t; +typedef short int16_t; +typedef unsigned short uint16_t; +typedef int int32_t; +typedef unsigned uint32_t; + +typedef uint8_t pixel; +typedef uint16_t sum_t; +typedef uint32_t sum2_t; + +#define LOWRES_COST_MASK ((1<<14)-1) +#define LOWRES_COST_SHIFT 14 +#define COST_MAX (1<<28) + +#define PIXEL_MAX 255 +#define BITS_PER_SUM (8 * sizeof(sum_t)) + +/* Constants for offsets into frame statistics buffer */ +#define COST_EST 0 +#define COST_EST_AQ 1 +#define INTRA_MBS 2 + +#define COPY2_IF_LT( x, y, a, b )\ + if((y)<(x))\ + {\ + (x) = (y);\ + (a) = (b);\ + } + +constant int2 dia_offs[4] = +{ + {0, -1}, {-1, 0}, {1, 0}, {0, 1}, +}; + +inline pixel x264_clip_pixel( int x ) +{ + return (pixel) clamp( x, (int) 0, (int) PIXEL_MAX ); +} + +inline int2 x264_median_mv( short2 a, short2 b, short2 c ) +{ + short2 t1 = min(a, b); + short2 t2 = min(max(a, b), c); + return convert_int2(max(t1, t2)); +} + +inline sum2_t abs2( sum2_t a ) +{ + sum2_t s = ((a >> (BITS_PER_SUM - 1)) & (((sum2_t)1 << BITS_PER_SUM) + 1)) * ((sum_t)-1); + return (a + s) ^ s; +} + +#define HADAMARD4( d0, d1, d2, d3, s0, s1, s2, s3 ) {\ + sum2_t t0 = s0 + s1;\ + sum2_t t1 = s0 - s1;\ + sum2_t t2 = s2 + s3;\ + sum2_t t3 = s2 - s3;\ + d0 = t0 + t2;\ + d2 = t0 - t2;\ + d1 = t1 + t3;\ + d3 = t1 - t3;\ +} + +#define HADAMARD4V( d0, d1, d2, d3, s0, s1, s2, s3 ) {\ + int2 t0 = s0 + s1;\ + int2 t1 = s0 - s1;\ + int2 t2 = s2 + s3;\ + int2 t3 = s2 - s3;\ + d0 = t0 + t2;\ + d2 = t0 - t2;\ + d1 = t1 + t3;\ + d3 = t1 - t3;\ +} + +#define SATD_C_8x4_Q( name, q1, q2 )\ + int name( q1 pixel *pix1, int i_pix1, q2 pixel *pix2, int i_pix2 )\ + {\ + sum2_t tmp[4][4];\ + sum2_t a0, a1, a2, a3;\ + sum2_t sum = 0;\ + for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )\ + {\ + a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM);\ + a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM);\ + a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM);\ + a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM);\ + HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3 );\ + }\ + for( int i = 0; i < 4; i++ )\ + {\ + HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] );\ + sum += abs2( a0 ) + abs2( a1 ) + abs2( a2 ) + abs2( a3 );\ + }\ + return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;\ + } + +/* + * Utility function to perform a parallel sum reduction of an array of integers + */ +int parallel_sum( int value, int x, volatile local int *array ) +{ + array[x] = value; + barrier( CLK_LOCAL_MEM_FENCE ); + + int dim = get_local_size( 0 ); + + while( dim > 1 ) + { + dim >>= 1; + + if( x < dim ) + array[x] += array[x + dim]; + + if( dim > 32 ) + barrier( CLK_LOCAL_MEM_FENCE ); + } + + return array[0]; +} + +int mv_cost( uint2 mvd ) +{ + float2 mvdf = (float2)(mvd.x, mvd.y) + 1.0f; + float2 cost = round( log2(mvdf) * 2.0f + 0.718f + (float2)(!!mvd.x, !!mvd.y) ); + return (int) (cost.x + cost.y); +}
View file
x264-snapshot-20130224-2245.tar.bz2/common/osdep.h -> x264-snapshot-20130723-2245.tar.bz2/common/osdep.h
Changed
@@ -79,6 +79,7 @@ #else #define DECLARE_ALIGNED( var, n ) var __attribute__((aligned(n))) #endif +#define ALIGNED_32( var ) DECLARE_ALIGNED( var, 32 ) #define ALIGNED_16( var ) DECLARE_ALIGNED( var, 16 ) #define ALIGNED_8( var ) DECLARE_ALIGNED( var, 8 ) #define ALIGNED_4( var ) DECLARE_ALIGNED( var, 4 ) @@ -110,9 +111,26 @@ #define EXPAND(x) x +#if HAVE_32B_STACK_ALIGNMENT +#define ALIGNED_ARRAY_32( type, name, sub1, ... )\ + ALIGNED_32( type name sub1 __VA_ARGS__ ) +#else #define ALIGNED_ARRAY_32( ... ) EXPAND( ALIGNED_ARRAY_EMU( 31, __VA_ARGS__ ) ) +#endif + #define ALIGNED_ARRAY_64( ... ) EXPAND( ALIGNED_ARRAY_EMU( 63, __VA_ARGS__ ) ) +/* For AVX2 */ +#if ARCH_X86 || ARCH_X86_64 +#define NATIVE_ALIGN 32 +#define ALIGNED_N ALIGNED_32 +#define ALIGNED_ARRAY_N ALIGNED_ARRAY_32 +#else +#define NATIVE_ALIGN 16 +#define ALIGNED_N ALIGNED_16 +#define ALIGNED_ARRAY_N ALIGNED_ARRAY_16 +#endif + #define UNINIT(x) x=x #if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0) @@ -204,6 +222,25 @@ #define x264_threading_init() 0 #endif +static ALWAYS_INLINE int x264_pthread_fetch_and_add( int *val, int add, x264_pthread_mutex_t *mutex ) +{ +#if HAVE_THREAD +#if defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ > 0) && ARCH_X86 + return __sync_fetch_and_add( val, add ); +#else + x264_pthread_mutex_lock( mutex ); + int res = *val; + *val += add; + x264_pthread_mutex_unlock( mutex ); + return res; +#endif +#else + int res = *val; + *val += add; + return res; +#endif +} + #define WORD_SIZE sizeof(void*) #define asm __asm__ @@ -254,6 +291,13 @@ } #endif +/* For values with 4 bits or less. */ +static int ALWAYS_INLINE x264_ctz_4bit( uint32_t x ) +{ + static uint8_t lut[16] = {4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0}; + return lut[x]; +} + #if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 3) #define x264_clz(x) __builtin_clz(x) #define x264_ctz(x) __builtin_ctz(x)
View file
x264-snapshot-20130224-2245.tar.bz2/common/pixel.c -> x264-snapshot-20130723-2245.tar.bz2/common/pixel.c
Changed
@@ -370,7 +370,6 @@ return (sum+2)>>2; } - static NOINLINE uint64_t pixel_hadamard_ac( pixel *pix, intptr_t stride ) { sum2_t tmp[32]; @@ -501,6 +500,7 @@ #if !HIGH_BIT_DEPTH SATD_X_DECL6( _sse2 ) SATD_X_DECL7( _ssse3 ) +SATD_X_DECL6( _ssse3_atom ) SATD_X_DECL7( _sse4 ) SATD_X_DECL7( _avx ) SATD_X_DECL7( _xop ) @@ -528,6 +528,7 @@ INTRA_MBCMP_8x8( sad,, _c ) INTRA_MBCMP_8x8(sa8d,, _c ) #if HIGH_BIT_DEPTH && HAVE_MMX +#define x264_predict_8x8_v_sse2 x264_predict_8x8_v_sse INTRA_MBCMP_8x8( sad, _mmx2, _c ) INTRA_MBCMP_8x8(sa8d, _sse2, _sse2 ) #endif @@ -554,6 +555,9 @@ #if HAVE_MMX #if HIGH_BIT_DEPTH +#define x264_predict_8x8c_v_sse2 x264_predict_8x8c_v_sse +#define x264_predict_8x16c_v_sse2 x264_predict_8x16c_v_sse +#define x264_predict_16x16_v_sse2 x264_predict_16x16_v_sse INTRA_MBCMP( sad, 4x4, v, h, dc, , _mmx2, _c ) INTRA_MBCMP( sad, 8x8, dc, h, v, c, _mmx2, _c ) INTRA_MBCMP( sad, 16x16, v, h, dc, , _mmx2, _mmx2 ) @@ -841,6 +845,7 @@ if( cpu&X264_CPU_MMX2 ) { INIT7( sad, _mmx2 ); + INIT7_NAME( sad_aligned, sad, _mmx2 ); INIT7( sad_x3, _mmx2 ); INIT7( sad_x4, _mmx2 ); INIT8( satd, _mmx2 ); @@ -870,11 +875,14 @@ { INIT4_NAME( sad_aligned, sad, _sse2_aligned ); INIT5( ssd, _sse2 ); + INIT6( satd, _sse2 ); + pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_sse2; pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse2; #if ARCH_X86_64 pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2; + pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse2; #endif pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_sse2; pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_sse2; @@ -916,10 +924,14 @@ if( cpu&X264_CPU_SSSE3 ) { INIT4_NAME( sad_aligned, sad, _ssse3_aligned ); + pixf->sad_aligned[PIXEL_4x4] = x264_pixel_sad_4x4_ssse3; + pixf->sad_aligned[PIXEL_4x8] = x264_pixel_sad_4x8_ssse3; INIT7( sad, _ssse3 ); INIT7( sad_x3, _ssse3 ); INIT7( sad_x4, _ssse3 ); INIT_ADS( _ssse3 ); + INIT6( satd, _ssse3 ); + pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_ssse3; if( !(cpu&X264_CPU_STACK_MOD4) ) { @@ -930,6 +942,9 @@ pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_ssse3; pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3; +#if ARCH_X86_64 + pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3; +#endif pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_ssse3; pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_ssse3; pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_ssse3; @@ -937,16 +952,24 @@ } if( cpu&X264_CPU_SSE4 ) { + INIT6( satd, _sse4 ); + pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_sse4; if( !(cpu&X264_CPU_STACK_MOD4) ) { INIT4( hadamard_ac, _sse4 ); } pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse4; +#if ARCH_X86_64 + pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse4; +#endif } if( cpu&X264_CPU_AVX ) { + INIT5_NAME( sad_aligned, sad, _ssse3 ); /* AVX-capable CPUs doesn't benefit from an aligned version */ INIT_ADS( _avx ); + INIT6( satd, _avx ); + pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_avx; if( !(cpu&X264_CPU_STACK_MOD4) ) { INIT4( hadamard_ac, _avx ); @@ -959,12 +982,26 @@ pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx; pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_avx; pixf->ssim_end4 = x264_pixel_ssim_end4_avx; +#if ARCH_X86_64 + pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_avx; +#endif } if( cpu&X264_CPU_XOP ) { pixf->vsad = x264_pixel_vsad_xop; pixf->asd8 = x264_pixel_asd8_xop; } + if( cpu&X264_CPU_AVX2 ) + { + INIT2( ssd, _avx2 ); + INIT2( sad, _avx2 ); + INIT2_NAME( sad_aligned, sad, _avx2 ); + INIT2( sad_x3, _avx2 ); + INIT2( sad_x4, _avx2 ); + pixf->vsad = x264_pixel_vsad_avx2; + pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx2; + pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_avx2; + } #endif // HAVE_MMX #else // !HIGH_BIT_DEPTH #if HAVE_MMX @@ -1003,14 +1040,14 @@ INIT4( sad_x3, _cache32_mmx2 ); INIT4( sad_x4, _cache32_mmx2 ); } - else if( cpu&X264_CPU_CACHELINE_64 ) + else if( cpu&X264_CPU_CACHELINE_64 && !(cpu&X264_CPU_SLOW_ATOM) ) { INIT5( sad, _cache64_mmx2 ); INIT4( sad_x3, _cache64_mmx2 ); INIT4( sad_x4, _cache64_mmx2 ); } #else - if( cpu&X264_CPU_CACHELINE_64 ) + if( cpu&X264_CPU_CACHELINE_64 && !(cpu&X264_CPU_SLOW_ATOM) ) { pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_cache64_mmx2; pixf->sad[PIXEL_8x8] = x264_pixel_sad_8x8_cache64_mmx2; @@ -1044,6 +1081,7 @@ pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse2; #if ARCH_X86_64 pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2; + pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse2; #endif pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_sse2; pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_sse2; @@ -1060,10 +1098,7 @@ pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_sse2; INIT6( satd_x3, _sse2 ); INIT6( satd_x4, _sse2 ); - if( !(cpu&X264_CPU_STACK_MOD4) ) - { - INIT4( hadamard_ac, _sse2 ); - } + INIT4( hadamard_ac, _sse2 ); INIT_ADS( _sse2 ); pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2; pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_sse2; @@ -1113,9 +1148,9 @@ if( cpu&X264_CPU_SSSE3 ) { + INIT4( hadamard_ac, _ssse3 ); if( !(cpu&X264_CPU_STACK_MOD4) ) { - INIT4( hadamard_ac, _ssse3 ); pixf->intra_sad_x9_4x4 = x264_intra_sad_x9_4x4_ssse3; pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_ssse3; pixf->intra_sad_x9_8x8 = x264_intra_sad_x9_8x8_ssse3; @@ -1124,7 +1159,20 @@ #endif } INIT_ADS( _ssse3 ); - if( !(cpu&X264_CPU_SLOW_ATOM) ) + if( cpu&X264_CPU_SLOW_ATOM ) + { + pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3_atom; + pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3_atom; + INIT6( satd, _ssse3_atom ); + pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_ssse3_atom; + INIT6( satd_x3, _ssse3_atom ); + INIT6( satd_x4, _ssse3_atom ); + INIT4( hadamard_ac, _ssse3_atom ); +#if ARCH_X86_64 + pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3_atom; +#endif + } + else { INIT8( ssd, _ssse3 ); pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3; @@ -1132,9 +1180,13 @@ INIT8( satd, _ssse3 ); INIT7( satd_x3, _ssse3 ); INIT7( satd_x4, _ssse3 ); +#if ARCH_X86_64 + pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3; +#endif } pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3; - pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_ssse3; + if( !(cpu&X264_CPU_SLOW_PSHUFB) ) + pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_ssse3; pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_ssse3; pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_ssse3; pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_ssse3; @@ -1147,7 +1199,13 @@ INIT2( sad_x3, _cache64_ssse3 ); INIT2( sad_x4, _cache64_ssse3 ); } - if( cpu&X264_CPU_SLOW_ATOM || !(cpu&X264_CPU_SHUFFLE_IS_FAST) ) + else + { + pixf->sad_x4[PIXEL_8x4] = x264_pixel_sad_x4_8x4_ssse3; + pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_ssse3; + pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_ssse3; + } + if( (cpu&X264_CPU_SLOW_ATOM) || (cpu&X264_CPU_SLOW_SHUFFLE) ) { INIT5( ssd, _sse2 ); /* on conroe, sse2 is faster for width8/16 */ } @@ -1158,9 +1216,9 @@ INIT8( satd, _sse4 ); INIT7( satd_x3, _sse4 ); INIT7( satd_x4, _sse4 ); + INIT4( hadamard_ac, _sse4 ); if( !(cpu&X264_CPU_STACK_MOD4) ) { - INIT4( hadamard_ac, _sse4 ); pixf->intra_sad_x9_4x4 = x264_intra_sad_x9_4x4_sse4; pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_sse4; pixf->intra_sad_x9_8x8 = x264_intra_sad_x9_8x8_sse4; @@ -1171,17 +1229,21 @@ pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse4; pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_sse4; +#if ARCH_X86_64 + pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse4; +#endif } if( cpu&X264_CPU_AVX ) { + INIT2_NAME( sad_aligned, sad, _sse2 ); /* AVX-capable CPUs doesn't benefit from an aligned version */ INIT8( satd, _avx ); INIT7( satd_x3, _avx ); INIT7( satd_x4, _avx ); INIT_ADS( _avx ); + INIT4( hadamard_ac, _avx ); if( !(cpu&X264_CPU_STACK_MOD4) ) { - INIT4( hadamard_ac, _avx ); pixf->intra_sad_x9_4x4 = x264_intra_sad_x9_4x4_avx; pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_avx; pixf->intra_sad_x9_8x8 = x264_intra_sad_x9_8x8_avx; @@ -1199,6 +1261,9 @@ pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_avx; pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_avx; pixf->ssim_end4 = x264_pixel_ssim_end4_avx; +#if ARCH_X86_64 + pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_avx; +#endif } if( cpu&X264_CPU_XOP ) @@ -1206,9 +1271,9 @@ INIT7( satd, _xop ); INIT7( satd_x3, _xop ); INIT7( satd_x4, _xop ); + INIT4( hadamard_ac, _xop ); if( !(cpu&X264_CPU_STACK_MOD4) ) { - INIT4( hadamard_ac, _xop ); pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_xop; } INIT5( ssd, _xop ); @@ -1220,6 +1285,30 @@ pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop; pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_xop; pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_xop; +#if ARCH_X86_64 + pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_xop; +#endif + } + + if( cpu&X264_CPU_AVX2 ) + { + INIT2( ssd, _avx2 ); + INIT2( sad_x3, _avx2 ); + INIT2( sad_x4, _avx2 ); + INIT4( satd, _avx2 ); + INIT2( hadamard_ac, _avx2 ); + INIT_ADS( _avx2 ); + pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx2; + pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx2; + pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx2; + pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx2; + pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_avx2; + pixf->intra_sad_x9_8x8 = x264_intra_sad_x9_8x8_avx2; + pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_avx2; + pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx2; +#if ARCH_X86_64 + pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_avx2; +#endif } #endif //HAVE_MMX
View file
x264-snapshot-20130224-2245.tar.bz2/common/pixel.h -> x264-snapshot-20130723-2245.tar.bz2/common/pixel.h
Changed
@@ -90,6 +90,7 @@ x264_pixel_cmp_t sad_aligned[8]; /* Aligned SAD for mbcmp */ int (*vsad)( pixel *, intptr_t, int ); int (*asd8)( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height ); + uint64_t (*sa8d_satd[1])( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 ); uint64_t (*var[4])( pixel *pix, intptr_t stride ); int (*var2[4])( pixel *pix1, intptr_t stride1,
View file
x264-snapshot-20130224-2245.tar.bz2/common/quant.c -> x264-snapshot-20130723-2245.tar.bz2/common/quant.c
Changed
@@ -63,6 +63,19 @@ return !!nz; } +static int quant_4x4x4( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] ) +{ + int nza = 0; + for( int j = 0; j < 4; j++ ) + { + int nz = 0; + for( int i = 0; i < 16; i++ ) + QUANT_ONE( dct[j][i], mf[i], bias[i] ); + nza |= (!!nz)<<j; + } + return nza; +} + static int quant_4x4_dc( dctcoef dct[16], int mf, int bias ) { int nz = 0; @@ -405,6 +418,7 @@ { pf->quant_8x8 = quant_8x8; pf->quant_4x4 = quant_4x4; + pf->quant_4x4x4 = quant_4x4x4; pf->quant_4x4_dc = quant_4x4_dc; pf->quant_2x2_dc = quant_2x2_dc; @@ -442,11 +456,6 @@ pf->denoise_dct = x264_denoise_dct_mmx; pf->decimate_score15 = x264_decimate_score15_mmx2; pf->decimate_score16 = x264_decimate_score16_mmx2; - if( cpu&X264_CPU_SLOW_CTZ ) - { - pf->decimate_score15 = x264_decimate_score15_mmx2_slowctz; - pf->decimate_score16 = x264_decimate_score16_mmx2_slowctz; - } pf->decimate_score64 = x264_decimate_score64_mmx2; pf->coeff_last8 = x264_coeff_last8_mmx2; pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmx2; @@ -464,6 +473,7 @@ if( cpu&X264_CPU_SSE2 ) { pf->quant_4x4 = x264_quant_4x4_sse2; + pf->quant_4x4x4 = x264_quant_4x4x4_sse2; pf->quant_8x8 = x264_quant_8x8_sse2; pf->quant_2x2_dc = x264_quant_2x2_dc_sse2; pf->quant_4x4_dc = x264_quant_4x4_dc_sse2; @@ -474,11 +484,6 @@ pf->decimate_score15 = x264_decimate_score15_sse2; pf->decimate_score16 = x264_decimate_score16_sse2; pf->decimate_score64 = x264_decimate_score64_sse2; - if( cpu&X264_CPU_SLOW_CTZ ) - { - pf->decimate_score15 = x264_decimate_score15_sse2_slowctz; - pf->decimate_score16 = x264_decimate_score16_sse2_slowctz; - } pf->coeff_last8 = x264_coeff_last8_sse2; pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2; pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2; @@ -501,17 +506,13 @@ if( cpu&X264_CPU_SSSE3 ) { pf->quant_4x4 = x264_quant_4x4_ssse3; + pf->quant_4x4x4 = x264_quant_4x4x4_ssse3; pf->quant_8x8 = x264_quant_8x8_ssse3; pf->quant_2x2_dc = x264_quant_2x2_dc_ssse3; pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3; pf->denoise_dct = x264_denoise_dct_ssse3; pf->decimate_score15 = x264_decimate_score15_ssse3; pf->decimate_score16 = x264_decimate_score16_ssse3; - if( cpu&X264_CPU_SLOW_CTZ ) - { - pf->decimate_score15 = x264_decimate_score15_ssse3_slowctz; - pf->decimate_score16 = x264_decimate_score16_ssse3_slowctz; - } pf->decimate_score64 = x264_decimate_score64_ssse3; INIT_TRELLIS( ssse3 ); } @@ -520,6 +521,7 @@ pf->quant_2x2_dc = x264_quant_2x2_dc_sse4; pf->quant_4x4_dc = x264_quant_4x4_dc_sse4; pf->quant_4x4 = x264_quant_4x4_sse4; + pf->quant_4x4x4 = x264_quant_4x4x4_sse4; pf->quant_8x8 = x264_quant_8x8_sse4; } if( cpu&X264_CPU_AVX ) @@ -535,6 +537,17 @@ pf->dequant_8x8 = x264_dequant_8x8_xop; } } + if( cpu&X264_CPU_AVX2 ) + { + pf->quant_4x4 = x264_quant_4x4_avx2; + pf->quant_4x4_dc = x264_quant_4x4_dc_avx2; + pf->quant_8x8 = x264_quant_8x8_avx2; + pf->quant_4x4x4 = x264_quant_4x4x4_avx2; + pf->dequant_4x4 = x264_dequant_4x4_avx2; + pf->dequant_8x8 = x264_dequant_8x8_avx2; + pf->dequant_4x4_dc = x264_dequant_4x4dc_avx2; + pf->denoise_dct = x264_denoise_dct_avx2; + } #endif // HAVE_MMX #else // !HIGH_BIT_DEPTH #if HAVE_MMX @@ -543,6 +556,7 @@ { #if ARCH_X86 pf->quant_4x4 = x264_quant_4x4_mmx; + pf->quant_4x4x4 = x264_quant_4x4x4_mmx; pf->quant_8x8 = x264_quant_8x8_mmx; pf->dequant_4x4 = x264_dequant_4x4_mmx; pf->dequant_4x4_dc = x264_dequant_4x4dc_mmx2; @@ -563,11 +577,6 @@ pf->quant_4x4_dc = x264_quant_4x4_dc_mmx2; pf->decimate_score15 = x264_decimate_score15_mmx2; pf->decimate_score16 = x264_decimate_score16_mmx2; - if( cpu&X264_CPU_SLOW_CTZ ) - { - pf->decimate_score15 = x264_decimate_score15_mmx2_slowctz; - pf->decimate_score16 = x264_decimate_score16_mmx2_slowctz; - } pf->decimate_score64 = x264_decimate_score64_mmx2; pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmx2; pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2; @@ -592,6 +601,7 @@ { pf->quant_4x4_dc = x264_quant_4x4_dc_sse2; pf->quant_4x4 = x264_quant_4x4_sse2; + pf->quant_4x4x4 = x264_quant_4x4x4_sse2; pf->quant_8x8 = x264_quant_8x8_sse2; pf->dequant_4x4 = x264_dequant_4x4_sse2; pf->dequant_4x4_dc = x264_dequant_4x4dc_sse2; @@ -606,11 +616,6 @@ pf->decimate_score15 = x264_decimate_score15_sse2; pf->decimate_score16 = x264_decimate_score16_sse2; pf->decimate_score64 = x264_decimate_score64_sse2; - if( cpu&X264_CPU_SLOW_CTZ ) - { - pf->decimate_score15 = x264_decimate_score15_sse2_slowctz; - pf->decimate_score16 = x264_decimate_score16_sse2_slowctz; - } pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2; pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2; pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2; @@ -631,18 +636,25 @@ pf->quant_2x2_dc = x264_quant_2x2_dc_ssse3; pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3; pf->quant_4x4 = x264_quant_4x4_ssse3; + pf->quant_4x4x4 = x264_quant_4x4x4_ssse3; pf->quant_8x8 = x264_quant_8x8_ssse3; pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_ssse3; pf->denoise_dct = x264_denoise_dct_ssse3; pf->decimate_score15 = x264_decimate_score15_ssse3; pf->decimate_score16 = x264_decimate_score16_ssse3; - if( cpu&X264_CPU_SLOW_CTZ ) - { - pf->decimate_score15 = x264_decimate_score15_ssse3_slowctz; - pf->decimate_score16 = x264_decimate_score16_ssse3_slowctz; - } pf->decimate_score64 = x264_decimate_score64_ssse3; INIT_TRELLIS( ssse3 ); + pf->coeff_level_run4 = x264_coeff_level_run4_ssse3; + pf->coeff_level_run8 = x264_coeff_level_run8_ssse3; + pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3; + pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3; + if( cpu&X264_CPU_LZCNT ) + { + pf->coeff_level_run4 = x264_coeff_level_run4_ssse3; + pf->coeff_level_run8 = x264_coeff_level_run8_ssse3; + pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3_lzcnt; + pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3_lzcnt; + } } if( cpu&X264_CPU_SSE4 ) @@ -673,6 +685,30 @@ pf->dequant_8x8 = x264_dequant_8x8_xop; } } + + if( cpu&X264_CPU_AVX2 ) + { + pf->quant_4x4 = x264_quant_4x4_avx2; + pf->quant_4x4_dc = x264_quant_4x4_dc_avx2; + pf->quant_8x8 = x264_quant_8x8_avx2; + pf->quant_4x4x4 = x264_quant_4x4x4_avx2; + pf->dequant_4x4 = x264_dequant_4x4_avx2; + pf->dequant_8x8 = x264_dequant_8x8_avx2; + pf->dequant_4x4_dc = x264_dequant_4x4dc_avx2; + if( h->param.i_cqm_preset == X264_CQM_FLAT ) + { + pf->dequant_4x4 = x264_dequant_4x4_flat16_avx2; + pf->dequant_8x8 = x264_dequant_8x8_flat16_avx2; + } + pf->decimate_score64 = x264_decimate_score64_avx2; + pf->denoise_dct = x264_denoise_dct_avx2; + if( cpu&X264_CPU_LZCNT ) + { + pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2_lzcnt; + pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_avx2_lzcnt; + pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_avx2_lzcnt; + } + } #endif // HAVE_MMX #if HAVE_ALTIVEC @@ -696,6 +732,7 @@ pf->quant_2x2_dc = x264_quant_2x2_dc_neon; pf->quant_4x4 = x264_quant_4x4_neon; pf->quant_4x4_dc = x264_quant_4x4_dc_neon; + pf->quant_4x4x4 = x264_quant_4x4x4_neon; pf->quant_8x8 = x264_quant_8x8_neon; pf->dequant_4x4 = x264_dequant_4x4_neon; pf->dequant_4x4_dc = x264_dequant_4x4_dc_neon;
View file
x264-snapshot-20130224-2245.tar.bz2/common/quant.h -> x264-snapshot-20130723-2245.tar.bz2/common/quant.h
Changed
@@ -29,8 +29,9 @@ typedef struct { - int (*quant_8x8)( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] ); - int (*quant_4x4)( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] ); + int (*quant_8x8) ( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] ); + int (*quant_4x4) ( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] ); + int (*quant_4x4x4)( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] ); int (*quant_4x4_dc)( dctcoef dct[16], int mf, int bias ); int (*quant_2x2_dc)( dctcoef dct[4], int mf, int bias );
View file
x264-snapshot-20130224-2245.tar.bz2/common/set.c -> x264-snapshot-20130723-2245.tar.bz2/common/set.c
Changed
@@ -85,44 +85,49 @@ int max_qp_err = -1; int max_chroma_qp_err = -1; int min_qp_err = QP_MAX+1; - int num_8x8_lists = h->sps->i_chroma_format_idc == CHROMA_444 ? 4 : 2; /* Checkasm may segfault if optimized out by --chroma-format */ + int num_8x8_lists = h->sps->i_chroma_format_idc == CHROMA_444 ? 4 + : h->param.analyse.b_transform_8x8 ? 2 : 0; /* Checkasm may segfault if optimized out by --chroma-format */ - for( int i = 0; i < 4 + num_8x8_lists; i++ ) - { - int size = i<4 ? 16 : 64; - int j; - for( j = (i<4 ? 0 : 4); j < i; j++ ) - if( !memcmp( h->pps->scaling_list[i], h->pps->scaling_list[j], size*sizeof(uint8_t) ) ) - break; - if( j < i ) - { - h-> quant4_mf[i] = h-> quant4_mf[j]; - h->dequant4_mf[i] = h->dequant4_mf[j]; - h->unquant4_mf[i] = h->unquant4_mf[j]; - } - else - { - CHECKED_MALLOC( h-> quant4_mf[i], (QP_MAX+1)*size*sizeof(udctcoef) ); - CHECKED_MALLOC( h->dequant4_mf[i], 6*size*sizeof(int) ); - CHECKED_MALLOC( h->unquant4_mf[i], (QP_MAX+1)*size*sizeof(int) ); - } - - for( j = (i<4 ? 0 : 4); j < i; j++ ) - if( deadzone[j&3] == deadzone[i&3] && - !memcmp( h->pps->scaling_list[i], h->pps->scaling_list[j], size*sizeof(uint8_t) ) ) - break; - if( j < i ) - { - h->quant4_bias[i] = h->quant4_bias[j]; - h->quant4_bias0[i] = h->quant4_bias0[j]; - } - else - { - CHECKED_MALLOC( h->quant4_bias[i], (QP_MAX+1)*size*sizeof(udctcoef) ); - CHECKED_MALLOC( h->quant4_bias0[i], (QP_MAX+1)*size*sizeof(udctcoef) ); - } +#define CQM_ALLOC( w, count )\ + for( int i = 0; i < count; i++ )\ + {\ + int size = w*w;\ + int start = w == 8 ? 4 : 0;\ + int j;\ + for( j = 0; j < i; j++ )\ + if( !memcmp( h->pps->scaling_list[i+start], h->pps->scaling_list[j+start], size*sizeof(uint8_t) ) )\ + break;\ + if( j < i )\ + {\ + h-> quant##w##_mf[i] = h-> quant##w##_mf[j];\ + h->dequant##w##_mf[i] = h->dequant##w##_mf[j];\ + h->unquant##w##_mf[i] = h->unquant##w##_mf[j];\ + }\ + else\ + {\ + CHECKED_MALLOC( h-> quant##w##_mf[i], (QP_MAX+1)*size*sizeof(udctcoef) );\ + CHECKED_MALLOC( h->dequant##w##_mf[i], 6*size*sizeof(int) );\ + CHECKED_MALLOC( h->unquant##w##_mf[i], (QP_MAX+1)*size*sizeof(int) );\ + }\ + for( j = 0; j < i; j++ )\ + if( deadzone[j] == deadzone[i] &&\ + !memcmp( h->pps->scaling_list[i+start], h->pps->scaling_list[j+start], size*sizeof(uint8_t) ) )\ + break;\ + if( j < i )\ + {\ + h->quant##w##_bias[i] = h->quant##w##_bias[j];\ + h->quant##w##_bias0[i] = h->quant##w##_bias0[j];\ + }\ + else\ + {\ + CHECKED_MALLOC( h->quant##w##_bias[i], (QP_MAX+1)*size*sizeof(udctcoef) );\ + CHECKED_MALLOC( h->quant##w##_bias0[i], (QP_MAX+1)*size*sizeof(udctcoef) );\ + }\ } + CQM_ALLOC( 4, 4 ) + CQM_ALLOC( 8, num_8x8_lists ) + for( int q = 0; q < 6; q++ ) { for( int i = 0; i < 16; i++ ) @@ -204,6 +209,9 @@ for( int cat = 0; cat < 3 + CHROMA444; cat++ ) { int dct8x8 = cat&1; + if( !h->param.analyse.b_transform_8x8 && dct8x8 ) + continue; + int size = dct8x8 ? 64 : 16; udctcoef *nr_offset = h->nr_offset_emergency[q][cat]; /* Denoise chroma first (due to h264's chroma QP offset), then luma, then DC. */
View file
x264-snapshot-20130224-2245.tar.bz2/common/win32thread.c -> x264-snapshot-20130723-2245.tar.bz2/common/win32thread.c
Changed
@@ -279,7 +279,7 @@ memset( &thread_control, 0, sizeof(x264_win32thread_control_t) ); } -int x264_pthread_num_processors_np() +int x264_pthread_num_processors_np( void ) { DWORD_PTR system_cpus, process_cpus = 0; int cpus = 0;
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/bitstream-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/bitstream-a.asm
Changed
@@ -4,7 +4,7 @@ ;* Copyright (C) 2010-2013 x264 project ;* ;* Authors: Jason Garrett-Glaser <darkshikari@gmail.com> -;* Henrik Gramner <hengar-6@student.ltu.se> +;* Henrik Gramner <henrik@gramner.com> ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -32,100 +32,105 @@ ;----------------------------------------------------------------------------- ; uint8_t *x264_nal_escape( uint8_t *dst, uint8_t *src, uint8_t *end ) ;----------------------------------------------------------------------------- - %macro NAL_LOOP 2 -%1_escape: +%%escape: ; Detect false positive to avoid unneccessary escape loop xor r3d, r3d cmp byte [r0+r1-1], 0 setnz r3b - xor r3d, r4d + xor k3, k4 jnz .escape - jmp %1_continue + jmp %%continue ALIGN 16 %1: - pcmpeqb m3, m1, m4 - pcmpeqb m2, m0, m4 - pmovmskb r3d, m3 - %2 [r0+r1], m0 + mova [r0+r1+mmsize], m1 + pcmpeqb m1, m0 + mova [r0+r1], m2 + pcmpeqb m2, m0 + pmovmskb r3d, m1 + %2 m1, [r1+r2+3*mmsize] pmovmskb r4d, m2 - shl r3d, mmsize - mova m0, [r1+r2+2*mmsize] - or r4d, r3d - %2 [r0+r1+mmsize], m1 - lea r3d, [r4+r4+1] - mova m1, [r1+r2+3*mmsize] - and r4d, r3d - jnz %1_escape -%1_continue: + %2 m2, [r1+r2+2*mmsize] + shl k3, mmsize + or k3, k4 + lea k4, [2*r3+1] + and k4, k3 + jnz %%escape +%%continue: add r1, 2*mmsize jl %1 %endmacro %macro NAL_ESCAPE 0 +%if mmsize == 32 + %xdefine k3 r3 + %xdefine k4 r4 +%else + %xdefine k3 r3d + %xdefine k4 r4d +%endif cglobal nal_escape, 3,5 - mov r3w, [r1] + movzx r3d, byte [r1] sub r1, r2 ; r1 = offset of current src pointer from end of src - pxor m4, m4 + pxor m0, m0 + mov [r0], r3b sub r0, r1 ; r0 = projected end of dst, assuming no more escapes - mov [r0+r1], r3w - add r1, 2 - jge .ret + or r3d, 0xffffff00 ; ignore data before src - ; Start off by jumping into the escape loop in - ; case there's an escape at the start. - ; And do a few more in scalar until src is aligned again. - jmp .first_escape + ; Start off by jumping into the escape loop in case there's an escape at the start. + ; And do a few more in scalar until dst is aligned. + jmp .escape_loop +%if mmsize == 16 NAL_LOOP .loop_aligned, mova -%if mmsize==16 jmp .ret - NAL_LOOP .loop_unaligned, movu %endif + NAL_LOOP .loop_unaligned, movu .ret: movifnidn rax, r0 RET -ALIGN 16 .escape: ; Skip bytes that are known to be valid - and r4d, r3d - tzcnt r3d, r4d - add r1, r3 + and k4, k3 + tzcnt k4, k4 + xor r3d, r3d ; the last two bytes are known to be zero + add r1, r4 .escape_loop: inc r1 jge .ret -.first_escape: - movzx r3d, byte [r1+r2] - lea r4, [r1+r2] - cmp r3d, 3 - jna .escape_check -.no_escape: + movzx r4d, byte [r1+r2] + shl r3d, 8 + or r3d, r4d + test r3d, 0xfffffc ; if the last two bytes are 0 and the current byte is <=3 + jz .add_escape_byte +.escaped: + lea r4d, [r0+r1] mov [r0+r1], r3b - test r4d, mmsize-1 ; Do SIMD when src is aligned + test r4d, mmsize-1 ; Do SIMD when dst is aligned jnz .escape_loop - mova m0, [r4] - mova m1, [r4+mmsize] -%if mmsize==16 - lea r4d, [r0+r1] + movu m1, [r1+r2+mmsize] + movu m2, [r1+r2] +%if mmsize == 16 + lea r4d, [r1+r2] test r4d, mmsize-1 - jnz .loop_unaligned + jz .loop_aligned %endif - jmp .loop_aligned + jmp .loop_unaligned -ALIGN 16 -.escape_check: - cmp word [r0+r1-2], 0 - jnz .no_escape +.add_escape_byte: mov byte [r0+r1], 3 - inc r0 - jmp .no_escape + inc r0 + or r3d, 0x0300 + jmp .escaped %endmacro INIT_MMX mmx2 NAL_ESCAPE INIT_XMM sse2 NAL_ESCAPE -INIT_XMM avx +%if ARCH_X86_64 +INIT_YMM avx2 NAL_ESCAPE +%endif
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/cabac-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/cabac-a.asm
Changed
@@ -26,22 +26,69 @@ ;***************************************************************************** %include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA + +coeff_abs_level1_ctx: db 1, 2, 3, 4, 0, 0, 0, 0 +coeff_abs_levelgt1_ctx: db 5, 5, 5, 5, 6, 7, 8, 9 +coeff_abs_level_transition: db 1, 2, 3, 3, 4, 5, 6, 7 + db 4, 4, 4, 4, 5, 6, 7, 7 + +%if ARCH_X86_64 +%macro COEFF_LAST_TABLE 17 + %define funccpu1 %1 + %define funccpu2 %2 + %define funccpu3 %3 + %rep 14 + %ifidn %4, 4 + dq mangle(x264_coeff_last%4_ %+ funccpu1) + %elifidn %4, 64 + dq mangle(x264_coeff_last%4_ %+ funccpu2) + %else + dq mangle(x264_coeff_last%4_ %+ funccpu3) + %endif + %rotate 1 + %endrep +%endmacro + +cextern coeff_last4_mmx2 +cextern coeff_last4_mmx2_lzcnt +cextern coeff_last15_sse2 +cextern coeff_last15_sse2_lzcnt +cextern coeff_last16_sse2 +cextern coeff_last16_sse2_lzcnt +cextern coeff_last64_sse2 +cextern coeff_last64_sse2_lzcnt +cextern coeff_last64_avx2_lzcnt + +%ifdef PIC +SECTION .data +%endif +coeff_last_sse2: COEFF_LAST_TABLE mmx2, sse2, sse2, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 +coeff_last_sse2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, sse2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 +coeff_last_avx2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, avx2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 +%endif SECTION .text cextern cabac_range_lps cextern cabac_transition cextern cabac_renorm_shift +cextern cabac_entropy +cextern cabac_size_unary +cextern cabac_transition_unary +cextern significant_coeff_flag_offset +cextern significant_coeff_flag_offset_8x8 +cextern last_coeff_flag_offset +cextern last_coeff_flag_offset_8x8 +cextern coeff_abs_level_m1_offset +cextern count_cat_m1 +cextern cabac_encode_ue_bypass -; t3 must be ecx, since it's used for shift. -%if WIN64 - DECLARE_REG_TMP 3,1,2,0,6,5,4,2 - %define pointer resq -%elif ARCH_X86_64 - DECLARE_REG_TMP 0,1,2,3,4,5,6,6 +%if ARCH_X86_64 %define pointer resq %else - DECLARE_REG_TMP 0,4,2,1,3,5,6,2 %define pointer resd %endif @@ -58,24 +105,34 @@ .state: resb 1024 endstruc -%macro LOAD_GLOBAL 4 +%macro LOAD_GLOBAL 3-5 0 ; dst, base, off1, off2, tmp %ifdef PIC - ; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea - lea r7, [%2] - %ifnidn %3, 0 - add r7, %3 + %ifidn %4, 0 + movzx %1, byte [%2+%3+r7-$$] + %else + lea %5, [r7+%4] + movzx %1, byte [%2+%3+%5-$$] %endif - movzx %1, byte [r7+%4] %else movzx %1, byte [%2+%3+%4] %endif %endmacro -cglobal cabac_encode_decision_asm, 0,7 - movifnidn t0, r0mp +%macro CABAC 1 +; t3 must be ecx, since it's used for shift. +%if WIN64 + DECLARE_REG_TMP 3,1,2,0,5,6,4,4 +%elif ARCH_X86_64 + DECLARE_REG_TMP 0,1,2,3,4,5,6,6 +%else + DECLARE_REG_TMP 0,4,2,1,3,5,6,2 +%endif + +cglobal cabac_encode_decision_%1, 1,7 movifnidn t1d, r1m - mov t5d, [t0+cb.range] - movzx t6d, byte [t0+cb.state+t1] + mov t5d, [r0+cb.range] + movzx t6d, byte [r0+cb.state+t1] + movifnidn t0, r0 ; WIN64 mov t4d, ~1 mov t3d, t5d and t4d, t6d @@ -84,8 +141,11 @@ %if WIN64 PUSH r7 %endif - LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*2 - LOAD_GLOBAL t4d, cabac_transition, t2, t6*2 +%ifdef PIC + lea r7, [$$] +%endif + LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*2, t4 + LOAD_GLOBAL t4d, cabac_transition, t2, t6*2, t4 and t6d, 1 sub t3d, t5d cmp t6d, t2d @@ -96,66 +156,82 @@ mov [t0+cb.state+t1], t4b ;cabac_encode_renorm mov t4d, t3d +%ifidn %1, bmi2 + lzcnt t3d, t3d + sub t3d, 23 + shlx t4d, t4d, t3d + shlx t6d, t6d, t3d +%else shr t3d, 3 - LOAD_GLOBAL t3d, cabac_renorm_shift, 0, t3 + LOAD_GLOBAL t3d, cabac_renorm_shift, t3 + shl t4d, t3b + shl t6d, t3b +%endif %if WIN64 POP r7 %endif - shl t4d, t3b - shl t6d, t3b mov [t0+cb.range], t4d add t3d, [t0+cb.queue] - jge cabac_putbyte + jge cabac_putbyte_%1 .update_queue_low: mov [t0+cb.low], t6d mov [t0+cb.queue], t3d RET -cglobal cabac_encode_bypass_asm, 0,3 - movifnidn t0, r0mp - movifnidn t3d, r1m - mov t7d, [t0+cb.low] - and t3d, [t0+cb.range] - lea t7d, [t7*2+t3] - mov t3d, [t0+cb.queue] +cglobal cabac_encode_bypass_%1, 2,3 + mov t7d, [r0+cb.low] + and r1d, [r0+cb.range] + lea t7d, [t7*2+r1] + movifnidn t0, r0 ; WIN64 + mov t3d, [r0+cb.queue] inc t3d -%if UNIX64 ; .putbyte compiles to nothing but a jmp - jge cabac_putbyte +%if ARCH_X86_64 ; .putbyte compiles to nothing but a jmp + jge cabac_putbyte_%1 %else jge .putbyte %endif mov [t0+cb.low], t7d mov [t0+cb.queue], t3d RET +%if ARCH_X86_64 == 0 .putbyte: PROLOGUE 0,7 movifnidn t6d, t7d - jmp cabac_putbyte + jmp cabac_putbyte_%1 +%endif -cglobal cabac_encode_terminal_asm, 0,3 - movifnidn t0, r0mp - sub dword [t0+cb.range], 2 +%ifnidn %1,bmi2 +cglobal cabac_encode_terminal_%1, 1,3 + sub dword [r0+cb.range], 2 ; shortcut: the renormalization shift in terminal ; can only be 0 or 1 and is zero over 99% of the time. - test dword [t0+cb.range], 0x100 + test dword [r0+cb.range], 0x100 je .renorm RET .renorm: - shl dword [t0+cb.low], 1 - shl dword [t0+cb.range], 1 - inc dword [t0+cb.queue] + shl dword [r0+cb.low], 1 + shl dword [r0+cb.range], 1 + inc dword [r0+cb.queue] jge .putbyte RET .putbyte: PROLOGUE 0,7 - mov t3d, [t0+cb.queue] + movifnidn t0, r0 ; WIN64 + mov t3d, [r0+cb.queue] mov t6d, [t0+cb.low] +%endif -cabac_putbyte: +cabac_putbyte_%1: ; alive: t0=cb t3=queue t6=low %if WIN64 DECLARE_REG_TMP 3,6,1,0,2,5,4 %endif +%ifidn %1, bmi2 + add t3d, 10 + shrx t2d, t6d, t3d + bzhi t6d, t6d, t3d + sub t3d, 18 +%else mov t1d, -1 add t3d, 10 mov t2d, t6d @@ -164,6 +240,7 @@ not t1d sub t3d, 18 and t6d, t1d +%endif mov t5d, [t0+cb.bytes_outstanding] cmp t2b, 0xff ; FIXME is a 32bit op faster? jz .postpone @@ -180,4 +257,500 @@ .postpone: inc t5d mov [t0+cb.bytes_outstanding], t5d - jmp mangle(x264_cabac_encode_decision_asm.update_queue_low) + jmp mangle(x264_cabac_encode_decision_%1.update_queue_low) +%endmacro + +CABAC asm +CABAC bmi2 + +; %1 = label name +; %2 = node_ctx init? +%macro COEFF_ABS_LEVEL_GT1 2 +%if %2 + %define ctx 1 +%else + movzx r11d, byte [coeff_abs_level1_ctx+r2 GLOBAL] + %define ctx r11 +%endif + movzx r9d, byte [r8+ctx] +; if( coeff_abs > 1 ) + cmp r1d, 1 + jg .%1_gt1 +; x264_cabac_encode_decision( cb, ctx_level+ctx, 0 ) + movzx r10d, byte [cabac_transition+r9*2 GLOBAL] + movzx r9d, word [cabac_entropy+r9*2 GLOBAL] + lea r0d, [r0+r9+256] + mov [r8+ctx], r10b +%if %2 + mov r2d, 1 +%else + movzx r2d, byte [coeff_abs_level_transition+r2 GLOBAL] +%endif + jmp .%1_end + +.%1_gt1: +; x264_cabac_encode_decision( cb, ctx_level+ctx, 1 ) + movzx r10d, byte [cabac_transition+r9*2+1 GLOBAL] + xor r9d, 1 + movzx r9d, word [cabac_entropy+r9*2 GLOBAL] + mov [r8+ctx], r10b + add r0d, r9d +%if %2 + %define ctx 5 +%else + movzx r11d, byte [coeff_abs_levelgt1_ctx+r2 GLOBAL] + %define ctx r11 +%endif +; if( coeff_abs < 15 ) + cmp r1d, 15 + jge .%1_escape + shl r1d, 7 +; x264_cabac_transition_unary[coeff_abs-1][cb->state[ctx_level+ctx]] + movzx r9d, byte [r8+ctx] + add r9d, r1d + movzx r10d, byte [cabac_transition_unary-128+r9 GLOBAL] +; x264_cabac_size_unary[coeff_abs-1][cb->state[ctx_level+ctx]] + movzx r9d, word [cabac_size_unary-256+r9*2 GLOBAL] + mov [r8+ctx], r10b + add r0d, r9d + jmp .%1_gt1_end + +.%1_escape: +; x264_cabac_transition_unary[14][cb->state[ctx_level+ctx]] + movzx r9d, byte [r8+ctx] + movzx r10d, byte [cabac_transition_unary+128*14+r9 GLOBAL] +; x264_cabac_size_unary[14][cb->state[ctx_level+ctx]] + movzx r9d, word [cabac_size_unary+256*14+r9*2 GLOBAL] + add r0d, r9d + mov [r8+ctx], r10b + sub r1d, 14 +%if cpuflag(lzcnt) + lzcnt r9d, r1d + xor r9d, 0x1f +%else + bsr r9d, r1d +%endif +; bs_size_ue_big(coeff_abs-15)<<8 + shl r9d, 9 +; (ilog2(coeff_abs-14)+1) << 8 + lea r0d, [r0+r9+256] +.%1_gt1_end: +%if %2 + mov r2d, 4 +%else + movzx r2d, byte [coeff_abs_level_transition+8+r2 GLOBAL] +%endif +.%1_end: +%endmacro + +%macro LOAD_DCTCOEF 1 +%if HIGH_BIT_DEPTH + mov %1, [dct+r6*4] +%else + movzx %1, word [dct+r6*2] +%endif +%endmacro + +%macro ABS_DCTCOEFS 2 +%assign i 0 +%rep %2/16 +%if HIGH_BIT_DEPTH + ABSD m0, [%1+ 0+i*64], m4 + ABSD m1, [%1+16+i*64], m5 + ABSD m2, [%1+32+i*64], m4 + ABSD m3, [%1+48+i*64], m5 + mova [rsp+ 0+i*64], m0 + mova [rsp+16+i*64], m1 + mova [rsp+32+i*64], m2 + mova [rsp+48+i*64], m3 +%else + ABSW m0, [%1+ 0+i*32], m2 + ABSW m1, [%1+16+i*32], m3 + mova [rsp+ 0+i*32], m0 + mova [rsp+16+i*32], m1 +%endif +%assign i i+1 +%endrep +%endmacro + +%macro SIG_OFFSET 1 +%if %1 + movzx r11d, byte [r4+r6] +%endif +%endmacro + +%macro LAST_OFFSET 1 +%if %1 + movzx r11d, byte [last_coeff_flag_offset_8x8+r6 GLOBAL] +%endif +%endmacro + +;----------------------------------------------------------------------------- +; void x264_cabac_block_residual_rd_internal_sse2 ( dctcoef *l, int b_interlaced, +; int ctx_block_cat, x264_cabac_t *cb ); +;----------------------------------------------------------------------------- + +;%1 = 8x8 mode +%macro CABAC_RESIDUAL_RD 2 +%if %1 + %define func cabac_block_residual_8x8_rd_internal + %define maxcoeffs 64 + %define dct rsp +%else + %define func cabac_block_residual_rd_internal + %define maxcoeffs 16 + %define dct r4 +%endif + +%ifdef PIC + cglobal func, 4,13 + lea r12, [$$] + %define GLOBAL +r12-$$ +%else + cglobal func, 4,12 + %define GLOBAL +%endif + +%assign pad gprsize+SIZEOF_DCTCOEF*maxcoeffs-(stack_offset&15) + SUB rsp, pad + shl r1d, 4 ; MB_INTERLACED*16 +%if %1 + lea r4, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL] ; r12 = sig offset 8x8 +%endif + add r1d, r2d + movzx r5d, word [significant_coeff_flag_offset+r1*2 GLOBAL] ; r5 = ctx_sig + movzx r7d, word [last_coeff_flag_offset+r1*2 GLOBAL] ; r7 = ctx_last + movzx r8d, word [coeff_abs_level_m1_offset+r2*2 GLOBAL] ; r8 = ctx_level + +; abs() all the coefficients; copy them to the stack to avoid +; changing the originals. +; overreading is okay; it's all valid aligned data anyways. +%if %1 + ABS_DCTCOEFS r0, 64 +%else + mov r4, r0 ; r4 = dct + mov r6, ~SIZEOF_DCTCOEF + and r6, r4 ; handle AC coefficient case + ABS_DCTCOEFS r6, 16 + sub r4, r6 ; calculate our new dct pointer + add r4, rsp ; restore AC coefficient offset +%endif + mov r1, [%2+gprsize*r2 GLOBAL] +; for improved OOE performance, run coeff_last on the original coefficients. + call r1 ; coeff_last[ctx_block_cat]( dct ) +; we know on 64-bit that the SSE2 versions of this function only +; overwrite r0, r1, and rax (r6). last64 overwrites r2 too, but we +; don't need r2 in 8x8 mode. + mov r0d, [r3+cb.bits_encoded] ; r0 = cabac.f8_bits_encoded +; pre-add some values to simplify addressing + add r3, cb.state + add r5, r3 + add r7, r3 + add r8, r3 ; precalculate cabac state pointers + +; if( last != count_cat_m1[ctx_block_cat] ) +%if %1 + cmp r6b, 63 +%else + cmp r6b, [count_cat_m1+r2 GLOBAL] +%endif + je .skip_last_sigmap + +; in 8x8 mode we have to do a bit of extra calculation for ctx_sig/last, +; so we'll use r11 for this. +%if %1 + %define siglast_ctx r11 +%else + %define siglast_ctx r6 +%endif + +; x264_cabac_encode_decision( cb, ctx_sig + last, 1 ) +; x264_cabac_encode_decision( cb, ctx_last + last, 1 ) + SIG_OFFSET %1 + movzx r1d, byte [r5+siglast_ctx] + movzx r9d, byte [cabac_transition+1+r1*2 GLOBAL] + xor r1d, 1 + movzx r1d, word [cabac_entropy+r1*2 GLOBAL] + mov [r5+siglast_ctx], r9b + add r0d, r1d + + LAST_OFFSET %1 + movzx r1d, byte [r7+siglast_ctx] + movzx r9d, byte [cabac_transition+1+r1*2 GLOBAL] + xor r1d, 1 + movzx r1d, word [cabac_entropy+r1*2 GLOBAL] + mov [r7+siglast_ctx], r9b + add r0d, r1d +.skip_last_sigmap: + LOAD_DCTCOEF r1d + COEFF_ABS_LEVEL_GT1 last, 1 +; for( int i = last-1 ; i >= 0; i-- ) + dec r6d + jl .end +.coeff_loop: + LOAD_DCTCOEF r1d +; if( l[i] ) + SIG_OFFSET %1 + movzx r9d, byte [r5+siglast_ctx] + test r1d, r1d + jnz .coeff_nonzero +; x264_cabac_encode_decision( cb, ctx_sig + i, 0 ) + movzx r10d, byte [cabac_transition+r9*2 GLOBAL] + movzx r9d, word [cabac_entropy+r9*2 GLOBAL] + mov [r5+siglast_ctx], r10b + add r0d, r9d + dec r6d + jge .coeff_loop + jmp .end +.coeff_nonzero: +; x264_cabac_encode_decision( cb, ctx_sig + i, 1 ) + movzx r10d, byte [cabac_transition+r9*2+1 GLOBAL] + xor r9d, 1 + movzx r9d, word [cabac_entropy+r9*2 GLOBAL] + mov [r5+siglast_ctx], r10b + add r0d, r9d +; x264_cabac_encode_decision( cb, ctx_last + i, 0 ); + LAST_OFFSET %1 + movzx r9d, byte [r7+siglast_ctx] + movzx r10d, byte [cabac_transition+r9*2 GLOBAL] + movzx r9d, word [cabac_entropy+r9*2 GLOBAL] + mov [r7+siglast_ctx], r10b + add r0d, r9d + COEFF_ABS_LEVEL_GT1 coeff, 0 + dec r6d + jge .coeff_loop +.end: + mov [r3+cb.bits_encoded-cb.state], r0d + ADD rsp, pad + RET +%endmacro + +%if ARCH_X86_64 +INIT_XMM sse2 +CABAC_RESIDUAL_RD 0, coeff_last_sse2 +CABAC_RESIDUAL_RD 1, coeff_last_sse2 +INIT_XMM sse2,lzcnt +CABAC_RESIDUAL_RD 0, coeff_last_sse2_lzcnt +CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt +INIT_XMM ssse3 +CABAC_RESIDUAL_RD 0, coeff_last_sse2 +CABAC_RESIDUAL_RD 1, coeff_last_sse2 +INIT_XMM ssse3,lzcnt +CABAC_RESIDUAL_RD 0, coeff_last_sse2_lzcnt +CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt +%endif + +;----------------------------------------------------------------------------- +; void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced, +; int ctx_block_cat, x264_cabac_t *cb ); +;----------------------------------------------------------------------------- + +%macro CALL_CABAC 0 +%if cpuflag(bmi2) + call cabac_encode_decision_bmi2 +%else + call cabac_encode_decision_asm +%endif +%if WIN64 ; move cabac back + mov r0, r3 +%endif +%endmacro + +; %1 = 8x8 mode +; %2 = dct register +; %3 = countcat +; %4 = name +%macro SIGMAP_LOOP 3-4 +.sigmap_%4loop: +%if HIGH_BIT_DEPTH + mov %2, [dct+r10*4] +%else + movsx %2, word [dct+r10*2] +%endif +%if %1 + movzx r1d, byte [sigoff_8x8 + r10] + add r1d, sigoffd +%else + lea r1d, [sigoffd + r10d] +%endif + test %2, %2 + jz .sigmap_%4zero ; if( l[i] ) + inc coeffidxd + mov [coeffs+coeffidxq*4], %2 ; coeffs[++coeff_idx] = l[i]; + mov r2d, 1 + CALL_CABAC ; x264_cabac_encode_decision( cb, ctx_sig + sig_off, 1 ); +%if %1 + movzx r1d, byte [last_coeff_flag_offset_8x8 + r10 GLOBAL] + add r1d, lastoffd +%else + lea r1d, [lastoffd + r10d] +%endif + cmp r10d, lastm ; if( i == last ) + je .sigmap_%4last + xor r2d, r2d + CALL_CABAC ; x264_cabac_encode_decision( cb, ctx_last + last_off, 0 ); + jmp .sigmap_%4loop_endcheck +.sigmap_%4zero: + xor r2d, r2d + CALL_CABAC ; x264_cabac_encode_decision( cb, ctx_sig + sig_off, 0 ); +.sigmap_%4loop_endcheck: + inc r10d + cmp r10d, %3 + jne .sigmap_%4loop ; if( ++i == count_m1 ) +%if HIGH_BIT_DEPTH + mov %2, [dct+r10*4] +%else + movsx %2, word [dct+r10*2] +%endif + inc coeffidxd + mov [coeffs+coeffidxq*4], %2 ; coeffs[++coeff_idx] = l[i] + jmp .sigmap_%4end +.sigmap_%4last: ; x264_cabac_encode_decision( cb, ctx_last + last_off, 1 ); + mov r2d, 1 + CALL_CABAC +.sigmap_%4end: +%if %1==0 + jmp .level_loop_start +%endif +%endmacro + +%macro CABAC_RESIDUAL 1 +cglobal cabac_block_residual_internal, 4,15 +%ifdef PIC +; if we use the same r7 as in cabac_encode_decision, we can cheat and save a register. + lea r7, [$$] + %define lastm [rsp+4*1] + %define GLOBAL +r7-$$ +%else + %define lastm r7d + %define GLOBAL +%endif +%assign pad gprsize+4*2+4*64-(stack_offset&15) + SUB rsp, pad + shl r1d, 4 + + %define sigoffq r8 + %define sigoffd r8d + %define lastoffq r9 + %define lastoffd r9d + %define leveloffq r10 + %define leveloffd r10d + %define leveloffm [rsp+4*0] + %define countcatd r11d + %define sigoff_8x8 r12 + %define coeffidxq r13 + %define coeffidxd r13d + %define dct r14 + %define coeffs rsp+4*2 + + lea sigoff_8x8, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL] + add r1d, r2d + movzx sigoffd, word [significant_coeff_flag_offset+r1*2 GLOBAL] + movzx lastoffd, word [last_coeff_flag_offset+r1*2 GLOBAL] + movzx leveloffd, word [coeff_abs_level_m1_offset+r2*2 GLOBAL] + movzx countcatd, byte [count_cat_m1+r2 GLOBAL] + mov coeffidxd, -1 + mov dct, r0 + mov leveloffm, leveloffd + + mov r1, [%1+gprsize*r2 GLOBAL] + call r1 + mov lastm, eax +; put cabac in r0; needed for cabac_encode_decision + mov r0, r3 + + xor r10d, r10d + cmp countcatd, 63 + je .sigmap_8x8 + SIGMAP_LOOP 0, r12d, countcatd, +.sigmap_8x8: + SIGMAP_LOOP 1, r11d, 63, _8x8 +.level_loop_start: +; we now have r8, r9, r11, r12, and r7/r14(dct) free for the main loop. + %define nodectxq r8 + %define nodectxd r8d + mov leveloffd, leveloffm + xor nodectxd, nodectxd +.level_loop: + mov r9d, [coeffs+coeffidxq*4] + mov r11d, r9d + sar r11d, 31 + add r9d, r11d + movzx r1d, byte [coeff_abs_level1_ctx+nodectxq GLOBAL] + xor r9d, r11d + add r1d, leveloffd + cmp r9d, 1 + jg .level_gt1 + xor r2d, r2d + CALL_CABAC + movzx nodectxd, byte [coeff_abs_level_transition+nodectxq GLOBAL] + jmp .level_sign +.level_gt1: + mov r2d, 1 + CALL_CABAC + movzx r14d, byte [coeff_abs_levelgt1_ctx+nodectxq GLOBAL] + add r14d, leveloffd + cmp r9d, 15 + mov r12d, 15 + cmovl r12d, r9d + sub r12d, 2 + jz .level_eq2 +.level_gt1_loop: + mov r1d, r14d + mov r2d, 1 + CALL_CABAC + dec r12d + jg .level_gt1_loop + cmp r9d, 15 + jge .level_bypass +.level_eq2: + mov r1d, r14d + xor r2d, r2d + CALL_CABAC + jmp .level_gt1_end +.level_bypass: + lea r2d, [r9d-15] + xor r1d, r1d + push r0 +; we could avoid this if we implemented it in asm, but I don't feel like that +; right now. +%if UNIX64 + push r7 + push r8 +%else + sub rsp, 32 ; shadow space +%endif + call cabac_encode_ue_bypass +%if UNIX64 + pop r8 + pop r7 +%else + add rsp, 32 +%endif + pop r0 +.level_gt1_end: + movzx nodectxd, byte [coeff_abs_level_transition+8+nodectxq GLOBAL] +.level_sign: + mov r1d, r11d +%if cpuflag(bmi2) + call cabac_encode_bypass_bmi2 +%else + call cabac_encode_bypass_asm +%endif +%if WIN64 + mov r0, r3 +%endif + dec coeffidxd + jge .level_loop + ADD rsp, pad + RET +%endmacro + +%if ARCH_X86_64 +INIT_XMM sse2 +CABAC_RESIDUAL coeff_last_sse2 +INIT_XMM sse2,lzcnt +CABAC_RESIDUAL coeff_last_sse2_lzcnt +INIT_XMM avx2,bmi2 +CABAC_RESIDUAL coeff_last_avx2_lzcnt +%endif
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/const-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/const-a.asm
Changed
@@ -26,39 +26,53 @@ %include "x86inc.asm" -SECTION_RODATA +SECTION_RODATA 32 + +const pb_1, times 32 db 1 +const hsub_mul, times 16 db 1, -1 +const pw_1, times 16 dw 1 +const pw_16, times 16 dw 16 +const pw_32, times 16 dw 32 +const pw_512, times 16 dw 512 +const pw_00ff, times 16 dw 0x00ff +const pw_pixel_max,times 16 dw ((1 << BIT_DEPTH)-1) +const pd_1, times 8 dd 1 +const deinterleave_shufd, dd 0,4,1,5,2,6,3,7 +const pb_unpackbd1, times 2 db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3 +const pb_unpackbd2, times 2 db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7 const pb_01, times 8 db 0,1 const pb_0, times 16 db 0 const pb_a1, times 16 db 0xa1 -const pb_1, times 16 db 1 const pb_3, times 16 db 3 -const hsub_mul, times 8 db 1, -1 const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6 -const pw_1, times 8 dw 1 const pw_2, times 8 dw 2 const pw_m2, times 8 dw -2 const pw_4, times 8 dw 4 const pw_8, times 8 dw 8 -const pw_16, times 8 dw 16 -const pw_32, times 8 dw 32 const pw_64, times 8 dw 64 +const pw_256, times 8 dw 256 const pw_32_0, times 4 dw 32, times 4 dw 0 const pw_8000, times 8 dw 0x8000 const pw_3fff, times 8 dw 0x3fff -const pw_pixel_max,times 8 dw ((1 << BIT_DEPTH)-1) const pw_ppppmmmm, dw 1,1,1,1,-1,-1,-1,-1 const pw_ppmmppmm, dw 1,1,-1,-1,1,1,-1,-1 const pw_pmpmpmpm, dw 1,-1,1,-1,1,-1,1,-1 const pw_pmmpzzzz, dw 1,-1,-1,1,0,0,0,0 -const pd_1, times 4 dd 1 const pd_32, times 4 dd 32 const pd_1024, times 4 dd 1024 const pd_ffff, times 4 dd 0xffff -const pw_00ff, times 8 dw 0x00ff const pw_ff00, times 8 dw 0xff00 +const popcnt_table +%assign x 0 +%rep 256 +; population count +db ((x>>0)&1)+((x>>1)&1)+((x>>2)&1)+((x>>3)&1)+((x>>4)&1)+((x>>5)&1)+((x>>6)&1)+((x>>7)&1) +%assign x x+1 +%endrep + const sw_64, dd 64
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/cpu-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/cpu-a.asm
Changed
@@ -66,7 +66,27 @@ mov [r4], edx RET -%if ARCH_X86_64 == 0 +%if ARCH_X86_64 + +;----------------------------------------------------------------------------- +; void stack_align( void (*func)(void*), void *arg ); +;----------------------------------------------------------------------------- +cglobal stack_align + push rbp + mov rbp, rsp +%if WIN64 + sub rsp, 32 ; shadow space +%endif + and rsp, ~31 + mov rax, r0 + mov r0, r1 + mov r1, r2 + mov r2, r3 + call rax + leave + ret + +%else ;----------------------------------------------------------------------------- ; int cpu_cpuid_test( void ) @@ -94,14 +114,11 @@ popfd ret -;----------------------------------------------------------------------------- -; void stack_align( void (*func)(void*), void *arg ); -;----------------------------------------------------------------------------- cglobal stack_align push ebp mov ebp, esp sub esp, 12 - and esp, ~15 + and esp, ~31 mov ecx, [ebp+8] mov edx, [ebp+12] mov [esp], edx @@ -165,7 +182,10 @@ %endif push rbp mov rbp, rsp - and rsp, ~15 +%if WIN64 + sub rsp, 32 ; shadow space +%endif + and rsp, ~31 call intel_cpu_indicator_init leave %if ARCH_X86_64
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/dct-64.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/dct-64.asm
Changed
@@ -311,6 +311,42 @@ INIT_XMM xop DCT_SUB8 +INIT_YMM avx2 +cglobal sub16x16_dct8, 3,3,10 + add r0, 128 + add r2, 4*FDEC_STRIDE + call .sub16x8_dct8 + add r0, 256 + add r1, FENC_STRIDE*8 + add r2, FDEC_STRIDE*8 + call .sub16x8_dct8 + RET +.sub16x8_dct8: + LOAD_DIFF16x2_AVX2 0, 1, 2, 3, 0, 1 + LOAD_DIFF16x2_AVX2 2, 3, 4, 5, 2, 3 + LOAD_DIFF16x2_AVX2 4, 5, 6, 7, 4, 5 + LOAD_DIFF16x2_AVX2 6, 7, 8, 9, 6, 7 + DCT8_1D w, 0,1,2,3,4,5,6,7,8,9 + TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8 + DCT8_1D w, 0,1,2,3,4,5,6,7,8,9 + mova [r0-0x80+0x00], xm0 + vextracti128 [r0+0x00], m0, 1 + mova [r0-0x80+0x10], xm1 + vextracti128 [r0+0x10], m1, 1 + mova [r0-0x80+0x20], xm2 + vextracti128 [r0+0x20], m2, 1 + mova [r0-0x80+0x30], xm3 + vextracti128 [r0+0x30], m3, 1 + mova [r0-0x80+0x40], xm4 + vextracti128 [r0+0x40], m4, 1 + mova [r0-0x80+0x50], xm5 + vextracti128 [r0+0x50], m5, 1 + mova [r0-0x80+0x60], xm6 + vextracti128 [r0+0x60], m6, 1 + mova [r0-0x80+0x70], xm7 + vextracti128 [r0+0x70], m7, 1 + ret + ;----------------------------------------------------------------------------- ; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] ) ;----------------------------------------------------------------------------- @@ -390,4 +426,5 @@ ADD8x8 INIT_XMM avx ADD8x8 + %endif ; !HIGH_BIT_DEPTH
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/dct-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/dct-a.asm
Changed
@@ -30,7 +30,7 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA +SECTION_RODATA 32 pw_ppmmmmpp: dw 1,1,-1,-1,-1,-1,1,1 pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15 pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15 @@ -39,8 +39,6 @@ pb_scan4frameb: SHUFFLE_MASK_W 0,4,1,2,5,6,3,7 pb_scan4frame2a: SHUFFLE_MASK_W 0,4,1,2,5,8,12,9 pb_scan4frame2b: SHUFFLE_MASK_W 6,3,7,10,13,14,11,15 -pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3 -pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7 pb_scan8framet1: SHUFFLE_MASK_W 0, 1, 6, 7, 8, 9, 13, 14 pb_scan8framet2: SHUFFLE_MASK_W 2 , 3, 4, 7, 9, 15, 10, 14 @@ -74,6 +72,7 @@ cextern pw_32_0 cextern pw_32 +cextern pw_512 cextern pw_8000 cextern pw_pixel_max cextern hsub_mul @@ -83,6 +82,9 @@ cextern pd_32 cextern pw_ppppmmmm cextern pw_pmpmpmpm +cextern deinterleave_shufd +cextern pb_unpackbd1 +cextern pb_unpackbd2 %macro WALSH4_1D 6 SUMSUB_BADC %1, %5, %4, %3, %2, %6 @@ -377,6 +379,135 @@ ADD4x4 INIT_XMM avx ADD4x4 + +%macro STOREx2_AVX2 9 + movq xm%3, [r0+%5*FDEC_STRIDE] + vinserti128 m%3, m%3, [r0+%6*FDEC_STRIDE], 1 + movq xm%4, [r0+%7*FDEC_STRIDE] + vinserti128 m%4, m%4, [r0+%8*FDEC_STRIDE], 1 + punpcklbw m%3, m%9 + punpcklbw m%4, m%9 + psraw m%1, 6 + psraw m%2, 6 + paddsw m%1, m%3 + paddsw m%2, m%4 + packuswb m%1, m%2 + vextracti128 xm%2, m%1, 1 + movq [r0+%5*FDEC_STRIDE], xm%1 + movq [r0+%6*FDEC_STRIDE], xm%2 + movhps [r0+%7*FDEC_STRIDE], xm%1 + movhps [r0+%8*FDEC_STRIDE], xm%2 +%endmacro + +INIT_YMM avx2 +cglobal add8x8_idct, 2,3,8 + add r0, 4*FDEC_STRIDE + pxor m7, m7 + TAIL_CALL .skip_prologue, 0 +global current_function %+ .skip_prologue +.skip_prologue: + ; TRANSPOSE4x4Q + mova xm0, [r1+ 0] + mova xm1, [r1+32] + mova xm2, [r1+16] + mova xm3, [r1+48] + vinserti128 m0, m0, [r1+ 64], 1 + vinserti128 m1, m1, [r1+ 96], 1 + vinserti128 m2, m2, [r1+ 80], 1 + vinserti128 m3, m3, [r1+112], 1 + SBUTTERFLY qdq, 0, 1, 4 + SBUTTERFLY qdq, 2, 3, 4 + IDCT4_1D w,0,1,2,3,4,5 + TRANSPOSE2x4x4W 0,1,2,3,4 + paddw m0, [pw_32] + IDCT4_1D w,0,1,2,3,4,5 + STOREx2_AVX2 0, 1, 4, 5, -4, 0, -3, 1, 7 + STOREx2_AVX2 2, 3, 4, 5, -2, 2, -1, 3, 7 + ret + +; 2xdst, 2xtmp, 4xsrcrow, 1xzero +%macro LOAD_DIFF8x2_AVX2 9 + movq xm%1, [r1+%5*FENC_STRIDE] + movq xm%2, [r1+%6*FENC_STRIDE] + vinserti128 m%1, m%1, [r1+%7*FENC_STRIDE], 1 + vinserti128 m%2, m%2, [r1+%8*FENC_STRIDE], 1 + punpcklbw m%1, m%9 + punpcklbw m%2, m%9 + movq xm%3, [r2+(%5-4)*FDEC_STRIDE] + movq xm%4, [r2+(%6-4)*FDEC_STRIDE] + vinserti128 m%3, m%3, [r2+(%7-4)*FDEC_STRIDE], 1 + vinserti128 m%4, m%4, [r2+(%8-4)*FDEC_STRIDE], 1 + punpcklbw m%3, m%9 + punpcklbw m%4, m%9 + psubw m%1, m%3 + psubw m%2, m%4 +%endmacro + +; 4x src, 1x tmp +%macro STORE8_DCT_AVX2 5 + SBUTTERFLY qdq, %1, %2, %5 + SBUTTERFLY qdq, %3, %4, %5 + mova [r0+ 0], xm%1 + mova [r0+ 16], xm%3 + mova [r0+ 32], xm%2 + mova [r0+ 48], xm%4 + vextracti128 [r0+ 64], m%1, 1 + vextracti128 [r0+ 80], m%3, 1 + vextracti128 [r0+ 96], m%2, 1 + vextracti128 [r0+112], m%4, 1 +%endmacro + +%macro STORE16_DCT_AVX2 5 + SBUTTERFLY qdq, %1, %2, %5 + SBUTTERFLY qdq, %3, %4, %5 + mova [r0+ 0-128], xm%1 + mova [r0+16-128], xm%3 + mova [r0+32-128], xm%2 + mova [r0+48-128], xm%4 + vextracti128 [r0+ 0], m%1, 1 + vextracti128 [r0+16], m%3, 1 + vextracti128 [r0+32], m%2, 1 + vextracti128 [r0+48], m%4, 1 +%endmacro + +INIT_YMM avx2 +cglobal sub8x8_dct, 3,3,7 + pxor m6, m6 + add r2, 4*FDEC_STRIDE + LOAD_DIFF8x2_AVX2 0, 1, 4, 5, 0, 1, 4, 5, 6 + LOAD_DIFF8x2_AVX2 2, 3, 4, 5, 2, 3, 6, 7, 6 + DCT4_1D 0, 1, 2, 3, 4 + TRANSPOSE2x4x4W 0, 1, 2, 3, 4 + DCT4_1D 0, 1, 2, 3, 4 + STORE8_DCT_AVX2 0, 1, 2, 3, 4 + RET + +INIT_YMM avx2 +cglobal sub16x16_dct, 3,3,6 + add r0, 128 + add r2, 4*FDEC_STRIDE + call .sub16x4_dct + add r0, 64 + add r1, 4*FENC_STRIDE + add r2, 4*FDEC_STRIDE + call .sub16x4_dct + add r0, 256-64 + add r1, 4*FENC_STRIDE + add r2, 4*FDEC_STRIDE + call .sub16x4_dct + add r0, 64 + add r1, 4*FENC_STRIDE + add r2, 4*FDEC_STRIDE + call .sub16x4_dct + RET +.sub16x4_dct: + LOAD_DIFF16x2_AVX2 0, 1, 4, 5, 0, 1 + LOAD_DIFF16x2_AVX2 2, 3, 4, 5, 2, 3 + DCT4_1D 0, 1, 2, 3, 4 + TRANSPOSE2x4x4W 0, 1, 2, 3, 4 + DCT4_1D 0, 1, 2, 3, 4 + STORE16_DCT_AVX2 0, 1, 2, 3, 4 + ret %endif ; HIGH_BIT_DEPTH INIT_MMX @@ -422,7 +553,7 @@ cglobal %1, 2,2,11 pxor m7, m7 %endif -%if mmsize==16 && %3!=256 +%if mmsize>=16 && %3!=256 add r0, 4*FDEC_STRIDE %endif .skip_prologue: @@ -497,6 +628,9 @@ SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2, 128, 8, 0, 0, 11 SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3, 128, 8, 0, 0, 11 SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx, 128, 8, 0, 0, 11 + +INIT_YMM +ADD_NxN_IDCT add16x16_idct_avx2, add8x8_idct_avx2, 128, 8, 0, 0 %endif ; HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH @@ -607,10 +741,9 @@ movh m0, [r1] pxor m1, m1 add r0, FDEC_STRIDE*4 - paddw m0, [pw_32] - psraw m0, 6 + pmulhrsw m0, [pw_512] psubw m1, m0 - mova m5, [pb_idctdc_unpack] + mova m5, [pb_unpackbd1] packuswb m0, m0 packuswb m1, m1 pshufb m0, m5 @@ -705,11 +838,10 @@ mova m0, [r1] add r1, 16 pxor m1, m1 - paddw m0, [pw_32] - psraw m0, 6 + pmulhrsw m0, [pw_512] psubw m1, m0 - mova m5, [ pb_idctdc_unpack] - mova m6, [pb_idctdc_unpack2] + mova m5, [pb_unpackbd1] + mova m6, [pb_unpackbd2] packuswb m0, m0 packuswb m1, m1 pshufb m2, m0, m6 @@ -726,6 +858,43 @@ INIT_XMM avx ADD16x16 +%macro ADD_DC_AVX2 3 + mova xm4, [r0+FDEC_STRIDE*0+%3] + mova xm5, [r0+FDEC_STRIDE*1+%3] + vinserti128 m4, m4, [r2+FDEC_STRIDE*0+%3], 1 + vinserti128 m5, m5, [r2+FDEC_STRIDE*1+%3], 1 + paddusb m4, %1 + paddusb m5, %1 + psubusb m4, %2 + psubusb m5, %2 + mova [r0+FDEC_STRIDE*0+%3], xm4 + mova [r0+FDEC_STRIDE*1+%3], xm5 + vextracti128 [r2+FDEC_STRIDE*0+%3], m4, 1 + vextracti128 [r2+FDEC_STRIDE*1+%3], m5, 1 +%endmacro + +INIT_YMM avx2 +cglobal add16x16_idct_dc, 2,3,6 + add r0, FDEC_STRIDE*4 + mova m0, [r1] + pxor m1, m1 + pmulhrsw m0, [pw_512] + psubw m1, m0 + mova m4, [pb_unpackbd1] + mova m5, [pb_unpackbd2] + packuswb m0, m0 + packuswb m1, m1 + pshufb m2, m0, m4 ; row0, row2 + pshufb m3, m1, m4 ; row0, row2 + pshufb m0, m5 ; row1, row3 + pshufb m1, m5 ; row1, row3 + lea r2, [r0+FDEC_STRIDE*8] + ADD_DC_AVX2 m2, m3, FDEC_STRIDE*-4 + ADD_DC_AVX2 m2, m3, FDEC_STRIDE*-2 + ADD_DC_AVX2 m0, m1, FDEC_STRIDE* 0 + ADD_DC_AVX2 m0, m1, FDEC_STRIDE* 2 + RET + %endif ; HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- @@ -1608,4 +1777,42 @@ ZIGZAG_8x8_CAVLC INIT_XMM avx ZIGZAG_8x8_CAVLC + +INIT_YMM avx2 +cglobal zigzag_interleave_8x8_cavlc, 3,3,6 + mova m0, [r1+ 0] + mova m1, [r1+32] + mova m2, [r1+64] + mova m3, [r1+96] + mova m5, [deinterleave_shufd] + SBUTTERFLY wd, 0, 1, 4 + SBUTTERFLY wd, 2, 3, 4 + SBUTTERFLY wd, 0, 1, 4 + SBUTTERFLY wd, 2, 3, 4 + vpermd m0, m5, m0 + vpermd m1, m5, m1 + vpermd m2, m5, m2 + vpermd m3, m5, m3 + mova [r0+ 0], xm0 + mova [r0+ 16], xm2 + vextracti128 [r0+ 32], m0, 1 + vextracti128 [r0+ 48], m2, 1 + mova [r0+ 64], xm1 + mova [r0+ 80], xm3 + vextracti128 [r0+ 96], m1, 1 + vextracti128 [r0+112], m3, 1 + + packsswb m0, m2 ; nnz0, nnz1 + packsswb m1, m3 ; nnz2, nnz3 + packsswb m0, m1 ; {nnz0,nnz2}, {nnz1,nnz3} + vpermq m0, m0, q3120 ; {nnz0,nnz1}, {nnz2,nnz3} + pxor m5, m5 + pcmpeqq m0, m5 + pmovmskb r0d, m0 + not r0d + and r0d, 0x01010101 + mov [r2+0], r0w + shr r0d, 16 + mov [r2+8], r0w + RET %endif ; !HIGH_BIT_DEPTH
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/dct.h -> x264-snapshot-20130723-2245.tar.bz2/common/x86/dct.h
Changed
@@ -40,6 +40,8 @@ void x264_sub16x16_dct_avx ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); void x264_sub8x8_dct_xop ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 ); void x264_sub16x16_dct_xop ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x8_dct_avx2 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub16x16_dct_avx2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); void x264_sub8x8_dct_dc_mmx2( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 ); void x264_sub8x8_dct_dc_sse2( dctcoef dct [ 4], pixel *pix1, pixel *pix2 ); void x264_sub8x16_dct_dc_sse2 ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 ); @@ -56,14 +58,17 @@ void x264_add16x16_idct_dc_mmx2 ( uint8_t *p_dst, int16_t dct [16] ); void x264_add8x8_idct_sse2 ( pixel *p_dst, dctcoef dct[ 4][16] ); void x264_add8x8_idct_avx ( pixel *p_dst, dctcoef dct[ 4][16] ); +void x264_add8x8_idct_avx2 ( pixel *p_dst, dctcoef dct[ 4][16] ); void x264_add16x16_idct_sse2 ( pixel *p_dst, dctcoef dct[16][16] ); void x264_add16x16_idct_avx ( pixel *p_dst, dctcoef dct[16][16] ); +void x264_add16x16_idct_avx2 ( pixel *p_dst, dctcoef dct[16][16] ); void x264_add8x8_idct_dc_sse2 ( pixel *p_dst, dctcoef dct [ 4] ); void x264_add16x16_idct_dc_sse2 ( pixel *p_dst, dctcoef dct [16] ); void x264_add8x8_idct_dc_ssse3 ( uint8_t *p_dst, int16_t dct [ 4] ); void x264_add16x16_idct_dc_ssse3( uint8_t *p_dst, int16_t dct [16] ); void x264_add8x8_idct_dc_avx ( pixel *p_dst, dctcoef dct [ 4] ); void x264_add16x16_idct_dc_avx ( pixel *p_dst, dctcoef dct [16] ); +void x264_add16x16_idct_dc_avx2 ( uint8_t *p_dst, int16_t dct [16] ); void x264_dct4x4dc_mmx ( int16_t d[16] ); void x264_dct4x4dc_sse2 ( int32_t d[16] ); @@ -82,6 +87,7 @@ void x264_sub16x16_dct8_sse4 ( int32_t dct[4][64], uint16_t *pix1, uint16_t *pix2 ); void x264_sub8x8_dct8_avx ( dctcoef dct [64], pixel *pix1, pixel *pix2 ); void x264_sub16x16_dct8_avx ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 ); +void x264_sub16x16_dct8_avx2 ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 ); void x264_add8x8_idct8_mmx ( uint8_t *dst, int16_t dct [64] ); @@ -118,5 +124,6 @@ void x264_zigzag_interleave_8x8_cavlc_mmx ( int16_t *dst, int16_t *src, uint8_t *nnz ); void x264_zigzag_interleave_8x8_cavlc_sse2( dctcoef *dst, dctcoef *src, uint8_t *nnz ); void x264_zigzag_interleave_8x8_cavlc_avx ( dctcoef *dst, dctcoef *src, uint8_t *nnz ); +void x264_zigzag_interleave_8x8_cavlc_avx2( int16_t *dst, int16_t *src, uint8_t *nnz ); #endif
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/deblock-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/deblock-a.asm
Changed
@@ -28,8 +28,10 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA +SECTION_RODATA 32 +load_bytes_shuf: times 2 db 3,4,5,6,11,12,13,14,4,5,6,7,12,13,14,15 +insert_top_shuf: dd 0,1,4,5,7,2,3,6 transpose_shuf: db 0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15 SECTION .text @@ -42,6 +44,7 @@ cextern pw_4 cextern pw_00ff cextern pw_pixel_max +cextern pb_unpackbd1 %if HIGH_BIT_DEPTH ; out: %4 = |%1-%2|-%3 @@ -162,14 +165,12 @@ ;----------------------------------------------------------------------------- ; void deblock_v_luma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal deblock_v_luma, 5,5,8 - %assign pad 5*mmsize+12-(stack_offset&15) +cglobal deblock_v_luma, 5,5,8,0-5*mmsize %define tcm [rsp] %define ms1 [rsp+mmsize] %define ms2 [rsp+mmsize*2] %define am [rsp+mmsize*3] %define bm [rsp+mmsize*4] - SUB rsp, pad add r1, r1 LOAD_AB m4, m5, r2d, r3d mov r3, 32/mmsize @@ -213,11 +214,9 @@ add r4, mmsize/8 dec r3 jg .loop - ADD rsp, pad RET -cglobal deblock_h_luma, 5,6,8 - %assign pad 7*mmsize+12-(stack_offset&15) +cglobal deblock_h_luma, 5,6,8,0-7*mmsize %define tcm [rsp] %define ms1 [rsp+mmsize] %define ms2 [rsp+mmsize*2] @@ -225,7 +224,6 @@ %define p2m [rsp+mmsize*4] %define am [rsp+mmsize*5] %define bm [rsp+mmsize*6] - SUB rsp, pad add r1, r1 LOAD_AB m4, m5, r2d, r3d mov r3, r1 @@ -302,7 +300,6 @@ lea r2, [r2+r1*(mmsize/2)] dec r5 jg .loop - ADD rsp, pad RET %endmacro @@ -485,7 +482,6 @@ %endmacro %macro LUMA_INTRA_INIT 1 - %xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15) %define t0 m4 %define t1 m5 %define t2 m6 @@ -495,7 +491,6 @@ CAT_XDEFINE t, i, [rsp+mmsize*(i-4)] %assign i i+1 %endrep - SUB rsp, pad add r1, r1 %endmacro @@ -724,7 +719,7 @@ ;----------------------------------------------------------------------------- ; void deblock_v_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_v_luma_intra, 4,7,8 +cglobal deblock_v_luma_intra, 4,7,8,0-3*mmsize LUMA_INTRA_INIT 3 lea r4, [r1*4] lea r5, [r1*3] @@ -744,13 +739,12 @@ add r4, mmsize dec r6 jg .loop - ADD rsp, pad RET ;----------------------------------------------------------------------------- ; void deblock_h_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_h_luma_intra, 4,7,8 +cglobal deblock_h_luma_intra, 4,7,8,0-8*mmsize LUMA_INTRA_INIT 8 %if mmsize == 8 lea r4, [r1*3] @@ -785,7 +779,6 @@ dec r6 %endif jg .loop - ADD rsp, pad RET %endmacro @@ -871,6 +864,19 @@ movh %8, m4 %endmacro +; in: 8 rows of 4 bytes in %9..%10 +; out: 8 rows of 4 bytes in %1..%8 +%macro STORE_8x4B 10 + movd %1, %9 + pextrd %2, %9, 1 + pextrd %3, %9, 2 + pextrd %4, %9, 3 + movd %5, %10 + pextrd %6, %10, 1 + pextrd %7, %10, 2 + pextrd %8, %10, 3 +%endmacro + %macro TRANSPOSE4x8B_LOAD 8 TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8 %endmacro @@ -925,6 +931,45 @@ ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] %macro TRANSPOSE6x8_MEM 9 RESET_MM_PERMUTATION +%if cpuflag(avx) + ; input: + ; _ABCDEF_ + ; _GHIJKL_ + ; _MNOPQR_ + ; _STUVWX_ + ; _YZabcd_ + ; _efghij_ + ; _klmnop_ + ; _qrstuv_ + + movh m0, %1 + movh m2, %2 + movh m1, %3 + movh m3, %4 + punpcklbw m0, m2 ; __ AG BH CI DJ EK FL __ + punpcklbw m1, m3 ; __ MS NT OU PV QW RX __ + movh m2, %5 + movh m3, %6 + punpcklbw m2, m3 ; __ Ye Zf ag bh ci dj __ + movh m3, %7 + movh m4, %8 + punpcklbw m3, m4 ; __ kq lr ms nt ou pv __ + + SBUTTERFLY wd, 0, 1, 4 ; __ __ AG MS BH NT CI OU + ; DJ PV EK QW FL RX __ __ + SBUTTERFLY wd, 2, 3, 4 ; __ __ Ye kq Zf lr ag ms + ; bh nt ci ou dj pv __ __ + SBUTTERFLY dq, 0, 2, 4 ; __ __ __ __ AG MS Ye kq + ; BH NT Zf lr CI FL OU RX + SBUTTERFLY dq, 1, 3, 4 ; DJ PV bh nt EK QW Zf lr + ; FL RX dj pv __ __ __ __ + movhps [%9+0x00], m0 + movh [%9+0x10], m2 + movhps [%9+0x20], m2 + movh [%9+0x30], m1 + movhps [%9+0x40], m1 + movh [%9+0x50], m3 +%else movq m0, %1 movq m1, %2 movq m2, %3 @@ -951,13 +996,41 @@ movq [%9+0x30], m1 movq [%9+0x40], m5 movq [%9+0x50], m3 +%endif RESET_MM_PERMUTATION %endmacro + ; in: 8 rows of 8 in %1..%8 ; out: 8 rows of 8 in %9..%16 %macro TRANSPOSE8x8_MEM 16 RESET_MM_PERMUTATION +%if cpuflag(avx) + movh m0, %1 + movh m4, %2 + movh m1, %3 + movh m5, %4 + movh m2, %5 + movh m3, %7 + punpcklbw m0, m4 + punpcklbw m1, m5 + movh m4, %6 + movh m5, %8 + punpcklbw m2, m4 + punpcklbw m3, m5 + SBUTTERFLY wd, 0, 1, 4 + SBUTTERFLY wd, 2, 3, 4 + SBUTTERFLY dq, 0, 2, 4 + SBUTTERFLY dq, 1, 3, 4 + movh %9, m0 + movhps %10, m0 + movh %11, m2 + movhps %12, m2 + movh %13, m1 + movhps %14, m1 + movh %15, m3 + movhps %16, m3 +%else movq m0, %1 movq m1, %2 movq m2, %3 @@ -988,6 +1061,7 @@ movq %12, m0 movq %15, m3 movq %16, m7 +%endif RESET_MM_PERMUTATION %endmacro @@ -1009,31 +1083,42 @@ ; out: %4 = |%1-%2|>%3 ; clobbers: %5 -%macro DIFF_GT2 5 -%if ARCH_X86_64 - psubusb %5, %2, %1 +%macro DIFF_GT2 5-6 +%if %0<6 psubusb %4, %1, %2 + psubusb %5, %2, %1 %else - mova %5, %2 mova %4, %1 - psubusb %5, %1 + mova %5, %2 psubusb %4, %2 + psubusb %5, %1 %endif psubusb %5, %3 psubusb %4, %3 pcmpeqb %4, %5 %endmacro -; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1 +; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha %2=beta ; out: m5=beta-1, m7=mask, %3=alpha-1 ; clobbers: m4,m6 %macro LOAD_MASK 2-3 +%if cpuflag(ssse3) movd m4, %1 movd m5, %2 + pxor m6, m6 + pshufb m4, m6 + pshufb m5, m6 +%else + movd m4, %1 + movd m5, %2 + punpcklbw m4, m4 + punpcklbw m5, m5 SPLATW m4, m4 SPLATW m5, m5 - packuswb m4, m4 ; 16x alpha-1 - packuswb m5, m5 ; 16x beta-1 +%endif + mova m6, [pb_1] + psubusb m4, m6 ; alpha - 1 + psubusb m5, m6 ; alpha - 2 %if %0>2 mova %3, m4 %endif @@ -1096,9 +1181,7 @@ cglobal deblock_v_luma, 5,5,10 movd m8, [r4] ; tc0 lea r4, [r1*3] - dec r2d ; alpha-1 neg r4 - dec r3d ; beta-1 add r4, r0 ; pix-3*stride mova m0, [r4+r1] ; p1 @@ -1107,21 +1190,26 @@ mova m3, [r0+r1] ; q1 LOAD_MASK r2d, r3d +%if cpuflag(avx) + pshufb m8, [pb_unpackbd1] + pblendvb m9, m7, m6, m8 +%else punpcklbw m8, m8 punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] pcmpeqb m9, m9 pcmpeqb m9, m8 pandn m9, m7 +%endif pand m8, m9 - movdqa m3, [r4] ; p2 + mova m3, [r4] ; p2 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 pand m6, m9 - psubb m7, m8, m6 + psubb m7, m8, m6 ; tc++ pand m6, m8 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 - movdqa m4, [r0+2*r1] ; q2 + mova m4, [r0+2*r1] ; q2 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 pand m6, m9 pand m8, m6 @@ -1137,16 +1225,19 @@ ;----------------------------------------------------------------------------- ; void deblock_h_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- + +%if cpuflag(avx) +INIT_XMM cpuname +%else INIT_MMX cpuname -cglobal deblock_h_luma, 5,9 +%endif +cglobal deblock_h_luma, 5,9,0,0x60+16*WIN64 lea r8, [r1*3] lea r6, [r0-4] lea r5, [r0-4+r8] %if WIN64 - sub rsp, 0x98 - %define pix_tmp rsp+0x30 + %define pix_tmp rsp+0x30 ; shadow space + r4 %else - sub rsp, 0x68 %define pix_tmp rsp %endif @@ -1170,6 +1261,22 @@ ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) add r6, 2 add r5, 2 +%if cpuflag(sse4) + mova m0, [pix_tmp+0x10] + mova m1, [pix_tmp+0x20] + mova m2, [pix_tmp+0x30] + mova m3, [pix_tmp+0x40] + SBUTTERFLY bw, 0, 1, 4 + SBUTTERFLY bw, 2, 3, 4 + SBUTTERFLY wd, 0, 2, 4 + SBUTTERFLY wd, 1, 3, 4 + STORE_8x4B PASS8ROWS(r6, r5, r7, r8), m1, m3 + shl r7, 3 + sub r6, r7 + sub r5, r7 + shr r7, 3 + STORE_8x4B PASS8ROWS(r6, r5, r7, r8), m0, m2 +%else movq m0, [pix_tmp+0x18] movq m1, [pix_tmp+0x28] movq m2, [pix_tmp+0x38] @@ -1185,12 +1292,8 @@ movq m2, [pix_tmp+0x30] movq m3, [pix_tmp+0x40] TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8) - -%if WIN64 - add rsp, 0x98 -%else - add rsp, 0x68 %endif + RET %endmacro @@ -1207,9 +1310,7 @@ ;----------------------------------------------------------------------------- cglobal deblock_%1_luma, 5,5,8,2*%2 lea r4, [r1*3] - dec r2 ; alpha-1 neg r4 - dec r3 ; beta-1 add r4, r0 ; pix-3*stride mova m0, [r4+r1] ; p1 @@ -1220,12 +1321,18 @@ mov r3, r4mp movd m4, [r3] ; tc0 +%if cpuflag(avx) + pshufb m4, [pb_unpackbd1] + mova [esp+%2], m4 ; tc + pblendvb m4, m7, m6, m4 +%else punpcklbw m4, m4 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] mova [esp+%2], m4 ; tc pcmpeqb m3, m3 pcmpgtb m4, m3 pand m4, m7 +%endif mova [esp], m4 ; mask mova m3, [r4] ; p2 @@ -1254,7 +1361,12 @@ ;----------------------------------------------------------------------------- ; void deblock_h_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- + +%if cpuflag(avx) +INIT_XMM cpuname +%else INIT_MMX cpuname +%endif cglobal deblock_h_luma, 0,5,8,0x60+HAVE_ALIGNED_STACK*12 mov r0, r0mp mov r3, r1m @@ -1289,6 +1401,20 @@ sub r0, 2 lea r1, [r0+r4] +%if cpuflag(avx) + mova m0, [pix_tmp+0x10] + mova m1, [pix_tmp+0x20] + mova m2, [pix_tmp+0x30] + mova m3, [pix_tmp+0x40] + SBUTTERFLY bw, 0, 1, 4 + SBUTTERFLY bw, 2, 3, 4 + SBUTTERFLY wd, 0, 2, 4 + SBUTTERFLY wd, 1, 3, 4 + STORE_8x4B PASS8ROWS(r0, r1, r3, r4), m0, m2 + lea r0, [r0+r3*8] + lea r1, [r1+r3*8] + STORE_8x4B PASS8ROWS(r0, r1, r3, r4), m1, m3 +%else movq m0, [pix_tmp+0x10] movq m1, [pix_tmp+0x20] movq m2, [pix_tmp+0x30] @@ -1302,6 +1428,7 @@ movq m2, [pix_tmp+0x38] movq m3, [pix_tmp+0x48] TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) +%endif RET %endmacro ; DEBLOCK_LUMA @@ -1429,7 +1556,11 @@ %define t5 m11 %define mask0 m12 %define mask1p m13 +%if WIN64 + %define mask1q [rsp] +%else %define mask1q [rsp-24] +%endif %define mpb_0 m14 %define mpb_1 m15 %else @@ -1448,14 +1579,10 @@ ;----------------------------------------------------------------------------- ; void deblock_v_luma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_%1_luma_intra, 4,6,16,ARCH_X86_64*0x50-0x50 +cglobal deblock_%1_luma_intra, 4,6,16,0-(1-ARCH_X86_64)*0x50-WIN64*0x10 lea r4, [r1*4] lea r5, [r1*3] ; 3*stride - dec r2d ; alpha-1 - jl .end neg r4 - dec r3d ; beta-1 - jl .end add r4, r0 ; pix-4*stride mova p1, [r4+2*r1] mova p0, [r4+r5] @@ -1470,9 +1597,9 @@ pavgb t5, mpb_1 ; alpha/4+1 movdqa p2, [r4+r1] movdqa q2, [r0+2*r1] - DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1 - DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1 - DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1 + DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1 + DIFF_GT2 p0, p2, m5, t2, t5, 1 ; mask1 = |p2-p0| > beta-1 + DIFF_GT2 q0, q2, m5, t4, t5, 1 ; t4 = |q2-q0| > beta-1 pand t0, mask0 pand t4, t0 pand t2, t0 @@ -1484,12 +1611,12 @@ mova mask0, m7 pavgb m4, [pb_0] pavgb m4, [pb_1] ; alpha/4+1 - DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1 + DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1 pand m6, mask0 - DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1 + DIFF_GT2 p0, p2, m5, m4, m7, 1 ; m4 = |p2-p0| > beta-1 pand m4, m6 mova mask1p, m4 - DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1 + DIFF_GT2 q0, q2, m5, m4, m7, 1 ; m4 = |q2-q0| > beta-1 pand m4, m6 mova mask1q, m4 %endif @@ -1499,17 +1626,24 @@ .end: REP_RET +%if cpuflag(avx) +INIT_XMM cpuname +%else INIT_MMX cpuname +%endif %if ARCH_X86_64 ;----------------------------------------------------------------------------- ; void deblock_h_luma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_h_luma_intra, 4,9 +cglobal deblock_h_luma_intra, 4,9,0,0x80 lea r8, [r1*3] lea r6, [r0-4] lea r5, [r0-4+r8] - sub rsp, 0x88 +%if WIN64 + %define pix_tmp rsp+0x20 ; shadow space +%else %define pix_tmp rsp +%endif ; transpose 8x16 -> tmp space TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r1, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) @@ -1530,7 +1664,6 @@ sub r5, r7 shr r7, 3 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8) - add rsp, 0x88 RET %else cglobal deblock_h_luma_intra, 2,4,8,0x80 @@ -1867,8 +2000,6 @@ %if HIGH_BIT_DEPTH == 0 %macro CHROMA_V_START 0 - dec r2d ; alpha-1 - dec r3d ; beta-1 mov t5, r0 sub t5, r1 sub t5, r1 @@ -1879,8 +2010,6 @@ %endmacro %macro CHROMA_H_START 0 - dec r2d - dec r3d sub r0, 4 lea t6, [r1*3] mov t5, r0 @@ -1969,8 +2098,6 @@ ;----------------------------------------------------------------------------- %macro DEBLOCK_H_CHROMA_420_MBAFF 0 cglobal deblock_h_chroma_mbaff, 5,7,8 - dec r2d - dec r3d sub r0, 4 lea t6, [r1*3] mov t5, r0 @@ -2368,3 +2495,70 @@ DEBLOCK_STRENGTH_XMM INIT_XMM avx DEBLOCK_STRENGTH_XMM + +%macro LOAD_BYTES_YMM 1 + movu m0, [%1-4] ; ___E FGHI ___J KLMN ___O PQRS ___T UVWX + pshufb m0, [load_bytes_shuf] ; EFGH JKLM FGHI KLMN OPQR TUVW PQRS UVWX + mova m2, [insert_top_shuf] + vpermq m1, m0, q3131 ; FGHI KLMN PQRS UVWX x2 + vpermd m0, m2, m0 ; EFGH JKLM OPQR TUVW ____ FGHI KLMN PQRS + vpbroadcastd m2, [%1-8] ; ABCD .... + vpblendd m0, m0, m2, 00010000b ; EFGH JKLM OPQR TUVW ABCD FGHI KLMN PQRS +%endmacro + +INIT_YMM avx2 +cglobal deblock_strength, 6,6,7 + ; Prepare mv comparison register + shl r4d, 8 + add r4d, 3 - (1<<8) + movd xm6, r4d + vpbroadcastw m6, xm6 + pxor m5, m5 ; bs0,bs1 + +.lists: + ; Check refs + LOAD_BYTES_YMM ref + pxor m0, m1 + por m5, m0 + + ; Check mvs + movu xm0, [mv-4+4*8*0] + vinserti128 m0, m0, [mv+4*8*-1], 1 + vbroadcasti128 m2, [mv+4*8* 0] + vinserti128 m1, m2, [mv-4+4*8*1], 0 + vbroadcasti128 m3, [mv+4*8* 1] + psubw m0, m2 + psubw m1, m3 + + vinserti128 m2, m3, [mv-4+4*8*2], 0 + vbroadcasti128 m4, [mv+4*8* 2] + vinserti128 m3, m4, [mv-4+4*8*3], 0 + psubw m2, m4 + vbroadcasti128 m4, [mv+4*8* 3] + psubw m3, m4 + packsswb m0, m1 + packsswb m2, m3 + pabsb m0, m0 + pabsb m2, m2 + psubusb m0, m6 + psubusb m2, m6 + packsswb m0, m2 + por m5, m0 + + add r1, 40 + add r2, 4*8*5 + dec r5d + jge .lists + + ; Check nnz + LOAD_BYTES_YMM nnz + por m0, m1 + mova m6, [pb_1] + pminub m0, m6 + pminub m5, m6 ; mv ? 1 : 0 + paddb m0, m0 ; nnz ? 2 : 0 + pmaxub m5, m0 + vextracti128 [bs1], m5, 1 + pshufb xm5, [transpose_shuf] + mova [bs0], xm5 + RET
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/mc-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/mc-a.asm
Changed
@@ -34,7 +34,7 @@ SECTION_RODATA 32 -ch_shuf: db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9 +ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9 ch_shuf_adj: times 8 db 0 times 8 db 2 times 8 db 4 @@ -49,10 +49,12 @@ cextern pw_8 cextern pw_32 cextern pw_64 +cextern pw_512 cextern pw_00ff cextern pw_pixel_max cextern sw_64 cextern pd_32 +cextern deinterleave_shufd ;============================================================================= ; implicit weighted biprediction @@ -141,8 +143,7 @@ movh m1, %2 punpcklbw m0, m1 pmaddubsw m0, m3 - paddw m0, m4 - psraw m0, 6 + pmulhrsw m0, m4 %endmacro %macro BIWEIGHT_START_SSSE3 0 @@ -151,9 +152,13 @@ sub t7d, t6d shl t7d, 8 add t6d, t7d - movd m3, t6d - mova m4, [pw_32] + mova m4, [pw_512] + movd xm3, t6d +%if cpuflag(avx2) + vpbroadcastw m3, xm3 +%else SPLATW m3, m3 ; weight_dst,src +%endif %endmacro %if HIGH_BIT_DEPTH @@ -244,6 +249,25 @@ INIT_XMM ssse3 AVG_WEIGHT 8, 7 AVG_WEIGHT 16, 7 + +INIT_YMM avx2 +cglobal pixel_avg_weight_w16 + BIWEIGHT_START + AVG_START 5 +.height_loop: + movu xm0, [t2] + movu xm1, [t4] + vinserti128 m0, m0, [t2+t3], 1 + vinserti128 m1, m1, [t4+t5], 1 + SBUTTERFLY bw, 0, 1, 2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + packuswb m0, m1 + mova [t0], xm0 + vextracti128 [t0+t1], m0, 1 + AVG_END %endif ;HIGH_BIT_DEPTH ;============================================================================= @@ -274,7 +298,7 @@ %endmacro ; src, dst, width -%macro WEIGHT_TWO_ROW 3 +%macro WEIGHT_TWO_ROW 4 %assign x 0 %rep (%3+mmsize/2-1)/(mmsize/2) %if %3-x/2 <= 4 && mmsize == 16 @@ -298,16 +322,21 @@ %else ; !HIGH_BIT_DEPTH %macro WEIGHT_START 1 +%if cpuflag(avx2) + vbroadcasti128 m3, [r4] + vbroadcasti128 m4, [r4+16] +%else mova m3, [r4] mova m4, [r4+16] %if notcpuflag(ssse3) movd m5, [r4+32] %endif +%endif pxor m2, m2 %endmacro -; src1, src2, dst1, dst2 -%macro WEIGHT_ROWx2 4 +; src1, src2, dst1, dst2, fast +%macro WEIGHT_ROWx2 5 movh m0, [%1 ] movh m1, [%1+mmsize/2] movh m6, [%2 ] @@ -317,10 +346,12 @@ punpcklbw m6, m2 punpcklbw m7, m2 %if cpuflag(ssse3) +%if %5==0 psllw m0, 7 psllw m1, 7 psllw m6, 7 psllw m7, 7 +%endif pmulhrsw m0, m3 pmulhrsw m1, m3 pmulhrsw m6, m3 @@ -349,15 +380,54 @@ mova [%4], m6 %endmacro -; src1, src2, dst1, dst2, width -%macro WEIGHT_COL 5 +; src1, src2, dst1, dst2, width, fast +%macro WEIGHT_COL 6 +%if cpuflag(avx2) +%if %5==16 + movu xm0, [%1] + vinserti128 m0, m0, [%2], 1 + punpckhbw m1, m0, m2 + punpcklbw m0, m0, m2 +%if %6==0 + psllw m0, 7 + psllw m1, 7 +%endif + pmulhrsw m0, m3 + pmulhrsw m1, m3 + paddw m0, m4 + paddw m1, m4 + packuswb m0, m1 + mova [%3], xm0 + vextracti128 [%4], m0, 1 +%else + movq xm0, [%1] + vinserti128 m0, m0, [%2], 1 + punpcklbw m0, m2 +%if %6==0 + psllw m0, 7 +%endif + pmulhrsw m0, m3 + paddw m0, m4 + packuswb m0, m0 + vextracti128 xm1, m0, 1 +%if %5 == 8 + movq [%3], xm0 + movq [%4], xm1 +%else + movd [%3], xm0 + movd [%4], xm1 +%endif +%endif +%else movh m0, [%1] movh m1, [%2] punpcklbw m0, m2 punpcklbw m1, m2 %if cpuflag(ssse3) +%if %6==0 psllw m0, 7 psllw m1, 7 +%endif pmulhrsw m0, m3 pmulhrsw m1, m3 paddw m0, m4 @@ -380,18 +450,22 @@ movd [%3], m0 ; width 2 can write garbage for the last 2 bytes movd [%4], m1 %endif +%endif %endmacro - ; src, dst, width -%macro WEIGHT_TWO_ROW 3 +%macro WEIGHT_TWO_ROW 4 %assign x 0 %rep %3 %if (%3-x) >= mmsize - WEIGHT_ROWx2 %1+x, %1+r3+x, %2+x, %2+r1+x + WEIGHT_ROWx2 %1+x, %1+r3+x, %2+x, %2+r1+x, %4 %assign x (x+mmsize) %else - WEIGHT_COL %1+x, %1+r3+x, %2+x, %2+r1+x, %3-x - %exitrep + %assign w %3-x +%if w == 20 + %assign w 16 +%endif + WEIGHT_COL %1+x, %1+r3+x, %2+x, %2+r1+x, w, %4 + %assign x (x+w) %endif %if x >= %3 %exitrep @@ -409,13 +483,30 @@ cglobal mc_weight_w%1, 6,6,8 FIX_STRIDES r1, r3 WEIGHT_START %1 +%if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0 + ; we can merge the shift step into the scale factor + ; if (m3<<7) doesn't overflow an int16_t + cmp byte [r4+1], 0 + jz .fast +%endif .loop: - WEIGHT_TWO_ROW r2, r0, %1 + WEIGHT_TWO_ROW r2, r0, %1, 0 lea r0, [r0+r1*2] lea r2, [r2+r3*2] sub r5d, 2 jg .loop RET +%if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0 +.fast: + psllw m3, 7 +.fastloop: + WEIGHT_TWO_ROW r2, r0, %1, 1 + lea r0, [r0+r1*2] + lea r2, [r2+r3*2] + sub r5d, 2 + jg .fastloop + RET +%endif %endmacro INIT_MMX mmx2 @@ -437,6 +528,10 @@ WEIGHTER 8 WEIGHTER 16 WEIGHTER 20 +INIT_YMM avx2 +WEIGHTER 8 +WEIGHTER 16 +WEIGHTER 20 %endif %macro OFFSET_OP 7 @@ -531,11 +626,15 @@ mov eax, %2 cmp dword r6m, 32 jne pixel_avg_weight_w%1 %+ SUFFIX +%if cpuflag(avx2) && %1 == 16 ; all AVX2 machines can do fast 16-byte unaligned loads + jmp pixel_avg_w%1_avx2 +%else %if mmsize == 16 && %1 == 16 test dword r4m, 15 jz pixel_avg_w%1_sse2 %endif jmp pixel_avg_w%1_mmx2 +%endif %endmacro ;----------------------------------------------------------------------------- @@ -635,6 +734,10 @@ AVGH 4, 8 AVGH 4, 4 AVGH 4, 2 +INIT_XMM avx2 +AVG_FUNC 16, movdqu, movdqa +AVGH 16, 16 +AVGH 16, 8 %endif ;HIGH_BIT_DEPTH @@ -657,7 +760,7 @@ .height_loop: movu m0, [r2] movu m1, [r2+r3*2] -%if mmsize == 8 +%if cpuflag(avx) || mmsize == 8 pavgw m0, [r2+r4] pavgw m1, [r2+r6] %else @@ -717,6 +820,8 @@ AVG2_W_ONE 8 AVG2_W_TWO 10, movd, movd AVG2_W_TWO 16, movu, mova +INIT_YMM avx2 +AVG2_W_ONE 16 INIT_MMX cglobal pixel_avg2_w10_mmx2, 6,7 @@ -805,27 +910,40 @@ jg .height_loop RET -INIT_XMM -cglobal pixel_avg2_w18_sse2, 6,7,6 +%macro PIXEL_AVG_W18 0 +cglobal pixel_avg2_w18, 6,7 sub r4, r2 .height_loop: movu m0, [r2+ 0] + movd xm2, [r2+32] +%if mmsize == 32 + pavgw m0, [r2+r4+ 0] + movd xm1, [r2+r4+32] + pavgw xm2, xm1 +%else movu m1, [r2+16] - movh m2, [r2+32] movu m3, [r2+r4+ 0] movu m4, [r2+r4+16] - movh m5, [r2+r4+32] + movd m5, [r2+r4+32] pavgw m0, m3 pavgw m1, m4 pavgw m2, m5 - mova [r0+ 0], m0 mova [r0+16], m1 - movh [r0+32], m2 +%endif + mova [r0+ 0], m0 + movd [r0+32], xm2 lea r2, [r2+r3*2] lea r0, [r0+r1*2] dec r5d jg .height_loop RET +%endmacro + +INIT_XMM sse2 +PIXEL_AVG_W18 +INIT_YMM avx2 +PIXEL_AVG_W18 + %endif ; HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH == 0 @@ -965,6 +1083,23 @@ AVG2_W20 sse2 AVG2_W20 sse2_misalign +INIT_YMM avx2 +cglobal pixel_avg2_w20, 6,7 + sub r2, r4 + lea r6, [r2+r3] +.height_loop: + movu m0, [r4] + movu m1, [r4+r3] + pavgb m0, [r4+r2] + pavgb m1, [r4+r6] + lea r4, [r4+r3*2] + mova [r0], m0 + mova [r0+r1], m1 + lea r0, [r0+r1*2] + sub r5d, 2 + jg .height_loop + RET + ; Cacheline split code for processors with high latencies for loads ; split over cache lines. See sad-a.asm for a more detailed explanation. ; This particular instance is complicated by the fact that src1 and src2 @@ -1172,18 +1307,18 @@ movu m1, [r2+%4*mmsize] movu m2, [r2+r3+%3*mmsize] movu m3, [r2+r3+%4*mmsize] - movu m4, [r2+r3*2+%3*mmsize] - movu m5, [r2+r3*2+%4*mmsize] - movu m6, [r2+%2+%3*mmsize] - movu m7, [r2+%2+%4*mmsize] mova [r0+%3*mmsize], m0 mova [r0+%4*mmsize], m1 mova [r0+r1+%3*mmsize], m2 mova [r0+r1+%4*mmsize], m3 - mova [r0+r1*2+%3*mmsize], m4 - mova [r0+r1*2+%4*mmsize], m5 - mova [r0+%1+%3*mmsize], m6 - mova [r0+%1+%4*mmsize], m7 + movu m0, [r2+r3*2+%3*mmsize] + movu m1, [r2+r3*2+%4*mmsize] + movu m2, [r2+%2+%3*mmsize] + movu m3, [r2+%2+%4*mmsize] + mova [r0+r1*2+%3*mmsize], m0 + mova [r0+r1*2+%4*mmsize], m1 + mova [r0+%1+%3*mmsize], m2 + mova [r0+%1+%4*mmsize], m3 %endmacro %macro COPY4 2 @@ -1216,7 +1351,7 @@ %macro MC_COPY 1 %assign %%w %1*SIZEOF_PIXEL/mmsize %if %%w > 0 -cglobal mc_copy_w%1, 5,7,8*(%%w/2) +cglobal mc_copy_w%1, 5,7 FIX_STRIDES r1, r3 lea r6, [r3*3] lea r5, [r1*3] @@ -1233,13 +1368,17 @@ INIT_MMX mmx MC_COPY 8 MC_COPY 16 -INIT_XMM sse2 +INIT_XMM sse MC_COPY 8 MC_COPY 16 -INIT_XMM aligned, sse2 +INIT_XMM aligned, sse MC_COPY 16 - - +%if HIGH_BIT_DEPTH +INIT_YMM avx +MC_COPY 16 +INIT_YMM aligned, avx +MC_COPY 16 +%endif ;============================================================================= ; prefetch @@ -1514,7 +1653,11 @@ mov t0, r0 mov t1, r1 mov t2, r3 +%if WIN64 + %define multy0 r4m +%else %define multy0 [rsp-8] +%endif mova multy0, m5 %else mov r3m, r3 @@ -1651,10 +1794,9 @@ %if ARCH_X86_64 ; too many regs for x86_32 RESET_MM_PERMUTATION %if WIN64 -%if xmm_regs_used > 6 - %assign stack_offset stack_offset-(xmm_regs_used-6)*16-16 - %assign xmm_regs_used 6 -%endif + %assign stack_offset stack_offset - stack_size_padded + %assign stack_size_padded 0 + %assign xmm_regs_used 0 %endif .mc1dy: and t2d, 7 @@ -1781,7 +1923,11 @@ %macro MC_CHROMA_SSSE3 0 cglobal mc_chroma +%if cpuflag(avx2) MC_CHROMA_START 9 +%else + MC_CHROMA_START 10 +%endif and r5d, 7 and t2d, 7 mov t0d, r5d @@ -1792,18 +1938,18 @@ sub r5d, t2d imul t2d, t0d ; (x*255+8)*y imul r5d, t0d ; (x*255+8)*(8-y) - movd m6, t2d - movd m7, r5d + movd xm6, t2d + movd xm7, r5d %if cpuflag(cache64) mov t0d, r3d and t0d, 7 %ifdef PIC lea t1, [ch_shuf_adj] - movddup m5, [t1 + t0*4] + movddup xm5, [t1 + t0*4] %else - movddup m5, [ch_shuf_adj + t0*4] + movddup xm5, [ch_shuf_adj + t0*4] %endif - paddb m5, [ch_shuf] + paddb xm5, [ch_shuf] and r3, ~7 %else mova m5, [ch_shuf] @@ -1812,12 +1958,80 @@ movifnidn r1, r1mp movifnidn r2d, r2m movifnidn r5d, r8m +%if cpuflag(avx2) + vpbroadcastw m6, xm6 + vpbroadcastw m7, xm7 +%else SPLATW m6, m6 SPLATW m7, m7 +%endif +%if ARCH_X86_64 + %define shiftround m8 + mova m8, [pw_512] +%else + %define shiftround [pw_512] +%endif cmp dword r7m, 4 jg .width8 - movu m0, [r3] + +%if cpuflag(avx2) +.loop4: + movu xm0, [r3] + movu xm1, [r3+r4] + vinserti128 m0, m0, [r3+r4], 1 + vinserti128 m1, m1, [r3+r4*2], 1 + pshufb m0, m5 + pshufb m1, m5 + pmaddubsw m0, m7 + pmaddubsw m1, m6 + paddw m0, m1 + pmulhrsw m0, shiftround + packuswb m0, m0 + vextracti128 xm1, m0, 1 + movd [r0], xm0 + movd [r0+r2], xm1 + psrldq xm0, 4 + psrldq xm1, 4 + movd [r1], xm0 + movd [r1+r2], xm1 + lea r3, [r3+r4*2] + lea r0, [r0+r2*2] + lea r1, [r1+r2*2] + sub r5d, 2 + jg .loop4 + RET +.width8: + movu xm0, [r3] + vinserti128 m0, m0, [r3+8], 1 pshufb m0, m5 +.loop8: + movu xm3, [r3+r4] + vinserti128 m3, m3, [r3+r4+8], 1 + pshufb m3, m5 + pmaddubsw m1, m0, m7 + pmaddubsw m2, m3, m6 + pmaddubsw m3, m3, m7 + + movu xm0, [r3+r4*2] + vinserti128 m0, m0, [r3+r4*2+8], 1 + pshufb m0, m5 + pmaddubsw m4, m0, m6 + + paddw m1, m2 + paddw m3, m4 + pmulhrsw m1, shiftround + pmulhrsw m3, shiftround + packuswb m1, m3 + mova m2, [deinterleave_shufd] + vpermd m1, m2, m1 + vextracti128 xm2, m1, 1 + movq [r0], xm1 + movhps [r1], xm1 + movq [r0+r2], xm2 + movhps [r1+r2], xm2 +%else + movu m0, [r3] + pshufb m0, xm5 .loop4: movu m1, [r3+r4] pshufb m1, m5 @@ -1828,16 +2042,14 @@ pmaddubsw m2, m1, m7 pmaddubsw m1, m6 pmaddubsw m3, m6 - paddw m0, [pw_32] - paddw m2, [pw_32] paddw m1, m0 paddw m3, m2 + pmulhrsw m1, shiftround + pmulhrsw m3, shiftround mova m0, m4 - psrlw m1, 6 - psrlw m3, 6 packuswb m1, m3 movhlps m3, m1 - movd [r0], m1 + movd [r0], xm1 movd [r0+r2], m3 psrldq m1, 4 psrldq m3, 4 @@ -1849,15 +2061,14 @@ sub r5d, 2 jg .loop4 RET - .width8: movu m0, [r3] pshufb m0, m5 movu m1, [r3+8] pshufb m1, m5 %if ARCH_X86_64 - SWAP 8, 6 - %define mult1 m8 + SWAP 9, 6 + %define mult1 m9 %else mova r0m, m6 %define mult1 r0m @@ -1873,12 +2084,10 @@ pmaddubsw m1, m7 pmaddubsw m2, mult1 pmaddubsw m3, mult1 - paddw m0, [pw_32] - paddw m1, [pw_32] paddw m0, m2 paddw m1, m3 - psrlw m0, 6 - psrlw m1, 6 + pmulhrsw m0, shiftround ; x + 32 >> 6 + pmulhrsw m1, shiftround packuswb m0, m1 pshufd m0, m0, q3120 movq [r0], m0 @@ -1894,16 +2103,15 @@ pmaddubsw m6, m7 pmaddubsw m2, mult1 pmaddubsw m3, mult1 - paddw m4, [pw_32] - paddw m6, [pw_32] paddw m2, m4 paddw m3, m6 - psrlw m2, 6 - psrlw m3, 6 + pmulhrsw m2, shiftround + pmulhrsw m3, shiftround packuswb m2, m3 pshufd m2, m2, q3120 movq [r0+r2], m2 movhps [r1+r2], m2 +%endif lea r3, [r3+r4*2] lea r0, [r0+r2*2] lea r1, [r1+r2*2] @@ -1932,4 +2140,6 @@ MC_CHROMA_SSSE3 INIT_XMM avx MC_CHROMA_SSSE3 ; No known AVX CPU will trigger CPU_CACHELINE_64 +INIT_YMM avx2 +MC_CHROMA_SSSE3 %endif ; HIGH_BIT_DEPTH
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/mc-a2.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/mc-a2.asm
Changed
@@ -30,13 +30,14 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA +SECTION_RODATA 32 + +filt_mul20: times 32 db 20 +filt_mul15: times 16 db 1, -5 +filt_mul51: times 16 db -5, 1 +hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15 +deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15 -filt_mul20: times 16 db 20 -filt_mul15: times 8 db 1, -5 -filt_mul51: times 8 db -5, 1 -hpel_shuf: db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15 -deinterleave_shuf: db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15 %if HIGH_BIT_DEPTH deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14 deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15 @@ -44,6 +45,7 @@ deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30 deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31 %endif +pw_1024: times 16 dw 1024 pd_16: times 4 dd 16 pd_0f: times 4 dd 0xffff @@ -64,6 +66,7 @@ cextern pw_1 cextern pw_16 cextern pw_32 +cextern pw_512 cextern pw_00ff cextern pw_3fff cextern pw_pixel_max @@ -127,19 +130,24 @@ paddw %4, %6 %endmacro -%macro FILT_PACK 4-6 b - paddw %1, %4 - paddw %2, %4 -%if %0 == 6 - psubusw %1, %6 - psubusw %2, %6 - psrlw %1, %3 - psrlw %2, %3 +%macro FILT_PACK 3-5 +%if cpuflag(ssse3) + pmulhrsw %1, %3 + pmulhrsw %2, %3 +%else + paddw %1, %3 + paddw %2, %3 +%if %0 == 5 + psubusw %1, %5 + psubusw %2, %5 + psrlw %1, %4 + psrlw %2, %4 %else - psraw %1, %3 - psraw %2, %3 + psraw %1, %4 + psraw %2, %4 %endif -%ifnidn w, %5 +%endif +%if HIGH_BIT_DEPTH == 0 packuswb %1, %2 %endif %endmacro @@ -203,7 +211,7 @@ mova [r2+r4+mmsize], m4 paddw m1, s30 paddw m4, s30 - FILT_PACK m1, m4, 5, m6, w, s10 + FILT_PACK m1, m4, m6, 5, s10 CLIPW m1, m0, m7 CLIPW m4, m0, m7 mova [r0+r4], m1 @@ -295,7 +303,7 @@ FILT_H2 m1, m2, m3, m4, m5, m6 mova m7, [pw_1] pxor m2, m2 - FILT_PACK m1, m4, 1, m7, w + FILT_PACK m1, m4, m7, 1 CLIPW m1, m2, m0 CLIPW m4, m2, m0 mova [r0+r2], m1 @@ -349,17 +357,25 @@ paddw m4, m5 paddw m1, m3 paddw m4, m6 + mova m7, [pw_1024] %else LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1 LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1 LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0 LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1 FILT_V2 m1, m2, m3, m4, m5, m6 + mova m7, [pw_16] %endif - mova m7, [pw_16] +%if mmsize==32 + mova [r2+r4*2], xm1 + mova [r2+r4*2+mmsize/2], xm4 + vextracti128 [r2+r4*2+mmsize], m1, 1 + vextracti128 [r2+r4*2+mmsize*3/2], m4, 1 +%else mova [r2+r4*2], m1 mova [r2+r4*2+mmsize], m4 - FILT_PACK m1, m4, 5, m7 +%endif + FILT_PACK m1, m4, m7, 5 movnta [r0+r4], m1 add r1, mmsize add r5, mmsize @@ -371,8 +387,8 @@ ;----------------------------------------------------------------------------- ; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width ); ;----------------------------------------------------------------------------- -INIT_MMX -cglobal hpel_filter_c_mmx2, 3,3 +INIT_MMX mmx2 +cglobal hpel_filter_c, 3,3 add r0, r2 lea r1, [r1+r2*2] neg r2 @@ -392,7 +408,7 @@ paddw m5, [src+12] ; b1 paddw m6, [src+10] ; c1 FILT_H2 m1, m2, m3, m4, m5, m6 - FILT_PACK m1, m4, 6, m7 + FILT_PACK m1, m4, m7, 6 movntq [r0+r2], m1 add r2, 8 jl .loop @@ -401,7 +417,8 @@ ;----------------------------------------------------------------------------- ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width ); ;----------------------------------------------------------------------------- -cglobal hpel_filter_h_mmx2, 3,3 +INIT_MMX mmx2 +cglobal hpel_filter_h, 3,3 add r0, r2 add r1, r2 neg r2 @@ -436,14 +453,12 @@ paddw m6, m7 ; a1 movq m7, [pw_1] FILT_H2 m1, m2, m3, m4, m5, m6 - FILT_PACK m1, m4, 1, m7 + FILT_PACK m1, m4, m7, 1 movntq [r0+r2], m1 add r2, 8 jl .loop RET -INIT_XMM - %macro HPEL_C 0 ;----------------------------------------------------------------------------- ; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width ); @@ -454,29 +469,33 @@ neg r2 %define src r1+r2*2 %ifnidn cpuname, sse2 +%if cpuflag(ssse3) + mova m7, [pw_512] +%else mova m7, [pw_32] - %define tpw_32 m7 +%endif + %define pw_rnd m7 %elif ARCH_X86_64 mova m8, [pw_32] - %define tpw_32 m8 + %define pw_rnd m8 %else - %define tpw_32 [pw_32] + %define pw_rnd [pw_32] %endif ; This doesn't seem to be faster (with AVX) on Sandy Bridge or Bulldozer... -%if cpuflag(misalign) +%if cpuflag(misalign) || mmsize==32 .loop: movu m4, [src-4] movu m5, [src-2] - mova m6, [src] - movu m3, [src+12] - movu m2, [src+14] - mova m1, [src+16] + mova m6, [src+0] + movu m3, [src-4+mmsize] + movu m2, [src-2+mmsize] + mova m1, [src+0+mmsize] paddw m4, [src+6] paddw m5, [src+4] paddw m6, [src+2] - paddw m3, [src+22] - paddw m2, [src+20] - paddw m1, [src+18] + paddw m3, [src+6+mmsize] + paddw m2, [src+4+mmsize] + paddw m1, [src+2+mmsize] FILT_H2 m4, m5, m6, m3, m2, m1 %else mova m0, [src-16] @@ -506,9 +525,12 @@ paddw m6, m0 FILT_H m3, m5, m6 %endif - FILT_PACK m4, m3, 6, tpw_32 - movntps [r0+r2], m4 - add r2, 16 + FILT_PACK m4, m3, pw_rnd, 6 +%if mmsize==32 + vpermq m4, m4, q3120 +%endif + movnta [r0+r2], m4 + add r2, mmsize jl .loop RET %endmacro @@ -516,7 +538,8 @@ ;----------------------------------------------------------------------------- ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width ); ;----------------------------------------------------------------------------- -cglobal hpel_filter_h_sse2, 3,3,8 +INIT_XMM sse2 +cglobal hpel_filter_h, 3,3,8 add r0, r2 add r1, r2 neg r2 @@ -555,7 +578,7 @@ paddw m6, m7 ; c1 mova m7, [pw_1] ; FIXME xmm8 FILT_H2 m1, m2, m3, m4, m5, m6 - FILT_PACK m1, m4, 1, m7 + FILT_PACK m1, m4, m7, 1 movntps [r0+r2], m1 add r2, 16 jl .loop @@ -572,7 +595,7 @@ %define src r1+r2 mova m0, [src-16] mova m1, [src] - mova m7, [pw_16] + mova m7, [pw_1024] .loop: mova m2, [src+16] ; Using unaligned loads instead of palignr is marginally slower on SB and significantly @@ -594,7 +617,7 @@ paddw m3, m1 paddw m4, m5 paddw m4, m6 - FILT_PACK m3, m4, 5, m7 + FILT_PACK m3, m4, m7, 5 pshufb m3, [hpel_shuf] mova m1, m2 movntps [r0+r2], m3 @@ -620,6 +643,45 @@ HPEL_C HPEL_V 0 HPEL_H +INIT_YMM avx2 +HPEL_V 8 +HPEL_C + +INIT_YMM avx2 +cglobal hpel_filter_h, 3,3,8 + add r0, r2 + add r1, r2 + neg r2 + %define src r1+r2 + mova m5, [filt_mul15] + mova m6, [filt_mul20] + mova m7, [filt_mul51] +.loop: + movu m0, [src-2] + movu m1, [src-1] + movu m2, [src+2] + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmaddubsw m2, m7 + paddw m0, m2 + + mova m2, [src+0] + movu m3, [src+1] + movu m4, [src+3] + pmaddubsw m2, m6 + pmaddubsw m3, m6 + pmaddubsw m4, m7 + paddw m0, m2 + paddw m1, m3 + paddw m1, m4 + + mova m2, [pw_1024] + FILT_PACK m0, m1, m2, 5 + pshufb m0, [hpel_shuf] + movnta [r0+r2], m0 + add r2, mmsize + jl .loop + RET %endif %if ARCH_X86_64 @@ -627,9 +689,9 @@ ;The optimum prefetch distance is difficult to determine in checkasm: ;any prefetch seems slower than not prefetching. ;In real use, the prefetch seems to be a slight win. - ;+16 is picked somewhat arbitrarily here based on the fact that even one + ;+mmsize is picked somewhat arbitrarily here based on the fact that even one ;loop iteration is going to take longer than the prefetch. - prefetcht0 [r1+r2*2+16] + prefetcht0 [r1+r2*2+mmsize] %if cpuflag(ssse3) mova m1, [r3] mova m2, [r3+r2] @@ -662,31 +724,48 @@ packuswb %3, %4 FILT_V2 m1, m2, m3, m4, m5, m6 %endif - add r3, 16 - add r1, 16 + add r3, mmsize + add r1, mmsize +%if mmsize==32 + vinserti128 %1, m1, xm4, 1 + vperm2i128 %2, m1, m4, q0301 +%else mova %1, m1 mova %2, m4 - FILT_PACK m1, m4, 5, m15 +%endif + FILT_PACK m1, m4, m15, 5 movntps [r8+r4+%5], m1 %endmacro -%macro FILT_C 4 - PALIGNR m1, %2, %1, 12, m2 - PALIGNR m2, %2, %1, 14, %1 +%macro FILT_C 3 +%if mmsize==32 + vperm2i128 m3, %2, %1, q0003 +%endif + PALIGNR m1, %2, %1, (mmsize-4), m3 + PALIGNR m2, %2, %1, (mmsize-2), m3 +%if mmsize==32 + vperm2i128 %1, %3, %2, q0003 +%endif PALIGNR m3, %3, %2, 4, %1 PALIGNR m4, %3, %2, 2, %1 paddw m3, m2 +%if mmsize==32 + mova m2, %1 +%endif mova %1, %3 - PALIGNR %3, %2, 6, m2 + PALIGNR %3, %3, %2, 6, m2 paddw m4, %2 paddw %3, m1 FILT_H %3, m3, m4 %endmacro %macro DO_FILT_C 4 - FILT_C %1, %2, %3, 6 - FILT_C %2, %1, %4, 6 - FILT_PACK %3, %4, 6, m15 + FILT_C %1, %2, %3 + FILT_C %2, %1, %4 + FILT_PACK %3, %4, m15, 6 +%if mmsize==32 + vpermq %3, %3, q3120 +%endif movntps [r5+r4], %3 %endmacro @@ -700,8 +779,14 @@ %endmacro %macro DO_FILT_H 3 - PALIGNR m1, %2, %1, 14, m3 - PALIGNR m2, %2, %1, 15, m3 +%if mmsize==32 + vperm2i128 m3, %2, %1, q0003 +%endif + PALIGNR m1, %2, %1, (mmsize-2), m3 + PALIGNR m2, %2, %1, (mmsize-1), m3 +%if mmsize==32 + vperm2i128 m3, %3, %2, q0003 +%endif PALIGNR m4, %3, %2, 1 , m3 PALIGNR m5, %3, %2, 2 , m3 PALIGNR m6, %3, %2, 3 , m3 @@ -717,14 +802,14 @@ paddw m2, m4 paddw m1, m5 paddw m2, m6 - FILT_PACK m1, m2, 5, m15 + FILT_PACK m1, m2, m15, 5 pshufb m1, [hpel_shuf] %else ; ssse3, avx ADD8TO16 m1, m6, m12, m3, m0 ; a ADD8TO16 m2, m5, m12, m3, m0 ; b ADD8TO16 %2, m4, m12, m3, m0 ; c FILT_V2 m1, m2, %2, m6, m5, m4 - FILT_PACK m1, m6, 5, m15 + FILT_PACK m1, m6, m15, 5 %endif movntps [r0+r4], m1 mova %2, %3 @@ -737,9 +822,9 @@ ;----------------------------------------------------------------------------- cglobal hpel_filter, 7,9,16 mov r7, r3 - sub r5d, 16 + sub r5d, mmsize mov r8, r1 - and r7, 15 + and r7, mmsize-1 sub r3, r7 add r0, r5 add r8, r5 @@ -751,13 +836,14 @@ sub r3, r2 sub r3, r2 mov r4, r7 - mova m15, [pw_16] %if cpuflag(ssse3) mova m0, [filt_mul51] mova m12, [filt_mul15] mova m14, [filt_mul20] + mova m15, [pw_1024] %else pxor m0, m0 + mova m15, [pw_16] %endif ;ALIGN 16 .loopy: @@ -765,16 +851,24 @@ DO_FILT_V m8, m7, m13, m12, 0 ;ALIGN 16 .loopx: - DO_FILT_V m6, m5, m11, m12, 16 + DO_FILT_V m6, m5, m11, m12, mmsize .lastx: +%if cpuflag(ssse3) + psrlw m15, 1 ; pw_512 +%else paddw m15, m15 ; pw_32 +%endif DO_FILT_C m9, m8, m7, m6 - psrlw m15, 1 ; pw_16 - movdqa m7, m5 +%if cpuflag(ssse3) + paddw m15, m15 ; pw_1024 +%else + psrlw m15, 1 ; pw_16 +%endif + mova m7, m5 DO_FILT_H m10, m13, m11 - add r4, 16 + add r4, mmsize jl .loopx - cmp r4, 16 + cmp r4, mmsize jl .lastx ; setup regs for next y sub r4, r7 @@ -797,6 +891,8 @@ HPEL INIT_XMM avx HPEL +INIT_YMM avx2 +HPEL %endif ; ARCH_X86_64 %undef movntq @@ -1131,115 +1227,109 @@ ;----------------------------------------------------------------------------- ; void *memcpy_aligned( void *dst, const void *src, size_t n ); ;----------------------------------------------------------------------------- -INIT_MMX -cglobal memcpy_aligned_mmx, 3,3 - test r2d, 16 - jz .copy32start - movq mm0, [r1 + r2 - 16] - movq mm1, [r1 + r2 - 8] - movq [r0 + r2 - 16], mm0 - movq [r0 + r2 - 8], mm1 - sub r2d, 16 -.copy32start - test r2d, r2d - jz .ret -.copy32: - movq mm0, [r1 + r2 - 32] - movq mm1, [r1 + r2 - 24] - movq mm2, [r1 + r2 - 16] - movq mm3, [r1 + r2 - 8] - movq [r0 + r2 - 32], mm0 - movq [r0 + r2 - 24], mm1 - movq [r0 + r2 - 16], mm2 - movq [r0 + r2 - 8], mm3 - sub r2d, 32 - jg .copy32 -.ret - RET - -;----------------------------------------------------------------------------- -; void *memcpy_aligned( void *dst, const void *src, size_t n ); -;----------------------------------------------------------------------------- -cglobal memcpy_aligned_sse2, 3,3 +%macro MEMCPY 0 +cglobal memcpy_aligned, 3,3 +%if mmsize == 16 test r2d, 16 - jz .copy32 - movdqa xmm0, [r1 + r2 - 16] - movdqa [r0 + r2 - 16], xmm0 + jz .copy2 + mova m0, [r1+r2-16] + mova [r0+r2-16], m0 sub r2d, 16 -.copy32: - test r2d, 32 - jz .copy64start - movdqa xmm0, [r1 + r2 - 32] - movdqa [r0 + r2 - 32], xmm0 - movdqa xmm1, [r1 + r2 - 16] - movdqa [r0 + r2 - 16], xmm1 - sub r2d, 32 -.copy64start +.copy2: +%endif + test r2d, 2*mmsize + jz .copy4start + mova m0, [r1+r2-1*mmsize] + mova m1, [r1+r2-2*mmsize] + mova [r0+r2-1*mmsize], m0 + mova [r0+r2-2*mmsize], m1 + sub r2d, 2*mmsize +.copy4start: test r2d, r2d jz .ret -.copy64: - movdqa xmm0, [r1 + r2 - 64] - movdqa [r0 + r2 - 64], xmm0 - movdqa xmm1, [r1 + r2 - 48] - movdqa [r0 + r2 - 48], xmm1 - movdqa xmm2, [r1 + r2 - 32] - movdqa [r0 + r2 - 32], xmm2 - movdqa xmm3, [r1 + r2 - 16] - movdqa [r0 + r2 - 16], xmm3 - sub r2d, 64 - jg .copy64 +.copy4: + mova m0, [r1+r2-1*mmsize] + mova m1, [r1+r2-2*mmsize] + mova m2, [r1+r2-3*mmsize] + mova m3, [r1+r2-4*mmsize] + mova [r0+r2-1*mmsize], m0 + mova [r0+r2-2*mmsize], m1 + mova [r0+r2-3*mmsize], m2 + mova [r0+r2-4*mmsize], m3 + sub r2d, 4*mmsize + jg .copy4 .ret: REP_RET +%endmacro + +INIT_MMX mmx +MEMCPY +INIT_XMM sse +MEMCPY ;----------------------------------------------------------------------------- ; void *memzero_aligned( void *dst, size_t n ); ;----------------------------------------------------------------------------- -%macro MEMZERO 0 +%macro MEMZERO 1 cglobal memzero_aligned, 2,2 add r0, r1 neg r1 +%if mmsize == 8 pxor m0, m0 +%else + xorps m0, m0 +%endif .loop: %assign i 0 -%rep 8 +%rep %1 mova [r0 + r1 + i], m0 %assign i i+mmsize %endrep - add r1, mmsize*8 + add r1, mmsize*%1 jl .loop RET %endmacro INIT_MMX mmx -MEMZERO -INIT_XMM sse2 -MEMZERO - - +MEMZERO 8 +INIT_XMM sse +MEMZERO 8 +INIT_YMM avx +MEMZERO 4 %if HIGH_BIT_DEPTH == 0 ;----------------------------------------------------------------------------- ; void integral_init4h( uint16_t *sum, uint8_t *pix, intptr_t stride ) ;----------------------------------------------------------------------------- -INIT_XMM -cglobal integral_init4h_sse4, 3,4 +%macro INTEGRAL_INIT4H 0 +cglobal integral_init4h, 3,4 lea r3, [r0+r2*2] add r1, r2 neg r2 pxor m4, m4 .loop: - movdqa m0, [r1+r2] - movdqa m1, [r1+r2+16] + mova m0, [r1+r2] +%if mmsize==32 + movu m1, [r1+r2+8] +%else + mova m1, [r1+r2+16] palignr m1, m0, 8 +%endif mpsadbw m0, m4, 0 mpsadbw m1, m4, 0 paddw m0, [r0+r2*2] - paddw m1, [r0+r2*2+16] - movdqa [r3+r2*2 ], m0 - movdqa [r3+r2*2+16], m1 - add r2, 16 + paddw m1, [r0+r2*2+mmsize] + mova [r3+r2*2 ], m0 + mova [r3+r2*2+mmsize], m1 + add r2, mmsize jl .loop RET +%endmacro + +INIT_XMM sse4 +INTEGRAL_INIT4H +INIT_YMM avx2 +INTEGRAL_INIT4H %macro INTEGRAL_INIT8H 0 cglobal integral_init8h, 3,4 @@ -1248,20 +1338,26 @@ neg r2 pxor m4, m4 .loop: - movdqa m0, [r1+r2] - movdqa m1, [r1+r2+16] + mova m0, [r1+r2] +%if mmsize==32 + movu m1, [r1+r2+8] + mpsadbw m2, m0, m4, 100100b + mpsadbw m3, m1, m4, 100100b +%else + mova m1, [r1+r2+16] palignr m1, m0, 8 - mpsadbw m2, m0, m4, 4 - mpsadbw m3, m1, m4, 4 + mpsadbw m2, m0, m4, 100b + mpsadbw m3, m1, m4, 100b +%endif mpsadbw m0, m4, 0 mpsadbw m1, m4, 0 paddw m0, [r0+r2*2] - paddw m1, [r0+r2*2+16] + paddw m1, [r0+r2*2+mmsize] paddw m0, m2 paddw m1, m3 - movdqa [r3+r2*2 ], m0 - movdqa [r3+r2*2+16], m1 - add r2, 16 + mova [r3+r2*2 ], m0 + mova [r3+r2*2+mmsize], m1 + add r2, mmsize jl .loop RET %endmacro @@ -1270,6 +1366,8 @@ INTEGRAL_INIT8H INIT_XMM avx INTEGRAL_INIT8H +INIT_YMM avx2 +INTEGRAL_INIT8H %endif ; !HIGH_BIT_DEPTH %macro INTEGRAL_INIT_8V 0 @@ -1277,7 +1375,7 @@ ; void integral_init8v( uint16_t *sum8, intptr_t stride ) ;----------------------------------------------------------------------------- cglobal integral_init8v, 3,3 - shl r1, 1 + add r1, r1 add r0, r1 lea r2, [r0+r1*8] neg r1 @@ -1297,12 +1395,14 @@ INTEGRAL_INIT_8V INIT_XMM sse2 INTEGRAL_INIT_8V +INIT_YMM avx2 +INTEGRAL_INIT_8V ;----------------------------------------------------------------------------- ; void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride ) ;----------------------------------------------------------------------------- -INIT_MMX -cglobal integral_init4v_mmx, 3,5 +INIT_MMX mmx +cglobal integral_init4v, 3,5 shl r2, 1 lea r3, [r0+r2*4] lea r4, [r0+r2*8] @@ -1323,8 +1423,8 @@ jge .loop RET -INIT_XMM -cglobal integral_init4v_sse2, 3,5 +INIT_XMM sse2 +cglobal integral_init4v, 3,5 shl r2, 1 add r0, r2 add r1, r2 @@ -1349,7 +1449,8 @@ jl .loop RET -cglobal integral_init4v_ssse3, 3,5 +INIT_XMM ssse3 +cglobal integral_init4v, 3,5 shl r2, 1 add r0, r2 add r1, r2 @@ -1374,6 +1475,28 @@ jl .loop RET +INIT_YMM avx2 +cglobal integral_init4v, 3,5 + add r2, r2 + add r0, r2 + add r1, r2 + lea r3, [r0+r2*4] + lea r4, [r0+r2*8] + neg r2 +.loop: + mova m2, [r0+r2] + movu m1, [r4+r2+8] + paddw m0, m2, [r0+r2+8] + paddw m1, [r4+r2] + mova m3, [r3+r2] + psubw m1, m0 + psubw m3, m2 + mova [r0+r2], m1 + mova [r1+r2], m3 + add r2, 32 + jl .loop + RET + %macro FILT8x4 7 mova %3, [r0+%7] mova %4, [r0+r5+%7] @@ -1394,6 +1517,43 @@ %endif %endmacro +%macro FILT32x4U 4 + mova m1, [r0+r5] + pavgb m0, m1, [r0] + movu m3, [r0+r5+1] + pavgb m2, m3, [r0+1] + pavgb m1, [r0+r5*2] + pavgb m3, [r0+r5*2+1] + pavgb m0, m2 + pavgb m1, m3 + + mova m3, [r0+r5+mmsize] + pavgb m2, m3, [r0+mmsize] + movu m5, [r0+r5+1+mmsize] + pavgb m4, m5, [r0+1+mmsize] + pavgb m3, [r0+r5*2+mmsize] + pavgb m5, [r0+r5*2+1+mmsize] + pavgb m2, m4 + pavgb m3, m5 + + pshufb m0, m7 + pshufb m1, m7 + pshufb m2, m7 + pshufb m3, m7 + punpckhqdq m4, m0, m2 + punpcklqdq m0, m0, m2 + punpckhqdq m5, m1, m3 + punpcklqdq m2, m1, m3 + vpermq m0, m0, q3120 + vpermq m1, m4, q3120 + vpermq m2, m2, q3120 + vpermq m3, m5, q3120 + mova [%1], m0 + mova [%2], m1 + mova [%3], m2 + mova [%4], m3 +%endmacro + %macro FILT16x2 4 mova m3, [r0+%4+mmsize] mova m2, [r0+%4] @@ -1497,6 +1657,10 @@ FIX_STRIDES r5 shl dword r7m, 1 %endif +%if mmsize >= 16 + add dword r7m, mmsize-1 + and dword r7m, ~(mmsize-1) +%endif ; src += 2*(height-1)*stride + 2*width mov r6d, r8m dec r6d @@ -1554,17 +1718,9 @@ sub r6d, mmsize jg .hloop %else ; !HIGH_BIT_DEPTH -%if mmsize == 16 - ; adjust for the odd end case - mov r6d, r7m - and r6d, 8 - sub r1, r6 - sub r2, r6 - sub r3, r6 - sub r4, r6 - add dst_gap, r6d -%endif ; mmsize -%if cpuflag(xop) +%if cpuflag(avx2) + mova m7, [deinterleave_shuf] +%elif cpuflag(xop) mova m6, [deinterleave_shuf32a] mova m7, [deinterleave_shuf32b] %else @@ -1574,44 +1730,22 @@ .vloop: mov r6d, r7m %ifnidn cpuname, mmx2 +%if mmsize <= 16 mova m0, [r0] mova m1, [r0+r5] pavgb m0, m1 pavgb m1, [r0+r5*2] %endif -%if mmsize == 16 - test r6d, 8 - jz .hloop - sub r0, 16 - FILT8x4 m0, m1, m2, m3, m4, m5, 0 -%if cpuflag(xop) - mova m4, m0 - vpperm m0, m4, m1, m6 - vpperm m1, m4, m1, m7 - movq [r1], m0 - movq [r2], m1 - movhps [r3], m0 - movhps [r4], m1 -%else - packuswb m0, m4 - packuswb m1, m5 - movq [r1], m0 - movhps [r2], m0 - movq [r3], m1 - movhps [r4], m1 %endif - mova m0, m2 - mova m1, m3 - sub r6d, 8 - jz .skip -%endif ; mmsize .hloop: sub r0, mmsize*2 sub r1, mmsize sub r2, mmsize sub r3, mmsize sub r4, mmsize -%ifdef m8 +%if mmsize==32 + FILT32x4U r1, r2, r3, r4 +%elifdef m8 FILT8x4 m0, m1, m2, m3, m10, m11, mmsize mova m8, m0 mova m9, m1 @@ -1669,6 +1803,10 @@ FRAME_INIT_LOWRES INIT_XMM xop FRAME_INIT_LOWRES +%if HIGH_BIT_DEPTH==0 +INIT_YMM avx2 +FRAME_INIT_LOWRES +%endif ;----------------------------------------------------------------------------- ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, @@ -1741,68 +1879,76 @@ INIT_XMM fma4 MBTREE -%macro INT16_TO_FLOAT 1 -%if cpuflag(avx2) - vpmovzxwd ymm%1, xmm%1 -%else - vpunpckhwd xmm4, xmm%1, xmm7 - vpunpcklwd xmm%1, xmm7 - vinsertf128 ymm%1, ymm%1, xmm4, 1 -%endif - vcvtdq2ps ymm%1, ymm%1 +%macro INT16_UNPACK 1 + vpunpckhwd xm4, xm%1, xm7 + vpunpcklwd xm%1, xm7 + vinsertf128 m%1, m%1, xm4, 1 %endmacro ; FIXME: align loads/stores to 16 bytes %macro MBTREE_AVX 0 cglobal mbtree_propagate_cost, 7,7,8 - add r6d, r6d - lea r0, [r0+r6*2] - add r1, r6 - add r2, r6 - add r3, r6 - add r4, r6 - neg r6 - vmovdqa xmm5, [pw_3fff] - vbroadcastss ymm6, [r5] - vmulps ymm6, ymm6, [pf_inv256] + add r6d, r6d + lea r0, [r0+r6*2] + add r1, r6 + add r2, r6 + add r3, r6 + add r4, r6 + neg r6 + mova xm5, [pw_3fff] + vbroadcastss m6, [r5] + mulps m6, [pf_inv256] %if notcpuflag(avx2) - vpxor xmm7, xmm7 + pxor xm7, xm7 %endif .loop: - vmovdqu xmm0, [r2+r6] ; intra - vmovdqu xmm1, [r4+r6] ; invq - vmovdqu xmm2, [r1+r6] ; prop - vpand xmm3, xmm5, [r3+r6] ; inter - INT16_TO_FLOAT 0 - INT16_TO_FLOAT 1 - INT16_TO_FLOAT 2 - INT16_TO_FLOAT 3 -%if cpuflag(fma3) - vmulps ymm1, ymm1, ymm0 - vsubps ymm4, ymm0, ymm3 - fmaddps ymm1, ymm1, ymm6, ymm2 - vrcpps ymm3, ymm0 - vmulps ymm2, ymm0, ymm3 - vmulps ymm1, ymm1, ymm4 - vaddps ymm4, ymm3, ymm3 - fnmaddps ymm4, ymm2, ymm3, ymm4 - vmulps ymm1, ymm1, ymm4 +%if cpuflag(avx2) + pmovzxwd m0, [r2+r6] ; intra + pmovzxwd m1, [r4+r6] ; invq + pmovzxwd m2, [r1+r6] ; prop + pand xm3, xm5, [r3+r6] ; inter + pmovzxwd m3, xm3 + pmaddwd m1, m0 + psubd m4, m0, m3 + cvtdq2ps m0, m0 + cvtdq2ps m1, m1 + cvtdq2ps m2, m2 + cvtdq2ps m4, m4 + fmaddps m1, m1, m6, m2 + rcpps m3, m0 + mulps m2, m0, m3 + mulps m1, m4 + addps m4, m3, m3 + fnmaddps m4, m2, m3, m4 + mulps m1, m4 %else - vmulps ymm1, ymm1, ymm0 - vsubps ymm4, ymm0, ymm3 - vmulps ymm1, ymm1, ymm6 ; intra*invq*fps_factor>>8 - vaddps ymm1, ymm1, ymm2 ; prop + (intra*invq*fps_factor>>8) - vrcpps ymm3, ymm0 ; 1 / intra 1st approximation - vmulps ymm2, ymm0, ymm3 ; intra * (1/intra 1st approx) - vmulps ymm2, ymm2, ymm3 ; intra * (1/intra 1st approx)^2 - vmulps ymm1, ymm1, ymm4 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter) - vaddps ymm3, ymm3, ymm3 ; 2 * (1/intra 1st approx) - vsubps ymm3, ymm3, ymm2 ; 2nd approximation for 1/intra - vmulps ymm1, ymm1, ymm3 ; / intra + movu xm0, [r2+r6] + movu xm1, [r4+r6] + movu xm2, [r1+r6] + pand xm3, xm5, [r3+r6] + INT16_UNPACK 0 + INT16_UNPACK 1 + INT16_UNPACK 2 + INT16_UNPACK 3 + cvtdq2ps m0, m0 + cvtdq2ps m1, m1 + cvtdq2ps m2, m2 + cvtdq2ps m3, m3 + mulps m1, m0 + subps m4, m0, m3 + mulps m1, m6 ; intra*invq*fps_factor>>8 + addps m1, m2 ; prop + (intra*invq*fps_factor>>8) + rcpps m3, m0 ; 1 / intra 1st approximation + mulps m2, m0, m3 ; intra * (1/intra 1st approx) + mulps m2, m3 ; intra * (1/intra 1st approx)^2 + mulps m1, m4 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter) + addps m3, m3 ; 2 * (1/intra 1st approx) + subps m3, m2 ; 2nd approximation for 1/intra + mulps m1, m3 ; / intra %endif - vcvtps2dq ymm1, ymm1 - vmovdqu [r0+r6*2], ymm1 - add r6, 16 + vcvtps2dq m1, m1 + movu [r0+r6*2], m1 + add r6, 16 jl .loop RET %endmacro
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/mc-c.c -> x264-snapshot-20130723-2245.tar.bz2/common/x86/mc-c.c
Changed
@@ -35,7 +35,8 @@ #define DECL_SUF( func, args )\ void func##_mmx2 args;\ void func##_sse2 args;\ - void func##_ssse3 args; + void func##_ssse3 args;\ + void func##_avx2 args; DECL_SUF( x264_pixel_avg_16x16, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int )) DECL_SUF( x264_pixel_avg_16x8, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int )) @@ -72,15 +73,20 @@ MC_WEIGHT( 12, ssse3 ) MC_WEIGHT( 16, ssse3 ) MC_WEIGHT( 20, ssse3 ) +MC_WEIGHT( 8, avx2 ) +MC_WEIGHT( 16, avx2 ) +MC_WEIGHT( 20, avx2 ) #undef MC_OFFSET #undef MC_WEIGHT -void x264_mc_copy_w4_mmx ( pixel *, intptr_t, pixel *, intptr_t, int ); -void x264_mc_copy_w8_mmx ( pixel *, intptr_t, pixel *, intptr_t, int ); -void x264_mc_copy_w8_sse2 ( pixel *, intptr_t, pixel *, intptr_t, int ); -void x264_mc_copy_w16_mmx ( pixel *, intptr_t, pixel *, intptr_t, int ); -void x264_mc_copy_w16_sse2( pixel *, intptr_t, pixel *, intptr_t, int ); -void x264_mc_copy_w16_aligned_sse2( pixel *, intptr_t, pixel *, intptr_t, int ); +void x264_mc_copy_w4_mmx ( pixel *, intptr_t, pixel *, intptr_t, int ); +void x264_mc_copy_w8_mmx ( pixel *, intptr_t, pixel *, intptr_t, int ); +void x264_mc_copy_w8_sse ( pixel *, intptr_t, pixel *, intptr_t, int ); +void x264_mc_copy_w16_mmx( pixel *, intptr_t, pixel *, intptr_t, int ); +void x264_mc_copy_w16_sse( pixel *, intptr_t, pixel *, intptr_t, int ); +void x264_mc_copy_w16_aligned_sse( pixel *, intptr_t, pixel *, intptr_t, int ); +void x264_mc_copy_w16_avx( uint16_t *, intptr_t, uint16_t *, intptr_t, int ); +void x264_mc_copy_w16_aligned_avx( uint16_t *, intptr_t, uint16_t *, intptr_t, int ); void x264_prefetch_fenc_420_mmx2( pixel *, intptr_t, pixel *, intptr_t, int ); void x264_prefetch_fenc_422_mmx2( pixel *, intptr_t, pixel *, intptr_t, int ); void x264_prefetch_ref_mmx2( pixel *, intptr_t, int ); @@ -121,18 +127,23 @@ void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, intptr_t i_src, int height ); void x264_load_deinterleave_chroma_fdec_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height ); void x264_load_deinterleave_chroma_fdec_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height ); -void *x264_memcpy_aligned_mmx ( void *dst, const void *src, size_t n ); -void *x264_memcpy_aligned_sse2( void *dst, const void *src, size_t n ); -void x264_memzero_aligned_mmx ( void *dst, size_t n ); -void x264_memzero_aligned_sse2( void *dst, size_t n ); +void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n ); +void *x264_memcpy_aligned_sse( void *dst, const void *src, size_t n ); +void x264_memzero_aligned_mmx( void *dst, size_t n ); +void x264_memzero_aligned_sse( void *dst, size_t n ); +void x264_memzero_aligned_avx( void *dst, size_t n ); void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride ); +void x264_integral_init4h_avx2( uint16_t *sum, uint8_t *pix, intptr_t stride ); void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride ); void x264_integral_init8h_avx ( uint16_t *sum, uint8_t *pix, intptr_t stride ); +void x264_integral_init8h_avx2( uint16_t *sum, uint8_t *pix, intptr_t stride ); void x264_integral_init4v_mmx ( uint16_t *sum8, uint16_t *sum4, intptr_t stride ); void x264_integral_init4v_sse2 ( uint16_t *sum8, uint16_t *sum4, intptr_t stride ); void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, intptr_t stride ); +void x264_integral_init4v_avx2( uint16_t *sum8, uint16_t *sum4, intptr_t stride ); void x264_integral_init8v_mmx ( uint16_t *sum8, intptr_t stride ); void x264_integral_init8v_sse2( uint16_t *sum8, intptr_t stride ); +void x264_integral_init8v_avx2( uint16_t *sum8, intptr_t stride ); void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); void x264_mbtree_propagate_cost_avx ( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, @@ -151,7 +162,7 @@ MC_CHROMA(ssse3) MC_CHROMA(ssse3_cache64) MC_CHROMA(avx) -MC_CHROMA(avx_cache64) +MC_CHROMA(avx2) #define LOWRES(cpu)\ void x264_frame_init_lowres_core_##cpu( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,\ @@ -162,6 +173,7 @@ LOWRES(ssse3) LOWRES(avx) LOWRES(xop) +LOWRES(avx2) #define PIXEL_AVG_W(width,cpu)\ void x264_pixel_avg2_w##width##_##cpu( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t ); @@ -176,6 +188,7 @@ PIXEL_AVG_WALL(sse2) PIXEL_AVG_WALL(sse2_misalign) PIXEL_AVG_WALL(cache64_ssse3) +PIXEL_AVG_WALL(avx2) #define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\ static void (* const x264_pixel_avg_wtab_##instr[6])( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t ) =\ @@ -194,6 +207,8 @@ #define x264_pixel_avg2_w20_mmx2 x264_pixel_avg2_w18_mmx2 #define x264_pixel_avg2_w12_sse2 x264_pixel_avg2_w10_sse2 #define x264_pixel_avg2_w20_sse2 x264_pixel_avg2_w18_sse2 +#define x264_pixel_avg2_w12_avx2 x264_pixel_avg2_w16_avx2 +#define x264_pixel_avg2_w20_avx2 x264_pixel_avg2_w18_avx2 #else /* w16 sse2 is faster than w12 mmx as long as the cacheline issue is resolved */ #define x264_pixel_avg2_w12_cache64_ssse3 x264_pixel_avg2_w16_cache64_ssse3 @@ -205,6 +220,7 @@ PIXEL_AVG_WTAB(mmx2, mmx2, mmx2, mmx2, mmx2, mmx2) #if HIGH_BIT_DEPTH PIXEL_AVG_WTAB(sse2, mmx2, sse2, sse2, sse2, sse2) +PIXEL_AVG_WTAB(avx2, mmx2, sse2, avx2, avx2, avx2) #else // !HIGH_BIT_DEPTH #if ARCH_X86 PIXEL_AVG_WTAB(cache32_mmx2, mmx2, cache32_mmx2, cache32_mmx2, cache32_mmx2, cache32_mmx2) @@ -214,6 +230,8 @@ PIXEL_AVG_WTAB(sse2_misalign, mmx2, mmx2, sse2, sse2, sse2_misalign) PIXEL_AVG_WTAB(cache64_sse2, mmx2, cache64_mmx2, cache64_sse2, cache64_sse2, cache64_sse2) PIXEL_AVG_WTAB(cache64_ssse3, mmx2, cache64_mmx2, cache64_ssse3, cache64_ssse3, cache64_sse2) +PIXEL_AVG_WTAB(cache64_ssse3_atom, mmx2, mmx2, cache64_ssse3, cache64_ssse3, sse2) +PIXEL_AVG_WTAB(avx2, mmx2, mmx2, sse2, sse2, avx2) #endif // HIGH_BIT_DEPTH #define MC_COPY_WTAB(instr, name1, name2, name3)\ @@ -228,9 +246,10 @@ MC_COPY_WTAB(mmx,mmx,mmx,mmx) #if HIGH_BIT_DEPTH -MC_COPY_WTAB(sse2,mmx,sse2,sse2) +MC_COPY_WTAB(sse,mmx,sse,sse) +MC_COPY_WTAB(avx,mmx,sse,avx) #else -MC_COPY_WTAB(sse2,mmx,mmx,sse2) +MC_COPY_WTAB(sse,mmx,mmx,sse) #endif #define MC_WEIGHT_WTAB(function, instr, name1, name2, w12version)\ @@ -282,6 +301,7 @@ MC_WEIGHT_WTAB(offsetadd,sse2,mmx2,mmx2,16) MC_WEIGHT_WTAB(offsetsub,sse2,mmx2,mmx2,16) MC_WEIGHT_WTAB(weight,ssse3,ssse3,ssse3,16) +MC_WEIGHT_WTAB(weight,avx2,ssse3,avx2,16) static void x264_weight_cache_mmx2( x264_t *h, x264_weight_t *w ) { @@ -357,14 +377,17 @@ } MC_LUMA(mmx2,mmx2,mmx) -MC_LUMA(sse2,sse2,sse2) -#if !HIGH_BIT_DEPTH +MC_LUMA(sse2,sse2,sse) +#if HIGH_BIT_DEPTH +MC_LUMA(avx2,avx2,avx) +#else #if ARCH_X86 MC_LUMA(cache32_mmx2,cache32_mmx2,mmx) MC_LUMA(cache64_mmx2,cache64_mmx2,mmx) #endif -MC_LUMA(cache64_sse2,cache64_sse2,sse2) -MC_LUMA(cache64_ssse3,cache64_ssse3,sse2) +MC_LUMA(cache64_sse2,cache64_sse2,sse) +MC_LUMA(cache64_ssse3,cache64_ssse3,sse) +MC_LUMA(cache64_ssse3_atom,cache64_ssse3_atom,sse) #endif // !HIGH_BIT_DEPTH #define GET_REF(name)\ @@ -400,6 +423,7 @@ GET_REF(mmx2) GET_REF(sse2) +GET_REF(avx2) #if !HIGH_BIT_DEPTH #if ARCH_X86 GET_REF(cache32_mmx2) @@ -408,6 +432,7 @@ GET_REF(sse2_misalign) GET_REF(cache64_sse2) GET_REF(cache64_ssse3) +GET_REF(cache64_ssse3_atom) #endif // !HIGH_BIT_DEPTH #define HPEL(align, cpu, cpuv, cpuc, cpuh)\ @@ -425,8 +450,8 @@ width += realign;\ while( height-- )\ {\ - x264_hpel_filter_v_##cpuv( dstv, src, buf+8, stride, width );\ - x264_hpel_filter_c_##cpuc( dstc, buf+8, width );\ + x264_hpel_filter_v_##cpuv( dstv, src, buf+16, stride, width );\ + x264_hpel_filter_c_##cpuc( dstc, buf+16, width );\ x264_hpel_filter_h_##cpuh( dsth, src, width );\ dsth += stride;\ dstv += stride;\ @@ -445,10 +470,12 @@ void x264_hpel_filter_sse2 ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf ); void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf ); void x264_hpel_filter_avx ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf ); +void x264_hpel_filter_avx2 ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf ); #else HPEL(16, sse2, sse2, sse2, sse2) HPEL(16, ssse3, ssse3, ssse3, ssse3) HPEL(16, avx, avx, avx, avx) +HPEL(32, avx2, avx2, avx2, avx2) #endif HPEL(16, sse2_misalign, sse2, sse2_misalign, sse2) #endif // HIGH_BIT_DEPTH @@ -545,6 +572,12 @@ pf->frame_init_lowres_core = x264_frame_init_lowres_core_mmx2; + if( cpu&X264_CPU_SSE ) + { + pf->memcpy_aligned = x264_memcpy_aligned_sse; + pf->memzero_aligned = x264_memzero_aligned_sse; + } + #if HIGH_BIT_DEPTH #if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead if( cpu&(X264_CPU_CACHELINE_32|X264_CPU_CACHELINE_64) ) @@ -569,8 +602,6 @@ pf->hpel_filter = x264_hpel_filter_sse2; } - pf->memcpy_aligned = x264_memcpy_aligned_sse2; - pf->memzero_aligned = x264_memzero_aligned_sse2; pf->integral_init4v = x264_integral_init4v_sse2; pf->integral_init8v = x264_integral_init8v_sse2; pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2; @@ -591,7 +622,7 @@ pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_sse2; pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_sse2; - pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2; + pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse; pf->weight = x264_mc_weight_wtab_sse2; if( !(cpu&X264_CPU_STACK_MOD4) ) @@ -602,7 +633,7 @@ pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3; - if( (cpu&X264_CPU_SHUFFLE_IS_FAST) && !(cpu&X264_CPU_SLOW_ATOM) ) + if( !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SLOW_ATOM|X264_CPU_SLOW_PALIGNR)) ) pf->integral_init4v = x264_integral_init4v_ssse3; if( !(cpu&X264_CPU_AVX) ) @@ -614,12 +645,16 @@ pf->plane_copy_interleave = x264_plane_copy_interleave_avx; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_avx; pf->store_interleave_chroma = x264_store_interleave_chroma_avx; + pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_avx; if( !(cpu&X264_CPU_STACK_MOD4) ) pf->mc_chroma = x264_mc_chroma_avx; if( cpu&X264_CPU_XOP ) pf->frame_init_lowres_core = x264_frame_init_lowres_core_xop; + + if( cpu&X264_CPU_AVX2 ) + pf->mc_luma = mc_luma_avx2; #else // !HIGH_BIT_DEPTH #if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead @@ -640,55 +675,53 @@ if( !(cpu&X264_CPU_SSE2) ) return; - pf->memcpy_aligned = x264_memcpy_aligned_sse2; - pf->memzero_aligned = x264_memzero_aligned_sse2; pf->integral_init4v = x264_integral_init4v_sse2; pf->integral_init8v = x264_integral_init8v_sse2; pf->hpel_filter = x264_hpel_filter_sse2_amd; pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2; - if( cpu&X264_CPU_SSE2_IS_SLOW ) - return; - - pf->weight = x264_mc_weight_wtab_sse2; - if( !(cpu&X264_CPU_SLOW_ATOM) ) + if( !(cpu&X264_CPU_SSE2_IS_SLOW) ) { - pf->offsetadd = x264_mc_offsetadd_wtab_sse2; - pf->offsetsub = x264_mc_offsetsub_wtab_sse2; - } - - pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2; - pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2; - pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_sse2; - pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_sse2; - pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_sse2; - pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_sse2; - pf->hpel_filter = x264_hpel_filter_sse2; - if( cpu&X264_CPU_SSE_MISALIGN ) - pf->hpel_filter = x264_hpel_filter_sse2_misalign; - pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2; - if( !(cpu&X264_CPU_STACK_MOD4) ) - pf->mc_chroma = x264_mc_chroma_sse2; - - if( cpu&X264_CPU_SSE2_IS_FAST ) - { - pf->store_interleave_chroma = x264_store_interleave_chroma_sse2; // FIXME sse2fast? sse2medium? - pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2; - pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2; - pf->plane_copy_interleave = x264_plane_copy_interleave_sse2; - pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2; - pf->mc_luma = mc_luma_sse2; - pf->get_ref = get_ref_sse2; - if( cpu&X264_CPU_CACHELINE_64 ) + pf->weight = x264_mc_weight_wtab_sse2; + if( !(cpu&X264_CPU_SLOW_ATOM) ) { - pf->mc_luma = mc_luma_cache64_sse2; - pf->get_ref = get_ref_cache64_sse2; + pf->offsetadd = x264_mc_offsetadd_wtab_sse2; + pf->offsetsub = x264_mc_offsetsub_wtab_sse2; } + + pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse; + pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2; + pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_sse2; + pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_sse2; + pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_sse2; + pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_sse2; + pf->hpel_filter = x264_hpel_filter_sse2; if( cpu&X264_CPU_SSE_MISALIGN ) + pf->hpel_filter = x264_hpel_filter_sse2_misalign; + pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2; + if( !(cpu&X264_CPU_STACK_MOD4) ) + pf->mc_chroma = x264_mc_chroma_sse2; + + if( cpu&X264_CPU_SSE2_IS_FAST ) { - pf->get_ref = get_ref_sse2_misalign; - if( !(cpu&X264_CPU_STACK_MOD4) ) - pf->mc_chroma = x264_mc_chroma_sse2_misalign; + pf->store_interleave_chroma = x264_store_interleave_chroma_sse2; // FIXME sse2fast? sse2medium? + pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2; + pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2; + pf->plane_copy_interleave = x264_plane_copy_interleave_sse2; + pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2; + pf->mc_luma = mc_luma_sse2; + pf->get_ref = get_ref_sse2; + if( cpu&X264_CPU_CACHELINE_64 ) + { + pf->mc_luma = mc_luma_cache64_sse2; + pf->get_ref = get_ref_cache64_sse2; + } + if( cpu&X264_CPU_SSE_MISALIGN ) + { + pf->get_ref = get_ref_sse2_misalign; + if( !(cpu&X264_CPU_STACK_MOD4) ) + pf->mc_chroma = x264_mc_chroma_sse2_misalign; + } } } @@ -705,12 +738,21 @@ pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_ssse3; pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_ssse3; - pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_ssse3; - pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_ssse3; - pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_ssse3; + if( !(cpu&X264_CPU_SLOW_PSHUFB) ) + { + pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_ssse3; + pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_ssse3; + pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_ssse3; + } - pf->hpel_filter = x264_hpel_filter_ssse3; - pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3; + if( !(cpu&X264_CPU_SLOW_PALIGNR) ) + { +#if ARCH_X86_64 + if( !(cpu&X264_CPU_SLOW_ATOM) ) /* The 64-bit version is slower, but the 32-bit version is faster? */ +#endif + pf->hpel_filter = x264_hpel_filter_ssse3; + pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3; + } if( !(cpu&X264_CPU_STACK_MOD4) ) pf->mc_chroma = x264_mc_chroma_ssse3; @@ -720,13 +762,17 @@ pf->mc_chroma = x264_mc_chroma_ssse3_cache64; pf->mc_luma = mc_luma_cache64_ssse3; pf->get_ref = get_ref_cache64_ssse3; - - /* ssse3 weight is slower on Nehalem, so only assign here. */ - pf->weight_cache = x264_weight_cache_ssse3; - pf->weight = x264_mc_weight_wtab_ssse3; + if( cpu&X264_CPU_SLOW_ATOM ) + { + pf->mc_luma = mc_luma_cache64_ssse3_atom; + pf->get_ref = get_ref_cache64_ssse3_atom; + } } - if( (cpu&X264_CPU_SHUFFLE_IS_FAST) && !(cpu&X264_CPU_SLOW_ATOM) ) + pf->weight_cache = x264_weight_cache_ssse3; + pf->weight = x264_mc_weight_wtab_ssse3; + + if( !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SLOW_ATOM|X264_CPU_SLOW_PALIGNR)) ) pf->integral_init4v = x264_integral_init4v_ssse3; if( !(cpu&X264_CPU_SSE4) ) @@ -742,18 +788,30 @@ pf->integral_init8h = x264_integral_init8h_avx; pf->hpel_filter = x264_hpel_filter_avx; - /* ssse3 weight seems to be faster again on Sandy Bridge and Bulldozer. */ - pf->weight_cache = x264_weight_cache_ssse3; - pf->weight = x264_mc_weight_wtab_ssse3; if( !(cpu&X264_CPU_STACK_MOD4) ) pf->mc_chroma = x264_mc_chroma_avx; if( cpu&X264_CPU_XOP ) pf->frame_init_lowres_core = x264_frame_init_lowres_core_xop; + + if( cpu&X264_CPU_AVX2 ) + { + pf->hpel_filter = x264_hpel_filter_avx2; + pf->mc_chroma = x264_mc_chroma_avx2; + pf->weight = x264_mc_weight_wtab_avx2; + pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_avx2; + pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_avx2; + pf->integral_init8v = x264_integral_init8v_avx2; + pf->integral_init4v = x264_integral_init4v_avx2; + pf->integral_init8h = x264_integral_init8h_avx2; + pf->integral_init4h = x264_integral_init4h_avx2; + pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx2; + } #endif // HIGH_BIT_DEPTH if( !(cpu&X264_CPU_AVX) ) return; + pf->memzero_aligned = x264_memzero_aligned_avx; pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx; if( cpu&X264_CPU_FMA4 ) @@ -761,6 +819,7 @@ if( !(cpu&X264_CPU_AVX2) ) return; + pf->get_ref = get_ref_avx2; if( cpu&X264_CPU_FMA3 ) pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2_fma3;
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/pixel-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/pixel-a.asm
Changed
@@ -32,8 +32,17 @@ %include "x86util.asm" SECTION_RODATA 32 +hmul_16p: times 16 db 1 + times 8 db 1, -1 +hmul_8p: times 8 db 1 + times 4 db 1, -1 + times 8 db 1 + times 4 db 1, -1 mask_ff: times 16 db 0xff times 16 db 0 +mask_ac4: times 2 dw 0, -1, -1, -1, 0, -1, -1, -1 +mask_ac4b: times 2 dw 0, -1, 0, -1, -1, -1, -1, -1 +mask_ac8: times 2 dw 0, -1, -1, -1, -1, -1, -1, -1 %if BIT_DEPTH == 10 ssim_c1: times 4 dd 6697.7856 ; .01*.01*1023*1023*64 ssim_c2: times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63 @@ -46,12 +55,7 @@ ssim_c1: times 4 dd 416 ; .01*.01*255*255*64 ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63 %endif -mask_ac4: dw 0, -1, -1, -1, 0, -1, -1, -1 -mask_ac4b: dw 0, -1, 0, -1, -1, -1, -1, -1 -mask_ac8: dw 0, -1, -1, -1, -1, -1, -1, -1 hmul_4p: times 2 db 1, 1, 1, 1, 1, -1, 1, -1 -hmul_8p: times 8 db 1 - times 4 db 1, -1 mask_10: times 4 dw 0, -1 mask_1100: times 2 dd 0, -1 pb_pppm: times 4 db 1,1,1,-1 @@ -85,6 +89,7 @@ intrax9b_v2: db 2, 3,-1,-1,-1,-1,-1,-1, 6, 7,-1,-1,-1,-1,-1,-1 intrax9b_lut: db 0x60,0x64,0x80,0x00,0x04,0x20,0x40,0x24,0x44,0,0,0,0,0,0,0 +ALIGN 32 intra8x9_h1: db 7, 7, 7, 7, 7, 7, 7, 7, 5, 5, 5, 5, 5, 5, 5, 5 intra8x9_h2: db 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4 intra8x9_h3: db 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1 @@ -120,9 +125,29 @@ transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15 sw_f0: dq 0xfff0, 0 -sq_0f: dq 0xffffffff, 0 pd_f0: times 4 dd 0xffff0000 +pw_76543210: dw 0, 1, 2, 3, 4, 5, 6, 7 + +ads_mvs_shuffle: +%macro ADS_MVS_SHUFFLE 8 + %assign y x + %rep 8 + %rep 7 + %rotate (~y)&1 + %assign y y>>((~y)&1) + %endrep + db %1*2, %1*2+1 + %rotate 1 + %assign y y>>1 + %endrep +%endmacro +%assign x 0 +%rep 256 + ADS_MVS_SHUFFLE 0, 1, 2, 3, 4, 5, 6, 7 +%assign x x+1 +%endrep + SECTION .text cextern pb_0 @@ -136,7 +161,9 @@ cextern pw_ppmmppmm cextern pw_pmpmpmpm cextern pw_pmmpzzzz +cextern pd_1 cextern hsub_mul +cextern popcnt_table ;============================================================================= ; SSD @@ -144,69 +171,67 @@ %if HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- -; int pixel_ssd_MxN( uint16_t *, intptr_t, uint16_t *, intptr_t ) +; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t ) ;----------------------------------------------------------------------------- %macro SSD_ONE 2 -cglobal pixel_ssd_%1x%2, 4,5,6 - mov r4, %1*%2/mmsize +cglobal pixel_ssd_%1x%2, 4,7,6 + FIX_STRIDES r1, r3 +%if mmsize == %1*2 + %define offset0_1 r1 + %define offset0_2 r1*2 + %define offset0_3 r5 + %define offset1_1 r3 + %define offset1_2 r3*2 + %define offset1_3 r6 + lea r5, [3*r1] + lea r6, [3*r3] +%elif mmsize == %1 + %define offset0_1 mmsize + %define offset0_2 r1 + %define offset0_3 r1+mmsize + %define offset1_1 mmsize + %define offset1_2 r3 + %define offset1_3 r3+mmsize +%elif mmsize == %1/2 + %define offset0_1 mmsize + %define offset0_2 mmsize*2 + %define offset0_3 mmsize*3 + %define offset1_1 mmsize + %define offset1_2 mmsize*2 + %define offset1_3 mmsize*3 +%endif + %assign %%n %2/(2*mmsize/%1) +%if %%n > 1 + mov r4d, %%n +%endif pxor m0, m0 .loop mova m1, [r0] -%if %1 <= mmsize/2 - mova m3, [r0+r1*2] - %define offset r3*2 - %define num_rows 2 -%else - mova m3, [r0+mmsize] - %define offset mmsize - %define num_rows 1 -%endif - lea r0, [r0+r1*2*num_rows] + mova m2, [r0+offset0_1] + mova m3, [r0+offset0_2] + mova m4, [r0+offset0_3] psubw m1, [r2] - psubw m3, [r2+offset] - lea r2, [r2+r3*2*num_rows] + psubw m2, [r2+offset1_1] + psubw m3, [r2+offset1_2] + psubw m4, [r2+offset1_3] +%if %%n > 1 + lea r0, [r0+r1*(%2/%%n)] + lea r2, [r2+r3*(%2/%%n)] +%endif pmaddwd m1, m1 + pmaddwd m2, m2 pmaddwd m3, m3 + pmaddwd m4, m4 + paddd m1, m2 + paddd m3, m4 paddd m0, m1 paddd m0, m3 - dec r4 +%if %%n > 1 + dec r4d jg .loop +%endif HADDD m0, m5 - movd eax, m0 - RET -%endmacro - -%macro SSD_16_MMX 2 -cglobal pixel_ssd_%1x%2, 4,5 - mov r4, %1*%2/mmsize/2 - pxor m0, m0 -.loop - mova m1, [r0] - mova m2, [r2] - mova m3, [r0+mmsize] - mova m4, [r2+mmsize] - mova m5, [r0+mmsize*2] - mova m6, [r2+mmsize*2] - mova m7, [r0+mmsize*3] - psubw m1, m2 - psubw m3, m4 - mova m2, [r2+mmsize*3] - psubw m5, m6 - pmaddwd m1, m1 - psubw m7, m2 - pmaddwd m3, m3 - pmaddwd m5, m5 - lea r0, [r0+r1*2] - lea r2, [r2+r3*2] - pmaddwd m7, m7 - paddd m1, m3 - paddd m5, m7 - paddd m0, m1 - paddd m0, m5 - dec r4 - jg .loop - HADDD m0, m7 - movd eax, m0 + movd eax, xm0 RET %endmacro @@ -217,14 +242,17 @@ SSD_ONE 8, 4 SSD_ONE 8, 8 SSD_ONE 8, 16 -SSD_16_MMX 16, 8 -SSD_16_MMX 16, 16 +SSD_ONE 16, 8 +SSD_ONE 16, 16 INIT_XMM sse2 SSD_ONE 8, 4 SSD_ONE 8, 8 SSD_ONE 8, 16 SSD_ONE 16, 8 SSD_ONE 16, 16 +INIT_YMM avx2 +SSD_ONE 16, 8 +SSD_ONE 16, 16 %endif ; HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH == 0 @@ -287,6 +315,23 @@ punpcklbw m%2, m%4 %endmacro +%macro LOAD_AVX2 5 + mova xm%1, %3 + vinserti128 m%1, m%1, %4, 1 +%if %5 + lea t0, [t0+2*t1] +%endif +%endmacro + +%macro JOIN_AVX2 7 + mova xm%2, %5 + vinserti128 m%2, m%2, %6, 1 +%if %7 + lea t2, [t2+2*t3] +%endif + SBUTTERFLY bw, %1, %2, %3 +%endmacro + %macro SSD_LOAD_HALF 5 LOAD 1, 2, [t0+%1], [t0+%3], 1 JOIN 1, 2, 3, 4, [t2+%2], [t2+%4], 1 @@ -409,8 +454,15 @@ %endif dec al jg .loop +%if mmsize==32 + vextracti128 xm1, m0, 1 + paddd xm0, xm1 + HADDD xm0, xm1 + movd eax, xm0 +%else HADDD m0, m1 movd eax, m0 +%endif RET %endif %endmacro @@ -462,6 +514,11 @@ SSD 16, 8 SSD 8, 16 SSD 8, 4 +%define LOAD LOAD_AVX2 +%define JOIN JOIN_AVX2 +INIT_YMM avx2 +SSD 16, 16 +SSD 16, 8 %assign function_align 16 %endif ; !HIGH_BIT_DEPTH @@ -500,7 +557,7 @@ psubw m1, [r2+r6+mmsize] PSHUFLW m0, m0, q3120 PSHUFLW m1, m1, q3120 -%if mmsize==16 +%if mmsize >= 16 pshufhw m0, m0, q3120 pshufhw m1, m1, q3120 %endif @@ -510,8 +567,13 @@ paddd m3, m1 add r6, 2*mmsize jl .loopx -%if mmsize==16 ; using HADDD would remove the mmsize/32 part from the - ; equation above, putting the width limit at 8208 +%if mmsize == 32 ; avx2 may overread by 32 bytes, that has to be handled + jz .no_overread + psubd m3, m1 +.no_overread: +%endif +%if mmsize >= 16 ; using HADDD would remove the mmsize/32 part from the + ; equation above, putting the width limit at 8208 punpckhdq m0, m2, m6 punpckhdq m1, m3, m6 punpckldq m2, m6 @@ -539,9 +601,13 @@ jg .loopy mov r3, r6m mov r4, r7m -%if mmsize==16 - movq [r3], m4 - movhps [r4], m4 +%if mmsize == 32 + vextracti128 xm0, m4, 1 + paddq xm4, xm0 +%endif +%if mmsize >= 16 + movq [r3], xm4 + movhps [r4], xm4 %else ; fixup for mmx2 SBUTTERFLY dq, 4, 5, 0 mova m0, m4 @@ -569,7 +635,7 @@ ;----------------------------------------------------------------------------- %macro SSD_NV12 0 cglobal pixel_ssd_nv12_core, 6,7 - shl r4d, 1 + add r4d, r4d add r0, r4 add r2, r4 pxor m3, m3 @@ -579,10 +645,15 @@ mov r6, r4 neg r6 .loopx: - mova m0, [r0+r6] +%if mmsize == 32 ; only 16-byte alignment is guaranteed + movu m2, [r0+r6] + movu m1, [r2+r6] +%else + mova m2, [r0+r6] mova m1, [r2+r6] - psubusb m0, m1 - psubusb m1, [r0+r6] +%endif + psubusb m0, m2, m1 + psubusb m1, m2 por m0, m1 psrlw m2, m0, 8 pand m0, m5 @@ -592,19 +663,28 @@ paddd m4, m2 add r6, mmsize jl .loopx +%if mmsize == 32 ; avx2 may overread by 16 bytes, that has to be handled + jz .no_overread + pcmpeqb xm1, xm1 + pandn m0, m1, m0 ; zero the lower half + pandn m2, m1, m2 + psubd m3, m0 + psubd m4, m2 +.no_overread: +%endif add r0, r1 add r2, r3 dec r5d jg .loopy mov r3, r6m mov r4, r7m - mova m5, [sq_0f] HADDD m3, m0 HADDD m4, m0 - pand m3, m5 - pand m4, m5 - movq [r3], m3 - movq [r4], m4 + pxor xm0, xm0 + punpckldq xm3, xm0 + punpckldq xm4, xm0 + movq [r3], xm3 + movq [r4], xm4 RET %endmacro ; SSD_NV12 %endif ; !HIGH_BIT_DEPTH @@ -615,6 +695,8 @@ SSD_NV12 INIT_XMM avx SSD_NV12 +INIT_YMM avx2 +SSD_NV12 ;============================================================================= ; variance @@ -626,7 +708,7 @@ %if HIGH_BIT_DEPTH == 0 %if %1 mova m7, [pw_00ff] -%else +%elif mmsize < 32 pxor m7, m7 ; zero %endif %endif ; !HIGH_BIT_DEPTH @@ -642,12 +724,13 @@ %else ; !HIGH_BIT_DEPTH HADDW m5, m2 %endif ; HIGH_BIT_DEPTH - movd eax, m5 HADDD m6, m1 - movd edx, m6 %if ARCH_X86_64 - shl rdx, 32 - add rax, rdx + punpckldq m5, m6 + movq rax, m5 +%else + movd eax, m5 + movd edx, m6 %endif RET %endmacro @@ -805,16 +888,45 @@ VAR INIT_XMM xop VAR + +INIT_YMM avx2 +cglobal pixel_var_16x16, 2,4,7 + VAR_START 0 + mov r2d, 4 + lea r3, [r1*3] +.loop: + pmovzxbw m0, [r0] + pmovzxbw m3, [r0+r1] + pmovzxbw m1, [r0+r1*2] + pmovzxbw m4, [r0+r3] + lea r0, [r0+r1*4] + VAR_CORE + dec r2d + jg .loop + vextracti128 xm0, m5, 1 + vextracti128 xm1, m6, 1 + paddw xm5, xm0 + paddd xm6, xm1 + HADDW xm5, xm2 + HADDD xm6, xm1 +%if ARCH_X86_64 + punpckldq xm5, xm6 + movq rax, xm5 +%else + movd eax, xm5 + movd edx, xm6 +%endif + RET %endif ; !HIGH_BIT_DEPTH -%macro VAR2_END 1 - HADDW m5, m7 - movd r1d, m5 +%macro VAR2_END 3 + HADDW %2, xm1 + movd r1d, %2 imul r1d, r1d - HADDD m6, m1 + HADDD %3, xm1 shr r1d, %1 - movd eax, m6 - mov [r4], eax + movd eax, %3 + movd [r4], %3 sub eax, r1d ; sqr - (sum * sum >> shift) RET %endmacro @@ -855,7 +967,7 @@ add r2, r3 dec r5d jg .loop - VAR2_END %2 + VAR2_END %2, m5, m6 %endmacro %if ARCH_X86_64 == 0 @@ -893,7 +1005,7 @@ lea r2, [r2+r3*2*SIZEOF_PIXEL] dec r5d jg .loop - VAR2_END %2 + VAR2_END %2, m5, m6 %endmacro INIT_XMM sse2 @@ -942,7 +1054,7 @@ lea r2, [r2+r3*2] dec r5d jg .loop - VAR2_END %2 + VAR2_END %2, m5, m6 %endmacro INIT_XMM ssse3 @@ -952,6 +1064,48 @@ VAR2_8x8_SSSE3 8, 6 VAR2_8x8_SSSE3 16, 7 +%macro VAR2_8x8_AVX2 2 +cglobal pixel_var2_8x%1, 5,6,6 + pxor m3, m3 ; sum + pxor m4, m4 ; sum squared + mova m5, [hsub_mul] + mov r5d, %1/4 +.loop: + movq xm0, [r0] + movq xm1, [r2] + vinserti128 m0, m0, [r0+r1], 1 + vinserti128 m1, m1, [r2+r3], 1 + lea r0, [r0+r1*2] + lea r2, [r2+r3*2] + punpcklbw m0, m1 + movq xm1, [r0] + movq xm2, [r2] + vinserti128 m1, m1, [r0+r1], 1 + vinserti128 m2, m2, [r2+r3], 1 + lea r0, [r0+r1*2] + lea r2, [r2+r3*2] + punpcklbw m1, m2 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + paddw m3, m0 + paddw m3, m1 + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m4, m0 + paddd m4, m1 + dec r5d + jg .loop + vextracti128 xm0, m3, 1 + vextracti128 xm1, m4, 1 + paddw xm3, xm0 + paddd xm4, xm1 + VAR2_END %2, xm3, xm4 +%endmacro + +INIT_YMM avx2 +VAR2_8x8_AVX2 8, 6 +VAR2_8x8_AVX2 16, 7 + %endif ; !HIGH_BIT_DEPTH ;============================================================================= @@ -962,7 +1116,7 @@ %if cpuflag(sse4) ; just use shufps on anything post conroe shufps %1, %2, 0 -%elif cpuflag(ssse3) +%elif cpuflag(ssse3) && notcpuflag(atom) ; join 2x 32 bit and duplicate them ; emulating shufps is faster on conroe punpcklqdq %1, %2 @@ -1023,7 +1177,7 @@ DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 %endmacro -%macro LOAD_SUMSUB_8x4P_SSSE3 7-10 r0, r2, 0 +%macro LOAD_SUMSUB_8x4P_SSSE3 7-11 r0, r2, 0, 0 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?] LOAD_SUMSUB_8x2P %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3] LOAD_SUMSUB_8x2P %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5] @@ -1059,6 +1213,52 @@ LOAD_SUMSUB_16P %4, %8, %13, %9, %10, %11+r4, %12+r5 %endmacro +%macro LOAD_SUMSUB_16x2P_AVX2 9 +; 2*dst, 2*tmp, mul, 4*ptr + vbroadcasti128 m%1, [%6] + vbroadcasti128 m%3, [%7] + vbroadcasti128 m%2, [%8] + vbroadcasti128 m%4, [%9] + DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 +%endmacro + +%macro LOAD_SUMSUB_16x4P_AVX2 7-11 r0, r2, 0, 0 +; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?] + LOAD_SUMSUB_16x2P_AVX2 %1, %2, %5, %6, %7, %8, %9, %8+r1, %9+r3 + LOAD_SUMSUB_16x2P_AVX2 %3, %4, %5, %6, %7, %8+2*r1, %9+2*r3, %8+r4, %9+r5 +%if %10 + lea %8, [%8+4*r1] + lea %9, [%9+4*r3] +%endif +%endmacro + +%macro LOAD_DUP_4x16P_AVX2 8 ; 4*dst, 4*pointer + mova xm%3, %6 + mova xm%4, %8 + mova xm%1, %5 + mova xm%2, %7 + vpermq m%3, m%3, q0011 + vpermq m%4, m%4, q0011 + vpermq m%1, m%1, q0011 + vpermq m%2, m%2, q0011 +%endmacro + +%macro LOAD_SUMSUB8_16x2P_AVX2 9 +; 2*dst, 2*tmp, mul, 4*ptr + LOAD_DUP_4x16P_AVX2 %1, %2, %3, %4, %6, %7, %8, %9 + DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 +%endmacro + +%macro LOAD_SUMSUB8_16x4P_AVX2 7-11 r0, r2, 0, 0 +; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?] + LOAD_SUMSUB8_16x2P_AVX2 %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3] + LOAD_SUMSUB8_16x2P_AVX2 %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5] +%if %10 + lea %8, [%8+4*r1] + lea %9, [%9+4*r3] +%endif +%endmacro + ; in: r4=3*stride1, r5=3*stride2 ; in: %2 = horizontal offset ; in: %3 = whether we need to increment pix1 and pix2 @@ -1080,8 +1280,9 @@ SWAP %%n, 4 %endmacro +; in: %1 = horizontal if 0, vertical if 1 %macro SATD_8x4_SSE 8-9 -%ifidn %1, sse2 +%if %1 HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax %else HADAMARD4_V %2, %3, %4, %5, %6 @@ -1095,7 +1296,7 @@ %else SWAP %8, %2 %endif -%ifidn %1, sse2 +%if %1 paddw m%8, m%4 %else HADAMARD 1, max, %3, %5, %6, %7 @@ -1250,21 +1451,43 @@ SATD_4x4_MMX m0, 0, 0 SATD_END_MMX -%macro SATD_START_SSE2 2 -%if cpuflag(ssse3) +%macro SATD_START_SSE2 2-3 0 + FIX_STRIDES r1, r3 +%if HIGH_BIT_DEPTH && %3 + pxor %2, %2 +%elif cpuflag(ssse3) && notcpuflag(atom) +%if mmsize==32 + mova %2, [hmul_16p] +%else mova %2, [hmul_8p] %endif +%endif lea r4, [3*r1] lea r5, [3*r3] pxor %1, %1 %endmacro -%macro SATD_END_SSE2 1 - HADDW %1, m7 +%macro SATD_END_SSE2 1-2 +%if HIGH_BIT_DEPTH + HADDUW %1, xm0 +%if %0 == 2 + paddd %1, %2 +%endif +%else + HADDW %1, xm7 +%endif movd eax, %1 RET %endmacro +%macro SATD_ACCUM 3 +%if HIGH_BIT_DEPTH + HADDUW %1, %2 + paddd %3, %1 + pxor %1, %1 +%endif +%endmacro + %macro BACKUP_POINTERS 0 %if ARCH_X86_64 %if WIN64 @@ -1277,20 +1500,44 @@ %macro RESTORE_AND_INC_POINTERS 0 %if ARCH_X86_64 - lea r0, [r6+8] - lea r2, [r7+8] + lea r0, [r6+8*SIZEOF_PIXEL] + lea r2, [r7+8*SIZEOF_PIXEL] %if WIN64 POP r7 %endif %else mov r0, r0mp mov r2, r2mp - add r0, 8 - add r2, 8 + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL %endif %endmacro -%macro SATD_4x8_SSE 2 +%macro SATD_4x8_SSE 3 +%if HIGH_BIT_DEPTH + movh m0, [r0+0*r1] + movh m4, [r2+0*r3] + movh m1, [r0+1*r1] + movh m5, [r2+1*r3] + movhps m0, [r0+4*r1] + movhps m4, [r2+4*r3] + movh m2, [r0+2*r1] + movh m6, [r2+2*r3] + psubw m0, m4 + movh m3, [r0+r4] + movh m4, [r2+r5] + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + movhps m1, [r0+1*r1] + movhps m5, [r2+1*r3] + movhps m2, [r0+2*r1] + movhps m6, [r2+2*r3] + psubw m1, m5 + movhps m3, [r0+r4] + movhps m4, [r2+r5] + psubw m2, m6 + psubw m3, m4 +%else ; !HIGH_BIT_DEPTH movd m4, [r2] movd m5, [r2+r3] movd m6, [r2+2*r3] @@ -1307,7 +1554,7 @@ JDUP m5, m3 movd m3, [r0+2*r1] JDUP m1, m3 -%if cpuflag(ssse3) && %1==1 +%if %1==0 && %2==1 mova m3, [hmul_4p] DIFFOP 0, 4, 1, 5, 3 %else @@ -1325,20 +1572,23 @@ JDUP m5, m4 movd m4, [r0+r1] JDUP m3, m4 -%if cpuflag(ssse3) && %1==1 +%if %1==0 && %2==1 mova m4, [hmul_4p] DIFFOP 2, 6, 3, 5, 4 %else DIFFOP 2, 6, 3, 5, 7 %endif - SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 7, %2 +%endif ; HIGH_BIT_DEPTH + SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3 %endmacro ;----------------------------------------------------------------------------- ; int pixel_satd_8x4( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- %macro SATDS_SSE2 0 -%if cpuflag(ssse3) +%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH) + +%if vertical==0 || HIGH_BIT_DEPTH cglobal pixel_satd_4x4, 4, 6, 6 SATD_START_MMX mova m4, [hmul_4p] @@ -1357,55 +1607,57 @@ cglobal pixel_satd_4x8, 4, 6, 8 SATD_START_MMX -%if cpuflag(ssse3) +%if vertical==0 mova m7, [hmul_4p] %endif - SATD_4x8_SSE 0, swap + SATD_4x8_SSE vertical, 0, swap HADDW m7, m1 movd eax, m7 RET cglobal pixel_satd_4x16, 4, 6, 8 SATD_START_MMX -%if cpuflag(ssse3) +%if vertical==0 mova m7, [hmul_4p] %endif - SATD_4x8_SSE 0, swap - lea r0, [r0+r1*2] - lea r2, [r2+r3*2] - SATD_4x8_SSE 1, add + SATD_4x8_SSE vertical, 0, swap + lea r0, [r0+r1*2*SIZEOF_PIXEL] + lea r2, [r2+r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add HADDW m7, m1 movd eax, m7 RET cglobal pixel_satd_8x8_internal - LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1 - SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 6 + LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 + SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6 %%pixel_satd_8x4_internal: - LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1 - SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 6 + LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 + SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6 ret -%if UNIX64 ; 16x8 regresses on phenom win64, 16x16 is almost the same +; 16x8 regresses on phenom win64, 16x16 is almost the same (too many spilled registers) +; These aren't any faster on AVX systems with fast movddup (Bulldozer, Sandy Bridge) +%if HIGH_BIT_DEPTH == 0 && UNIX64 && notcpuflag(avx) cglobal pixel_satd_16x4_internal LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11 lea r2, [r2+4*r3] lea r0, [r0+4*r1] - ; FIXME: this doesn't really mean ssse3, but rather selects between two different behaviors implemented with sse2? - SATD_8x4_SSE ssse3, 0, 1, 2, 3, 6, 11, 10 - SATD_8x4_SSE ssse3, 4, 8, 5, 9, 6, 3, 10 + ; always use horizontal mode here + SATD_8x4_SSE 0, 0, 1, 2, 3, 6, 11, 10 + SATD_8x4_SSE 0, 4, 8, 5, 9, 6, 3, 10 ret cglobal pixel_satd_16x8, 4,6,12 SATD_START_SSE2 m10, m7 -%if notcpuflag(ssse3) +%if vertical mova m7, [pw_00ff] %endif jmp %%pixel_satd_16x8_internal cglobal pixel_satd_16x16, 4,6,12 SATD_START_SSE2 m10, m7 -%if notcpuflag(ssse3) +%if vertical mova m7, [pw_00ff] %endif call pixel_satd_16x4_internal @@ -1424,14 +1676,15 @@ SATD_END_SSE2 m6 cglobal pixel_satd_16x16, 4,6,8 - SATD_START_SSE2 m6, m7 + SATD_START_SSE2 m6, m7, 1 BACKUP_POINTERS call pixel_satd_8x8_internal call pixel_satd_8x8_internal + SATD_ACCUM m6, m0, m7 RESTORE_AND_INC_POINTERS call pixel_satd_8x8_internal call pixel_satd_8x8_internal - SATD_END_SSE2 m6 + SATD_END_SSE2 m6, m7 %endif cglobal pixel_satd_8x16, 4,6,8 @@ -1468,11 +1721,8 @@ %endmacro %macro SA8D 0 -%if HIGH_BIT_DEPTH - %define vertical 1 -%else ; sse2 doesn't seem to like the horizontal way of doing things - %define vertical (cpuflags == cpuflags_sse2) -%endif +; sse2 doesn't seem to like the horizontal way of doing things +%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH) %if ARCH_X86_64 ;----------------------------------------------------------------------------- @@ -1679,6 +1929,170 @@ %endmacro ; SA8D ;============================================================================= +; SA8D_SATD +;============================================================================= + +; %1: vertical/horizontal mode +; %2-%5: sa8d output regs (m0,m1,m2,m3,m4,m5,m8,m9) +; m10: satd result +; m6, m11-15: tmp regs +%macro SA8D_SATD_8x4 5 +%if %1 + LOAD_DIFF_8x4P %2, %3, %4, %5, 6, 11, 7, r0, r2, 1 + HADAMARD 0, sumsub, %2, %3, 6 + HADAMARD 0, sumsub, %4, %5, 6 + SBUTTERFLY wd, %2, %3, 6 + SBUTTERFLY wd, %4, %5, 6 + HADAMARD2_2D %2, %4, %3, %5, 6, dq + + mova m12, m%2 + mova m13, m%3 + mova m14, m%4 + mova m15, m%5 + HADAMARD 0, sumsub, %2, %3, 6 + HADAMARD 0, sumsub, %4, %5, 6 + SBUTTERFLY qdq, 12, 13, 6 + HADAMARD 0, amax, 12, 13, 6 + SBUTTERFLY qdq, 14, 15, 6 + paddw m10, m12 + HADAMARD 0, amax, 14, 15, 6 + paddw m10, m14 +%else + LOAD_SUMSUB_8x4P %2, %3, %4, %5, 6, 11, 7, r0, r2, 1 + HADAMARD4_V %2, %3, %4, %5, 6 + + pabsw m12, m%2 ; doing the abs first is a slight advantage + pabsw m14, m%4 + pabsw m13, m%3 + pabsw m15, m%5 + HADAMARD 1, max, 12, 14, 6, 11 + paddw m10, m12 + HADAMARD 1, max, 13, 15, 6, 11 + paddw m10, m13 +%endif +%endmacro ; SA8D_SATD_8x4 + +; %1: add spilled regs? +; %2: spill regs? +%macro SA8D_SATD_ACCUM 2 +%if HIGH_BIT_DEPTH + pmaddwd m10, [pw_1] + HADDUWD m0, m1 +%if %1 + paddd m10, temp1 + paddd m0, temp0 +%endif +%if %2 + mova temp1, m10 + pxor m10, m10 +%endif +%elif %1 + paddw m0, temp0 +%endif +%if %2 + mova temp0, m0 +%endif +%endmacro + +%macro SA8D_SATD 0 +%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH) +cglobal pixel_sa8d_satd_8x8_internal + SA8D_SATD_8x4 vertical, 0, 1, 2, 3 + SA8D_SATD_8x4 vertical, 4, 5, 8, 9 + +%if vertical ; sse2-style + HADAMARD2_2D 0, 4, 2, 8, 6, qdq, amax + HADAMARD2_2D 1, 5, 3, 9, 6, qdq, amax +%else ; complete sa8d + SUMSUB_BADC w, 0, 4, 1, 5, 12 + HADAMARD 2, sumsub, 0, 4, 12, 11 + HADAMARD 2, sumsub, 1, 5, 12, 11 + SUMSUB_BADC w, 2, 8, 3, 9, 12 + HADAMARD 2, sumsub, 2, 8, 12, 11 + HADAMARD 2, sumsub, 3, 9, 12, 11 + HADAMARD 1, amax, 0, 4, 12, 11 + HADAMARD 1, amax, 1, 5, 12, 4 + HADAMARD 1, amax, 2, 8, 12, 4 + HADAMARD 1, amax, 3, 9, 12, 4 +%endif + + ; create sa8d sub results + paddw m1, m2 + paddw m0, m3 + paddw m0, m1 + + SAVE_MM_PERMUTATION + ret + +;------------------------------------------------------------------------------- +; uint64_t pixel_sa8d_satd_16x16( pixel *, intptr_t, pixel *, intptr_t ) +;------------------------------------------------------------------------------- +cglobal pixel_sa8d_satd_16x16, 4,8-(mmsize/32),16,SIZEOF_PIXEL*mmsize + %define temp0 [rsp+0*mmsize] + %define temp1 [rsp+1*mmsize] + FIX_STRIDES r1, r3 +%if vertical==0 + mova m7, [hmul_8p] +%endif + lea r4, [3*r1] + lea r5, [3*r3] + pxor m10, m10 + +%if mmsize==32 + call pixel_sa8d_satd_8x8_internal + SA8D_SATD_ACCUM 0, 1 + call pixel_sa8d_satd_8x8_internal + SA8D_SATD_ACCUM 1, 0 + vextracti128 xm1, m0, 1 + vextracti128 xm2, m10, 1 + paddw xm0, xm1 + paddw xm10, xm2 +%else + lea r6, [r2+8*SIZEOF_PIXEL] + lea r7, [r0+8*SIZEOF_PIXEL] + + call pixel_sa8d_satd_8x8_internal + SA8D_SATD_ACCUM 0, 1 + call pixel_sa8d_satd_8x8_internal + SA8D_SATD_ACCUM 1, 1 + + mov r0, r7 + mov r2, r6 + + call pixel_sa8d_satd_8x8_internal + SA8D_SATD_ACCUM 1, 1 + call pixel_sa8d_satd_8x8_internal + SA8D_SATD_ACCUM 1, 0 +%endif + +; xop already has fast horizontal sums +%if cpuflag(sse4) && notcpuflag(xop) && HIGH_BIT_DEPTH==0 + pmaddwd xm10, [pw_1] + HADDUWD xm0, xm1 + phaddd xm0, xm10 ; sa8d1 sa8d2 satd1 satd2 + pshufd xm1, xm0, q2301 ; sa8d2 sa8d1 satd2 satd1 + paddd xm0, xm1 ; sa8d sa8d satd satd + movd r0d, xm0 + pextrd eax, xm0, 2 +%else +%if HIGH_BIT_DEPTH + HADDD xm0, xm1 + HADDD xm10, xm2 +%else + HADDUW xm0, xm1 + HADDW xm10, xm2 +%endif + movd r0d, xm0 + movd eax, xm10 +%endif + add r0d, 1 + shl rax, 32 + shr r0d, 1 + or rax, r0 + RET +%endmacro ; SA8D_SATD + +;============================================================================= ; INTRA SATD ;============================================================================= @@ -1913,15 +2327,16 @@ ; void intra_satd_x3_4x4( uint8_t *fenc, uint8_t *fdec, int *res ) ;----------------------------------------------------------------------------- cglobal intra_satd_x3_4x4, 3,3 -%if ARCH_X86_64 +%if UNIX64 ; stack is 16 byte aligned because abi says so %define top_1d rsp-8 ; size 8 %define left_1d rsp-16 ; size 8 %else - ; stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned - SUB esp, 16 - %define top_1d esp+8 - %define left_1d esp + ; WIN64: stack is 16 byte aligned because abi says so + ; X86_32: stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned + SUB rsp, 16 + %define top_1d rsp+8 + %define left_1d rsp %endif call hadamard_load @@ -1943,8 +2358,8 @@ movd [r2+0], m0 ; i4x4_v satd movd [r2+4], m4 ; i4x4_h satd movd [r2+8], m5 ; i4x4_dc satd -%if ARCH_X86_64 == 0 - ADD esp, 16 +%if UNIX64 == 0 + ADD rsp, 16 %endif RET @@ -2526,7 +2941,7 @@ psubw m1, m9 psubw m2, m10 psubw m3, m11 - SATD_8x4_SSE cpuname, 0, 1, 2, 3, 13, 14, 0, swap + SATD_8x4_SSE 0, 0, 1, 2, 3, 13, 14, 0, swap pmaddwd m0, [pw_1] %if cpuflag(sse4) pshufd m1, m0, q0032 @@ -2634,7 +3049,7 @@ psubw m2, [fenc_buf+0x20] .satd_8x4b: psubw m3, [fenc_buf+0x30] - SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 0, swap + SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 0, swap pmaddwd m0, [pw_1] %if cpuflag(sse4) pshufd m1, m0, q0032 @@ -3574,12 +3989,12 @@ %define spill2 m10 %else %define spill0 [rsp+gprsize] - %define spill1 [rsp+gprsize+16] - %define spill2 [rsp+gprsize+32] + %define spill1 [rsp+gprsize+mmsize] + %define spill2 [rsp+gprsize+mmsize*2] %endif %if HIGH_BIT_DEPTH %define vertical 1 -%elif cpuflag(ssse3) +%elif cpuflag(ssse3) && notcpuflag(atom) %define vertical 0 ;LOAD_INC loads sumsubs mova m7, [hmul_8p] @@ -3638,17 +4053,14 @@ AC_PADD m1, m2, [pw_1] ABSW m2, m7, m7 AC_PADD m1, m3, [pw_1] - mova m3, m7 AC_PADD m1, m2, [pw_1] - mova m2, m6 + paddw m3, m7, spill2 psubw m7, spill2 - paddw m3, spill2 - mova [rsp+gprsize+32], m1 ; save satd - mova m1, m5 + mova [rsp+gprsize+mmsize*2], m1 ; save satd + paddw m2, m6, spill1 psubw m6, spill1 - paddw m2, spill1 + paddw m1, m5, spill0 psubw m5, spill0 - paddw m1, spill0 %assign %%x 2 %if vertical %assign %%x 4 @@ -3672,15 +4084,17 @@ ABSW m0, m0, m7 AC_PADD m2, m4, [pw_1] AC_PADD m2, m0, [pw_1] - mova [rsp+gprsize+16], m2 ; save sa8d + mova [rsp+gprsize+mmsize], m2 ; save sa8d SWAP 0, 2 SAVE_MM_PERMUTATION ret HADAMARD_AC_WXH_SSE2 16, 16 -HADAMARD_AC_WXH_SSE2 8, 16 HADAMARD_AC_WXH_SSE2 16, 8 +%if mmsize <= 16 +HADAMARD_AC_WXH_SSE2 8, 16 HADAMARD_AC_WXH_SSE2 8, 8 +%endif %endmacro ; HADAMARD_AC_SSE2 %macro HADAMARD_AC_WXH_SUM_SSE2 2 @@ -3697,62 +4111,69 @@ paddd m1, [rsp+8*mmsize] psrld m0, 1 %endif - HADDD m0, m2 - HADDD m1, m3 + HADDD xm0, xm2 + HADDD xm1, xm3 %else ; !HIGH_BIT_DEPTH -%if %1*%2 >= 128 +%if %1*%2*16/mmsize >= 128 paddusw m0, [rsp+3*mmsize] paddusw m1, [rsp+4*mmsize] %endif -%if %1*%2 == 256 +%if %1*%2*16/mmsize == 256 paddusw m0, [rsp+5*mmsize] paddusw m1, [rsp+6*mmsize] paddusw m0, [rsp+7*mmsize] paddusw m1, [rsp+8*mmsize] psrlw m0, 1 %endif - HADDUW m0, m2 - HADDW m1, m3 +%if mmsize==32 + vextracti128 xm2, m0, 1 + vextracti128 xm3, m1, 1 + paddusw xm0, xm2 + paddusw xm1, xm3 +%endif + HADDUW xm0, xm2 + HADDW xm1, xm3 %endif ; HIGH_BIT_DEPTH %endmacro ; struct { int satd, int sa8d; } pixel_hadamard_ac_16x16( uint8_t *pix, int stride ) %macro HADAMARD_AC_WXH_SSE2 2 -cglobal pixel_hadamard_ac_%1x%2, 2,3,11 - %assign pad 16-gprsize-(stack_offset&15) +cglobal pixel_hadamard_ac_%1x%2, 2,4,11 %define ysub r1 FIX_STRIDES r1 - sub rsp, 48+pad - lea r2, [r1*3] + mov r3, rsp + and rsp, ~(mmsize-1) + sub rsp, mmsize*3 + lea r2, [r1*3] call hadamard_ac_8x8 %if %2==16 %define ysub r2 - lea r0, [r0+r1*4] - sub rsp, 32 + lea r0, [r0+r1*4] + sub rsp, mmsize*2 call hadamard_ac_8x8 %endif -%if %1==16 +%if %1==16 && mmsize <= 16 neg ysub - sub rsp, 32 - lea r0, [r0+ysub*4+8*SIZEOF_PIXEL] + sub rsp, mmsize*2 + lea r0, [r0+ysub*4+8*SIZEOF_PIXEL] neg ysub call hadamard_ac_8x8 %if %2==16 - lea r0, [r0+r1*4] - sub rsp, 32 + lea r0, [r0+r1*4] + sub rsp, mmsize*2 call hadamard_ac_8x8 %endif %endif HADAMARD_AC_WXH_SUM_SSE2 %1, %2 - movd edx, m0 - movd eax, m1 - shr edx, 2 - (%1*%2 >> 8) + movd edx, xm0 + movd eax, xm1 + shr edx, 2 - (%1*%2*16/mmsize >> 8) shr eax, 1 %if ARCH_X86_64 shl rdx, 32 add rax, rdx %endif - add rsp, 16+%1*%2/2+pad + mov rsp, r3 RET %endmacro ; HADAMARD_AC_WXH_SSE2 @@ -3775,6 +4196,9 @@ INIT_XMM sse2 SA8D SATDS_SSE2 +%if ARCH_X86_64 +SA8D_SATD +%endif %if HIGH_BIT_DEPTH == 0 INTRA_SA8D_SSE2 %endif @@ -3783,6 +4207,16 @@ INIT_XMM sse2 HADAMARD_AC_SSE2 +%if HIGH_BIT_DEPTH == 0 +INIT_XMM ssse3,atom +SATDS_SSE2 +SA8D +HADAMARD_AC_SSE2 +%if ARCH_X86_64 +SA8D_SATD +%endif +%endif + %define DIFFOP DIFF_SUMSUB_SSSE3 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE %if HIGH_BIT_DEPTH == 0 @@ -3794,6 +4228,9 @@ SATDS_SSE2 SA8D HADAMARD_AC_SSE2 +%if ARCH_X86_64 +SA8D_SATD +%endif %if HIGH_BIT_DEPTH == 0 INTRA_X9 INTRA8_X9 @@ -3812,14 +4249,23 @@ SATDS_SSE2 SA8D HADAMARD_AC_SSE2 +%if ARCH_X86_64 +SA8D_SATD +%endif %if HIGH_BIT_DEPTH == 0 INTRA_X9 INTRA8_X9 %endif +; Sandy/Ivy Bridge and Bulldozer do movddup in the load unit, so +; it's effectively free. +%define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE INIT_XMM avx SATDS_SSE2 SA8D +%if ARCH_X86_64 +SA8D_SATD +%endif %if HIGH_BIT_DEPTH == 0 INTRA_X9 INTRA8_X9 @@ -3830,12 +4276,331 @@ INIT_XMM xop SATDS_SSE2 SA8D +%if ARCH_X86_64 +SA8D_SATD +%endif %if HIGH_BIT_DEPTH == 0 INTRA_X9 ; no xop INTRA8_X9. it's slower than avx on bulldozer. dunno why. %endif HADAMARD_AC_SSE2 + +%if HIGH_BIT_DEPTH == 0 +%define LOAD_SUMSUB_8x4P LOAD_SUMSUB8_16x4P_AVX2 +%define LOAD_DUP_4x8P LOAD_DUP_4x16P_AVX2 +%define TRANS TRANS_SSE4 +INIT_YMM avx2 +HADAMARD_AC_SSE2 +%if ARCH_X86_64 +SA8D_SATD +%endif + +%macro LOAD_SUMSUB_8x8P_AVX2 7 ; 4*dst, 2*tmp, mul] + movq xm%1, [r0] + movq xm%3, [r2] + movq xm%2, [r0+r1] + movq xm%4, [r2+r3] + vinserti128 m%1, m%1, [r0+4*r1], 1 + vinserti128 m%3, m%3, [r2+4*r3], 1 + vinserti128 m%2, m%2, [r0+r4], 1 + vinserti128 m%4, m%4, [r2+r5], 1 + punpcklqdq m%1, m%1 + punpcklqdq m%3, m%3 + punpcklqdq m%2, m%2 + punpcklqdq m%4, m%4 + DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %7 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + + movq xm%3, [r0] + movq xm%5, [r2] + movq xm%4, [r0+r1] + movq xm%6, [r2+r3] + vinserti128 m%3, m%3, [r0+4*r1], 1 + vinserti128 m%5, m%5, [r2+4*r3], 1 + vinserti128 m%4, m%4, [r0+r4], 1 + vinserti128 m%6, m%6, [r2+r5], 1 + punpcklqdq m%3, m%3 + punpcklqdq m%5, m%5 + punpcklqdq m%4, m%4 + punpcklqdq m%6, m%6 + DIFF_SUMSUB_SSSE3 %3, %5, %4, %6, %7 +%endmacro + +%macro SATD_START_AVX2 2-3 0 + FIX_STRIDES r1, r3 +%if %3 + mova %2, [hmul_8p] + lea r4, [5*r1] + lea r5, [5*r3] +%else + mova %2, [hmul_16p] + lea r4, [3*r1] + lea r5, [3*r3] +%endif + pxor %1, %1 +%endmacro + +%define TRANS TRANS_SSE4 +INIT_YMM avx2 +cglobal pixel_satd_16x8_internal + LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 1 + SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6 + LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 0 + SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6 + ret + +cglobal pixel_satd_16x16, 4,6,8 + SATD_START_AVX2 m6, m7 + call pixel_satd_16x8_internal + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] +pixel_satd_16x8_internal: + call pixel_satd_16x8_internal + vextracti128 xm0, m6, 1 + paddw xm0, xm6 + SATD_END_SSE2 xm0 + RET + +cglobal pixel_satd_16x8, 4,6,8 + SATD_START_AVX2 m6, m7 + jmp pixel_satd_16x8_internal + +cglobal pixel_satd_8x8_internal + LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7 + SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6 + ret + +cglobal pixel_satd_8x16, 4,6,8 + SATD_START_AVX2 m6, m7, 1 + call pixel_satd_8x8_internal + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + call pixel_satd_8x8_internal + vextracti128 xm0, m6, 1 + paddw xm0, xm6 + SATD_END_SSE2 xm0 + RET + +cglobal pixel_satd_8x8, 4,6,8 + SATD_START_AVX2 m6, m7, 1 + call pixel_satd_8x8_internal + vextracti128 xm0, m6, 1 + paddw xm0, xm6 + SATD_END_SSE2 xm0 + RET + +cglobal pixel_sa8d_8x8_internal + LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7 + HADAMARD4_V 0, 1, 2, 3, 4 + HADAMARD 8, sumsub, 0, 1, 4, 5 + HADAMARD 8, sumsub, 2, 3, 4, 5 + HADAMARD 2, sumsub, 0, 1, 4, 5 + HADAMARD 2, sumsub, 2, 3, 4, 5 + HADAMARD 1, amax, 0, 1, 4, 5 + HADAMARD 1, amax, 2, 3, 4, 5 + paddw m6, m0 + paddw m6, m2 + ret + +cglobal pixel_sa8d_8x8, 4,6,8 + SATD_START_AVX2 m6, m7, 1 + call pixel_sa8d_8x8_internal + vextracti128 xm1, m6, 1 + paddw xm6, xm1 + HADDW xm6, xm1 + movd eax, xm6 + add eax, 1 + shr eax, 1 + RET + +cglobal intra_sad_x9_8x8, 5,7,8 + %define pred(i,j) [rsp+i*0x40+j*0x20] + + mov r6, rsp + and rsp, ~31 + sub rsp, 0x240 + movu m5, [r0+0*FENC_STRIDE] + movu m6, [r0+4*FENC_STRIDE] + punpcklqdq m5, [r0+2*FENC_STRIDE] + punpcklqdq m6, [r0+6*FENC_STRIDE] + + ; save instruction size: avoid 4-byte memory offsets + lea r0, [intra8x9_h1+128] + %define off(m) (r0+m-(intra8x9_h1+128)) + + vpbroadcastq m0, [r2+16] + psadbw m4, m0, m5 + psadbw m2, m0, m6 + mova pred(0,0), m0 + mova pred(0,1), m0 + paddw m4, m2 + + vpbroadcastq m1, [r2+7] + pshufb m3, m1, [off(intra8x9_h1)] + pshufb m2, m1, [off(intra8x9_h3)] + mova pred(1,0), m3 + mova pred(1,1), m2 + psadbw m3, m5 + psadbw m2, m6 + paddw m3, m2 + + lea r5, [rsp+0x100] + %define pred(i,j) [r5+i*0x40+j*0x20-0x100] + + ; combine the first two + pslldq m3, 2 + por m4, m3 + + pxor m2, m2 + psadbw m0, m2 + psadbw m1, m2 + paddw m0, m1 + psrlw m0, 3 + pavgw m0, m2 + pshufb m0, m2 + mova pred(2,0), m0 + mova pred(2,1), m0 + psadbw m3, m0, m5 + psadbw m2, m0, m6 + paddw m3, m2 + + pslldq m3, 4 + por m4, m3 + + vbroadcasti128 m0, [r2+16] + vbroadcasti128 m2, [r2+17] + pslldq m1, m0, 1 + pavgb m3, m0, m2 + PRED4x4_LOWPASS m0, m1, m2, m0, m7 + pshufb m1, m0, [off(intra8x9_ddl1)] + pshufb m2, m0, [off(intra8x9_ddl3)] + mova pred(3,0), m1 + mova pred(3,1), m2 + psadbw m1, m5 + psadbw m2, m6 + paddw m1, m2 + + pslldq m1, 6 + por m4, m1 + vextracti128 xm1, m4, 1 + paddw xm4, xm1 + mova [r4], xm4 + + ; for later + vinserti128 m7, m3, xm0, 1 + + vbroadcasti128 m2, [r2+8] + vbroadcasti128 m0, [r2+7] + vbroadcasti128 m1, [r2+6] + pavgb m3, m2, m0 + PRED4x4_LOWPASS m0, m1, m2, m0, m4 + pshufb m1, m0, [off(intra8x9_ddr1)] + pshufb m2, m0, [off(intra8x9_ddr3)] + mova pred(4,0), m1 + mova pred(4,1), m2 + psadbw m4, m1, m5 + psadbw m2, m6 + paddw m4, m2 + + add r0, 256 + add r5, 0xC0 + %define off(m) (r0+m-(intra8x9_h1+256+128)) + %define pred(i,j) [r5+i*0x40+j*0x20-0x1C0] + + vpblendd m2, m3, m0, 11110011b + pshufb m1, m2, [off(intra8x9_vr1)] + pshufb m2, m2, [off(intra8x9_vr3)] + mova pred(5,0), m1 + mova pred(5,1), m2 + psadbw m1, m5 + psadbw m2, m6 + paddw m1, m2 + + pslldq m1, 2 + por m4, m1 + + psrldq m2, m3, 4 + pblendw m2, m0, q3330 + punpcklbw m0, m3 + pshufb m1, m2, [off(intra8x9_hd1)] + pshufb m2, m0, [off(intra8x9_hd3)] + mova pred(6,0), m1 + mova pred(6,1), m2 + psadbw m1, m5 + psadbw m2, m6 + paddw m1, m2 + + pslldq m1, 4 + por m4, m1 + + pshufb m1, m7, [off(intra8x9_vl1)] + pshufb m2, m7, [off(intra8x9_vl3)] + mova pred(7,0), m1 + mova pred(7,1), m2 + psadbw m1, m5 + psadbw m2, m6 + paddw m1, m2 + + pslldq m1, 6 + por m4, m1 + vextracti128 xm1, m4, 1 + paddw xm4, xm1 + mova xm3, [r4] + SBUTTERFLY qdq, 3, 4, 7 + paddw xm3, xm4 + + pslldq m1, m0, 1 + vpbroadcastd m0, [r2+7] + palignr m0, m1, 1 + pshufb m1, m0, [off(intra8x9_hu1)] + pshufb m2, m0, [off(intra8x9_hu3)] + mova pred(8,0), m1 + mova pred(8,1), m2 + psadbw m1, m5 + psadbw m2, m6 + paddw m1, m2 + vextracti128 xm2, m1, 1 + paddw xm1, xm2 + movhlps xm2, xm1 + paddw xm1, xm2 + movd r2d, xm1 + + paddw xm3, [r3] + mova [r4], xm3 + add r2w, word [r3+16] + mov [r4+16], r2w + + phminposuw xm3, xm3 + movd r3d, xm3 + add r2d, 8<<16 + cmp r3w, r2w + cmovg r3d, r2d + + mov r2d, r3d + shr r3, 16 + shl r3, 6 + add r1, 4*FDEC_STRIDE + mova xm0, [rsp+r3+0x00] + mova xm1, [rsp+r3+0x10] + mova xm2, [rsp+r3+0x20] + mova xm3, [rsp+r3+0x30] + movq [r1+FDEC_STRIDE*-4], xm0 + movhps [r1+FDEC_STRIDE*-2], xm0 + movq [r1+FDEC_STRIDE*-3], xm1 + movhps [r1+FDEC_STRIDE*-1], xm1 + movq [r1+FDEC_STRIDE* 0], xm2 + movhps [r1+FDEC_STRIDE* 2], xm2 + movq [r1+FDEC_STRIDE* 1], xm3 + movhps [r1+FDEC_STRIDE* 3], xm3 + mov rsp, r6 + mov eax, r2d + RET +%endif ; HIGH_BIT_DEPTH + ;============================================================================= ; SSIM ;============================================================================= @@ -4074,13 +4839,13 @@ %macro ADS_START 0 %if UNIX64 - movsxd r5, r5d + movsxd r5, r5d %else - mov r5d, r5m + mov r5d, r5m %endif - mov r0d, r5d - lea r6, [r4+r5+15] - and r6, ~15; + mov r0d, r5d + lea r6, [r4+r5+(mmsize-1)] + and r6, ~(mmsize-1) shl r2d, 1 %endmacro @@ -4088,10 +4853,19 @@ add r1, 8*%1 add r3, 8*%1 add r6, 4*%1 - sub r0d, 4*%1 + sub r0d, 4*%1 jg .loop WIN64_RESTORE_XMM rsp - jmp ads_mvs +%if mmsize==32 + vzeroupper +%endif + lea r6, [r4+r5+(mmsize-1)] + and r6, ~(mmsize-1) +%if cpuflag(ssse3) + jmp ads_mvs_ssse3 +%else + jmp ads_mvs_mmx +%endif %endmacro ;----------------------------------------------------------------------------- @@ -4100,192 +4874,226 @@ ;----------------------------------------------------------------------------- INIT_MMX mmx2 cglobal pixel_ads4, 5,7 - movq mm6, [r0] - movq mm4, [r0+8] - pshufw mm7, mm6, 0 - pshufw mm6, mm6, q2222 - pshufw mm5, mm4, 0 - pshufw mm4, mm4, q2222 + mova m6, [r0] + mova m4, [r0+8] + pshufw m7, m6, 0 + pshufw m6, m6, q2222 + pshufw m5, m4, 0 + pshufw m4, m4, q2222 ADS_START .loop: - movq mm0, [r1] - movq mm1, [r1+16] - psubw mm0, mm7 - psubw mm1, mm6 - ABSW mm0, mm0, mm2 - ABSW mm1, mm1, mm3 - movq mm2, [r1+r2] - movq mm3, [r1+r2+16] - psubw mm2, mm5 - psubw mm3, mm4 - paddw mm0, mm1 - ABSW mm2, mm2, mm1 - ABSW mm3, mm3, mm1 - paddw mm0, mm2 - paddw mm0, mm3 - pshufw mm1, r6m, 0 - paddusw mm0, [r3] - psubusw mm1, mm0 - packsswb mm1, mm1 - movd [r6], mm1 + movu m0, [r1] + movu m1, [r1+16] + psubw m0, m7 + psubw m1, m6 + ABSW m0, m0, m2 + ABSW m1, m1, m3 + movu m2, [r1+r2] + movu m3, [r1+r2+16] + psubw m2, m5 + psubw m3, m4 + paddw m0, m1 + ABSW m2, m2, m1 + ABSW m3, m3, m1 + paddw m0, m2 + paddw m0, m3 + pshufw m1, r6m, 0 + paddusw m0, [r3] + psubusw m1, m0 + packsswb m1, m1 + movd [r6], m1 ADS_END 1 cglobal pixel_ads2, 5,7 - movq mm6, [r0] - pshufw mm5, r6m, 0 - pshufw mm7, mm6, 0 - pshufw mm6, mm6, q2222 + mova m6, [r0] + pshufw m5, r6m, 0 + pshufw m7, m6, 0 + pshufw m6, m6, q2222 ADS_START .loop: - movq mm0, [r1] - movq mm1, [r1+r2] - psubw mm0, mm7 - psubw mm1, mm6 - ABSW mm0, mm0, mm2 - ABSW mm1, mm1, mm3 - paddw mm0, mm1 - paddusw mm0, [r3] - movq mm4, mm5 - psubusw mm4, mm0 - packsswb mm4, mm4 - movd [r6], mm4 + movu m0, [r1] + movu m1, [r1+r2] + psubw m0, m7 + psubw m1, m6 + ABSW m0, m0, m2 + ABSW m1, m1, m3 + paddw m0, m1 + paddusw m0, [r3] + mova m4, m5 + psubusw m4, m0 + packsswb m4, m4 + movd [r6], m4 ADS_END 1 cglobal pixel_ads1, 5,7 - pshufw mm7, [r0], 0 - pshufw mm6, r6m, 0 + pshufw m7, [r0], 0 + pshufw m6, r6m, 0 ADS_START .loop: - movq mm0, [r1] - movq mm1, [r1+8] - psubw mm0, mm7 - psubw mm1, mm7 - ABSW mm0, mm0, mm2 - ABSW mm1, mm1, mm3 - paddusw mm0, [r3] - paddusw mm1, [r3+8] - movq mm4, mm6 - movq mm5, mm6 - psubusw mm4, mm0 - psubusw mm5, mm1 - packsswb mm4, mm5 - movq [r6], mm4 + movu m0, [r1] + movu m1, [r1+8] + psubw m0, m7 + psubw m1, m7 + ABSW m0, m0, m2 + ABSW m1, m1, m3 + paddusw m0, [r3] + paddusw m1, [r3+8] + mova m4, m6 + mova m5, m6 + psubusw m4, m0 + psubusw m5, m1 + packsswb m4, m5 + mova [r6], m4 ADS_END 2 %macro ADS_XMM 0 +%if mmsize==32 +cglobal pixel_ads4, 5,7,8 + vpbroadcastw m7, [r0+ 0] + vpbroadcastw m6, [r0+ 4] + vpbroadcastw m5, [r0+ 8] + vpbroadcastw m4, [r0+12] +%else cglobal pixel_ads4, 5,7,12 - movdqa xmm4, [r0] - pshuflw xmm7, xmm4, 0 - pshuflw xmm6, xmm4, q2222 - pshufhw xmm5, xmm4, 0 - pshufhw xmm4, xmm4, q2222 - punpcklqdq xmm7, xmm7 - punpcklqdq xmm6, xmm6 - punpckhqdq xmm5, xmm5 - punpckhqdq xmm4, xmm4 -%if ARCH_X86_64 - pshuflw xmm8, r6m, 0 - punpcklqdq xmm8, xmm8 + mova m4, [r0] + pshuflw m7, m4, q0000 + pshuflw m6, m4, q2222 + pshufhw m5, m4, q0000 + pshufhw m4, m4, q2222 + punpcklqdq m7, m7 + punpcklqdq m6, m6 + punpckhqdq m5, m5 + punpckhqdq m4, m4 +%endif +%if ARCH_X86_64 && mmsize == 16 + movd m8, r6m + SPLATW m8, m8 ADS_START - movdqu xmm10, [r1] - movdqu xmm11, [r1+r2] + movu m10, [r1] + movu m11, [r1+r2] .loop: - psubw xmm0, xmm10, xmm7 - movdqu xmm10, [r1+16] - psubw xmm1, xmm10, xmm6 - ABSW xmm0, xmm0, xmm2 - ABSW xmm1, xmm1, xmm3 - psubw xmm2, xmm11, xmm5 - movdqu xmm11, [r1+r2+16] - paddw xmm0, xmm1 - psubw xmm3, xmm11, xmm4 - movdqu xmm9, [r3] - ABSW xmm2, xmm2, xmm1 - ABSW xmm3, xmm3, xmm1 - paddw xmm0, xmm2 - paddw xmm0, xmm3 - paddusw xmm0, xmm9 - psubusw xmm1, xmm8, xmm0 - packsswb xmm1, xmm1 - movq [r6], xmm1 + psubw m0, m10, m7 + movu m10, [r1+16] + psubw m1, m10, m6 + ABSW m0, m0, m2 + ABSW m1, m1, m3 + psubw m2, m11, m5 + movu m11, [r1+r2+16] + paddw m0, m1 + psubw m3, m11, m4 + movu m9, [r3] + ABSW m2, m2, m1 + ABSW m3, m3, m1 + paddw m0, m2 + paddw m0, m3 + paddusw m0, m9 + psubusw m1, m8, m0 %else ADS_START .loop: - movdqu xmm0, [r1] - movdqu xmm1, [r1+16] - psubw xmm0, xmm7 - psubw xmm1, xmm6 - ABSW xmm0, xmm0, xmm2 - ABSW xmm1, xmm1, xmm3 - movdqu xmm2, [r1+r2] - movdqu xmm3, [r1+r2+16] - psubw xmm2, xmm5 - psubw xmm3, xmm4 - paddw xmm0, xmm1 - ABSW xmm2, xmm2, xmm1 - ABSW xmm3, xmm3, xmm1 - paddw xmm0, xmm2 - paddw xmm0, xmm3 - movd xmm1, r6m - movdqu xmm2, [r3] - pshuflw xmm1, xmm1, 0 - punpcklqdq xmm1, xmm1 - paddusw xmm0, xmm2 - psubusw xmm1, xmm0 - packsswb xmm1, xmm1 - movq [r6], xmm1 + movu m0, [r1] + movu m1, [r1+16] + psubw m0, m7 + psubw m1, m6 + ABSW m0, m0, m2 + ABSW m1, m1, m3 + movu m2, [r1+r2] + movu m3, [r1+r2+16] + psubw m2, m5 + psubw m3, m4 + paddw m0, m1 + ABSW m2, m2, m1 + ABSW m3, m3, m1 + paddw m0, m2 + paddw m0, m3 + movu m2, [r3] +%if mmsize==32 + vpbroadcastw m1, r6m +%else + movd m1, r6m + pshuflw m1, m1, 0 + punpcklqdq m1, m1 +%endif + paddusw m0, m2 + psubusw m1, m0 %endif ; ARCH - ADS_END 2 + packsswb m1, m1 +%if mmsize==32 + vpermq m1, m1, q3120 + mova [r6], xm1 +%else + movh [r6], m1 +%endif + ADS_END mmsize/8 cglobal pixel_ads2, 5,7,8 - movq xmm6, [r0] - movd xmm5, r6m - pshuflw xmm7, xmm6, 0 - pshuflw xmm6, xmm6, q2222 - pshuflw xmm5, xmm5, 0 - punpcklqdq xmm7, xmm7 - punpcklqdq xmm6, xmm6 - punpcklqdq xmm5, xmm5 +%if mmsize==32 + vpbroadcastw m7, [r0+0] + vpbroadcastw m6, [r0+4] + vpbroadcastw m5, r6m +%else + movq m6, [r0] + movd m5, r6m + pshuflw m7, m6, 0 + pshuflw m6, m6, q2222 + pshuflw m5, m5, 0 + punpcklqdq m7, m7 + punpcklqdq m6, m6 + punpcklqdq m5, m5 +%endif ADS_START .loop: - movdqu xmm0, [r1] - movdqu xmm1, [r1+r2] - psubw xmm0, xmm7 - psubw xmm1, xmm6 - movdqu xmm4, [r3] - ABSW xmm0, xmm0, xmm2 - ABSW xmm1, xmm1, xmm3 - paddw xmm0, xmm1 - paddusw xmm0, xmm4 - psubusw xmm1, xmm5, xmm0 - packsswb xmm1, xmm1 - movq [r6], xmm1 - ADS_END 2 + movu m0, [r1] + movu m1, [r1+r2] + psubw m0, m7 + psubw m1, m6 + movu m4, [r3] + ABSW m0, m0, m2 + ABSW m1, m1, m3 + paddw m0, m1 + paddusw m0, m4 + psubusw m1, m5, m0 + packsswb m1, m1 +%if mmsize==32 + vpermq m1, m1, q3120 + mova [r6], xm1 +%else + movh [r6], m1 +%endif + ADS_END mmsize/8 cglobal pixel_ads1, 5,7,8 - movd xmm7, [r0] - movd xmm6, r6m - pshuflw xmm7, xmm7, 0 - pshuflw xmm6, xmm6, 0 - punpcklqdq xmm7, xmm7 - punpcklqdq xmm6, xmm6 +%if mmsize==32 + vpbroadcastw m7, [r0] + vpbroadcastw m6, r6m +%else + movd m7, [r0] + movd m6, r6m + pshuflw m7, m7, 0 + pshuflw m6, m6, 0 + punpcklqdq m7, m7 + punpcklqdq m6, m6 +%endif ADS_START .loop: - movdqu xmm0, [r1] - movdqu xmm1, [r1+16] - psubw xmm0, xmm7 - psubw xmm1, xmm7 - movdqu xmm2, [r3] - movdqu xmm3, [r3+16] - ABSW xmm0, xmm0, xmm4 - ABSW xmm1, xmm1, xmm5 - paddusw xmm0, xmm2 - paddusw xmm1, xmm3 - psubusw xmm4, xmm6, xmm0 - psubusw xmm5, xmm6, xmm1 - packsswb xmm4, xmm5 - movdqa [r6], xmm4 - ADS_END 4 + movu m0, [r1] + movu m1, [r1+mmsize] + psubw m0, m7 + psubw m1, m7 + movu m2, [r3] + movu m3, [r3+mmsize] + ABSW m0, m0, m4 + ABSW m1, m1, m5 + paddusw m0, m2 + paddusw m1, m3 + psubusw m4, m6, m0 + psubusw m5, m6, m1 + packsswb m4, m5 +%if mmsize==32 + vpermq m4, m4, q3120 +%endif + mova [r6], m4 + ADS_END mmsize/4 %endmacro INIT_XMM sse2 @@ -4294,6 +5102,8 @@ ADS_XMM INIT_XMM avx ADS_XMM +INIT_YMM avx2 +ADS_XMM ; int pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width ) ; { @@ -4318,11 +5128,9 @@ inc r1d %endmacro -INIT_MMX +INIT_MMX mmx cglobal pixel_ads_mvs, 0,7,0 -ads_mvs: - lea r6, [r4+r5+15] - and r6, ~15; +ads_mvs_mmx: ; mvs = r4 ; masks = r6 ; width = r5 @@ -4364,3 +5172,36 @@ .end: movifnidn eax, r0d RET + +INIT_XMM ssse3 +cglobal pixel_ads_mvs, 0,7,0 +ads_mvs_ssse3: + mova m3, [pw_8] + mova m4, [pw_76543210] + pxor m5, m5 + add r5, r6 + xor r0d, r0d ; nmv + mov [r5], r0d +%ifdef PIC + lea r1, [$$] + %define GLOBAL +r1-$$ +%else + %define GLOBAL +%endif +.loop: + movh m0, [r6] + pcmpeqb m0, m5 + pmovmskb r2d, m0 + xor r2d, 0xffff ; skipping if r2d is zero is slower (branch mispredictions) + movzx r3d, byte [r2+popcnt_table GLOBAL] ; popcnt + add r2d, r2d + ; shuffle counters based on mv mask + pshufb m2, m4, [r2*8+ads_mvs_shuffle GLOBAL] + movu [r4+r0*2], m2 + add r0d, r3d + paddw m4, m3 ; {i*8+0, i*8+1, i*8+2, i*8+3, i*8+4, i*8+5, i*8+6, i*8+7} + add r6, 8 + cmp r6, r5 + jl .loop + movifnidn eax, r0d + RET
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/pixel.h -> x264-snapshot-20130723-2245.tar.bz2/common/x86/pixel.h
Changed
@@ -52,10 +52,12 @@ DECL_X1( sad, sse2_aligned ) DECL_X1( sad, ssse3 ) DECL_X1( sad, ssse3_aligned ) +DECL_X1( sad, avx2 ) DECL_X4( sad, mmx2 ) DECL_X4( sad, sse2 ) DECL_X4( sad, sse3 ) DECL_X4( sad, ssse3 ) +DECL_X4( sad, avx2 ) DECL_X1( ssd, mmx ) DECL_X1( ssd, mmx2 ) DECL_X1( ssd, sse2slow ) @@ -63,18 +65,23 @@ DECL_X1( ssd, ssse3 ) DECL_X1( ssd, avx ) DECL_X1( ssd, xop ) +DECL_X1( ssd, avx2 ) DECL_X1( satd, mmx2 ) DECL_X1( satd, sse2 ) DECL_X1( satd, ssse3 ) +DECL_X1( satd, ssse3_atom ) DECL_X1( satd, sse4 ) DECL_X1( satd, avx ) DECL_X1( satd, xop ) +DECL_X1( satd, avx2 ) DECL_X1( sa8d, mmx2 ) DECL_X1( sa8d, sse2 ) DECL_X1( sa8d, ssse3 ) +DECL_X1( sa8d, ssse3_atom ) DECL_X1( sa8d, sse4 ) DECL_X1( sa8d, avx ) DECL_X1( sa8d, xop ) +DECL_X1( sa8d, avx2 ) DECL_X1( sad, cache32_mmx2 ); DECL_X1( sad, cache64_mmx2 ); DECL_X1( sad, cache64_sse2 ); @@ -88,12 +95,15 @@ DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, var, avx, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, var, xop, ( pixel *pix, intptr_t i_stride )) +DECL_PIXELS( uint64_t, var, avx2, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, mmx2, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( pixel *pix, intptr_t i_stride )) +DECL_PIXELS( uint64_t, hadamard_ac, ssse3_atom, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, sse4, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, avx, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, xop, ( pixel *pix, intptr_t i_stride )) +DECL_PIXELS( uint64_t, hadamard_ac, avx2, ( pixel *pix, intptr_t i_stride )) void x264_intra_satd_x3_4x4_mmx2 ( pixel *, pixel *, int * ); @@ -106,16 +116,19 @@ void x264_intra_sad_x3_8x8c_mmx2 ( pixel *, pixel *, int * ); void x264_intra_sad_x3_8x8c_sse2 ( pixel *, pixel *, int * ); void x264_intra_sad_x3_8x8c_ssse3 ( pixel *, pixel *, int * ); +void x264_intra_sad_x3_8x8c_avx2 ( pixel *, pixel *, int * ); void x264_intra_satd_x3_16x16_mmx2 ( pixel *, pixel *, int * ); void x264_intra_satd_x3_16x16_ssse3( uint8_t *, uint8_t *, int * ); void x264_intra_sad_x3_16x16_mmx2 ( pixel *, pixel *, int * ); void x264_intra_sad_x3_16x16_sse2 ( pixel *, pixel *, int * ); void x264_intra_sad_x3_16x16_ssse3 ( pixel *, pixel *, int * ); +void x264_intra_sad_x3_16x16_avx2 ( pixel *, pixel *, int * ); void x264_intra_sa8d_x3_8x8_mmx2 ( uint8_t *, uint8_t *, int * ); void x264_intra_sa8d_x3_8x8_sse2 ( pixel *, pixel *, int * ); void x264_intra_sad_x3_8x8_mmx2 ( pixel *, pixel *, int * ); void x264_intra_sad_x3_8x8_sse2 ( pixel *, pixel *, int * ); void x264_intra_sad_x3_8x8_ssse3 ( pixel *, pixel *, int * ); +void x264_intra_sad_x3_8x8_avx2 ( uint16_t*, uint16_t*, int * ); int x264_intra_satd_x9_4x4_ssse3( uint8_t *, uint8_t *, uint16_t * ); int x264_intra_satd_x9_4x4_sse4 ( uint8_t *, uint8_t *, uint16_t * ); int x264_intra_satd_x9_4x4_avx ( uint8_t *, uint8_t *, uint16_t * ); @@ -129,6 +142,7 @@ int x264_intra_sad_x9_8x8_ssse3 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * ); int x264_intra_sad_x9_8x8_sse4 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * ); int x264_intra_sad_x9_8x8_avx ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * ); +int x264_intra_sad_x9_8x8_avx2 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * ); void x264_pixel_ssd_nv12_core_mmx2( pixel *pixuv1, intptr_t stride1, pixel *pixuv2, intptr_t stride2, int width, @@ -139,6 +153,9 @@ void x264_pixel_ssd_nv12_core_avx ( pixel *pixuv1, intptr_t stride1, pixel *pixuv2, intptr_t stride2, int width, int height, uint64_t *ssd_u, uint64_t *ssd_v ); +void x264_pixel_ssd_nv12_core_avx2( pixel *pixuv1, intptr_t stride1, + pixel *pixuv2, intptr_t stride2, int width, + int height, uint64_t *ssd_u, uint64_t *ssd_v ); void x264_pixel_ssim_4x4x2_core_mmx2( const uint8_t *pix1, intptr_t stride1, const uint8_t *pix2, intptr_t stride2, int sums[2][4] ); void x264_pixel_ssim_4x4x2_core_sse2( const pixel *pix1, intptr_t stride1, @@ -151,17 +168,28 @@ int x264_pixel_var2_8x8_sse2 ( pixel *, intptr_t, pixel *, intptr_t, int * ); int x264_pixel_var2_8x8_ssse3 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); int x264_pixel_var2_8x8_xop ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); +int x264_pixel_var2_8x8_avx2 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); int x264_pixel_var2_8x16_mmx2 ( pixel *, intptr_t, pixel *, intptr_t, int * ); int x264_pixel_var2_8x16_sse2 ( pixel *, intptr_t, pixel *, intptr_t, int * ); int x264_pixel_var2_8x16_ssse3( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); int x264_pixel_var2_8x16_xop ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); +int x264_pixel_var2_8x16_avx2 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); int x264_pixel_vsad_mmx2 ( pixel *src, intptr_t stride, int height ); int x264_pixel_vsad_sse2 ( pixel *src, intptr_t stride, int height ); int x264_pixel_vsad_ssse3( pixel *src, intptr_t stride, int height ); int x264_pixel_vsad_xop ( pixel *src, intptr_t stride, int height ); +int x264_pixel_vsad_avx2 ( uint16_t *src, intptr_t stride, int height ); int x264_pixel_asd8_sse2 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height ); int x264_pixel_asd8_ssse3( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height ); int x264_pixel_asd8_xop ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height ); +uint64_t x264_pixel_sa8d_satd_16x16_sse2 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 ); +uint64_t x264_pixel_sa8d_satd_16x16_ssse3 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 ); +uint64_t x264_pixel_sa8d_satd_16x16_ssse3_atom( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 ); +uint64_t x264_pixel_sa8d_satd_16x16_sse4 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 ); +uint64_t x264_pixel_sa8d_satd_16x16_avx ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 ); +uint64_t x264_pixel_sa8d_satd_16x16_xop ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 ); +uint64_t x264_pixel_sa8d_satd_16x16_avx2 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 ); + #define DECL_ADS( size, suffix ) \ int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\ @@ -178,6 +206,9 @@ DECL_ADS( 4, avx ) DECL_ADS( 2, avx ) DECL_ADS( 1, avx ) +DECL_ADS( 4, avx2 ) +DECL_ADS( 2, avx2 ) +DECL_ADS( 1, avx2 ) #undef DECL_PIXELS #undef DECL_X1
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/predict-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/predict-a.asm
Changed
@@ -6,6 +6,7 @@ ;* Authors: Loren Merritt <lorenm@u.washington.edu> ;* Holger Lubitz <holger@lubitz.org> ;* Jason Garrett-Glaser <darkshikari@gmail.com> +;* Henrik Gramner <henrik@gramner.com> ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -28,13 +29,12 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA +SECTION_RODATA 32 -pw_76543210: -pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7 -pw_43210123: dw -3, -2, -1, 0, 1, 2, 3, 4 -pw_m3: times 8 dw -3 -pw_m7: times 8 dw -7 +pw_0to15: dw 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +pw_43210123: times 2 dw -3, -2, -1, 0, 1, 2, 3, 4 +pw_m3: times 16 dw -3 +pw_m7: times 16 dw -7 pb_00s_ff: times 8 db 0 pb_0s_ff: times 7 db 0 db 0xff @@ -57,109 +57,106 @@ cextern pw_00ff cextern pw_pixel_max -%macro STORE8x8 2-4 - add r0, 4*FDEC_STRIDEB - mova [r0 + -4*FDEC_STRIDEB], %1 - mova [r0 + -3*FDEC_STRIDEB], %1 - mova [r0 + -2*FDEC_STRIDEB], %1 - mova [r0 + -1*FDEC_STRIDEB], %1 - mova [r0 + 0*FDEC_STRIDEB], %2 - mova [r0 + 1*FDEC_STRIDEB], %2 - mova [r0 + 2*FDEC_STRIDEB], %2 - mova [r0 + 3*FDEC_STRIDEB], %2 +%macro STORE8 1 + mova [r0+0*FDEC_STRIDEB], %1 + mova [r0+1*FDEC_STRIDEB], %1 + add r0, 4*FDEC_STRIDEB + mova [r0-2*FDEC_STRIDEB], %1 + mova [r0-1*FDEC_STRIDEB], %1 + mova [r0+0*FDEC_STRIDEB], %1 + mova [r0+1*FDEC_STRIDEB], %1 + mova [r0+2*FDEC_STRIDEB], %1 + mova [r0+3*FDEC_STRIDEB], %1 %endmacro -%macro STORE8x16 4 - add r0, 4*FDEC_STRIDEB - mova [r0 + -4*FDEC_STRIDEB], %1 - mova [r0 + -3*FDEC_STRIDEB], %1 - mova [r0 + -2*FDEC_STRIDEB], %1 - mova [r0 + -1*FDEC_STRIDEB], %1 - add r0, 4*FDEC_STRIDEB - mova [r0 + -4*FDEC_STRIDEB], %2 - mova [r0 + -3*FDEC_STRIDEB], %2 - mova [r0 + -2*FDEC_STRIDEB], %2 - mova [r0 + -1*FDEC_STRIDEB], %2 - add r0, 4*FDEC_STRIDEB - mova [r0 + -4*FDEC_STRIDEB], %3 - mova [r0 + -3*FDEC_STRIDEB], %3 - mova [r0 + -2*FDEC_STRIDEB], %3 - mova [r0 + -1*FDEC_STRIDEB], %3 - mova [r0 + 0*FDEC_STRIDEB], %4 - mova [r0 + 1*FDEC_STRIDEB], %4 - mova [r0 + 2*FDEC_STRIDEB], %4 - mova [r0 + 3*FDEC_STRIDEB], %4 +%macro STORE16 1-4 +%if %0 > 1 + mov r1d, 2*%0 +.loop: + mova [r0+0*FDEC_STRIDEB+0*mmsize], %1 + mova [r0+0*FDEC_STRIDEB+1*mmsize], %2 + mova [r0+1*FDEC_STRIDEB+0*mmsize], %1 + mova [r0+1*FDEC_STRIDEB+1*mmsize], %2 +%ifidn %0, 4 + mova [r0+0*FDEC_STRIDEB+2*mmsize], %3 + mova [r0+0*FDEC_STRIDEB+3*mmsize], %4 + mova [r0+1*FDEC_STRIDEB+2*mmsize], %3 + mova [r0+1*FDEC_STRIDEB+3*mmsize], %4 + add r0, 2*FDEC_STRIDEB +%else ; %0 == 2 + add r0, 4*FDEC_STRIDEB + mova [r0-2*FDEC_STRIDEB+0*mmsize], %1 + mova [r0-2*FDEC_STRIDEB+1*mmsize], %2 + mova [r0-1*FDEC_STRIDEB+0*mmsize], %1 + mova [r0-1*FDEC_STRIDEB+1*mmsize], %2 +%endif + dec r1d + jg .loop +%else ; %0 == 1 + STORE8 %1 +%if HIGH_BIT_DEPTH ; Different code paths to reduce code size + add r0, 6*FDEC_STRIDEB + mova [r0-2*FDEC_STRIDEB], %1 + mova [r0-1*FDEC_STRIDEB], %1 + mova [r0+0*FDEC_STRIDEB], %1 + mova [r0+1*FDEC_STRIDEB], %1 + add r0, 4*FDEC_STRIDEB + mova [r0-2*FDEC_STRIDEB], %1 + mova [r0-1*FDEC_STRIDEB], %1 + mova [r0+0*FDEC_STRIDEB], %1 + mova [r0+1*FDEC_STRIDEB], %1 +%else + add r0, 8*FDEC_STRIDE + mova [r0-4*FDEC_STRIDE], %1 + mova [r0-3*FDEC_STRIDE], %1 + mova [r0-2*FDEC_STRIDE], %1 + mova [r0-1*FDEC_STRIDE], %1 + mova [r0+0*FDEC_STRIDE], %1 + mova [r0+1*FDEC_STRIDE], %1 + mova [r0+2*FDEC_STRIDE], %1 + mova [r0+3*FDEC_STRIDE], %1 +%endif ; HIGH_BIT_DEPTH +%endif %endmacro -%macro STORE16x16 2-4 -%ifidn %0, 4 - mov r1d, 8 -.loop: - mova [r0 + 0*FDEC_STRIDEB + 0], %1 - mova [r0 + 1*FDEC_STRIDEB + 0], %1 - mova [r0 + 0*FDEC_STRIDEB + 8], %2 - mova [r0 + 1*FDEC_STRIDEB + 8], %2 - mova [r0 + 0*FDEC_STRIDEB +16], %3 - mova [r0 + 1*FDEC_STRIDEB +16], %3 - mova [r0 + 0*FDEC_STRIDEB +24], %4 - mova [r0 + 1*FDEC_STRIDEB +24], %4 - add r0, 2*FDEC_STRIDEB - dec r1d - jg .loop +%macro PRED_H_LOAD 2 ; reg, offset +%if cpuflag(avx2) + vpbroadcastpix %1, [r0+(%2)*FDEC_STRIDEB-SIZEOF_PIXEL] +%elif HIGH_BIT_DEPTH + movd %1, [r0+(%2)*FDEC_STRIDEB-4] + SPLATW %1, %1, 1 %else - mov r1d, 4 -.loop: - mova [r0 + 0*FDEC_STRIDE], %1 - mova [r0 + 1*FDEC_STRIDE], %1 - mova [r0 + 2*FDEC_STRIDE], %1 - mova [r0 + 3*FDEC_STRIDE], %1 - mova [r0 + 0*FDEC_STRIDE + 8], %2 - mova [r0 + 1*FDEC_STRIDE + 8], %2 - mova [r0 + 2*FDEC_STRIDE + 8], %2 - mova [r0 + 3*FDEC_STRIDE + 8], %2 - add r0, 4*FDEC_STRIDE - dec r1d - jg .loop + SPLATB_LOAD %1, r0+(%2)*FDEC_STRIDE-1, m2 %endif %endmacro -%macro STORE16x16_SSE2 1-2 -%ifidn %0,2 - mov r1d, 4 -.loop - mova [r0+0*FDEC_STRIDEB+ 0], %1 - mova [r0+0*FDEC_STRIDEB+16], %2 - mova [r0+1*FDEC_STRIDEB+ 0], %1 - mova [r0+1*FDEC_STRIDEB+16], %2 - mova [r0+2*FDEC_STRIDEB+ 0], %1 - mova [r0+2*FDEC_STRIDEB+16], %2 - mova [r0+3*FDEC_STRIDEB+ 0], %1 - mova [r0+3*FDEC_STRIDEB+16], %2 - add r0, 4*FDEC_STRIDEB - dec r1d - jg .loop +%macro PRED_H_STORE 3 ; reg, offset, width +%assign %%w %3*SIZEOF_PIXEL +%if %%w == 8 + movq [r0+(%2)*FDEC_STRIDEB], %1 %else - add r0, 4*FDEC_STRIDEB - mova [r0 + -4*FDEC_STRIDEB], %1 - mova [r0 + -3*FDEC_STRIDEB], %1 - mova [r0 + -2*FDEC_STRIDEB], %1 - mova [r0 + -1*FDEC_STRIDEB], %1 - mova [r0 + 0*FDEC_STRIDEB], %1 - mova [r0 + 1*FDEC_STRIDEB], %1 - mova [r0 + 2*FDEC_STRIDEB], %1 - mova [r0 + 3*FDEC_STRIDEB], %1 - add r0, 8*FDEC_STRIDEB - mova [r0 + -4*FDEC_STRIDEB], %1 - mova [r0 + -3*FDEC_STRIDEB], %1 - mova [r0 + -2*FDEC_STRIDEB], %1 - mova [r0 + -1*FDEC_STRIDEB], %1 - mova [r0 + 0*FDEC_STRIDEB], %1 - mova [r0 + 1*FDEC_STRIDEB], %1 - mova [r0 + 2*FDEC_STRIDEB], %1 - mova [r0 + 3*FDEC_STRIDEB], %1 + %assign %%i 0 + %rep %%w/mmsize + mova [r0+(%2)*FDEC_STRIDEB+%%i], %1 + %assign %%i %%i+mmsize + %endrep %endif %endmacro +%macro PRED_H_4ROWS 2 ; width, inc_ptr + PRED_H_LOAD m0, 0 + PRED_H_LOAD m1, 1 + PRED_H_STORE m0, 0, %1 + PRED_H_STORE m1, 1, %1 + PRED_H_LOAD m0, 2 +%if %2 + add r0, 4*FDEC_STRIDEB +%endif + PRED_H_LOAD m1, 3-4*%2 + PRED_H_STORE m0, 2-4*%2, %1 + PRED_H_STORE m1, 3-4*%2, %1 +%endmacro + ; dest, left, right, src, tmp ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 %macro PRED8x8_LOWPASS 4-5 @@ -178,6 +175,16 @@ %endmacro ;----------------------------------------------------------------------------- +; void predict_4x4_h( pixel *src ) +;----------------------------------------------------------------------------- +%if HIGH_BIT_DEPTH +INIT_XMM avx2 +cglobal predict_4x4_h, 1,1 + PRED_H_4ROWS 4, 0 + RET +%endif + +;----------------------------------------------------------------------------- ; void predict_4x4_ddl( pixel *src ) ;----------------------------------------------------------------------------- %macro PREDICT_4x4_DDL 0 @@ -755,12 +762,12 @@ %macro PREDICT_8x8_V 0 cglobal predict_8x8_v, 2,2 mova m0, [r1+16*SIZEOF_PIXEL] - STORE8x8 m0, m0 + STORE8 m0 RET %endmacro %if HIGH_BIT_DEPTH -INIT_XMM sse2 +INIT_XMM sse PREDICT_8x8_V %else INIT_MMX mmx2 @@ -806,7 +813,7 @@ paddw m0, [pw_8] psrlw m0, 4 SPLATW m0, m0 - STORE8x8 m0, m0 + STORE8 m0 RET %else ; !HIGH_BIT_DEPTH @@ -821,7 +828,7 @@ psrlw mm0, 4 pshufw mm0, mm0, 0 packuswb mm0, mm0 - STORE8x8 mm0, mm0 + STORE8 mm0 RET %endif ; HIGH_BIT_DEPTH @@ -837,7 +844,7 @@ paddw m0, [pw_4] psrlw m0, 3 SPLATW m0, m0 - STORE8x8 m0, m0 + STORE8 m0 RET %endmacro INIT_XMM sse2 @@ -853,7 +860,7 @@ psrlw mm0, 3 pshufw mm0, mm0, 0 packuswb mm0, mm0 - STORE8x8 mm0, mm0 + STORE8 mm0 RET %endmacro INIT_MMX @@ -1062,17 +1069,21 @@ %endif %macro LOAD_PLANE_ARGS 0 -%if ARCH_X86_64 - movd mm0, r1d - movd mm2, r2d - movd mm4, r3d - pshufw mm0, mm0, 0 - pshufw mm2, mm2, 0 - pshufw mm4, mm4, 0 +%if cpuflag(avx2) && ARCH_X86_64 == 0 + vpbroadcastw m0, r1m + vpbroadcastw m2, r2m + vpbroadcastw m4, r3m +%elif mmsize == 8 ; MMX is only used on x86_32 + SPLATW m0, r1m + SPLATW m2, r2m + SPLATW m4, r3m %else - pshufw mm0, r1m, 0 - pshufw mm2, r2m, 0 - pshufw mm4, r3m, 0 + movd xm0, r1m + movd xm2, r2m + movd xm4, r3m + SPLATW m0, xm0 + SPLATW m2, xm2 + SPLATW m4, xm4 %endif %endmacro @@ -1084,7 +1095,7 @@ cglobal predict_8x%1c_p_core, 1,2 LOAD_PLANE_ARGS movq m1, m2 - pmullw m2, [pw_3210] + pmullw m2, [pw_0to15] psllw m1, 2 paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b} paddsw m1, m0 ; m1 = {i+4*b, i+5*b, i+6*b, i+7*b} @@ -1111,17 +1122,12 @@ PREDICT_CHROMA_P_MMX 16 %endif ; !ARCH_X86_64 && !HIGH_BIT_DEPTH -%macro PREDICT_CHROMA_P_XMM 1 +%macro PREDICT_CHROMA_P 1 %if HIGH_BIT_DEPTH cglobal predict_8x%1c_p_core, 1,2,7 - movd m0, r1m - movd m2, r2m - movd m4, r3m + LOAD_PLANE_ARGS mova m3, [pw_pixel_max] pxor m1, m1 - SPLATW m0, m0, 0 - SPLATW m2, m2, 0 - SPLATW m4, m4, 0 pmullw m2, [pw_43210123] ; b %if %1 == 16 pmullw m5, m4, [pw_m7] ; c @@ -1129,70 +1135,88 @@ pmullw m5, m4, [pw_m3] %endif paddw m5, [pw_16] - mov r1d, %1 +%if mmsize == 32 + mova xm6, xm4 + paddw m4, m4 + paddw m5, m6 +%endif + mov r1d, %1/(mmsize/16) .loop: paddsw m6, m2, m5 paddsw m6, m0 psraw m6, 5 CLIPW m6, m1, m3 - mova [r0], m6 paddw m5, m4 +%if mmsize == 32 + vextracti128 [r0], m6, 1 + mova [r0+FDEC_STRIDEB], xm6 + add r0, 2*FDEC_STRIDEB +%else + mova [r0], m6 add r0, FDEC_STRIDEB +%endif dec r1d jg .loop RET %else ; !HIGH_BIT_DEPTH cglobal predict_8x%1c_p_core, 1,2 - movd m0, r1m - movd m2, r2m - movd m4, r3m - SPLATW m0, m0, 0 - SPLATW m2, m2, 0 - SPLATW m4, m4, 0 - pmullw m2, [pw_76543210] - paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b} - paddsw m3, m0, m4 + LOAD_PLANE_ARGS +%if mmsize == 32 + vbroadcasti128 m1, [pw_0to15] ; 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 + pmullw m2, m1 + mova xm1, xm4 ; zero upper half paddsw m4, m4 - mov r1d, %1/4 + paddsw m0, m1 +%else + pmullw m2, [pw_0to15] +%endif + paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b} + paddsw m1, m0, m4 + paddsw m4, m4 + mov r1d, %1/(mmsize/8) .loop: - paddsw m1, m3, m4 - paddsw m5, m0, m4 - psraw m3, 5 - psraw m0, 5 - packuswb m0, m3 - movq [r0+FDEC_STRIDE*0], m0 - movhps [r0+FDEC_STRIDE*1], m0 - paddsw m0, m5, m4 - paddsw m3, m1, m4 - psraw m5, 5 - psraw m1, 5 - packuswb m5, m1 - movq [r0+FDEC_STRIDE*2], m5 - movhps [r0+FDEC_STRIDE*3], m5 - add r0, FDEC_STRIDE*4 + psraw m2, m0, 5 + psraw m3, m1, 5 + paddsw m0, m4 + paddsw m1, m4 + packuswb m2, m3 +%if mmsize == 32 + movq [r0+FDEC_STRIDE*1], xm2 + movhps [r0+FDEC_STRIDE*3], xm2 + vextracti128 xm2, m2, 1 + movq [r0+FDEC_STRIDE*0], xm2 + movhps [r0+FDEC_STRIDE*2], xm2 +%else + movq [r0+FDEC_STRIDE*0], xm2 + movhps [r0+FDEC_STRIDE*1], xm2 +%endif + add r0, FDEC_STRIDE*mmsize/8 dec r1d jg .loop RET %endif ; HIGH_BIT_DEPTH -%endmacro ; PREDICT_CHROMA_P_XMM +%endmacro ; PREDICT_CHROMA_P INIT_XMM sse2 -PREDICT_CHROMA_P_XMM 8 -PREDICT_CHROMA_P_XMM 16 +PREDICT_CHROMA_P 8 +PREDICT_CHROMA_P 16 INIT_XMM avx -PREDICT_CHROMA_P_XMM 8 -PREDICT_CHROMA_P_XMM 16 +PREDICT_CHROMA_P 8 +PREDICT_CHROMA_P 16 +INIT_YMM avx2 +PREDICT_CHROMA_P 8 +PREDICT_CHROMA_P 16 ;----------------------------------------------------------------------------- ; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c ) ;----------------------------------------------------------------------------- -%if ARCH_X86_64 == 0 +%if HIGH_BIT_DEPTH == 0 && ARCH_X86_64 == 0 INIT_MMX mmx2 cglobal predict_16x16_p_core, 1,2 LOAD_PLANE_ARGS movq mm5, mm2 movq mm1, mm2 - pmullw mm5, [pw_3210] + pmullw mm5, [pw_0to15] psllw mm2, 3 psllw mm1, 2 movq mm3, mm2 @@ -1226,7 +1250,7 @@ dec r1d jg .loop RET -%endif ; !ARCH_X86_64 +%endif ; !HIGH_BIT_DEPTH && !ARCH_X86_64 %macro PREDICT_16x16_P 0 cglobal predict_16x16_p_core, 1,2,8 @@ -1236,7 +1260,7 @@ SPLATW m0, m0, 0 SPLATW m1, m1, 0 SPLATW m2, m2, 0 - pmullw m3, m1, [pw_76543210] + pmullw m3, m1, [pw_0to15] psllw m1, 3 %if HIGH_BIT_DEPTH pxor m6, m6 @@ -1257,8 +1281,6 @@ mova [r0+16], m5 add r0, FDEC_STRIDEB paddw m6, m2 - dec r1d - jg .loop %else ; !HIGH_BIT_DEPTH paddsw m0, m3 ; m0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b} paddsw m1, m0 ; m1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b} @@ -1279,9 +1301,9 @@ paddsw m0, m7 paddsw m1, m7 add r0, FDEC_STRIDE*2 - dec r1d - jg .loop %endif ; !HIGH_BIT_DEPTH + dec r1d + jg .loop RET %endmacro ; PREDICT_16x16_P @@ -1292,6 +1314,60 @@ PREDICT_16x16_P %endif +INIT_YMM avx2 +cglobal predict_16x16_p_core, 1,2,8*HIGH_BIT_DEPTH + LOAD_PLANE_ARGS +%if HIGH_BIT_DEPTH + pmullw m2, [pw_0to15] + pxor m5, m5 + pxor m6, m6 + mova m7, [pw_pixel_max] + mov r1d, 8 +.loop: + paddsw m1, m2, m5 + paddw m5, m4 + paddsw m1, m0 + paddsw m3, m2, m5 + psraw m1, 5 + paddsw m3, m0 + psraw m3, 5 + CLIPW m1, m6, m7 + mova [r0+0*FDEC_STRIDEB], m1 + CLIPW m3, m6, m7 + mova [r0+1*FDEC_STRIDEB], m3 + paddw m5, m4 + add r0, 2*FDEC_STRIDEB +%else ; !HIGH_BIT_DEPTH + vbroadcasti128 m1, [pw_0to15] + mova xm3, xm4 ; zero high bits + pmullw m1, m2 + psllw m2, 3 + paddsw m0, m3 + paddsw m0, m1 ; X+1*C X+0*C + paddsw m1, m0, m2 ; Y+1*C Y+0*C + paddsw m4, m4 + mov r1d, 4 +.loop: + psraw m2, m0, 5 + psraw m3, m1, 5 + paddsw m0, m4 + paddsw m1, m4 + packuswb m2, m3 ; X+1*C Y+1*C X+0*C Y+0*C + vextracti128 [r0+0*FDEC_STRIDE], m2, 1 + mova [r0+1*FDEC_STRIDE], xm2 + psraw m2, m0, 5 + psraw m3, m1, 5 + paddsw m0, m4 + paddsw m1, m4 + packuswb m2, m3 ; X+3*C Y+3*C X+2*C Y+2*C + vextracti128 [r0+2*FDEC_STRIDE], m2, 1 + mova [r0+3*FDEC_STRIDE], xm2 + add r0, FDEC_STRIDE*4 +%endif ; !HIGH_BIT_DEPTH + dec r1d + jg .loop + RET + %if HIGH_BIT_DEPTH == 0 %macro PREDICT_8x8 0 ;----------------------------------------------------------------------------- @@ -1625,12 +1701,12 @@ %macro PREDICT_8x8C_V 0 cglobal predict_8x8c_v, 1,1 mova m0, [r0 - FDEC_STRIDEB] - STORE8x8 m0, m0 + STORE8 m0 RET %endmacro %if HIGH_BIT_DEPTH -INIT_XMM sse2 +INIT_XMM sse PREDICT_8x8C_V %else INIT_MMX mmx @@ -1659,12 +1735,12 @@ %macro PREDICT_8x16C_V 0 cglobal predict_8x16c_v, 1,1 mova m0, [r0 - FDEC_STRIDEB] - STORE8x16 m0, m0, m0, m0 + STORE16 m0 RET %endmacro %if HIGH_BIT_DEPTH -INIT_XMM sse2 +INIT_XMM sse PREDICT_8x16C_V %else INIT_MMX mmx @@ -1674,71 +1750,42 @@ ;----------------------------------------------------------------------------- ; void predict_8x8c_h( uint8_t *src ) ;----------------------------------------------------------------------------- -%if HIGH_BIT_DEPTH - -%macro PREDICT_C_H 1 -cglobal predict_8x%1c_h, 1,1 - add r0, FDEC_STRIDEB*4 -%assign Y -4 -%rep %1 - movd m0, [r0+FDEC_STRIDEB*Y-SIZEOF_PIXEL*2] - SPLATW m0, m0, 1 - mova [r0+FDEC_STRIDEB*Y], m0 -%if mmsize == 8 - mova [r0+FDEC_STRIDEB*Y+8], m0 +%macro PREDICT_C_H 0 +cglobal predict_8x8c_h, 1,1 +%if cpuflag(ssse3) && notcpuflag(avx2) + mova m2, [pb_3] %endif -%assign Y Y+1 -%endrep + PRED_H_4ROWS 8, 1 + PRED_H_4ROWS 8, 0 RET -%endmacro - -INIT_MMX mmx2 -PREDICT_C_H 8 -PREDICT_C_H 16 -INIT_XMM sse2 -PREDICT_C_H 8 -PREDICT_C_H 16 - -%else ; !HIGH_BIT_DEPTH - -%macro PREDICT_C_H_CORE 1 -%assign Y %1 -%rep 4 - SPLATB_LOAD m0, r0+FDEC_STRIDE*Y-1, m1 - mova [r0+FDEC_STRIDE*Y], m0 -%assign Y Y+1 -%endrep -%endmacro -%macro PREDICT_C_H 1 -cglobal predict_8x%1c_h, 1,1 -%if cpuflag(ssse3) - mova m1, [pb_3] +cglobal predict_8x16c_h, 1,2 +%if cpuflag(ssse3) && notcpuflag(avx2) + mova m2, [pb_3] %endif -%if %1==16 - add r0, FDEC_STRIDE*4 - PREDICT_C_H_CORE -4 - add r0, FDEC_STRIDE*4 - PREDICT_C_H_CORE -4 -%endif - add r0, FDEC_STRIDE*4 - PREDICT_C_H_CORE -4 - PREDICT_C_H_CORE 0 + mov r1d, 4 +.loop: + PRED_H_4ROWS 8, 1 + dec r1d + jg .loop RET %endmacro INIT_MMX mmx2 -PREDICT_C_H 8 -PREDICT_C_H 16 +PREDICT_C_H +%if HIGH_BIT_DEPTH +INIT_XMM sse2 +PREDICT_C_H +INIT_XMM avx2 +PREDICT_C_H +%else INIT_MMX ssse3 -PREDICT_C_H 8 -PREDICT_C_H 16 - +PREDICT_C_H %endif + ;----------------------------------------------------------------------------- ; void predict_8x8c_dc( pixel *src ) ;----------------------------------------------------------------------------- - %macro LOAD_LEFT 1 movzx r1d, pixel [r0+FDEC_STRIDEB*(%1-4)-SIZEOF_PIXEL] movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-3)-SIZEOF_PIXEL] @@ -1958,7 +2005,7 @@ paddw m0, m1 psrlw m0, 1 pavgw m0, m2 - STORE8x%1 m0, m0, m0, m0 + STORE%1 m0 RET %else ; !HIGH_BIT_DEPTH INIT_MMX @@ -1977,7 +2024,7 @@ pshufw mm1, mm1, 0 pshufw mm0, mm0, 0 ; dc0 (w) packuswb mm0, mm1 ; dc0,dc1 (b) - STORE8x%1 mm0, mm0, mm0, mm0 + STORE%1 mm0 RET %endif %endmacro @@ -1988,33 +2035,31 @@ ;----------------------------------------------------------------------------- ; void predict_16x16_v( pixel *src ) ;----------------------------------------------------------------------------- -%if HIGH_BIT_DEPTH -INIT_MMX -cglobal predict_16x16_v_mmx2, 1,2 - mova m0, [r0 - FDEC_STRIDEB+ 0] - mova m1, [r0 - FDEC_STRIDEB+ 8] - mova m2, [r0 - FDEC_STRIDEB+16] - mova m3, [r0 - FDEC_STRIDEB+24] - STORE16x16 m0, m1, m2, m3 - RET -INIT_XMM -cglobal predict_16x16_v_sse2, 2,2 - mova m0, [r0 - FDEC_STRIDEB+ 0] - mova m1, [r0 - FDEC_STRIDEB+16] - STORE16x16_SSE2 m0, m1 - RET -%else ; !HIGH_BIT_DEPTH -INIT_MMX -cglobal predict_16x16_v_mmx2, 1,2 - movq m0, [r0 - FDEC_STRIDE + 0] - movq m1, [r0 - FDEC_STRIDE + 8] - STORE16x16 m0, m1 - RET -INIT_XMM -cglobal predict_16x16_v_sse2, 1,1 - movdqa xmm0, [r0 - FDEC_STRIDE] - STORE16x16_SSE2 xmm0 + +%macro PREDICT_16x16_V 0 +cglobal predict_16x16_v, 1,2 +%assign %%i 0 +%rep 16*SIZEOF_PIXEL/mmsize + mova m %+ %%i, [r0-FDEC_STRIDEB+%%i*mmsize] +%assign %%i %%i+1 +%endrep +%if 16*SIZEOF_PIXEL/mmsize == 4 + STORE16 m0, m1, m2, m3 +%elif 16*SIZEOF_PIXEL/mmsize == 2 + STORE16 m0, m1 +%else + STORE16 m0 +%endif RET +%endmacro + +INIT_MMX mmx2 +PREDICT_16x16_V +INIT_XMM sse +PREDICT_16x16_V +%if HIGH_BIT_DEPTH +INIT_YMM avx +PREDICT_16x16_V %endif ;----------------------------------------------------------------------------- @@ -2022,46 +2067,23 @@ ;----------------------------------------------------------------------------- %macro PREDICT_16x16_H 0 cglobal predict_16x16_h, 1,2 - mov r1, 12*FDEC_STRIDEB -%if HIGH_BIT_DEPTH -.vloop: -%assign Y 0 -%rep 4 - movd m0, [r0+r1+Y*FDEC_STRIDEB-2*SIZEOF_PIXEL] - SPLATW m0, m0, 1 - mova [r0+r1+Y*FDEC_STRIDEB+ 0], m0 - mova [r0+r1+Y*FDEC_STRIDEB+16], m0 -%if mmsize==8 - mova [r0+r1+Y*FDEC_STRIDEB+ 8], m0 - mova [r0+r1+Y*FDEC_STRIDEB+24], m0 -%endif -%assign Y Y+1 -%endrep - -%else ; !HIGH_BIT_DEPTH -%if cpuflag(ssse3) - mova m1, [pb_3] -%endif -.vloop: -%assign Y 0 -%rep 4 - SPLATB_LOAD m0, r0+r1+FDEC_STRIDE*Y-1, m1 - mova [r0+r1+FDEC_STRIDE*Y], m0 -%if mmsize==8 - mova [r0+r1+FDEC_STRIDE*Y+8], m0 +%if cpuflag(ssse3) && notcpuflag(avx2) + mova m2, [pb_3] %endif -%assign Y Y+1 -%endrep -%endif ; HIGH_BIT_DEPTH - sub r1, 4*FDEC_STRIDEB - jge .vloop + mov r1d, 4 +.loop: + PRED_H_4ROWS 16, 1 + dec r1d + jg .loop RET %endmacro INIT_MMX mmx2 PREDICT_16x16_H -INIT_XMM sse2 %if HIGH_BIT_DEPTH +INIT_XMM sse2 +PREDICT_16x16_H +INIT_YMM avx2 PREDICT_16x16_H %else ;no SSE2 for 8-bit, it's slower than MMX on all systems that don't support SSSE3 @@ -2072,8 +2094,7 @@ ;----------------------------------------------------------------------------- ; void predict_16x16_dc_core( pixel *src, int i_dc_left ) ;----------------------------------------------------------------------------- - -%macro PRED16x16_DC 2 +%macro PRED16x16_DC_MMX 2 %if HIGH_BIT_DEPTH mova m0, [r0 - FDEC_STRIDEB+ 0] paddw m0, [r0 - FDEC_STRIDEB+ 8] @@ -2083,7 +2104,7 @@ paddw m0, %1 psrlw m0, %2 SPLATW m0, m0 - STORE16x16 m0, m0, m0, m0 + STORE16 m0, m0, m0, m0 %else ; !HIGH_BIT_DEPTH pxor m0, m0 pxor m1, m1 @@ -2094,7 +2115,7 @@ psrlw m0, %2 ; dc pshufw m0, m0, 0 packuswb m0, m0 ; dc in bytes - STORE16x16 m0, m0 + STORE16 m0, m0 %endif %endmacro @@ -2102,15 +2123,15 @@ cglobal predict_16x16_dc_core, 1,2 %if ARCH_X86_64 movd m6, r1d - PRED16x16_DC m6, 5 + PRED16x16_DC_MMX m6, 5 %else - PRED16x16_DC r1m, 5 + PRED16x16_DC_MMX r1m, 5 %endif RET INIT_MMX mmx2 cglobal predict_16x16_dc_top, 1,2 - PRED16x16_DC [pw_8], 4 + PRED16x16_DC_MMX [pw_8], 4 RET INIT_MMX mmx2 @@ -2118,30 +2139,30 @@ cglobal predict_16x16_dc_left_core, 1,2 movd m0, r1m SPLATW m0, m0 - STORE16x16 m0, m0, m0, m0 + STORE16 m0, m0, m0, m0 RET %else ; !HIGH_BIT_DEPTH cglobal predict_16x16_dc_left_core, 1,1 movd m0, r1m pshufw m0, m0, 0 packuswb m0, m0 - STORE16x16 m0, m0 + STORE16 m0, m0 RET %endif -;----------------------------------------------------------------------------- -; void predict_16x16_dc_core( pixel *src, int i_dc_left ) -;----------------------------------------------------------------------------- - -%macro PRED16x16_DC_SSE2 2 +%macro PRED16x16_DC 2 %if HIGH_BIT_DEPTH - mova m0, [r0 - FDEC_STRIDEB+ 0] - paddw m0, [r0 - FDEC_STRIDEB+16] - HADDW m0, m2 - paddw m0, %1 - psrlw m0, %2 - SPLATW m0, m0 - STORE16x16_SSE2 m0, m0 + mova xm0, [r0 - FDEC_STRIDEB+ 0] + paddw xm0, [r0 - FDEC_STRIDEB+16] + HADDW xm0, xm2 + paddw xm0, %1 + psrlw xm0, %2 + SPLATW m0, xm0 +%if mmsize == 32 + STORE16 m0 +%else + STORE16 m0, m0 +%endif %else ; !HIGH_BIT_DEPTH pxor m0, m0 psadbw m0, [r0 - FDEC_STRIDE] @@ -2151,32 +2172,40 @@ psrlw m0, %2 ; dc SPLATW m0, m0 packuswb m0, m0 ; dc in bytes - STORE16x16_SSE2 m0 + STORE16 m0 %endif %endmacro -INIT_XMM sse2 +%macro PREDICT_16x16_DC_CORE 0 cglobal predict_16x16_dc_core, 2,2,4 - movd m3, r1m - PRED16x16_DC_SSE2 m3, 5 + movd xm3, r1m + PRED16x16_DC xm3, 5 RET cglobal predict_16x16_dc_top, 1,2 - PRED16x16_DC_SSE2 [pw_8], 4 + PRED16x16_DC [pw_8], 4 RET -INIT_XMM sse2 -%if HIGH_BIT_DEPTH cglobal predict_16x16_dc_left_core, 1,2 - movd m0, r1m - SPLATW m0, m0 - STORE16x16_SSE2 m0, m0 - RET -%else ; !HIGH_BIT_DEPTH -cglobal predict_16x16_dc_left_core, 1,1 - movd m0, r1m - SPLATW m0, m0 + movd xm0, r1m + SPLATW m0, xm0 +%if HIGH_BIT_DEPTH && mmsize == 16 + STORE16 m0, m0 +%else +%if HIGH_BIT_DEPTH == 0 packuswb m0, m0 - STORE16x16_SSE2 m0 +%endif + STORE16 m0 +%endif RET +%endmacro + +INIT_XMM sse2 +PREDICT_16x16_DC_CORE +%if HIGH_BIT_DEPTH +INIT_YMM avx2 +PREDICT_16x16_DC_CORE +%else +INIT_XMM avx2 +PREDICT_16x16_DC_CORE %endif
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/predict-c.c -> x264-snapshot-20130723-2245.tar.bz2/common/x86/predict-c.c
Changed
@@ -43,6 +43,7 @@ PREDICT_16x16_DC( mmx2 ) PREDICT_16x16_DC( sse2 ) +PREDICT_16x16_DC( avx2 ) #define PREDICT_16x16_DC_LEFT(name)\ static void x264_predict_16x16_dc_left_##name( pixel *src )\ @@ -58,10 +59,11 @@ PREDICT_16x16_DC_LEFT( mmx2 ) PREDICT_16x16_DC_LEFT( sse2 ) +PREDICT_16x16_DC_LEFT( avx2 ) #define PREDICT_P_SUM(j,i)\ H += i * ( src[j+i - FDEC_STRIDE ] - src[j-i - FDEC_STRIDE ] );\ - V += i * ( src[(j+i)*FDEC_STRIDE -1] - src[(j-i)*FDEC_STRIDE -1] );\ + V += i * ( src[(j+i)*FDEC_STRIDE -1] - src[(j-i)*FDEC_STRIDE -1] ); ALIGNED_16( static const int16_t pw_12345678[8] ) = {1,2,3,4,5,6,7,8}; ALIGNED_16( static const int16_t pw_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1}; @@ -70,178 +72,181 @@ ALIGNED_8( static const int8_t pb_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1}; ALIGNED_8( static const int8_t pb_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4}; -#if !HIGH_BIT_DEPTH -#define PREDICT_16x16_P(name)\ -static void x264_predict_16x16_p_##name( pixel *src )\ -{\ - int a, b, c;\ +#define PREDICT_16x16_P_CORE\ int H = 0;\ int V = 0;\ - int i00;\ - PREDICT_P_SUM(7,1) \ - PREDICT_P_SUM(7,2) \ - PREDICT_P_SUM(7,3) \ - PREDICT_P_SUM(7,4) \ - PREDICT_P_SUM(7,5) \ - PREDICT_P_SUM(7,6) \ - PREDICT_P_SUM(7,7) \ - PREDICT_P_SUM(7,8) \ - a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] );\ - b = ( 5 * H + 32 ) >> 6;\ - c = ( 5 * V + 32 ) >> 6;\ - i00 = a - b * 7 - c * 7 + 16;\ - x264_predict_16x16_p_core_##name( src, i00, b, c );\ -} -#ifndef ARCH_X86_64 -PREDICT_16x16_P( mmx2 ) -#endif -PREDICT_16x16_P( sse2 ) -PREDICT_16x16_P( avx ) -#endif //!HIGH_BIT_DEPTH + PREDICT_P_SUM(7,1)\ + PREDICT_P_SUM(7,2)\ + PREDICT_P_SUM(7,3)\ + PREDICT_P_SUM(7,4)\ + PREDICT_P_SUM(7,5)\ + PREDICT_P_SUM(7,6)\ + PREDICT_P_SUM(7,7)\ + PREDICT_P_SUM(7,8) -#define PREDICT_8x16C_P_CORE \ - int H = 0, V = 0;\ - for( int i = 0; i < 4; i++ )\ - H += ( i + 1 ) * ( src[4 + i - FDEC_STRIDE] - src[2 - i - FDEC_STRIDE] );\ - for( int i = 0; i < 8; i++ )\ - V += ( i + 1 ) * ( src[-1 + (i+8)*FDEC_STRIDE] - src[-1 + (6-i)*FDEC_STRIDE] );\ - int a = 16 * ( src[-1 + 15*FDEC_STRIDE] + src[7 - FDEC_STRIDE] );\ - int b = ( 17 * H + 16 ) >> 5;\ - int c = ( 5 * V + 32 ) >> 6; - -#if HIGH_BIT_DEPTH -#define PREDICT_8x16_P(name)\ -static void x264_predict_8x16c_p_##name( uint16_t *src )\ -{\ - PREDICT_8x16C_P_CORE \ - x264_predict_8x16c_p_core_##name( src, a, b, c );\ -} +#define PREDICT_16x16_P_END(name)\ + int a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] );\ + int b = ( 5 * H + 32 ) >> 6;\ + int c = ( 5 * V + 32 ) >> 6;\ + int i00 = a - b * 7 - c * 7 + 16;\ + /* b*15 + c*15 can overflow: it's easier to just branch away in this rare case + * than to try to consider it in the asm. */\ + if( BIT_DEPTH > 8 && (i00 > 0x7fff || abs(b) > 1092 || abs(c) > 1092) )\ + x264_predict_16x16_p_c( src );\ + else\ + x264_predict_16x16_p_core_##name( src, i00, b, c ); -PREDICT_8x16_P(sse2) -PREDICT_8x16_P(avx) -#else -#define PREDICT_8x16_P(name)\ -static void x264_predict_8x16c_p_##name( uint8_t *src )\ +#define PREDICT_16x16_P(name, name2)\ +static void x264_predict_16x16_p_##name( pixel *src )\ {\ - PREDICT_8x16C_P_CORE \ - int i00 = a -3*b -7*c + 16;\ - x264_predict_8x16c_p_core_##name( src, i00, b, c );\ + PREDICT_16x16_P_CORE\ + PREDICT_16x16_P_END(name2)\ } -#ifndef ARCH_X86_64 -PREDICT_8x16_P(mmx2) -#endif -PREDICT_8x16_P(sse2) -PREDICT_8x16_P(avx) -#endif #if HAVE_X86_INLINE_ASM #if HIGH_BIT_DEPTH -static void x264_predict_16x16_p_sse2( uint16_t *src ) -#else -static void x264_predict_16x16_p_ssse3( uint8_t *src ) -#endif -{ - int a, b, c, i00; - int H, V; -#if HIGH_BIT_DEPTH - asm ( - "movdqu %1, %%xmm1 \n" - "movdqa %2, %%xmm0 \n" - "pmaddwd %3, %%xmm0 \n" - "pmaddwd %4, %%xmm1 \n" - "paddd %%xmm1, %%xmm0 \n" - "movhlps %%xmm0, %%xmm1 \n" - "paddd %%xmm1, %%xmm0 \n" - "pshuflw $14, %%xmm0, %%xmm1 \n" - "paddd %%xmm1, %%xmm0 \n" - "movd %%xmm0, %0 \n" - :"=r"(H) - :"m"(src[-FDEC_STRIDE-1]), "m"(src[-FDEC_STRIDE+8]), - "m"(*pw_12345678), "m"(*pw_m87654321) +#define PREDICT_16x16_P_ASM\ + asm (\ + "movdqu %1, %%xmm1 \n"\ + "movdqa %2, %%xmm0 \n"\ + "pmaddwd %3, %%xmm0 \n"\ + "pmaddwd %4, %%xmm1 \n"\ + "paddd %%xmm1, %%xmm0 \n"\ + "movhlps %%xmm0, %%xmm1 \n"\ + "paddd %%xmm1, %%xmm0 \n"\ + "pshuflw $14, %%xmm0, %%xmm1 \n"\ + "paddd %%xmm1, %%xmm0 \n"\ + "movd %%xmm0, %0 \n"\ + :"=r"(H)\ + :"m"(src[-FDEC_STRIDE-1]), "m"(src[-FDEC_STRIDE+8]),\ + "m"(*pw_12345678), "m"(*pw_m87654321)\ ); -#else - asm ( - "movq %1, %%mm1 \n" - "movq %2, %%mm0 \n" - "palignr $7, %3, %%mm1 \n" - "pmaddubsw %4, %%mm0 \n" - "pmaddubsw %5, %%mm1 \n" - "paddw %%mm1, %%mm0 \n" - "pshufw $14, %%mm0, %%mm1 \n" - "paddw %%mm1, %%mm0 \n" - "pshufw $1, %%mm0, %%mm1 \n" - "paddw %%mm1, %%mm0 \n" - "movd %%mm0, %0 \n" - "movswl %w0, %0 \n" - :"=r"(H) - :"m"(src[-FDEC_STRIDE]), "m"(src[-FDEC_STRIDE+8]), - "m"(src[-FDEC_STRIDE-8]), "m"(*pb_12345678), "m"(*pb_m87654321) +#else // !HIGH_BIT_DEPTH +#define PREDICT_16x16_P_ASM\ + asm (\ + "movq %1, %%mm1 \n"\ + "movq %2, %%mm0 \n"\ + "palignr $7, %3, %%mm1 \n"\ + "pmaddubsw %4, %%mm0 \n"\ + "pmaddubsw %5, %%mm1 \n"\ + "paddw %%mm1, %%mm0 \n"\ + "pshufw $14, %%mm0, %%mm1 \n"\ + "paddw %%mm1, %%mm0 \n"\ + "pshufw $1, %%mm0, %%mm1 \n"\ + "paddw %%mm1, %%mm0 \n"\ + "movd %%mm0, %0 \n"\ + "movswl %w0, %0 \n"\ + :"=r"(H)\ + :"m"(src[-FDEC_STRIDE]), "m"(src[-FDEC_STRIDE+8]),\ + "m"(src[-FDEC_STRIDE-8]), "m"(*pb_12345678), "m"(*pb_m87654321)\ ); -#endif - V = 8 * ( src[15*FDEC_STRIDE-1] - src[-1*FDEC_STRIDE-1] ) - + 7 * ( src[14*FDEC_STRIDE-1] - src[ 0*FDEC_STRIDE-1] ) - + 6 * ( src[13*FDEC_STRIDE-1] - src[ 1*FDEC_STRIDE-1] ) - + 5 * ( src[12*FDEC_STRIDE-1] - src[ 2*FDEC_STRIDE-1] ) - + 4 * ( src[11*FDEC_STRIDE-1] - src[ 3*FDEC_STRIDE-1] ) - + 3 * ( src[10*FDEC_STRIDE-1] - src[ 4*FDEC_STRIDE-1] ) - + 2 * ( src[ 9*FDEC_STRIDE-1] - src[ 5*FDEC_STRIDE-1] ) +#endif // HIGH_BIT_DEPTH + +#define PREDICT_16x16_P_CORE_INLINE\ + int H, V;\ + PREDICT_16x16_P_ASM\ + V = 8 * ( src[15*FDEC_STRIDE-1] - src[-1*FDEC_STRIDE-1] )\ + + 7 * ( src[14*FDEC_STRIDE-1] - src[ 0*FDEC_STRIDE-1] )\ + + 6 * ( src[13*FDEC_STRIDE-1] - src[ 1*FDEC_STRIDE-1] )\ + + 5 * ( src[12*FDEC_STRIDE-1] - src[ 2*FDEC_STRIDE-1] )\ + + 4 * ( src[11*FDEC_STRIDE-1] - src[ 3*FDEC_STRIDE-1] )\ + + 3 * ( src[10*FDEC_STRIDE-1] - src[ 4*FDEC_STRIDE-1] )\ + + 2 * ( src[ 9*FDEC_STRIDE-1] - src[ 5*FDEC_STRIDE-1] )\ + 1 * ( src[ 8*FDEC_STRIDE-1] - src[ 6*FDEC_STRIDE-1] ); - a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] ); - b = ( 5 * H + 32 ) >> 6; - c = ( 5 * V + 32 ) >> 6; - i00 = a - b * 7 - c * 7 + 16; - /* b*15 + c*15 can overflow: it's easier to just branch away in this rare case - * than to try to consider it in the asm. */ - if( BIT_DEPTH > 8 && (i00 > 0x7fff || abs(b) > 1092 || abs(c) > 1092) ) - x264_predict_16x16_p_c( src ); - else - x264_predict_16x16_p_core_sse2( src, i00, b, c ); + +#define PREDICT_16x16_P_INLINE(name, name2)\ +static void x264_predict_16x16_p_##name( pixel *src )\ +{\ + PREDICT_16x16_P_CORE_INLINE\ + PREDICT_16x16_P_END(name2)\ } -#endif +#else // !HAVE_X86_INLINE_ASM +#define PREDICT_16x16_P_INLINE(name, name2) PREDICT_16x16_P(name, name2) +#endif // HAVE_X86_INLINE_ASM + +#if HIGH_BIT_DEPTH +PREDICT_16x16_P_INLINE( sse2, sse2 ) +#else // !HIGH_BIT_DEPTH +#if !ARCH_X86_64 +PREDICT_16x16_P( mmx2, mmx2 ) +#endif // !ARCH_X86_64 +PREDICT_16x16_P( sse2, sse2 ) +#if HAVE_X86_INLINE_ASM +PREDICT_16x16_P_INLINE( ssse3, sse2 ) +#endif // HAVE_X86_INLINE_ASM +PREDICT_16x16_P_INLINE( avx, avx ) +#endif // HIGH_BIT_DEPTH +PREDICT_16x16_P_INLINE( avx2, avx2 ) + +#define PREDICT_8x16C_P_CORE\ + int H = 0, V = 0;\ + for( int i = 0; i < 4; i++ )\ + H += ( i + 1 ) * ( src[4 + i - FDEC_STRIDE] - src[2 - i - FDEC_STRIDE] );\ + for( int i = 0; i < 8; i++ )\ + V += ( i + 1 ) * ( src[-1 + (i+8)*FDEC_STRIDE] - src[-1 + (6-i)*FDEC_STRIDE] ); -#if !HIGH_BIT_DEPTH +#if HIGH_BIT_DEPTH +#define PREDICT_8x16C_P_END(name)\ + int a = 16 * ( src[-1 + 15*FDEC_STRIDE] + src[7 - FDEC_STRIDE] );\ + int b = ( 17 * H + 16 ) >> 5;\ + int c = ( 5 * V + 32 ) >> 6;\ + x264_predict_8x16c_p_core_##name( src, a, b, c ); +#else // !HIGH_BIT_DEPTH +#define PREDICT_8x16C_P_END(name)\ + int a = 16 * ( src[-1 + 15*FDEC_STRIDE] + src[7 - FDEC_STRIDE] );\ + int b = ( 17 * H + 16 ) >> 5;\ + int c = ( 5 * V + 32 ) >> 6;\ + int i00 = a -3*b -7*c + 16;\ + x264_predict_8x16c_p_core_##name( src, i00, b, c ); +#endif // HIGH_BIT_DEPTH -#define PREDICT_8x8_P(name)\ -static void x264_predict_8x8c_p_##name( uint8_t *src )\ +#define PREDICT_8x16C_P(name)\ +static void x264_predict_8x16c_p_##name( pixel *src )\ {\ - int a, b, c;\ + PREDICT_8x16C_P_CORE\ + PREDICT_8x16C_P_END(name)\ +} + +#if !ARCH_X86_64 && !HIGH_BIT_DEPTH +PREDICT_8x16C_P( mmx2 ) +#endif // !ARCH_X86_64 && !HIGH_BIT_DEPTH +PREDICT_8x16C_P( sse2 ) +PREDICT_8x16C_P( avx ) +PREDICT_8x16C_P( avx2 ) + +#define PREDICT_8x8C_P_CORE\ int H = 0;\ int V = 0;\ - int i00;\ PREDICT_P_SUM(3,1)\ PREDICT_P_SUM(3,2)\ PREDICT_P_SUM(3,3)\ - PREDICT_P_SUM(3,4)\ - a = 16 * ( src[7*FDEC_STRIDE -1] + src[7 - FDEC_STRIDE] );\ - b = ( 17 * H + 16 ) >> 5;\ - c = ( 17 * V + 16 ) >> 5;\ - i00 = a -3*b -3*c + 16;\ - x264_predict_8x8c_p_core_##name( src, i00, b, c );\ -} -#ifndef ARCH_X86_64 -PREDICT_8x8_P( mmx2 ) -#endif -PREDICT_8x8_P( sse2 ) + PREDICT_P_SUM(3,4) -#endif //!HIGH_BIT_DEPTH - -#if HAVE_X86_INLINE_ASM - -#define PREDICT_8x8C_P_CORE\ - V = 1 * ( src[4*FDEC_STRIDE -1] - src[ 2*FDEC_STRIDE -1] )\ - + 2 * ( src[5*FDEC_STRIDE -1] - src[ 1*FDEC_STRIDE -1] )\ - + 3 * ( src[6*FDEC_STRIDE -1] - src[ 0*FDEC_STRIDE -1] )\ - + 4 * ( src[7*FDEC_STRIDE -1] - src[-1*FDEC_STRIDE -1] );\ - H += -4 * src[-1*FDEC_STRIDE -1];\ +#if HIGH_BIT_DEPTH +#define PREDICT_8x8C_P_END(name)\ int a = 16 * ( src[7*FDEC_STRIDE -1] + src[7 - FDEC_STRIDE] );\ int b = ( 17 * H + 16 ) >> 5;\ - int c = ( 17 * V + 16 ) >> 5; + int c = ( 17 * V + 16 ) >> 5;\ + x264_predict_8x8c_p_core_##name( src, a, b, c ); +#else // !HIGH_BIT_DEPTH +#define PREDICT_8x8C_P_END(name)\ + int a = 16 * ( src[7*FDEC_STRIDE -1] + src[7 - FDEC_STRIDE] );\ + int b = ( 17 * H + 16 ) >> 5;\ + int c = ( 17 * V + 16 ) >> 5;\ + int i00 = a -3*b -3*c + 16;\ + x264_predict_8x8c_p_core_##name( src, i00, b, c ); +#endif // HIGH_BIT_DEPTH -#if HIGH_BIT_DEPTH -#define PREDICT_8x8_P2(cpu1, cpu2)\ -static void x264_predict_8x8c_p_ ## cpu1( pixel *src )\ +#define PREDICT_8x8C_P(name, name2)\ +static void x264_predict_8x8c_p_##name( pixel *src )\ {\ - int H, V;\ + PREDICT_8x8C_P_CORE\ + PREDICT_8x8C_P_END(name2)\ +} + +#if HAVE_X86_INLINE_ASM +#if HIGH_BIT_DEPTH +#define PREDICT_8x8C_P_ASM\ asm (\ "movdqa %1, %%xmm0 \n"\ "pmaddwd %2, %%xmm0 \n"\ @@ -252,19 +257,9 @@ "movd %%xmm0, %0 \n"\ :"=r"(H)\ :"m"(src[-FDEC_STRIDE]), "m"(*pw_m32101234)\ - );\ - PREDICT_8x8C_P_CORE\ - x264_predict_8x8c_p_core_ ## cpu2( src, a, b, c );\ -} - -PREDICT_8x8_P2(sse2, sse2) -PREDICT_8x8_P2( avx, avx) - -#else //!HIGH_BIT_DEPTH -#define PREDICT_8x8_P2(cpu1, cpu2)\ -static void x264_predict_8x8c_p_ ## cpu1( pixel *src )\ -{\ - int H, V;\ + ); +#else // !HIGH_BIT_DEPTH +#define PREDICT_8x8C_P_ASM\ asm (\ "movq %1, %%mm0 \n"\ "pmaddubsw %2, %%mm0 \n"\ @@ -276,16 +271,41 @@ "movswl %w0, %0 \n"\ :"=r"(H)\ :"m"(src[-FDEC_STRIDE]), "m"(*pb_m32101234)\ - );\ - PREDICT_8x8C_P_CORE\ - int i00 = a -3*b -3*c + 16;\ - x264_predict_8x8c_p_core_ ## cpu2( src, i00, b, c );\ + ); +#endif // HIGH_BIT_DEPTH + +#define PREDICT_8x8C_P_CORE_INLINE\ + int H, V;\ + PREDICT_8x8C_P_ASM\ + V = 1 * ( src[4*FDEC_STRIDE -1] - src[ 2*FDEC_STRIDE -1] )\ + + 2 * ( src[5*FDEC_STRIDE -1] - src[ 1*FDEC_STRIDE -1] )\ + + 3 * ( src[6*FDEC_STRIDE -1] - src[ 0*FDEC_STRIDE -1] )\ + + 4 * ( src[7*FDEC_STRIDE -1] - src[-1*FDEC_STRIDE -1] );\ + H += -4 * src[-1*FDEC_STRIDE -1]; + +#define PREDICT_8x8C_P_INLINE(name, name2)\ +static void x264_predict_8x8c_p_##name( pixel *src )\ +{\ + PREDICT_8x8C_P_CORE_INLINE\ + PREDICT_8x8C_P_END(name2)\ } +#else // !HAVE_X86_INLINE_ASM +#define PREDICT_8x8C_P_INLINE(name, name2) PREDICT_8x8C_P(name, name2) +#endif // HAVE_X86_INLINE_ASM -PREDICT_8x8_P2(ssse3, sse2) -PREDICT_8x8_P2( avx, avx) -#endif -#endif +#if HIGH_BIT_DEPTH +PREDICT_8x8C_P_INLINE( sse2, sse2 ) +#else //!HIGH_BIT_DEPTH +#if !ARCH_X86_64 +PREDICT_8x8C_P( mmx2, mmx2 ) +#endif // !ARCH_X86_64 +PREDICT_8x8C_P( sse2, sse2 ) +#if HAVE_X86_INLINE_ASM +PREDICT_8x8C_P_INLINE( ssse3, sse2 ) +#endif // HAVE_X86_INLINE_ASM +#endif // HIGH_BIT_DEPTH +PREDICT_8x8C_P_INLINE( avx, avx ) +PREDICT_8x8C_P_INLINE( avx2, avx2 ) #if ARCH_X86_64 && !HIGH_BIT_DEPTH static void x264_predict_8x8c_dc_left( uint8_t *src ) @@ -312,7 +332,6 @@ M64( src ) = dc1; src += FDEC_STRIDE; } - } #endif // ARCH_X86_64 && !HIGH_BIT_DEPTH @@ -329,24 +348,32 @@ pf[I_PRED_16x16_V] = x264_predict_16x16_v_mmx2; pf[I_PRED_16x16_H] = x264_predict_16x16_h_mmx2; #if HIGH_BIT_DEPTH + if( !(cpu&X264_CPU_SSE) ) + return; + pf[I_PRED_16x16_V] = x264_predict_16x16_v_sse; if( !(cpu&X264_CPU_SSE2) ) return; pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_sse2; pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_sse2; pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_sse2; - pf[I_PRED_16x16_V] = x264_predict_16x16_v_sse2; pf[I_PRED_16x16_H] = x264_predict_16x16_h_sse2; -#if HAVE_X86_INLINE_ASM pf[I_PRED_16x16_P] = x264_predict_16x16_p_sse2; -#endif + if( !(cpu&X264_CPU_AVX) ) + return; + pf[I_PRED_16x16_V] = x264_predict_16x16_v_avx; + if( !(cpu&X264_CPU_AVX2) ) + return; + pf[I_PRED_16x16_H] = x264_predict_16x16_h_avx2; #else #if !ARCH_X86_64 pf[I_PRED_16x16_P] = x264_predict_16x16_p_mmx2; #endif + if( !(cpu&X264_CPU_SSE) ) + return; + pf[I_PRED_16x16_V] = x264_predict_16x16_v_sse; if( !(cpu&X264_CPU_SSE2) ) return; pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_sse2; - pf[I_PRED_16x16_V] = x264_predict_16x16_v_sse2; if( cpu&X264_CPU_SSE2_IS_SLOW ) return; pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_sse2; @@ -354,7 +381,8 @@ pf[I_PRED_16x16_P] = x264_predict_16x16_p_sse2; if( !(cpu&X264_CPU_SSSE3) ) return; - pf[I_PRED_16x16_H] = x264_predict_16x16_h_ssse3; + if( !(cpu&X264_CPU_SLOW_PSHUFB) ) + pf[I_PRED_16x16_H] = x264_predict_16x16_h_ssse3; #if HAVE_X86_INLINE_ASM pf[I_PRED_16x16_P] = x264_predict_16x16_p_ssse3; #endif @@ -362,6 +390,14 @@ return; pf[I_PRED_16x16_P] = x264_predict_16x16_p_avx; #endif // HIGH_BIT_DEPTH + + if( cpu&X264_CPU_AVX2 ) + { + pf[I_PRED_16x16_P] = x264_predict_16x16_p_avx2; + pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_avx2; + pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_avx2; + pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_avx2; + } } void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] ) @@ -374,18 +410,21 @@ return; pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_mmx2; pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_mmx2; + if( !(cpu&X264_CPU_SSE) ) + return; + pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_sse; if( !(cpu&X264_CPU_SSE2) ) return; - pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_sse2; pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_sse2; pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_sse2; pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_sse2; -#if HAVE_X86_INLINE_ASM pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_sse2; if( !(cpu&X264_CPU_AVX) ) return; pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_avx; -#endif + if( !(cpu&X264_CPU_AVX2) ) + return; + pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_avx2; #else #if ARCH_X86_64 pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left; @@ -407,11 +446,16 @@ pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_ssse3; #if HAVE_X86_INLINE_ASM pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_ssse3; +#endif if( !(cpu&X264_CPU_AVX) ) return; pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_avx; -#endif #endif // HIGH_BIT_DEPTH + + if( cpu&X264_CPU_AVX2 ) + { + pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_avx2; + } } void x264_predict_8x16c_init_mmx( int cpu, x264_predict_t pf[7] ) @@ -423,9 +467,11 @@ return; pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_mmx2; pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_mmx2; + if( !(cpu&X264_CPU_SSE) ) + return; + pf[I_PRED_CHROMA_V] = x264_predict_8x16c_v_sse; if( !(cpu&X264_CPU_SSE2) ) return; - pf[I_PRED_CHROMA_V] = x264_predict_8x16c_v_sse2; pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x16c_dc_top_sse2; pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_sse2; pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_sse2; @@ -433,6 +479,9 @@ if( !(cpu&X264_CPU_AVX) ) return; pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_avx; + if( !(cpu&X264_CPU_AVX2) ) + return; + pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_avx2; #else pf[I_PRED_CHROMA_V] = x264_predict_8x16c_v_mmx; if( !(cpu&X264_CPU_MMX2) ) @@ -440,7 +489,7 @@ pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x16c_dc_top_mmx2; pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_mmx2; pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_mmx2; -#ifndef ARCH_X86_64 +#if !ARCH_X86_64 pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_mmx2; #endif if( !(cpu&X264_CPU_SSE2) ) @@ -453,6 +502,11 @@ return; pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_avx; #endif // HIGH_BIT_DEPTH + + if( cpu&X264_CPU_AVX2 ) + { + pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_avx2; + } } void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter ) @@ -460,9 +514,11 @@ if( !(cpu&X264_CPU_MMX2) ) return; #if HIGH_BIT_DEPTH + if( !(cpu&X264_CPU_SSE) ) + return; + pf[I_PRED_8x8_V] = x264_predict_8x8_v_sse; if( !(cpu&X264_CPU_SSE2) ) return; - pf[I_PRED_8x8_V] = x264_predict_8x8_v_sse2; pf[I_PRED_8x8_H] = x264_predict_8x8_h_sse2; pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_sse2; pf[I_PRED_8x8_DC_TOP] = x264_predict_8x8_dc_top_sse2; @@ -520,8 +576,11 @@ pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_sse2; if( !(cpu&X264_CPU_SSSE3) ) return; - pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_ssse3; - pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_ssse3; + if( !(cpu&X264_CPU_SLOW_PALIGNR) ) + { + pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_ssse3; + pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_ssse3; + } pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_ssse3; *predict_8x8_filter = x264_predict_8x8_filter_ssse3; if( !(cpu&X264_CPU_AVX) ) @@ -564,6 +623,9 @@ pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_avx; pf[I_PRED_4x4_VL] = x264_predict_4x4_vl_avx; pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_avx; + if( !(cpu&X264_CPU_AVX2) ) + return; + pf[I_PRED_4x4_H] = x264_predict_4x4_h_avx2; #else pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_mmx2; if( !(cpu&X264_CPU_SSSE3) )
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/predict.h -> x264-snapshot-20130723-2245.tar.bz2/common/x86/predict.h
Changed
@@ -34,48 +34,57 @@ void x264_predict_8x8_init_mmx ( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter ); void x264_predict_16x16_v_mmx2( pixel *src ); -void x264_predict_16x16_v_sse2( pixel *src ); +void x264_predict_16x16_v_sse ( pixel *src ); +void x264_predict_16x16_v_avx ( uint16_t *src ); void x264_predict_16x16_h_mmx2( pixel *src ); void x264_predict_16x16_h_sse2( uint16_t *src ); void x264_predict_16x16_h_ssse3( uint8_t *src ); +void x264_predict_16x16_h_avx2( uint16_t *src ); void x264_predict_16x16_dc_mmx2( pixel *src ); void x264_predict_16x16_dc_sse2( pixel *src ); void x264_predict_16x16_dc_core_mmx2( pixel *src, int i_dc_left ); void x264_predict_16x16_dc_core_sse2( pixel *src, int i_dc_left ); +void x264_predict_16x16_dc_core_avx2( pixel *src, int i_dc_left ); void x264_predict_16x16_dc_left_core_mmx2( pixel *src, int i_dc_left ); void x264_predict_16x16_dc_left_core_sse2( pixel *src, int i_dc_left ); +void x264_predict_16x16_dc_left_core_avx2( pixel *src, int i_dc_left ); void x264_predict_16x16_dc_top_mmx2( pixel *src ); void x264_predict_16x16_dc_top_sse2( pixel *src ); -void x264_predict_16x16_dc_top_ssse3( uint16_t *src ); +void x264_predict_16x16_dc_top_avx2( pixel *src ); void x264_predict_16x16_p_core_mmx2( uint8_t *src, int i00, int b, int c ); void x264_predict_16x16_p_core_sse2( pixel *src, int i00, int b, int c ); void x264_predict_16x16_p_core_avx( pixel *src, int i00, int b, int c ); +void x264_predict_16x16_p_core_avx2( pixel *src, int i00, int b, int c ); void x264_predict_8x16c_dc_mmx2( pixel *src ); void x264_predict_8x16c_dc_sse2( uint16_t *src ); void x264_predict_8x16c_dc_top_mmx2( uint8_t *src ); void x264_predict_8x16c_dc_top_sse2( uint16_t *src ); void x264_predict_8x16c_v_mmx( uint8_t *src ); -void x264_predict_8x16c_v_sse2( uint16_t *src ); +void x264_predict_8x16c_v_sse( uint16_t *src ); void x264_predict_8x16c_h_mmx2( pixel *src ); -void x264_predict_8x16c_h_sse2( pixel *src ); +void x264_predict_8x16c_h_sse2( uint16_t *src ); void x264_predict_8x16c_h_ssse3( uint8_t *src ); +void x264_predict_8x16c_h_avx2( uint16_t *src ); void x264_predict_8x16c_p_core_mmx2( uint8_t *src, int i00, int b, int c ); void x264_predict_8x16c_p_core_sse2( pixel *src, int i00, int b, int c ); -void x264_predict_8x16c_p_core_avx( pixel *src, int i00, int b, int c ); +void x264_predict_8x16c_p_core_avx ( pixel *src, int i00, int b, int c ); +void x264_predict_8x16c_p_core_avx2( pixel *src, int i00, int b, int c ); void x264_predict_8x8c_p_core_mmx2( uint8_t *src, int i00, int b, int c ); void x264_predict_8x8c_p_core_sse2( pixel *src, int i00, int b, int c ); -void x264_predict_8x8c_p_core_avx( pixel *src, int i00, int b, int c ); +void x264_predict_8x8c_p_core_avx ( pixel *src, int i00, int b, int c ); +void x264_predict_8x8c_p_core_avx2( pixel *src, int i00, int b, int c ); void x264_predict_8x8c_dc_mmx2( pixel *src ); void x264_predict_8x8c_dc_sse2( uint16_t *src ); void x264_predict_8x8c_dc_top_mmx2( uint8_t *src ); void x264_predict_8x8c_dc_top_sse2( uint16_t *src ); void x264_predict_8x8c_v_mmx( pixel *src ); -void x264_predict_8x8c_v_sse2( uint16_t *src ); +void x264_predict_8x8c_v_sse( uint16_t *src ); void x264_predict_8x8c_h_mmx2( pixel *src ); -void x264_predict_8x8c_h_sse2( pixel *src ); +void x264_predict_8x8c_h_sse2( uint16_t *src ); void x264_predict_8x8c_h_ssse3( uint8_t *src ); +void x264_predict_8x8c_h_avx2( uint16_t *src ); void x264_predict_8x8_v_mmx2( uint8_t *src, uint8_t edge[36] ); -void x264_predict_8x8_v_sse2( uint16_t *src, uint16_t edge[36] ); +void x264_predict_8x8_v_sse ( uint16_t *src, uint16_t edge[36] ); void x264_predict_8x8_h_mmx2( uint8_t *src, uint8_t edge[36] ); void x264_predict_8x8_h_sse2( uint16_t *src, uint16_t edge[36] ); void x264_predict_8x8_hd_mmx2( uint8_t *src, uint8_t edge[36] ); @@ -114,6 +123,7 @@ void x264_predict_8x8_filter_sse2( uint16_t *src, uint16_t edge[36], int i_neighbor, int i_filters ); void x264_predict_8x8_filter_ssse3( pixel *src, pixel edge[36], int i_neighbor, int i_filters ); void x264_predict_8x8_filter_avx( uint16_t *src, uint16_t edge[36], int i_neighbor, int i_filters ); +void x264_predict_4x4_h_avx2( uint16_t *src ); void x264_predict_4x4_ddl_mmx2( pixel *src ); void x264_predict_4x4_ddl_sse2( uint16_t *src ); void x264_predict_4x4_ddl_avx( uint16_t *src );
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/quant-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/quant-a.asm
Changed
@@ -7,7 +7,7 @@ ;* Jason Garrett-Glaser <darkshikari@gmail.com> ;* Christian Heine <sennindemokrit@gmx.net> ;* Oskar Arvidsson <oskar@irock.se> -;* Henrik Gramner <hengar-6@student.ltu.se> +;* Henrik Gramner <henrik@gramner.com> ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -30,7 +30,7 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA +SECTION_RODATA 32 %macro DQM4 3 dw %1, %2, %1, %2, %2, %3, %2, %3 @@ -39,8 +39,7 @@ dw %1, %4, %5, %4, %1, %4, %5, %4 dw %4, %2, %6, %2, %4, %2, %6, %2 dw %5, %6, %3, %6, %5, %6, %3, %6 - ; last line not used, just padding for power-of-2 stride - times 8 dw 0 + dw %4, %2, %6, %2, %4, %2, %6, %2 %endmacro dequant4_scale: @@ -75,27 +74,55 @@ chroma_dc_dct_mask: dw 1, 1,-1,-1, 1, 1,-1,-1 chroma_dc_dmf_mask: dw 1, 1,-1,-1, 1,-1,-1, 1 +%if HIGH_BIT_DEPTH==0 +dct_coef_shuffle: +%macro DCT_COEF_SHUFFLE 8 + %assign y x + %rep 8 + %rep 7 + %rotate (~(y>>7))&1 + %assign y y<<((~(y>>7))&1) + %endrep + db %1*2 + %rotate 1 + %assign y y<<1 + %endrep +%endmacro +%assign x 0 +%rep 256 + DCT_COEF_SHUFFLE 7, 6, 5, 4, 3, 2, 1, 0 +%assign x x+1 +%endrep +%endif + SECTION .text cextern pb_1 cextern pw_1 +cextern pw_2 +cextern pw_256 cextern pd_1 cextern pb_01 cextern pd_1024 - -%macro QUANT_DC_START 0 - movd m6, r1m ; mf - movd m7, r2m ; bias -%if HIGH_BIT_DEPTH - SPLATD m6, m6 - SPLATD m7, m7 +cextern deinterleave_shufd +cextern popcnt_table + +%macro QUANT_DC_START 2 + movd xm%1, r1m ; mf + movd xm%2, r2m ; bias +%if cpuflag(avx2) + vpbroadcastdct m%1, xm%1 + vpbroadcastdct m%2, xm%2 +%elif HIGH_BIT_DEPTH + SPLATD m%1, m%1 + SPLATD m%2, m%2 %elif cpuflag(sse4) ; ssse3, but not faster on conroe mova m5, [pb_01] - pshufb m6, m5 - pshufb m7, m5 + pshufb m%1, m5 + pshufb m%2, m5 %else - SPLATW m6, m6 - SPLATW m7, m7 + SPLATW m%1, m%1 + SPLATW m%2, m%2 %endif %endmacro @@ -175,7 +202,7 @@ %endif ; cpuflag %endmacro -%macro QUANT_ONE_AC_MMX 4 +%macro QUANT_ONE_AC_MMX 5 mova m0, [%1] mova m2, [%2] ABSD m1, m0 @@ -191,10 +218,10 @@ psrad m1, 16 PSIGND m1, m0 mova [%1], m1 - ACCUM por, 5, 1, %4 + ACCUM por, %5, 1, %4 %endmacro -%macro QUANT_TWO_AC 4 +%macro QUANT_TWO_AC 5 %if cpuflag(sse4) mova m0, [%1 ] mova m1, [%1+mmsize] @@ -210,11 +237,11 @@ PSIGND m3, m1 mova [%1 ], m2 mova [%1+mmsize], m3 - ACCUM por, 5, 2, %4 - por m5, m3 + ACCUM por, %5, 2, %4 + por m%5, m3 %else ; !sse4 - QUANT_ONE_AC_MMX %1, %2, %3, %4 - QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, %4+mmsize + QUANT_ONE_AC_MMX %1, %2, %3, %4, %5 + QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, 1, %5 %endif ; cpuflag %endmacro @@ -223,7 +250,7 @@ ;----------------------------------------------------------------------------- %macro QUANT_DC 2 cglobal quant_%1x%2_dc, 3,3,8 - QUANT_DC_START + QUANT_DC_START 6,7 %if %1*%2 <= mmsize/4 QUANT_ONE_DC r0, m6, m7, 0 %else @@ -244,35 +271,87 @@ cglobal quant_%1x%2, 3,3,8 %assign x 0 %rep %1*%2/(mmsize/2) - QUANT_TWO_AC r0+x, r1+x, r2+x, x + QUANT_TWO_AC r0+x, r1+x, r2+x, x, 5 %assign x x+mmsize*2 %endrep QUANT_END RET %endmacro +%macro QUANT_4x4 2 + QUANT_TWO_AC r0+%1+mmsize*0, r1+mmsize*0, r2+mmsize*0, 0, %2 + QUANT_TWO_AC r0+%1+mmsize*2, r1+mmsize*2, r2+mmsize*2, 1, %2 +%endmacro + +%macro QUANT_4x4x4 0 +cglobal quant_4x4x4, 3,3,8 + QUANT_4x4 0, 5 + QUANT_4x4 64, 6 + add r0, 128 + packssdw m5, m6 + QUANT_4x4 0, 6 + QUANT_4x4 64, 7 + packssdw m6, m7 + packssdw m5, m6 + packssdw m5, m5 ; AA BB CC DD + packsswb m5, m5 ; A B C D + pxor m4, m4 + pcmpeqb m5, m4 + pmovmskb eax, m5 + not eax + and eax, 0xf + RET +%endmacro + INIT_XMM sse2 QUANT_DC 2, 2 QUANT_DC 4, 4 QUANT_AC 4, 4 QUANT_AC 8, 8 +QUANT_4x4x4 INIT_XMM ssse3 QUANT_DC 2, 2 QUANT_DC 4, 4 QUANT_AC 4, 4 QUANT_AC 8, 8 +QUANT_4x4x4 INIT_XMM sse4 QUANT_DC 2, 2 QUANT_DC 4, 4 QUANT_AC 4, 4 QUANT_AC 8, 8 +QUANT_4x4x4 + +INIT_YMM avx2 +QUANT_DC 4, 4 +QUANT_AC 4, 4 +QUANT_AC 8, 8 + +INIT_YMM avx2 +cglobal quant_4x4x4, 3,3,6 + QUANT_TWO_AC r0, r1, r2, 0, 4 + QUANT_TWO_AC r0+64, r1, r2, 0, 5 + add r0, 128 + packssdw m4, m5 + QUANT_TWO_AC r0, r1, r2, 0, 5 + QUANT_TWO_AC r0+64, r1, r2, 0, 1 + packssdw m5, m1 + packssdw m4, m5 + pxor m3, m3 + pcmpeqd m4, m3 + movmskps eax, m4 + mov edx, eax + shr eax, 4 + and eax, edx + xor eax, 0xf + RET %endif ; HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH == 0 -%macro QUANT_ONE 4 +%macro QUANT_ONE 5 ;;; %1 (m64) dct[y][x] ;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t) ;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t) @@ -282,10 +361,10 @@ pmulhuw m0, %2 ; divide PSIGNW m0, m1 ; restore sign mova %1, m0 ; store - ACCUM por, 5, 0, %4 + ACCUM por, %5, 0, %4 %endmacro -%macro QUANT_TWO 7 +%macro QUANT_TWO 8 mova m1, %1 mova m3, %2 ABSW m0, m1, sign @@ -298,8 +377,8 @@ PSIGNW m2, m3 mova %1, m0 mova %2, m2 - ACCUM por, 5, 0, %7 - por m5, m2 + ACCUM por, %8, 0, %7 + ACCUM por, %8, 2, %7+mmsize %endmacro ;----------------------------------------------------------------------------- @@ -307,13 +386,14 @@ ;----------------------------------------------------------------------------- %macro QUANT_DC 2-3 0 cglobal %1, 1,1,%3 - QUANT_DC_START %if %2==1 - QUANT_ONE [r0], m6, m7, 0 + QUANT_DC_START 2,3 + QUANT_ONE [r0], m2, m3, 0, 5 %else + QUANT_DC_START 4,6 %assign x 0 %rep %2/2 - QUANT_TWO [r0+x], [r0+x+mmsize], m6, m6, m7, m7, x + QUANT_TWO [r0+x], [r0+x+mmsize], m4, m4, m6, m6, x, 5 %assign x x+mmsize*2 %endrep %endif @@ -326,15 +406,57 @@ ;----------------------------------------------------------------------------- %macro QUANT_AC 2 cglobal %1, 3,3 +%if %2==1 + QUANT_ONE [r0], [r1], [r2], 0, 5 +%else %assign x 0 %rep %2/2 - QUANT_TWO [r0+x], [r0+x+mmsize], [r1+x], [r1+x+mmsize], [r2+x], [r2+x+mmsize], x + QUANT_TWO [r0+x], [r0+x+mmsize], [r1+x], [r1+x+mmsize], [r2+x], [r2+x+mmsize], x, 5 %assign x x+mmsize*2 %endrep +%endif QUANT_END RET %endmacro +%macro QUANT_4x4 2 +%if UNIX64 + QUANT_TWO [r0+%1+mmsize*0], [r0+%1+mmsize*1], m8, m9, m10, m11, mmsize*0, %2 +%else + QUANT_TWO [r0+%1+mmsize*0], [r0+%1+mmsize*1], [r1+mmsize*0], [r1+mmsize*1], [r2+mmsize*0], [r2+mmsize*1], mmsize*0, %2 +%if mmsize==8 + QUANT_TWO [r0+%1+mmsize*2], [r0+%1+mmsize*3], [r1+mmsize*2], [r1+mmsize*3], [r2+mmsize*2], [r2+mmsize*3], mmsize*2, %2 +%endif +%endif +%endmacro + +%macro QUANT_4x4x4 0 +cglobal quant_4x4x4, 3,3,7 +%if UNIX64 + mova m8, [r1+mmsize*0] + mova m9, [r1+mmsize*1] + mova m10, [r2+mmsize*0] + mova m11, [r2+mmsize*1] +%endif + QUANT_4x4 0, 4 + QUANT_4x4 32, 5 + packssdw m4, m5 + QUANT_4x4 64, 5 + QUANT_4x4 96, 6 + packssdw m5, m6 + packssdw m4, m5 +%if mmsize == 16 + packssdw m4, m4 ; AA BB CC DD +%endif + packsswb m4, m4 ; A B C D + pxor m3, m3 + pcmpeqb m4, m3 + pmovmskb eax, m4 + not eax + and eax, 0xf + RET +%endmacro + INIT_MMX mmx2 QUANT_DC quant_2x2_dc, 1 %if ARCH_X86_64 == 0 ; not needed because sse2 is faster @@ -342,26 +464,54 @@ INIT_MMX mmx QUANT_AC quant_4x4, 4 QUANT_AC quant_8x8, 16 +QUANT_4x4x4 %endif INIT_XMM sse2 -QUANT_DC quant_4x4_dc, 2, 8 +QUANT_DC quant_4x4_dc, 2, 7 QUANT_AC quant_4x4, 2 QUANT_AC quant_8x8, 8 +QUANT_4x4x4 INIT_XMM ssse3 -QUANT_DC quant_4x4_dc, 2, 8 +QUANT_DC quant_4x4_dc, 2, 7 QUANT_AC quant_4x4, 2 QUANT_AC quant_8x8, 8 +QUANT_4x4x4 INIT_MMX ssse3 QUANT_DC quant_2x2_dc, 1 INIT_XMM sse4 ;Not faster on Conroe, so only used in SSE4 versions -QUANT_DC quant_4x4_dc, 2, 8 +QUANT_DC quant_4x4_dc, 2, 7 QUANT_AC quant_4x4, 2 QUANT_AC quant_8x8, 8 + +INIT_YMM avx2 +QUANT_AC quant_4x4, 1 +QUANT_AC quant_8x8, 4 +QUANT_DC quant_4x4_dc, 1, 6 + +INIT_YMM avx2 +cglobal quant_4x4x4, 3,3,6 + mova m2, [r1] + mova m3, [r2] + QUANT_ONE [r0+ 0], m2, m3, 0, 4 + QUANT_ONE [r0+32], m2, m3, 0, 5 + packssdw m4, m5 + QUANT_ONE [r0+64], m2, m3, 0, 5 + QUANT_ONE [r0+96], m2, m3, 0, 1 + packssdw m5, m1 + packssdw m4, m5 + pxor m3, m3 + pcmpeqd m4, m3 + movmskps eax, m4 + mov edx, eax + shr eax, 4 + and eax, edx + xor eax, 0xf + RET %endif ; !HIGH_BIT_DEPTH @@ -370,56 +520,81 @@ ; dequant ;============================================================================= -%macro DEQUANT16_L 3 +%macro DEQUANT16_L 4 ;;; %1 dct[y][x] ;;; %2,%3 dequant_mf[i_mf][y][x] ;;; m2 i_qbits - mova m0, %2 %if HIGH_BIT_DEPTH - pmaddwd m0, %1 - pslld m0, m2 + mova m0, %1 + mova m1, %4 + pmaddwd m0, %2 + pmaddwd m1, %3 + pslld m0, xm2 + pslld m1, xm2 + mova %1, m0 + mova %4, m1 %else + mova m0, %2 packssdw m0, %3 - pmullw m0, %1 - psllw m0, m2 +%if mmsize==32 + vpermq m0, m0, q3120 %endif + pmullw m0, %1 + psllw m0, xm2 mova %1, m0 +%endif %endmacro -%macro DEQUANT32_R 3 +%macro DEQUANT32_R 4 ;;; %1 dct[y][x] ;;; %2,%3 dequant_mf[i_mf][y][x] ;;; m2 -i_qbits ;;; m3 f ;;; m4 0 - mova m0, %1 %if HIGH_BIT_DEPTH + mova m0, %1 + mova m1, %4 pmadcswd m0, m0, %2, m3 - psrad m0, m2 + pmadcswd m1, m1, %3, m3 + psrad m0, xm2 + psrad m1, xm2 + mova %1, m0 + mova %4, m1 %else +%if mmsize == 32 + pmovzxwd m0, %1 + pmovzxwd m1, %4 +%else + mova m0, %1 punpckhwd m1, m0, m4 punpcklwd m0, m4 +%endif pmadcswd m0, m0, %2, m3 pmadcswd m1, m1, %3, m3 - psrad m0, m2 - psrad m1, m2 + psrad m0, xm2 + psrad m1, xm2 packssdw m0, m1 +%if mmsize == 32 + vpermq m0, m0, q3120 %endif mova %1, m0 +%endif %endmacro %macro DEQUANT_LOOP 3 -%if 8*(%2-2*%3) +%if 8*(%2-2*%3) > 0 mov t0d, 8*(%2-2*%3) %%loop: - %1 [r0+(t0 )*SIZEOF_PIXEL], [r1+t0*2 ], [r1+t0*2+ 8*%3] - %1 [r0+(t0+8*%3)*SIZEOF_PIXEL], [r1+t0*2+16*%3], [r1+t0*2+24*%3] + %1 [r0+(t0 )*SIZEOF_PIXEL], [r1+t0*2 ], [r1+t0*2+ 8*%3], [r0+(t0+ 4*%3)*SIZEOF_PIXEL] + %1 [r0+(t0+8*%3)*SIZEOF_PIXEL], [r1+t0*2+16*%3], [r1+t0*2+24*%3], [r0+(t0+12*%3)*SIZEOF_PIXEL] sub t0d, 16*%3 jge %%loop RET %else - %1 [r0+(8*%3)*SIZEOF_PIXEL], [r1+16*%3], [r1+24*%3] - %1 [r0+(0 )*SIZEOF_PIXEL], [r1+0 ], [r1+ 8*%3] +%if mmsize < 32 + %1 [r0+(8*%3)*SIZEOF_PIXEL], [r1+16*%3], [r1+24*%3], [r0+(12*%3)*SIZEOF_PIXEL] +%endif + %1 [r0+(0 )*SIZEOF_PIXEL], [r1+0 ], [r1+ 8*%3], [r0+( 4*%3)*SIZEOF_PIXEL] RET %endif %endmacro @@ -441,10 +616,8 @@ %endrep %endmacro -%if WIN64 +%if ARCH_X86_64 DECLARE_REG_TMP 6,3,2 -%elif ARCH_X86_64 - DECLARE_REG_TMP 4,3,2 %else DECLARE_REG_TMP 2,0,1 %endif @@ -453,8 +626,8 @@ movifnidn t2d, r2m imul t0d, t2d, 0x2b shr t0d, 8 ; i_qbits = i_qp / 6 - lea t1, [t0*3] - sub t2d, t1d + lea t1d, [t0*5] + sub t2d, t0d sub t2d, t1d ; i_mf = i_qp % 6 shl t2d, %1 %if ARCH_X86_64 @@ -476,19 +649,19 @@ DEQUANT_START %2+2, %2 .lshift: - movd m2, t0d + movd xm2, t0d DEQUANT_LOOP DEQUANT16_L, %1*%1/4, %3 .rshift32: neg t0d - movd m2, t0d mova m3, [pd_1] + movd xm2, t0d + pslld m3, xm2 pxor m4, m4 - pslld m3, m2 psrld m3, 1 DEQUANT_LOOP DEQUANT32_R, %1*%1/4, %3 -%if HIGH_BIT_DEPTH == 0 && notcpuflag(avx) +%if HIGH_BIT_DEPTH == 0 && (notcpuflag(avx) || mmsize == 32) cglobal dequant_%1x%1_flat16, 0,3 movifnidn t2d, r2m %if %1 == 8 @@ -498,8 +671,8 @@ %endif imul t0d, t2d, 0x2b shr t0d, 8 ; i_qbits = i_qp / 6 - lea t1, [t0*3] - sub t2d, t1d + lea t1d, [t0*5] + sub t2d, t0d sub t2d, t1d ; i_mf = i_qp % 6 shl t2d, %2 %ifdef PIC @@ -509,23 +682,41 @@ lea r1, [dequant%1_scale + t2] %endif movifnidn r0, r0mp - movd m4, t0d + movd xm4, t0d %if %1 == 4 %if mmsize == 8 DEQUANT16_FLAT [r1], 0, 16 DEQUANT16_FLAT [r1+8], 8, 24 -%else +%elif mmsize == 16 DEQUANT16_FLAT [r1], 0, 16 +%else + vbroadcasti128 m0, [r1] + psllw m0, xm4 + pmullw m0, [r0] + mova [r0], m0 %endif %elif mmsize == 8 DEQUANT16_FLAT [r1], 0, 8, 64, 72 DEQUANT16_FLAT [r1+16], 16, 24, 48, 56 DEQUANT16_FLAT [r1+16], 80, 88, 112, 120 DEQUANT16_FLAT [r1+32], 32, 40, 96, 104 -%else +%elif mmsize == 16 DEQUANT16_FLAT [r1], 0, 64 DEQUANT16_FLAT [r1+16], 16, 48, 80, 112 DEQUANT16_FLAT [r1+32], 32, 96 +%else + mova m1, [r1+ 0] + mova m2, [r1+32] + psllw m1, xm4 + psllw m2, xm4 + pmullw m0, m1, [r0+ 0] + pmullw m3, m2, [r0+32] + pmullw m4, m1, [r0+64] + pmullw m5, m2, [r0+96] + mova [r0+ 0], m0 + mova [r0+32], m3 + mova [r0+64], m4 + mova [r0+96], m5 %endif RET %endif ; !HIGH_BIT_DEPTH && !AVX @@ -533,11 +724,14 @@ %if HIGH_BIT_DEPTH INIT_XMM sse2 -DEQUANT 4, 4, 1 -DEQUANT 8, 6, 1 +DEQUANT 4, 4, 2 +DEQUANT 8, 6, 2 INIT_XMM xop -DEQUANT 4, 4, 1 -DEQUANT 8, 6, 1 +DEQUANT 4, 4, 2 +DEQUANT 8, 6, 2 +INIT_YMM avx2 +DEQUANT 4, 4, 4 +DEQUANT 8, 6, 4 %else %if ARCH_X86_64 == 0 INIT_MMX mmx @@ -553,6 +747,9 @@ INIT_XMM xop DEQUANT 4, 4, 2 DEQUANT 8, 6, 2 +INIT_YMM avx2 +DEQUANT 4, 4, 4 +DEQUANT 8, 6, 4 %endif %macro DEQUANT_DC 2 @@ -560,55 +757,62 @@ DEQUANT_START 6, 6 .lshift: - movd m3, [r1] - movd m2, t0d - pslld m3, m2 - SPLAT%1 m3, m3, 0 -%assign x 0 -%rep SIZEOF_PIXEL*16/mmsize - mova m0, [r0+mmsize*0+x] - mova m1, [r0+mmsize*1+x] - %2 m0, m3 - %2 m1, m3 - mova [r0+mmsize*0+x], m0 - mova [r0+mmsize*1+x], m1 -%assign x x+mmsize*2 +%if cpuflag(avx2) + vpbroadcastdct m3, [r1] +%else + movd xm3, [r1] + SPLAT%1 m3, xm3 +%endif + movd xm2, t0d + pslld m3, xm2 +%assign %%x 0 +%rep SIZEOF_PIXEL*32/mmsize + %2 m0, m3, [r0+%%x] + mova [r0+%%x], m0 +%assign %%x %%x+mmsize %endrep RET .rshift32: - neg t0d - movd m3, t0d - mova m4, [p%1_1] - mova m5, m4 - pslld m4, m3 - psrld m4, 1 - movd m2, [r1] -%assign x 0 + neg t0d +%if cpuflag(avx2) + vpbroadcastdct m2, [r1] +%else + movd xm2, [r1] +%endif + mova m5, [p%1_1] + movd xm3, t0d + pslld m4, m5, xm3 + psrld m4, 1 %if HIGH_BIT_DEPTH - pshufd m2, m2, 0 +%if notcpuflag(avx2) + pshufd m2, m2, 0 +%endif +%assign %%x 0 %rep SIZEOF_PIXEL*32/mmsize - mova m0, [r0+x] - pmadcswd m0, m0, m2, m4 - psrad m0, m3 - mova [r0+x], m0 -%assign x x+mmsize + pmadcswd m0, m2, [r0+%%x], m4 + psrad m0, xm3 + mova [r0+%%x], m0 +%assign %%x %%x+mmsize %endrep %else ; !HIGH_BIT_DEPTH +%if notcpuflag(avx2) PSHUFLW m2, m2, 0 +%endif punpcklwd m2, m4 +%assign %%x 0 %rep SIZEOF_PIXEL*32/mmsize - mova m0, [r0+x] + mova m0, [r0+%%x] punpckhwd m1, m0, m5 punpcklwd m0, m5 pmaddwd m0, m2 pmaddwd m1, m2 - psrad m0, m3 - psrad m1, m3 + psrad m0, xm3 + psrad m1, xm3 packssdw m0, m1 - mova [r0+x], m0 -%assign x x+mmsize + mova [r0+%%x], m0 +%assign %%x %%x+mmsize %endrep %endif ; !HIGH_BIT_DEPTH RET @@ -619,6 +823,8 @@ DEQUANT_DC d, pmaddwd INIT_XMM xop DEQUANT_DC d, pmaddwd +INIT_YMM avx2 +DEQUANT_DC d, pmaddwd %else %if ARCH_X86_64 == 0 INIT_MMX mmx2 @@ -628,6 +834,8 @@ DEQUANT_DC w, pmullw INIT_XMM avx DEQUANT_DC w, pmullw +INIT_YMM avx2 +DEQUANT_DC w, pmullw %endif ; t4 is eax for return value. @@ -757,31 +965,29 @@ ; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size ) ;----------------------------------------------------------------------------- %macro DENOISE_DCT 0 -cglobal denoise_dct, 4,4,8 - pxor m6, m6 +cglobal denoise_dct, 4,4,6 + pxor m5, m5 movsxdifnidn r3, r3d .loop: mova m2, [r0+r3*4-2*mmsize] mova m3, [r0+r3*4-1*mmsize] ABSD m0, m2 ABSD m1, m3 - mova m4, m0 - mova m5, m1 + paddd m4, m0, [r1+r3*4-2*mmsize] psubd m0, [r2+r3*4-2*mmsize] + mova [r1+r3*4-2*mmsize], m4 + paddd m4, m1, [r1+r3*4-1*mmsize] psubd m1, [r2+r3*4-1*mmsize] - pcmpgtd m7, m0, m6 - pand m0, m7 - pcmpgtd m7, m1, m6 - pand m1, m7 + mova [r1+r3*4-1*mmsize], m4 + pcmpgtd m4, m0, m5 + pand m0, m4 + pcmpgtd m4, m1, m5 + pand m1, m4 PSIGND m0, m2 PSIGND m1, m3 mova [r0+r3*4-2*mmsize], m0 mova [r0+r3*4-1*mmsize], m1 - paddd m4, [r1+r3*4-2*mmsize] - paddd m5, [r1+r3*4-1*mmsize] - mova [r1+r3*4-2*mmsize], m4 - mova [r1+r3*4-1*mmsize], m5 - sub r3, mmsize/2 + sub r3d, mmsize/2 jg .loop RET %endmacro @@ -796,6 +1002,8 @@ DENOISE_DCT INIT_XMM avx DENOISE_DCT +INIT_YMM avx2 +DENOISE_DCT %else ; !HIGH_BIT_DEPTH @@ -845,6 +1053,27 @@ INIT_XMM avx DENOISE_DCT +INIT_YMM avx2 +cglobal denoise_dct, 4,4,4 + pxor m3, m3 + movsxdifnidn r3, r3d +.loop: + mova m1, [r0+r3*2-mmsize] + pabsw m0, m1 + psubusw m2, m0, [r2+r3*2-mmsize] + vpermq m0, m0, q3120 + psignw m2, m1 + mova [r0+r3*2-mmsize], m2 + punpcklwd m1, m0, m3 + punpckhwd m0, m3 + paddd m1, [r1+r3*4-2*mmsize] + paddd m0, [r1+r3*4-1*mmsize] + mova [r1+r3*4-2*mmsize], m1 + mova [r1+r3*4-1*mmsize], m0 + sub r3, mmsize/2 + jg .loop + RET + %endif ; !HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- @@ -854,56 +1083,55 @@ %macro DECIMATE_MASK 5 %if mmsize==16 %if HIGH_BIT_DEPTH - movdqa xmm0, [%3+ 0] - movdqa xmm1, [%3+32] - packssdw xmm0, [%3+16] - packssdw xmm1, [%3+48] - ABSW2 xmm0, xmm1, xmm0, xmm1, xmm3, xmm4 + movdqa m0, [%3+ 0] + movdqa m1, [%3+32] + packssdw m0, [%3+16] + packssdw m1, [%3+48] + ABSW2 m0, m1, m0, m1, m3, m4 %else - ABSW xmm0, [%3+ 0], xmm3 - ABSW xmm1, [%3+16], xmm4 + ABSW m0, [%3+ 0], m3 + ABSW m1, [%3+16], m4 %endif - packsswb xmm0, xmm1 - pxor xmm2, xmm2 - pcmpeqb xmm2, xmm0 - pcmpgtb xmm0, %4 - pmovmskb %1, xmm2 - pmovmskb %2, xmm0 - + packsswb m0, m1 + pxor m2, m2 + pcmpeqb m2, m0 + pcmpgtb m0, %4 + pmovmskb %1, m2 + pmovmskb %2, m0 %else ; mmsize==8 %if HIGH_BIT_DEPTH - movq mm0, [%3+ 0] - movq mm1, [%3+16] - movq mm2, [%3+32] - movq mm3, [%3+48] - packssdw mm0, [%3+ 8] - packssdw mm1, [%3+24] - packssdw mm2, [%3+40] - packssdw mm3, [%3+56] + movq m0, [%3+ 0] + movq m1, [%3+16] + movq m2, [%3+32] + movq m3, [%3+48] + packssdw m0, [%3+ 8] + packssdw m1, [%3+24] + packssdw m2, [%3+40] + packssdw m3, [%3+56] %else - movq mm0, [%3+ 0] - movq mm1, [%3+ 8] - movq mm2, [%3+16] - movq mm3, [%3+24] -%endif - ABSW2 mm0, mm1, mm0, mm1, mm6, mm7 - ABSW2 mm2, mm3, mm2, mm3, mm6, mm7 - packsswb mm0, mm1 - packsswb mm2, mm3 - pxor mm4, mm4 - pxor mm6, mm6 - pcmpeqb mm4, mm0 - pcmpeqb mm6, mm2 - pcmpgtb mm0, %4 - pcmpgtb mm2, %4 - pmovmskb %5, mm4 - pmovmskb %1, mm6 - shl %1, 8 - or %1, %5 - pmovmskb %5, mm0 - pmovmskb %2, mm2 - shl %2, 8 - or %2, %5 + movq m0, [%3+ 0] + movq m1, [%3+ 8] + movq m2, [%3+16] + movq m3, [%3+24] +%endif + ABSW2 m0, m1, m0, m1, m6, m7 + ABSW2 m2, m3, m2, m3, m6, m7 + packsswb m0, m1 + packsswb m2, m3 + pxor m4, m4 + pxor m6, m6 + pcmpeqb m4, m0 + pcmpeqb m6, m2 + pcmpgtb m0, %4 + pcmpgtb m2, %4 + pmovmskb %5, m4 + pmovmskb %1, m6 + shl %1, 8 + or %1, %5 + pmovmskb %5, m0 + pmovmskb %2, m2 + shl %2, 8 + or %2, %5 %endif %endmacro @@ -912,8 +1140,6 @@ %macro DECIMATE4x4 1 -;A LUT is faster than bsf on older AMD processors. -;This is not true for score64. cglobal decimate_score%1, 1,3 %ifdef PIC lea r4, [decimate_table4] @@ -932,7 +1158,6 @@ %if %1==15 shr edx, 1 %endif -%if cpuflag(slowctz) movzx ecx, dl movzx eax, byte [mask_table + rcx] cmp edx, ecx @@ -940,19 +1165,11 @@ bsr ecx, ecx shr edx, 1 shr edx, cl - bsf ecx, edx + tzcnt ecx, edx shr edx, 1 shr edx, cl add al, byte [table + rcx] add al, byte [mask_table + rdx] -%else -.loop: - tzcnt ecx, edx - shr edx, cl - add al, byte [table + rcx] - shr edx, 1 - jne .loop -%endif .ret: REP_RET .ret9: @@ -965,22 +1182,36 @@ INIT_MMX mmx2 DECIMATE4x4 15 DECIMATE4x4 16 -INIT_MMX mmx2, slowctz -DECIMATE4x4 15 -DECIMATE4x4 16 %endif INIT_XMM sse2 DECIMATE4x4 15 DECIMATE4x4 16 -INIT_XMM sse2, slowctz -DECIMATE4x4 15 -DECIMATE4x4 16 INIT_XMM ssse3 DECIMATE4x4 15 DECIMATE4x4 16 -INIT_XMM ssse3, slowctz -DECIMATE4x4 15 -DECIMATE4x4 16 + +; 2x gt1 output, 2x nz output, 1x mask +%macro DECIMATE_MASK64_AVX2 5 + pabsw m0, [r0+ 0] + pabsw m2, [r0+32] + pabsw m1, [r0+64] + pabsw m3, [r0+96] + packsswb m0, m2 + packsswb m1, m3 + pcmpgtb m2, m0, %5 ; the > 1 checks don't care about order, so + pcmpgtb m3, m1, %5 ; we can save latency by doing them here + pmovmskb %1, m2 + pmovmskb %2, m3 + or %1, %2 + jne .ret9 + vpermq m0, m0, q3120 + vpermq m1, m1, q3120 + pxor m4, m4 + pcmpeqb m0, m4 + pcmpeqb m1, m4 + pmovmskb %3, m0 + pmovmskb %4, m1 +%endmacro %macro DECIMATE8x8 0 @@ -993,33 +1224,44 @@ %define table decimate_table8 %endif mova m5, [pb_1] +%if mmsize==32 + DECIMATE_MASK64_AVX2 eax, r2d, r1d, r3d, m5 + shl r3, 32 + or r1, r3 + xor r1, -1 + je .ret +%else DECIMATE_MASK r1d, eax, r0+SIZEOF_DCTCOEF* 0, m5, null - test eax, eax + test eax, eax jne .ret9 DECIMATE_MASK r2d, eax, r0+SIZEOF_DCTCOEF*16, m5, null - shl r2d, 16 - or r1d, r2d + shl r2d, 16 + or r1d, r2d DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*32, m5, null shl r2, 32 - or eax, r3d + or eax, r3d or r1, r2 DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*48, m5, null shl r2, 48 or r1, r2 xor r1, -1 je .ret - add eax, r3d + add eax, r3d jne .ret9 +%endif + mov al, -6 .loop: tzcnt rcx, r1 shr r1, cl add al, byte [table + rcx] + jge .ret9 shr r1, 1 jne .loop + add al, 6 .ret: REP_RET .ret9: - mov eax, 9 + mov eax, 9 RET %else ; ARCH @@ -1029,6 +1271,13 @@ cglobal decimate_score64, 1,5 %endif mova m5, [pb_1] +%if mmsize==32 + DECIMATE_MASK64_AVX2 r0, r2, r3, r4, m5 + xor r3, -1 + je .tryret + xor r4, -1 +.cont: +%else DECIMATE_MASK r3, r2, r0+SIZEOF_DCTCOEF* 0, m5, r5 test r2, r2 jne .ret9 @@ -1045,22 +1294,24 @@ xor r4, -1 .cont: add r0, r2 - jne .ret9 ;r0 is zero at this point, so we don't need to zero it + jne .ret9 +%endif + mov al, -6 .loop: tzcnt ecx, r3 test r3, r3 je .largerun shrd r3, r4, cl shr r4, cl - add r0b, byte [decimate_table8 + ecx] + add al, byte [decimate_table8 + ecx] + jge .ret9 shrd r3, r4, 1 shr r4, 1 - cmp r0, 6 ;score64's threshold is never higher than 6 - jge .ret9 ;this early termination is only useful on 32-bit because it can be done in the latency after shrd test r3, r3 jne .loop test r4, r4 jne .loop + add al, 6 .ret: REP_RET .tryret: @@ -1077,6 +1328,7 @@ shr r3, cl shr r3, 1 jne .loop + add al, 6 RET %endif ; ARCH @@ -1090,6 +1342,8 @@ DECIMATE8x8 INIT_XMM ssse3 DECIMATE8x8 +INIT_YMM avx2 +DECIMATE8x8 ;----------------------------------------------------------------------------- ; int coeff_last( dctcoef *dct ) @@ -1281,38 +1535,38 @@ RET %if ARCH_X86_64 == 0 -cglobal coeff_last64, 1, 5-mmsize/16 +cglobal coeff_last64, 1, 4-mmsize/16 pxor m2, m2 - LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF* 32, r4d - LAST_MASK 16, r3d, r0+SIZEOF_DCTCOEF* 48, r4d - shl r3d, 16 - or r2d, r3d - xor r2d, -1 + LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 32, r3d + LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF* 48, r3d + shl r2d, 16 + or r1d, r2d + xor r1d, -1 jne .secondhalf - LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0, r4d - LAST_MASK 16, r3d, r0+SIZEOF_DCTCOEF*16, r4d - shl r3d, 16 - or r1d, r3d + LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0, r3d + LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*16, r3d + shl r2d, 16 + or r1d, r2d not r1d BSR eax, r1d, 0x1f RET .secondhalf: - BSR eax, r2d, 0x1f + BSR eax, r1d, 0x1f add eax, 32 RET %else -cglobal coeff_last64, 1,4 +cglobal coeff_last64, 1,3 pxor m2, m2 LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0 LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*16 - LAST_MASK 16, r3d, r0+SIZEOF_DCTCOEF*32 - LAST_MASK 16, r0d, r0+SIZEOF_DCTCOEF*48 shl r2d, 16 - shl r0d, 16 or r1d, r2d - or r3d, r0d - shl r3, 32 - or r1, r3 + LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*32 + LAST_MASK 16, r0d, r0+SIZEOF_DCTCOEF*48 + shl r0d, 16 + or r2d, r0d + shl r2, 32 + or r1, r2 not r1 BSR rax, r1, 0x3f RET @@ -1328,10 +1582,63 @@ INIT_XMM sse2, lzcnt COEFF_LAST +%macro LAST_MASK_AVX2 2 +%if HIGH_BIT_DEPTH + mova m0, [%2+ 0] + packssdw m0, [%2+32] + mova m1, [%2+64] + packssdw m1, [%2+96] + packsswb m0, m1 + mova m1, [deinterleave_shufd] + vpermd m0, m1, m0 +%else + mova m0, [%2+ 0] + packsswb m0, [%2+32] + vpermq m0, m0, q3120 +%endif + pcmpeqb m0, m2 + pmovmskb %1, m0 +%endmacro + +%if ARCH_X86_64 == 0 +INIT_YMM avx2,lzcnt +cglobal coeff_last64, 1,2 + pxor m2, m2 + LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF*32 + xor r1d, -1 + jne .secondhalf + LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF* 0 + not r1d + BSR eax, r1d, 0x1f + RET +.secondhalf: + BSR eax, r1d, 0x1f + add eax, 32 + RET +%else +INIT_YMM avx2,lzcnt +cglobal coeff_last64, 1,3 + pxor m2, m2 + LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF* 0 + LAST_MASK_AVX2 r2d, r0+SIZEOF_DCTCOEF*32 + shl r2, 32 + or r1, r2 + not r1 + BSR rax, r1, 0x3f + RET +%endif + ;----------------------------------------------------------------------------- ; int coeff_level_run( dctcoef *dct, run_level_t *runlevel ) ;----------------------------------------------------------------------------- +struc levelrun + .last: resd 1 + .mask: resd 1 + align 16, resb 1 + .level: resw 16 +endstruc + ; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args %if WIN64 DECLARE_REG_TMP 3,1,2,0,4,5,6 @@ -1346,6 +1653,7 @@ movifnidn t0, r0mp movifnidn t1, r1mp pxor m2, m2 + xor t3d, t3d LAST_MASK %1, t5d, t0-(%1&1)*SIZEOF_DCTCOEF, t4d %if %1==15 shr t5d, 1 @@ -1355,7 +1663,7 @@ and t5d, 0xf %endif xor t5d, (1<<%1)-1 - mov [t1+4], t5d + mov [t1+levelrun.mask], t5d shl t5d, 32-%1 mov t4d, %1-1 LZCOUNT t3d, t5d, 0x1f @@ -1363,7 +1671,7 @@ add t5d, t5d sub t4d, t3d shl t5d, t3b - mov [t1], t4d + mov [t1+levelrun.last], t4d .loop: LZCOUNT t3d, t5d, 0x1f %if HIGH_BIT_DEPTH @@ -1374,9 +1682,9 @@ inc t3d shl t5d, t3b %if HIGH_BIT_DEPTH - mov [t1+t6*4+ 8], t2d + mov [t1+t6*4+levelrun.level], t2d %else - mov [t1+t6*2+ 8], t2w + mov [t1+t6*2+levelrun.level], t2w %endif inc t6d sub t4d, t3d @@ -1406,3 +1714,133 @@ INIT_MMX mmx2, lzcnt COEFF_LEVELRUN 4 COEFF_LEVELRUN 8 + +; Similar to the one above, but saves the DCT +; coefficients in m0/m1 so we don't have to load +; them later. +%macro LAST_MASK_LUT 3 + pxor xm5, xm5 +%if %1 <= 8 + mova m0, [%3] + packsswb m2, m0, m0 +%else + mova xm0, [%3+ 0] + mova xm1, [%3+16] + packsswb xm2, xm0, xm1 +%if mmsize==32 + vinserti128 m0, m0, xm1, 1 +%endif +%endif + pcmpeqb xm2, xm5 + pmovmskb %2, xm2 +%endmacro + +%macro COEFF_LEVELRUN_LUT 1 +cglobal coeff_level_run%1,2,4+(%1/9) +%ifdef PIC + lea r5, [$$] + %define GLOBAL +r5-$$ +%else + %define GLOBAL +%endif + LAST_MASK_LUT %1, eax, r0-(%1&1)*SIZEOF_DCTCOEF +%if %1==15 + shr eax, 1 +%elif %1==8 + and eax, 0xff +%elif %1==4 + and eax, 0xf +%endif + xor eax, (1<<%1)-1 + mov [r1+levelrun.mask], eax +%if %1==15 + add eax, eax +%endif +%if %1 > 8 +%if ARCH_X86_64 + mov r4d, eax + shr r4d, 8 +%else + movzx r4d, ah ; first 8 bits +%endif +%endif + movzx r2d, al ; second 8 bits + shl eax, 32-%1-(%1&1) + LZCOUNT eax, eax, 0x1f + mov r3d, %1-1 + sub r3d, eax + mov [r1+levelrun.last], r3d +; Here we abuse pshufb, combined with a lookup table, to do a gather +; operation based on a bitmask. For example: +; +; dct 15-8 (input): 0 0 4 0 0 -2 1 0 +; dct 7-0 (input): 0 0 -1 0 0 0 0 15 +; bitmask 1: 0 0 1 0 0 1 1 0 +; bitmask 2: 0 0 1 0 0 0 0 1 +; gather 15-8: 4 -2 1 __ __ __ __ __ +; gather 7-0: -1 15 __ __ __ __ __ __ +; levels (output): 4 -2 1 -1 15 __ __ __ __ __ __ __ __ __ __ __ +; +; The overlapping, dependent stores almost surely cause a mess of +; forwarding issues, but it's still enormously faster. +%if %1 > 8 + movzx eax, byte [popcnt_table+r4 GLOBAL] + movzx r3d, byte [popcnt_table+r2 GLOBAL] +%if mmsize==16 + movh m3, [dct_coef_shuffle+r4*8 GLOBAL] + movh m2, [dct_coef_shuffle+r2*8 GLOBAL] + mova m4, [pw_256] +; Storing 8 bytes of shuffle constant and converting it (unpack + or) +; is neutral to slightly faster in local speed measurements, but it +; cuts the table size in half, which is surely a big cache win. + punpcklbw m3, m3 + punpcklbw m2, m2 + por m3, m4 + por m2, m4 + pshufb m1, m3 + pshufb m0, m2 + mova [r1+levelrun.level], m1 +; This obnoxious unaligned store messes with store forwarding and +; stalls the CPU to no end, but merging the two registers before +; storing requires a variable 128-bit shift. Emulating this does +; work, but requires a lot of ops and the gain is tiny and +; inconsistent, so we'll err on the side of fewer instructions. + movu [r1+rax*2+levelrun.level], m0 +%else ; mmsize==32 + movq xm2, [dct_coef_shuffle+r4*8 GLOBAL] + vinserti128 m2, m2, [dct_coef_shuffle+r2*8 GLOBAL], 1 + punpcklbw m2, m2 + por m2, [pw_256] + pshufb m0, m2 + vextracti128 [r1+levelrun.level], m0, 1 + movu [r1+rax*2+levelrun.level], xm0 +%endif + add eax, r3d +%else + movzx eax, byte [popcnt_table+r2 GLOBAL] + movh m1, [dct_coef_shuffle+r2*8 GLOBAL] + punpcklbw m1, m1 + por m1, [pw_256] + pshufb m0, m1 + mova [r1+levelrun.level], m0 +%endif + RET +%endmacro + +%if HIGH_BIT_DEPTH==0 +INIT_MMX ssse3 +COEFF_LEVELRUN_LUT 4 +INIT_XMM ssse3 +COEFF_LEVELRUN_LUT 8 +COEFF_LEVELRUN_LUT 15 +COEFF_LEVELRUN_LUT 16 +INIT_MMX ssse3, lzcnt +COEFF_LEVELRUN_LUT 4 +INIT_XMM ssse3, lzcnt +COEFF_LEVELRUN_LUT 8 +COEFF_LEVELRUN_LUT 15 +COEFF_LEVELRUN_LUT 16 +INIT_XMM avx2, lzcnt +COEFF_LEVELRUN_LUT 15 +COEFF_LEVELRUN_LUT 16 +%endif
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/quant.h -> x264-snapshot-20130723-2245.tar.bz2/common/x86/quant.h
Changed
@@ -31,19 +31,27 @@ int x264_quant_2x2_dc_mmx2( dctcoef dct[4], int mf, int bias ); int x264_quant_4x4_dc_mmx2( dctcoef dct[16], int mf, int bias ); int x264_quant_4x4_mmx( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] ); +int x264_quant_4x4x4_mmx( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] ); int x264_quant_8x8_mmx( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] ); int x264_quant_2x2_dc_sse2( dctcoef dct[16], int mf, int bias ); int x264_quant_4x4_dc_sse2( dctcoef dct[16], int mf, int bias ); int x264_quant_4x4_sse2( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] ); +int x264_quant_4x4x4_sse2( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] ); int x264_quant_8x8_sse2( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] ); int x264_quant_2x2_dc_ssse3( dctcoef dct[4], int mf, int bias ); int x264_quant_4x4_dc_ssse3( dctcoef dct[16], int mf, int bias ); int x264_quant_4x4_ssse3( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] ); +int x264_quant_4x4x4_ssse3( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] ); int x264_quant_8x8_ssse3( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] ); int x264_quant_2x2_dc_sse4( dctcoef dct[16], int mf, int bias ); int x264_quant_4x4_dc_sse4( dctcoef dct[16], int mf, int bias ); int x264_quant_4x4_sse4( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] ); +int x264_quant_4x4x4_sse4( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] ); int x264_quant_8x8_sse4( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] ); +int x264_quant_4x4_avx2( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] ); +int x264_quant_4x4_dc_avx2( dctcoef dct[16], int mf, int bias ); +int x264_quant_8x8_avx2( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] ); +int x264_quant_4x4x4_avx2( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] ); void x264_dequant_4x4_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_4x4dc_mmx2( int16_t dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_8x8_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp ); @@ -56,10 +64,15 @@ void x264_dequant_4x4_xop( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_4x4dc_xop( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_8x8_xop( dctcoef dct[64], int dequant_mf[6][64], int i_qp ); +void x264_dequant_4x4_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); +void x264_dequant_4x4dc_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); +void x264_dequant_8x8_avx2( dctcoef dct[64], int dequant_mf[6][64], int i_qp ); void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp ); void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_8x8_flat16_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp ); +void x264_dequant_4x4_flat16_avx2( int16_t dct[16], int dequant_mf[6][16], int i_qp ); +void x264_dequant_8x8_flat16_avx2( int16_t dct[64], int dequant_mf[6][64], int i_qp ); int x264_optimize_chroma_2x2_dc_sse2( dctcoef dct[4], int dequant_mf ); int x264_optimize_chroma_2x2_dc_ssse3( dctcoef dct[4], int dequant_mf ); int x264_optimize_chroma_2x2_dc_sse4( dctcoef dct[4], int dequant_mf ); @@ -68,21 +81,17 @@ void x264_denoise_dct_sse2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ); void x264_denoise_dct_ssse3( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ); void x264_denoise_dct_avx ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ); +void x264_denoise_dct_avx2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ); int x264_decimate_score15_mmx2( dctcoef *dct ); int x264_decimate_score15_sse2( dctcoef *dct ); int x264_decimate_score15_ssse3( dctcoef *dct ); int x264_decimate_score16_mmx2( dctcoef *dct ); int x264_decimate_score16_sse2( dctcoef *dct ); int x264_decimate_score16_ssse3( dctcoef *dct ); -int x264_decimate_score15_mmx2_slowctz( dctcoef *dct ); -int x264_decimate_score15_sse2_slowctz( dctcoef *dct ); -int x264_decimate_score15_ssse3_slowctz( dctcoef *dct ); -int x264_decimate_score16_mmx2_slowctz( dctcoef *dct ); -int x264_decimate_score16_sse2_slowctz( dctcoef *dct ); -int x264_decimate_score16_ssse3_slowctz( dctcoef *dct ); int x264_decimate_score64_mmx2( dctcoef *dct ); int x264_decimate_score64_sse2( dctcoef *dct ); int x264_decimate_score64_ssse3( dctcoef *dct ); +int x264_decimate_score64_avx2( int16_t *dct ); int x264_coeff_last4_mmx2( dctcoef *dct ); int x264_coeff_last8_mmx2( dctcoef *dct ); int x264_coeff_last15_mmx2( dctcoef *dct ); @@ -98,18 +107,29 @@ int x264_coeff_last15_sse2_lzcnt( dctcoef *dct ); int x264_coeff_last16_sse2_lzcnt( dctcoef *dct ); int x264_coeff_last64_sse2_lzcnt( dctcoef *dct ); +int x264_coeff_last64_avx2_lzcnt( dctcoef *dct ); int x264_coeff_level_run16_mmx2( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run16_sse2( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run16_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run16_ssse3( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run16_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run16_avx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run15_mmx2( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run15_sse2( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run15_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run15_ssse3( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run15_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run15_avx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run4_mmx2( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run4_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run4_ssse3( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run4_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run8_mmx2( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run8_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run8_sse2( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run8_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run8_ssse3( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run8_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_trellis_cabac_4x4_sse2 ( TRELLIS_PARAMS, int b_ac ); int x264_trellis_cabac_4x4_ssse3( TRELLIS_PARAMS, int b_ac ); int x264_trellis_cabac_8x8_sse2 ( TRELLIS_PARAMS, int b_interlaced );
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/sad-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/sad-a.asm
Changed
@@ -29,6 +29,12 @@ %include "x86inc.asm" %include "x86util.asm" +SECTION_RODATA 32 + +pb_shuf8x8c2: times 2 db 0,0,0,0,8,8,8,8,-1,-1,-1,-1,-1,-1,-1,-1 +deinterleave_sadx4: dd 0,4,2,6 +hpred_shuf: db 0,0,2,2,8,8,10,10,1,1,3,3,9,9,11,11 + SECTION .text cextern pb_3 @@ -556,6 +562,65 @@ INIT_MMX ssse3 INTRA_SAD_8x8C +INIT_YMM avx2 +cglobal intra_sad_x3_8x8c, 3,3,7 + vpbroadcastq m2, [r1 - FDEC_STRIDE] ; V pred + add r1, FDEC_STRIDE*4-1 + pxor xm5, xm5 + punpckldq xm3, xm2, xm5 ; V0 _ V1 _ + movd xm0, [r1 + FDEC_STRIDE*-1 - 3] + movd xm1, [r1 + FDEC_STRIDE* 3 - 3] + pinsrb xm0, [r1 + FDEC_STRIDE*-4], 0 + pinsrb xm1, [r1 + FDEC_STRIDE* 0], 0 + pinsrb xm0, [r1 + FDEC_STRIDE*-3], 1 + pinsrb xm1, [r1 + FDEC_STRIDE* 1], 1 + pinsrb xm0, [r1 + FDEC_STRIDE*-2], 2 + pinsrb xm1, [r1 + FDEC_STRIDE* 2], 2 + punpcklqdq xm0, xm1 ; H0 _ H1 _ + vinserti128 m3, m3, xm0, 1 ; V0 V1 H0 H1 + pshufb xm0, [hpred_shuf] ; H00224466 H11335577 + psadbw m3, m5 ; s0 s1 s2 s3 + vpermq m4, m3, q3312 ; s2 s1 s3 s3 + vpermq m3, m3, q1310 ; s0 s1 s3 s1 + paddw m3, m4 + psrlw m3, 2 + pavgw m3, m5 ; s0+s2 s1 s3 s1+s3 + pshufb m3, [pb_shuf8x8c2] ; DC0 _ DC1 _ + vpblendd m3, m3, m2, 11001100b ; DC0 V DC1 V + vinserti128 m1, m3, xm3, 1 ; DC0 V DC0 V + vperm2i128 m6, m3, m3, q0101 ; DC1 V DC1 V + vpermq m0, m0, q3120 ; H00224466 _ H11335577 _ + movddup m2, [r0+FENC_STRIDE*0] + movddup m4, [r0+FENC_STRIDE*2] + pshuflw m3, m0, q0000 + psadbw m3, m2 + psadbw m2, m1 + pshuflw m5, m0, q1111 + psadbw m5, m4 + psadbw m4, m1 + paddw m2, m4 + paddw m3, m5 + movddup m4, [r0+FENC_STRIDE*4] + pshuflw m5, m0, q2222 + psadbw m5, m4 + psadbw m4, m6 + paddw m2, m4 + paddw m3, m5 + movddup m4, [r0+FENC_STRIDE*6] + pshuflw m5, m0, q3333 + psadbw m5, m4 + psadbw m4, m6 + paddw m2, m4 + paddw m3, m5 + vextracti128 xm0, m2, 1 + vextracti128 xm1, m3, 1 + paddw xm2, xm0 ; DC V + paddw xm3, xm1 ; H + pextrd [r2+8], xm2, 2 ; V + movd [r2+4], xm3 ; H + movd [r2+0], xm2 ; DC + RET + ;----------------------------------------------------------------------------- ; void intra_sad_x3_16x16( uint8_t *fenc, uint8_t *fdec, int res[3] ); @@ -648,7 +713,50 @@ INIT_XMM ssse3 INTRA_SAD16 - +INIT_YMM avx2 +cglobal intra_sad_x3_16x16, 3,5,6 + pxor xm0, xm0 + psadbw xm0, [r1-FDEC_STRIDE] + movhlps xm1, xm0 + paddw xm0, xm1 + movd r3d, xm0 +%assign x 0 +%rep 16 + movzx r4d, byte [r1-1+FDEC_STRIDE*(x&3)] +%if (x&3)==3 && x!=15 + add r1, FDEC_STRIDE*4 +%endif + add r3d, r4d +%assign x x+1 +%endrep + sub r1, FDEC_STRIDE*12 + add r3d, 16 + shr r3d, 5 + movd xm5, r3d + vpbroadcastb xm5, xm5 + vinserti128 m5, m5, [r1-FDEC_STRIDE], 1 ; m5 contains DC and V prediction + + pxor m4, m4 ; DC / V accumulator + pxor xm3, xm3 ; H accumulator + mov r3d, 15*FENC_STRIDE +.vloop: + vpbroadcastb xm2, [r1+r3*2-1] + vbroadcasti128 m0, [r0+r3] + psadbw m1, m0, m5 + psadbw xm0, xm2 + paddw m4, m1 + paddw xm3, xm0 + add r3d, -FENC_STRIDE + jge .vloop + punpckhqdq m5, m4, m4 + movhlps xm2, xm3 + paddw m4, m5 ; DC / V + paddw xm3, xm2 ; H + vextracti128 xm2, m4, 1 + movd [r2+0], xm2 + movd [r2+4], xm3 + movd [r2+8], xm4 + RET ;============================================================================= ; SAD x3/x4 MMX @@ -944,17 +1052,27 @@ %endif %endmacro -%macro SAD_X3_2x16P_SSE2 1 -%if %1 +%macro SAD_X3_4x16P_SSE2 2 +%if %1==0 +%if UNIX64 + mov r6, r5 +%endif + lea r5, [r4*3] SAD_X3_START_1x16P_SSE2 %else - SAD_X3_1x16P_SSE2 0, 0 + SAD_X3_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0 +%endif + SAD_X3_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r4*1 + SAD_X3_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2 + SAD_X3_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), r5 +%if %1 != %2-1 +%if (%1&1) != 0 + add r0, 8*FENC_STRIDE +%endif + lea r1, [r1+4*r4] + lea r2, [r2+4*r4] + lea r3, [r3+4*r4] %endif - SAD_X3_1x16P_SSE2 FENC_STRIDE, r4 - add r0, 2*FENC_STRIDE - lea r1, [r1+2*r4] - lea r2, [r2+2*r4] - lea r3, [r3+2*r4] %endmacro %macro SAD_X3_START_2x8P_SSE2 0 @@ -971,15 +1089,15 @@ psadbw xmm2, xmm7 %endmacro -%macro SAD_X3_2x8P_SSE2 0 - movq xmm7, [r0] - movq xmm3, [r1] - movq xmm4, [r2] - movq xmm5, [r3] - movhps xmm7, [r0+FENC_STRIDE] - movhps xmm3, [r1+r4] - movhps xmm4, [r2+r4] - movhps xmm5, [r3+r4] +%macro SAD_X3_2x8P_SSE2 4 + movq xmm7, [r0+%1] + movq xmm3, [r1+%2] + movq xmm4, [r2+%2] + movq xmm5, [r3+%2] + movhps xmm7, [r0+%3] + movhps xmm3, [r1+%4] + movhps xmm4, [r2+%4] + movhps xmm5, [r3+%4] psadbw xmm3, xmm7 psadbw xmm4, xmm7 psadbw xmm5, xmm7 @@ -1005,18 +1123,18 @@ psadbw xmm3, xmm7 %endmacro -%macro SAD_X4_2x8P_SSE2 0 - movq xmm7, [r0] - movq xmm4, [r1] - movq xmm5, [r2] +%macro SAD_X4_2x8P_SSE2 4 + movq xmm7, [r0+%1] + movq xmm4, [r1+%2] + movq xmm5, [r2+%2] %if ARCH_X86_64 - movq xmm6, [r3] - movq xmm8, [r4] - movhps xmm7, [r0+FENC_STRIDE] - movhps xmm4, [r1+r5] - movhps xmm5, [r2+r5] - movhps xmm6, [r3+r5] - movhps xmm8, [r4+r5] + movq xmm6, [r3+%2] + movq xmm8, [r4+%2] + movhps xmm7, [r0+%3] + movhps xmm4, [r1+%4] + movhps xmm5, [r2+%4] + movhps xmm6, [r3+%4] + movhps xmm8, [r4+%4] psadbw xmm4, xmm7 psadbw xmm5, xmm7 psadbw xmm6, xmm7 @@ -1026,17 +1144,17 @@ paddw xmm2, xmm6 paddw xmm3, xmm8 %else - movhps xmm7, [r0+FENC_STRIDE] - movhps xmm4, [r1+r5] - movhps xmm5, [r2+r5] + movhps xmm7, [r0+%3] + movhps xmm4, [r1+%4] + movhps xmm5, [r2+%4] psadbw xmm4, xmm7 psadbw xmm5, xmm7 paddw xmm0, xmm4 paddw xmm1, xmm5 - movq xmm6, [r3] - movq xmm4, [r4] - movhps xmm6, [r3+r5] - movhps xmm4, [r4+r5] + movq xmm6, [r3+%2] + movq xmm4, [r4+%2] + movhps xmm6, [r3+%4] + movhps xmm4, [r4+%4] psadbw xmm6, xmm7 psadbw xmm4, xmm7 paddw xmm2, xmm6 @@ -1110,43 +1228,65 @@ %endif %endmacro -%macro SAD_X4_2x16P_SSE2 1 -%if %1 +%macro SAD_X4_4x16P_SSE2 2 +%if %1==0 + lea r6, [r5*3] SAD_X4_START_1x16P_SSE2 %else - SAD_X4_1x16P_SSE2 0, 0 + SAD_X4_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r5*0 +%endif + SAD_X4_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r5*1 + SAD_X4_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r5*2 + SAD_X4_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), r6 +%if %1 != %2-1 +%if (%1&1) != 0 + add r0, 8*FENC_STRIDE +%endif + lea r1, [r1+4*r5] + lea r2, [r2+4*r5] + lea r3, [r3+4*r5] + lea r4, [r4+4*r5] %endif - SAD_X4_1x16P_SSE2 FENC_STRIDE, r5 - add r0, 2*FENC_STRIDE - lea r1, [r1+2*r5] - lea r2, [r2+2*r5] - lea r3, [r3+2*r5] - lea r4, [r4+2*r5] %endmacro -%macro SAD_X3_2x8P_SSE2 1 -%if %1 +%macro SAD_X3_4x8P_SSE2 2 +%if %1==0 +%if UNIX64 + mov r6, r5 +%endif + lea r5, [r4*3] SAD_X3_START_2x8P_SSE2 %else - SAD_X3_2x8P_SSE2 + SAD_X3_2x8P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0, FENC_STRIDE*(1+(%1&1)*4), r4*1 +%endif + SAD_X3_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2, FENC_STRIDE*(3+(%1&1)*4), r5 +%if %1 != %2-1 +%if (%1&1) != 0 + add r0, 8*FENC_STRIDE +%endif + lea r1, [r1+4*r4] + lea r2, [r2+4*r4] + lea r3, [r3+4*r4] %endif - add r0, 2*FENC_STRIDE - lea r1, [r1+2*r4] - lea r2, [r2+2*r4] - lea r3, [r3+2*r4] %endmacro -%macro SAD_X4_2x8P_SSE2 1 -%if %1 +%macro SAD_X4_4x8P_SSE2 2 +%if %1==0 + lea r6, [r5*3] SAD_X4_START_2x8P_SSE2 %else - SAD_X4_2x8P_SSE2 + SAD_X4_2x8P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1 +%endif + SAD_X4_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6 +%if %1 != %2-1 +%if (%1&1) != 0 + add r0, 8*FENC_STRIDE +%endif + lea r1, [r1+4*r5] + lea r2, [r2+4*r5] + lea r3, [r3+4*r5] + lea r4, [r4+4*r5] %endif - add r0, 2*FENC_STRIDE - lea r1, [r1+2*r5] - lea r2, [r2+2*r5] - lea r3, [r3+2*r5] - lea r4, [r4+2*r5] %endmacro %macro SAD_X3_END_SSE2 0 @@ -1157,9 +1297,9 @@ paddw xmm1, xmm5 paddw xmm2, xmm6 %if UNIX64 - movd [r5+0], xmm0 - movd [r5+4], xmm1 - movd [r5+8], xmm2 + movd [r6+0], xmm0 + movd [r6+4], xmm1 + movd [r6+8], xmm2 %else mov r0, r5mp movd [r0+0], xmm0 @@ -1184,15 +1324,230 @@ RET %endmacro +%macro SAD_X4_START_2x8P_SSSE3 0 + movddup xmm4, [r0] + movq xmm0, [r1] + movq xmm1, [r3] + movhps xmm0, [r2] + movhps xmm1, [r4] + movddup xmm5, [r0+FENC_STRIDE] + movq xmm2, [r1+r5] + movq xmm3, [r3+r5] + movhps xmm2, [r2+r5] + movhps xmm3, [r4+r5] + psadbw xmm0, xmm4 + psadbw xmm1, xmm4 + psadbw xmm2, xmm5 + psadbw xmm3, xmm5 + paddw xmm0, xmm2 + paddw xmm1, xmm3 +%endmacro + +%macro SAD_X4_2x8P_SSSE3 4 + movddup xmm6, [r0+%1] + movq xmm2, [r1+%2] + movq xmm3, [r3+%2] + movhps xmm2, [r2+%2] + movhps xmm3, [r4+%2] + movddup xmm7, [r0+%3] + movq xmm4, [r1+%4] + movq xmm5, [r3+%4] + movhps xmm4, [r2+%4] + movhps xmm5, [r4+%4] + psadbw xmm2, xmm6 + psadbw xmm3, xmm6 + psadbw xmm4, xmm7 + psadbw xmm5, xmm7 + paddw xmm0, xmm2 + paddw xmm1, xmm3 + paddw xmm0, xmm4 + paddw xmm1, xmm5 +%endmacro + +%macro SAD_X4_4x8P_SSSE3 2 +%if %1==0 + lea r6, [r5*3] + SAD_X4_START_2x8P_SSSE3 +%else + SAD_X4_2x8P_SSSE3 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1 +%endif + SAD_X4_2x8P_SSSE3 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6 +%if %1 != %2-1 +%if (%1&1) != 0 + add r0, 8*FENC_STRIDE +%endif + lea r1, [r1+4*r5] + lea r2, [r2+4*r5] + lea r3, [r3+4*r5] + lea r4, [r4+4*r5] +%endif +%endmacro + +%macro SAD_X4_END_SSSE3 0 + mov r0, r6mp + packssdw xmm0, xmm1 + movdqa [r0], xmm0 + RET +%endmacro + +%macro SAD_X3_START_2x16P_AVX2 0 + movu m3, [r0] ; assumes FENC_STRIDE == 16 + movu xm0, [r1] + movu xm1, [r2] + movu xm2, [r3] + vinserti128 m0, m0, [r1+r4], 1 + vinserti128 m1, m1, [r2+r4], 1 + vinserti128 m2, m2, [r3+r4], 1 + psadbw m0, m3 + psadbw m1, m3 + psadbw m2, m3 +%endmacro + +%macro SAD_X3_2x16P_AVX2 3 + movu m3, [r0+%1] ; assumes FENC_STRIDE == 16 + movu xm4, [r1+%2] + movu xm5, [r2+%2] + movu xm6, [r3+%2] + vinserti128 m4, m4, [r1+%3], 1 + vinserti128 m5, m5, [r2+%3], 1 + vinserti128 m6, m6, [r3+%3], 1 + psadbw m4, m3 + psadbw m5, m3 + psadbw m6, m3 + paddw m0, m4 + paddw m1, m5 + paddw m2, m6 +%endmacro + +%macro SAD_X3_4x16P_AVX2 2 +%if %1==0 +%if UNIX64 + mov r6, r5 +%endif + lea r5, [r4*3] + SAD_X3_START_2x16P_AVX2 +%else + SAD_X3_2x16P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r4*0, r4*1 +%endif + SAD_X3_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r4*2, r5 +%if %1 != %2-1 +%if (%1&1) != 0 + add r0, 8*FENC_STRIDE +%endif + lea r1, [r1+4*r4] + lea r2, [r2+4*r4] + lea r3, [r3+4*r4] +%endif +%endmacro + +%macro SAD_X4_START_2x16P_AVX2 0 + vbroadcasti128 m4, [r0] + vbroadcasti128 m5, [r0+FENC_STRIDE] + movu xm0, [r1] + movu xm1, [r3] + movu xm2, [r1+r5] + movu xm3, [r3+r5] + vinserti128 m0, m0, [r2], 1 + vinserti128 m1, m1, [r4], 1 + vinserti128 m2, m2, [r2+r5], 1 + vinserti128 m3, m3, [r4+r5], 1 + psadbw m0, m4 + psadbw m1, m4 + psadbw m2, m5 + psadbw m3, m5 + paddw m0, m2 + paddw m1, m3 +%endmacro + +%macro SAD_X4_2x16P_AVX2 4 + vbroadcasti128 m6, [r0+%1] + vbroadcasti128 m7, [r0+%3] + movu xm2, [r1+%2] + movu xm3, [r3+%2] + movu xm4, [r1+%4] + movu xm5, [r3+%4] + vinserti128 m2, m2, [r2+%2], 1 + vinserti128 m3, m3, [r4+%2], 1 + vinserti128 m4, m4, [r2+%4], 1 + vinserti128 m5, m5, [r4+%4], 1 + psadbw m2, m6 + psadbw m3, m6 + psadbw m4, m7 + psadbw m5, m7 + paddw m0, m2 + paddw m1, m3 + paddw m0, m4 + paddw m1, m5 +%endmacro + +%macro SAD_X4_4x16P_AVX2 2 +%if %1==0 + lea r6, [r5*3] + SAD_X4_START_2x16P_AVX2 +%else + SAD_X4_2x16P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1 +%endif + SAD_X4_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6 +%if %1 != %2-1 +%if (%1&1) != 0 + add r0, 8*FENC_STRIDE +%endif + lea r1, [r1+4*r5] + lea r2, [r2+4*r5] + lea r3, [r3+4*r5] + lea r4, [r4+4*r5] +%endif +%endmacro + +%macro SAD_X3_END_AVX2 0 + vextracti128 xm4, m0, 1 + vextracti128 xm5, m1, 1 + vextracti128 xm6, m2, 1 + paddw xm0, xm4 + paddw xm1, xm5 + paddw xm2, xm6 + movhlps xm4, xm0 + movhlps xm5, xm1 + movhlps xm6, xm2 + paddw xm0, xm4 + paddw xm1, xm5 + paddw xm2, xm6 +%if UNIX64 + movd [r6+0], xm0 + movd [r6+4], xm1 + movd [r6+8], xm2 +%else + mov r0, r5mp + movd [r0+0], xm0 + movd [r0+4], xm1 + movd [r0+8], xm2 +%endif + RET +%endmacro + +%macro SAD_X4_END_AVX2 0 + mov r0, r6mp + punpckhqdq m2, m0, m0 + punpckhqdq m3, m1, m1 + paddw m0, m2 + paddw m1, m3 + packssdw m0, m1 + mova xm2, [deinterleave_sadx4] + vpermd m0, m2, m0 + mova [r0], xm0 + RET +%endmacro + ;----------------------------------------------------------------------------- ; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, ; uint8_t *pix2, intptr_t i_stride, int scores[3] ) ;----------------------------------------------------------------------------- %macro SAD_X_SSE2 3 -cglobal pixel_sad_x%1_%2x%3, 2+%1,2+%1,9 - SAD_X%1_2x%2P_SSE2 1 -%rep %3/2-1 - SAD_X%1_2x%2P_SSE2 0 +cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,9 +%assign x 0 +%rep %3/4 + SAD_X%1_4x%2P_SSE2 x, %3/4 +%assign x x+1 %endrep SAD_X%1_END_SSE2 %endmacro @@ -1221,7 +1576,36 @@ SAD_X_SSE2 4, 16, 16 SAD_X_SSE2 4, 16, 8 +%macro SAD_X_SSSE3 3 +cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,8 +%assign x 0 +%rep %3/4 + SAD_X%1_4x%2P_SSSE3 x, %3/4 +%assign x x+1 +%endrep + SAD_X%1_END_SSSE3 +%endmacro + +INIT_XMM ssse3 +SAD_X_SSSE3 4, 8, 16 +SAD_X_SSSE3 4, 8, 8 +SAD_X_SSSE3 4, 8, 4 + +%macro SAD_X_AVX2 4 +cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,%4 +%assign x 0 +%rep %3/4 + SAD_X%1_4x%2P_AVX2 x, %3/4 +%assign x x+1 +%endrep + SAD_X%1_END_AVX2 +%endmacro +INIT_YMM avx2 +SAD_X_AVX2 3, 16, 16, 7 +SAD_X_AVX2 3, 16, 8, 7 +SAD_X_AVX2 4, 16, 16, 8 +SAD_X_AVX2 4, 16, 8, 8 ;============================================================================= ; SAD cacheline split @@ -1410,12 +1794,12 @@ .split: %if ARCH_X86_64 PROLOGUE 6,9 + push r3 + push r2 %if WIN64 movsxd r4, r4d - sub rsp, 8 + sub rsp, 40 ; shadow space and alignment %endif - push r3 - push r2 mov r2, r1 mov r1, FENC_STRIDE mov r3, r4 @@ -1424,7 +1808,7 @@ call pixel_sad_%1x%2_cache%3_%5 mov [r8], eax %if WIN64 - mov r2, [rsp] + mov r2, [rsp+40+0*8] %else pop r2 %endif @@ -1432,7 +1816,7 @@ call pixel_sad_%1x%2_cache%3_%5 mov [r8+4], eax %if WIN64 - mov r2, [rsp+8] + mov r2, [rsp+40+1*8] %else pop r2 %endif @@ -1440,7 +1824,7 @@ call pixel_sad_%1x%2_cache%3_%5 mov [r8+8], eax %if WIN64 - add rsp, 24 + add rsp, 40+2*8 %endif RET %else @@ -1480,6 +1864,9 @@ push r4 push r3 push r2 +%if WIN64 + sub rsp, 32 ; shadow space +%endif mov r2, r1 mov r1, FENC_STRIDE mov r3, r5 @@ -1487,7 +1874,7 @@ call pixel_sad_%1x%2_cache%3_%5 mov [r8], eax %if WIN64 - mov r2, [rsp] + mov r2, [rsp+32+0*8] %else pop r2 %endif @@ -1495,7 +1882,7 @@ call pixel_sad_%1x%2_cache%3_%5 mov [r8+4], eax %if WIN64 - mov r2, [rsp+8] + mov r2, [rsp+32+1*8] %else pop r2 %endif @@ -1503,7 +1890,7 @@ call pixel_sad_%1x%2_cache%3_%5 mov [r8+8], eax %if WIN64 - mov r2, [rsp+16] + mov r2, [rsp+32+2*8] %else pop r2 %endif @@ -1511,7 +1898,7 @@ call pixel_sad_%1x%2_cache%3_%5 mov [r8+12], eax %if WIN64 - add rsp, 24 + add rsp, 32+3*8 %endif RET %else
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/sad16-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/sad16-a.asm
Changed
@@ -4,6 +4,7 @@ ;* Copyright (C) 2010-2013 x264 project ;* ;* Authors: Oskar Arvidsson <oskar@irock.se> +;* Henrik Gramner <henrik@gramner.com> ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -90,11 +91,18 @@ ; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t ) ;----------------------------------------------------------------------------- %macro SAD_MMX 3 -cglobal pixel_sad_%1x%2, 4,4 +cglobal pixel_sad_%1x%2, 4,5-(%2&4/4) pxor m0, m0 -%rep %2/%3 +%if %2 == 4 SAD_INC_%3x%1P_MMX -%endrep + SAD_INC_%3x%1P_MMX +%else + mov r4d, %2/%3 +.loop: + SAD_INC_%3x%1P_MMX + dec r4d + jg .loop +%endif %if %1*%2 == 256 HADDUW m0, m1 %else @@ -120,7 +128,8 @@ ; SAD XMM ;============================================================================= -%macro SAD_INC_2x16P_XMM 0 +%macro SAD_INC_2ROW 1 +%if 2*%1 > mmsize movu m1, [r2+ 0] movu m2, [r2+16] movu m3, [r2+2*r3+ 0] @@ -137,9 +146,7 @@ paddw m3, m4 paddw m0, m1 paddw m0, m3 -%endmacro - -%macro SAD_INC_2x8P_XMM 0 +%else movu m1, [r2] movu m2, [r2+2*r3] psubw m1, [r0] @@ -149,44 +156,55 @@ lea r2, [r2+4*r3] paddw m0, m1 paddw m0, m2 +%endif %endmacro ;----------------------------------------------------------------------------- ; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t ) ;----------------------------------------------------------------------------- -%macro SAD_XMM 2 -cglobal pixel_sad_%1x%2, 4,4,8 +%macro SAD 2 +cglobal pixel_sad_%1x%2, 4,5-(%2&4/4),8*(%1/mmsize) pxor m0, m0 -%rep %2/2 - SAD_INC_2x%1P_XMM -%endrep +%if %2 == 4 + SAD_INC_2ROW %1 + SAD_INC_2ROW %1 +%else + mov r4d, %2/2 +.loop: + SAD_INC_2ROW %1 + dec r4d + jg .loop +%endif HADDW m0, m1 - movd eax, m0 + movd eax, xm0 RET %endmacro INIT_XMM sse2 -SAD_XMM 16, 16 -SAD_XMM 16, 8 -SAD_XMM 8, 16 -SAD_XMM 8, 8 -SAD_XMM 8, 4 +SAD 16, 16 +SAD 16, 8 +SAD 8, 16 +SAD 8, 8 +SAD 8, 4 INIT_XMM sse2, aligned -SAD_XMM 16, 16 -SAD_XMM 16, 8 -SAD_XMM 8, 16 -SAD_XMM 8, 8 +SAD 16, 16 +SAD 16, 8 +SAD 8, 16 +SAD 8, 8 INIT_XMM ssse3 -SAD_XMM 16, 16 -SAD_XMM 16, 8 -SAD_XMM 8, 16 -SAD_XMM 8, 8 -SAD_XMM 8, 4 +SAD 16, 16 +SAD 16, 8 +SAD 8, 16 +SAD 8, 8 +SAD 8, 4 INIT_XMM ssse3, aligned -SAD_XMM 16, 16 -SAD_XMM 16, 8 -SAD_XMM 8, 16 -SAD_XMM 8, 8 +SAD 16, 16 +SAD 16, 8 +SAD 8, 16 +SAD 8, 8 +INIT_YMM avx2 +SAD 16, 16 +SAD 16, 8 ;============================================================================= ; SAD x3/x4 @@ -237,14 +255,14 @@ HADDW m2, m5 %endif %if UNIX64 - movd [r5+0], m0 - movd [r5+4], m1 - movd [r5+8], m2 + movd [r5+0], xm0 + movd [r5+4], xm1 + movd [r5+8], xm2 %else mov r0, r5mp - movd [r0+0], m0 - movd [r0+4], m1 - movd [r0+8], m2 + movd [r0+0], xm0 + movd [r0+4], xm1 + movd [r0+8], xm2 %endif RET %endmacro @@ -333,10 +351,10 @@ HADDW m3, m7 %endif mov r0, r6mp - movd [r0+ 0], m0 - movd [r0+ 4], m1 - movd [r0+ 8], m2 - movd [r0+12], m3 + movd [r0+ 0], xm0 + movd [r0+ 4], xm1 + movd [r0+ 8], xm2 + movd [r0+12], xm3 RET %endmacro @@ -400,8 +418,39 @@ INIT_XMM xop PIXEL_VSAD +INIT_YMM avx2 +cglobal pixel_vsad, 3,3 + mova m0, [r0] + mova m1, [r0+2*r1] + lea r0, [r0+4*r1] + psubw m0, m1 + pabsw m0, m0 + sub r2d, 2 + je .end +.loop: + mova m2, [r0] + mova m3, [r0+2*r1] + lea r0, [r0+4*r1] + psubw m1, m2 + psubw m2, m3 + pabsw m1, m1 + pabsw m2, m2 + paddw m0, m1 + paddw m0, m2 + mova m1, m3 + sub r2d, 2 + jg .loop +.end: +%if BIT_DEPTH == 9 + HADDW m0, m1 +%else + HADDUW m0, m1 +%endif + movd eax, xm0 + RET + ;----------------------------------------------------------------------------- -; void pixel_sad_xK_MxN( uint16_t *fenc, uint16_t *pix0, uint16_t *pix1, +; void pixel_sad_xN_WxH( uint16_t *fenc, uint16_t *pix0, uint16_t *pix1, ; uint16_t *pix2, intptr_t i_stride, int scores[3] ) ;----------------------------------------------------------------------------- %macro SAD_X 3 @@ -445,29 +494,38 @@ SAD_X 4, 4, 8 SAD_X 4, 4, 4 INIT_XMM ssse3 -%define XMM_REGS 9 +%define XMM_REGS 7 SAD_X 3, 16, 16 SAD_X 3, 16, 8 SAD_X 3, 8, 16 SAD_X 3, 8, 8 SAD_X 3, 8, 4 +%define XMM_REGS 9 SAD_X 4, 16, 16 SAD_X 4, 16, 8 SAD_X 4, 8, 16 SAD_X 4, 8, 8 SAD_X 4, 8, 4 INIT_XMM sse2 -%define XMM_REGS 11 +%define XMM_REGS 8 SAD_X 3, 16, 16 SAD_X 3, 16, 8 SAD_X 3, 8, 16 SAD_X 3, 8, 8 SAD_X 3, 8, 4 +%define XMM_REGS 11 SAD_X 4, 16, 16 SAD_X 4, 16, 8 SAD_X 4, 8, 16 SAD_X 4, 8, 8 SAD_X 4, 8, 4 +INIT_YMM avx2 +%define XMM_REGS 7 +SAD_X 3, 16, 16 +SAD_X 3, 16, 8 +%define XMM_REGS 9 +SAD_X 4, 16, 16 +SAD_X 4, 16, 8 ;----------------------------------------------------------------------------- ; void intra_sad_x3_4x4( uint16_t *fenc, uint16_t *fdec, int res[3] ); @@ -475,52 +533,57 @@ %macro INTRA_SAD_X3_4x4 0 cglobal intra_sad_x3_4x4, 3,3,7 - movq m0, [r1-1*FDEC_STRIDEB] + movddup m0, [r1-1*FDEC_STRIDEB] movq m1, [r0+0*FENC_STRIDEB] movq m2, [r0+2*FENC_STRIDEB] pshuflw m6, m0, q1032 paddw m6, m0 pshuflw m5, m6, q2301 paddw m6, m5 - punpcklqdq m6, m6 ;A+B+C+D 8 times - punpcklqdq m0, m0 + punpcklqdq m6, m6 ; A+B+C+D 8 times movhps m1, [r0+1*FENC_STRIDEB] movhps m2, [r0+3*FENC_STRIDEB] psubw m3, m1, m0 psubw m0, m2 - ABSW m3, m3, m5 - ABSW m0, m0, m5 + ABSW2 m3, m0, m3, m0, m4, m5 paddw m0, m3 - HADDW m0, m5 - movd [r2], m0 ;V prediction cost movd m3, [r1+0*FDEC_STRIDEB-4] - movhps m3, [r1+1*FDEC_STRIDEB-8] movd m4, [r1+2*FDEC_STRIDEB-4] + movhps m3, [r1+1*FDEC_STRIDEB-8] movhps m4, [r1+3*FDEC_STRIDEB-8] pshufhw m3, m3, q3333 pshufhw m4, m4, q3333 pshuflw m3, m3, q1111 ; FF FF EE EE pshuflw m4, m4, q1111 ; HH HH GG GG paddw m5, m3, m4 - pshufd m0, m5, q1032 + paddw m6, [pw_4] + paddw m6, m5 + pshufd m5, m5, q1032 paddw m5, m6 - paddw m5, m0 - paddw m5, [pw_4] psrlw m5, 3 psubw m6, m5, m2 psubw m5, m1 psubw m1, m3 psubw m2, m4 - ABSW m5, m5, m0 - ABSW m6, m6, m0 - ABSW m1, m1, m0 - ABSW m2, m2, m0 + ABSW2 m5, m6, m5, m6, m3, m4 + ABSW2 m1, m2, m1, m2, m3, m4 paddw m5, m6 paddw m1, m2 - HADDW m5, m0 - HADDW m1, m2 - movd [r2+8], m5 ;DC prediction cost - movd [r2+4], m1 ;H prediction cost +%if cpuflag(ssse3) + phaddw m0, m1 + movhlps m3, m5 + paddw m5, m3 + phaddw m0, m5 + pmaddwd m0, [pw_1] + mova [r2], m0 +%else + HADDW m0, m3 + HADDW m1, m3 + HADDW m5, m3 + movd [r2], m0 ; V prediction cost + movd [r2+4], m1 ; H prediction cost + movd [r2+8], m5 ; DC prediction cost +%endif RET %endmacro @@ -581,12 +644,21 @@ INTRA_SAD_HVDC_ITER 5, q2222 INTRA_SAD_HVDC_ITER 6, q1111 INTRA_SAD_HVDC_ITER 7, q0000 +%if cpuflag(ssse3) + phaddw m2, m3 ; 2 2 2 2 3 3 3 3 + movhlps m3, m1 + paddw m1, m3 ; 1 1 1 1 _ _ _ _ + phaddw m2, m1 ; 2 2 3 3 1 1 _ _ + pmaddwd m2, [pw_1] ; 2 3 1 _ + mova [r2], m2 +%else HADDW m2, m4 HADDW m3, m4 HADDW m1, m4 movd [r2+0], m2 movd [r2+4], m3 movd [r2+8], m1 +%endif RET %endmacro @@ -594,3 +666,44 @@ INTRA_SAD_X3_8x8 INIT_XMM ssse3 INTRA_SAD_X3_8x8 + +%macro INTRA_SAD_HVDC_ITER_YMM 2 + mova xm4, [r0+(%1-4)*FENC_STRIDEB] + vinserti128 m4, m4, [r0+%1*FENC_STRIDEB], 1 + pshufd m5, m7, %2 + psubw m5, m4 + pabsw m5, m5 + ACCUM paddw, 2, 5, %1 ; H + psubw m5, m4, m6 + psubw m4, m0 + pabsw m5, m5 + pabsw m4, m4 + ACCUM paddw, 1, 5, %1 ; V + ACCUM paddw, 3, 4, %1 ; DC +%endmacro + +INIT_YMM avx2 +cglobal intra_sad_x3_8x8, 3,3,8 + add r0, 4*FENC_STRIDEB + movu xm0, [r1+7*SIZEOF_PIXEL] + vbroadcasti128 m6, [r1+16*SIZEOF_PIXEL] ; V prediction + vpermq m7, m0, q0011 + paddw xm0, xm6 + paddw xm0, [pw_1] ; equal to +8 after HADDW + HADDW xm0, xm4 + psrld xm0, 4 + vpbroadcastw m0, xm0 + punpcklwd m7, m7 + INTRA_SAD_HVDC_ITER_YMM 0, q3333 + INTRA_SAD_HVDC_ITER_YMM 1, q2222 + INTRA_SAD_HVDC_ITER_YMM 2, q1111 + INTRA_SAD_HVDC_ITER_YMM 3, q0000 + phaddw m1, m2 ; 1 1 1 1 2 2 2 2 1 1 1 1 2 2 2 2 + punpckhqdq m2, m3, m3 + paddw m3, m2 ; 3 3 3 3 _ _ _ _ 3 3 3 3 _ _ _ _ + phaddw m1, m3 ; 1 1 2 2 3 3 _ _ 1 1 2 2 3 3 _ _ + vextracti128 xm2, m1, 1 + paddw xm1, xm2 ; 1 1 2 2 3 3 _ _ + pmaddwd xm1, [pw_1] ; 1 2 3 _ + mova [r2], xm1 + RET
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/trellis-64.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/trellis-64.asm
Changed
@@ -96,6 +96,15 @@ %endif %endmacro +%macro LOAD_DUP 2 ; dst, src +%if cpuflag(ssse3) + movddup %1, %2 +%else + movd %1, %2 + punpcklqdq %1, %1 +%endif +%endmacro + ;----------------------------------------------------------------------------- ; int trellis_cabac_4x4_psy( ; const int *unquant_mf, const uint8_t *zigzag, int lambda2, @@ -186,12 +195,11 @@ mov dword levelgt1_ctxm, 9 %endif %if psy - movd m6, psy_trellism + LOAD_DUP m6, psy_trellism %define psy_trellis m6 %elif dc - movd m6, [unquant_mfq] + LOAD_DUP m6, [unquant_mfq] paddd m6, m6 - punpcklqdq m6, m6 %define unquant_mf m6 %endif %ifdef PIC @@ -333,13 +341,12 @@ movd m0, abs_leveld mov r6, orig_coefsm %if HIGH_BIT_DEPTH - movd m1, [r6 + zigzagiq*SIZEOF_DCTCOEF] + LOAD_DUP m1, [r6 + zigzagiq*SIZEOF_DCTCOEF] %else - movd m1, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2] - psrad m1, 16 + LOAD_DUP m1, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2] + psrad m1, 16 ; sign_coef %endif punpcklqdq m0, m0 ; quant_coef - punpcklqdq m1, m1 ; sign_coef %if cpuflag(ssse3) pabsd m0, m0 pabsd m2, m1 ; abs_coef @@ -403,11 +410,10 @@ %else %ifdef PIC mov r10, unquant_mfm - movd m3, [r10 + zigzagiq*4] + LOAD_DUP m3, [r10 + zigzagiq*4] %else - movd m3, [unquant_mfq + zigzagiq*4] + LOAD_DUP m3, [unquant_mfq + zigzagiq*4] %endif - punpcklqdq m3, m3 pmuludq m0, m3 %endif paddd m0, [pq_128] @@ -420,8 +426,7 @@ %if dc psllq m0, 8 %else - movd m5, [dct_weight2_tab + zigzagiq*4 GLOBAL] - punpcklqdq m5, m5 + LOAD_DUP m5, [dct_weight2_tab + zigzagiq*4 GLOBAL] pmuludq m0, m5 %endif @@ -434,12 +439,11 @@ ; ssd1[k] -= psy_weight * psy_value; mov r6, fenc_dctm %if HIGH_BIT_DEPTH - movd m3, [r6 + zigzagiq*SIZEOF_DCTCOEF] + LOAD_DUP m3, [r6 + zigzagiq*SIZEOF_DCTCOEF] %else - movd m3, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2] + LOAD_DUP m3, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2] psrad m3, 16 ; orig_coef %endif - punpcklqdq m3, m3 %if cpuflag(ssse3) psignd m4, m1 ; SIGN(unquant_abs_level, sign_coef) %else @@ -453,9 +457,8 @@ ABSD m3, m4 SWAP 4, 3 %endif - movd m1, [dct_weight1_tab + zigzagiq*4 GLOBAL] + LOAD_DUP m1, [dct_weight1_tab + zigzagiq*4 GLOBAL] pmuludq m1, psy_trellis - punpcklqdq m1, m1 pmuludq m4, m1 psubq m0, m4 %if %1
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/util.h -> x264-snapshot-20130723-2245.tar.bz2/common/x86/util.h
Changed
@@ -121,42 +121,132 @@ return amvd; } +#define x264_predictor_clip x264_predictor_clip_mmx2 +static int ALWAYS_INLINE x264_predictor_clip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv ) +{ + static const uint32_t pd_32 = 0x20; + intptr_t tmp = (intptr_t)mv_limit, mvc_max = i_mvc, i = 0; + + asm( + "movq (%2), %%mm5 \n" + "movd %6, %%mm3 \n" + "psllw $2, %%mm5 \n" // Convert to subpel + "pshufw $0xEE, %%mm5, %%mm6 \n" + "dec %k3 \n" + "jz 2f \n" // if( i_mvc == 1 ) {do the last iteration} + "punpckldq %%mm3, %%mm3 \n" + "punpckldq %%mm5, %%mm5 \n" + "movd %7, %%mm4 \n" + "lea (%0,%3,4), %3 \n" + "1: \n" + "movq (%0), %%mm0 \n" + "add $8, %0 \n" + "movq %%mm3, %%mm1 \n" + "pxor %%mm2, %%mm2 \n" + "pcmpeqd %%mm0, %%mm1 \n" // mv == pmv + "pcmpeqd %%mm0, %%mm2 \n" // mv == 0 + "por %%mm1, %%mm2 \n" // (mv == pmv || mv == 0) * -1 + "pmovmskb %%mm2, %k2 \n" // (mv == pmv || mv == 0) * 0xf + "pmaxsw %%mm5, %%mm0 \n" + "pminsw %%mm6, %%mm0 \n" + "pand %%mm4, %%mm2 \n" // (mv0 == pmv || mv0 == 0) * 32 + "psrlq %%mm2, %%mm0 \n" // drop mv0 if it's skipped + "movq %%mm0, (%5,%4,4) \n" + "and $24, %k2 \n" + "add $2, %4 \n" + "add $8, %k2 \n" + "shr $4, %k2 \n" // (4-val)>>1 + "sub %2, %4 \n" // +1 for each valid motion vector + "cmp %3, %0 \n" + "jl 1b \n" + "jg 3f \n" // if( i == i_mvc - 1 ) {do the last iteration} + + /* Do the last iteration */ + "2: \n" + "movd (%0), %%mm0 \n" + "pxor %%mm2, %%mm2 \n" + "pcmpeqd %%mm0, %%mm3 \n" + "pcmpeqd %%mm0, %%mm2 \n" + "por %%mm3, %%mm2 \n" + "pmovmskb %%mm2, %k2 \n" + "pmaxsw %%mm5, %%mm0 \n" + "pminsw %%mm6, %%mm0 \n" + "movd %%mm0, (%5,%4,4) \n" + "inc %4 \n" + "and $1, %k2 \n" + "sub %2, %4 \n" // output += !(mv == pmv || mv == 0) + "3: \n" + :"+r"(mvc), "=m"(M64( dst )), "+r"(tmp), "+r"(mvc_max), "+r"(i) + :"r"(dst), "g"(pmv), "m"(pd_32), "m"(M64( mvc )) + ); + return i; +} + +/* Same as the above, except we do (mv + 2) >> 2 on the input. */ #define x264_predictor_roundclip x264_predictor_roundclip_mmx2 -static void ALWAYS_INLINE x264_predictor_roundclip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max ) +static int ALWAYS_INLINE x264_predictor_roundclip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv ) { - uint32_t mv_min = pack16to32_mask( mv_x_min, mv_y_min ); - uint32_t mv_max = pack16to32_mask( mv_x_max, mv_y_max ); static const uint64_t pw_2 = 0x0002000200020002ULL; - intptr_t i = i_mvc; + static const uint32_t pd_32 = 0x20; + intptr_t tmp = (intptr_t)mv_limit, mvc_max = i_mvc, i = 0; + asm( - "movd %2, %%mm5 \n" - "movd %3, %%mm6 \n" - "movq %4, %%mm7 \n" - "punpckldq %%mm5, %%mm5 \n" - "punpckldq %%mm6, %%mm6 \n" - "test $1, %0 \n" - "jz 1f \n" - "movd -4(%6,%0,4), %%mm0 \n" - "paddw %%mm7, %%mm0 \n" - "psraw $2, %%mm0 \n" - "pmaxsw %%mm5, %%mm0 \n" - "pminsw %%mm6, %%mm0 \n" - "movd %%mm0, -4(%5,%0,4) \n" - "dec %0 \n" - "jz 2f \n" - "1: \n" - "movq -8(%6,%0,4), %%mm0 \n" - "paddw %%mm7, %%mm0 \n" - "psraw $2, %%mm0 \n" - "pmaxsw %%mm5, %%mm0 \n" - "pminsw %%mm6, %%mm0 \n" - "movq %%mm0, -8(%5,%0,4) \n" - "sub $2, %0 \n" - "jnz 1b \n" - "2: \n" - :"+r"(i), "=m"(M64( dst )) - :"g"(mv_min), "g"(mv_max), "m"(pw_2), "r"(dst), "r"(mvc), "m"(M64( mvc )) + "movq (%2), %%mm5 \n" + "movq %6, %%mm7 \n" + "movd %7, %%mm3 \n" + "pshufw $0xEE, %%mm5, %%mm6 \n" + "dec %k3 \n" + "jz 2f \n" + "punpckldq %%mm3, %%mm3 \n" + "punpckldq %%mm5, %%mm5 \n" + "movd %8, %%mm4 \n" + "lea (%0,%3,4), %3 \n" + "1: \n" + "movq (%0), %%mm0 \n" + "add $8, %0 \n" + "paddw %%mm7, %%mm0 \n" + "psraw $2, %%mm0 \n" + "movq %%mm3, %%mm1 \n" + "pxor %%mm2, %%mm2 \n" + "pcmpeqd %%mm0, %%mm1 \n" + "pcmpeqd %%mm0, %%mm2 \n" + "por %%mm1, %%mm2 \n" + "pmovmskb %%mm2, %k2 \n" + "pmaxsw %%mm5, %%mm0 \n" + "pminsw %%mm6, %%mm0 \n" + "pand %%mm4, %%mm2 \n" + "psrlq %%mm2, %%mm0 \n" + "movq %%mm0, (%5,%4,4) \n" + "and $24, %k2 \n" + "add $2, %4 \n" + "add $8, %k2 \n" + "shr $4, %k2 \n" + "sub %2, %4 \n" + "cmp %3, %0 \n" + "jl 1b \n" + "jg 3f \n" + + /* Do the last iteration */ + "2: \n" + "movd (%0), %%mm0 \n" + "paddw %%mm7, %%mm0 \n" + "psraw $2, %%mm0 \n" + "pxor %%mm2, %%mm2 \n" + "pcmpeqd %%mm0, %%mm3 \n" + "pcmpeqd %%mm0, %%mm2 \n" + "por %%mm3, %%mm2 \n" + "pmovmskb %%mm2, %k2 \n" + "pmaxsw %%mm5, %%mm0 \n" + "pminsw %%mm6, %%mm0 \n" + "movd %%mm0, (%5,%4,4) \n" + "inc %4 \n" + "and $1, %k2 \n" + "sub %2, %4 \n" + "3: \n" + :"+r"(mvc), "=m"(M64( dst )), "+r"(tmp), "+r"(mvc_max), "+r"(i) + :"r"(dst), "m"(pw_2), "g"(pmv), "m"(pd_32), "m"(M64( mvc )) ); + return i; } #endif
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/x86inc.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/x86inc.asm
Changed
@@ -6,7 +6,7 @@ ;* Authors: Loren Merritt <lorenm@u.washington.edu> ;* Anton Mitrofanov <BugMaster@narod.ru> ;* Jason Garrett-Glaser <darkshikari@gmail.com> -;* Henrik Gramner <hengar-6@student.ltu.se> +;* Henrik Gramner <henrik@gramner.com> ;* ;* Permission to use, copy, modify, and/or distribute this software for any ;* purpose with or without fee is hereby granted, provided that the above @@ -34,8 +34,12 @@ ; as this feature might be useful for others as well. Send patches or ideas ; to x264-devel@videolan.org . -%ifndef program_name - %define program_name x264 +%ifndef private_prefix + %define private_prefix x264 +%endif + +%ifndef public_prefix + %define public_prefix private_prefix %endif %define WIN64 0 @@ -56,29 +60,12 @@ %define mangle(x) x %endif -; Name of the .rodata section. -; Kludge: Something on OS X fails to align .rodata even given an align attribute, -; so use a different read-only section. %macro SECTION_RODATA 0-1 16 - %ifidn __OUTPUT_FORMAT__,macho64 - SECTION .text align=%1 - %elifidn __OUTPUT_FORMAT__,macho - SECTION .text align=%1 - fakegot: - %elifidn __OUTPUT_FORMAT__,aout - section .text - %else - SECTION .rodata align=%1 - %endif + SECTION .rodata align=%1 %endmacro -; aout does not support align= %macro SECTION_TEXT 0-1 16 - %ifidn __OUTPUT_FORMAT__,aout - SECTION .text - %else - SECTION .text align=%1 - %endif + SECTION .text align=%1 %endmacro %if WIN64 @@ -323,14 +310,18 @@ %if stack_size < 0 %assign stack_size -stack_size %endif - %if mmsize != 8 - %assign xmm_regs_used %2 + %assign stack_size_padded stack_size + %if WIN64 + %assign stack_size_padded stack_size_padded + 32 ; reserve 32 bytes for shadow space + %if mmsize != 8 + %assign xmm_regs_used %2 + %if xmm_regs_used > 8 + %assign stack_size_padded stack_size_padded + (xmm_regs_used-8)*16 + %endif + %endif %endif %if mmsize <= 16 && HAVE_ALIGNED_STACK - %assign stack_size_padded stack_size + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1)) - %if xmm_regs_used > 6 - %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16 - %endif + %assign stack_size_padded stack_size_padded + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1)) SUB rsp, stack_size_padded %else %assign %%reg_num (regs_used - 1) @@ -340,14 +331,6 @@ ; stack in a single instruction (i.e. mov rsp, rstk or mov ; rsp, [rsp+stack_size_padded]) mov rstk, rsp - %assign stack_size_padded stack_size - %if xmm_regs_used > 6 - %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16 - %if mmsize == 32 && xmm_regs_used & 1 - ; re-align to 32 bytes - %assign stack_size_padded (stack_size_padded + 16) - %endif - %endif %if %1 < 0 ; need to store rsp on stack sub rsp, gprsize+stack_size_padded and rsp, ~(%%stack_alignment-1) @@ -359,9 +342,7 @@ %xdefine rstkm rstk %endif %endif - %if xmm_regs_used > 6 - WIN64_PUSH_XMM - %endif + WIN64_PUSH_XMM %endif %endif %endmacro @@ -422,40 +403,55 @@ %endmacro %macro WIN64_PUSH_XMM 0 - %assign %%i xmm_regs_used - %rep (xmm_regs_used-6) - %assign %%i %%i-1 - movdqa [rsp + (%%i-6)*16 + stack_size + (~stack_offset&8)], xmm %+ %%i - %endrep + ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. + %if xmm_regs_used > 6 + movaps [rstk + stack_offset + 8], xmm6 + %endif + %if xmm_regs_used > 7 + movaps [rstk + stack_offset + 24], xmm7 + %endif + %if xmm_regs_used > 8 + %assign %%i 8 + %rep xmm_regs_used-8 + movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i + %assign %%i %%i+1 + %endrep + %endif %endmacro %macro WIN64_SPILL_XMM 1 %assign xmm_regs_used %1 ASSERT xmm_regs_used <= 16 - %if xmm_regs_used > 6 - SUB rsp, (xmm_regs_used-6)*16+16 - WIN64_PUSH_XMM + %if xmm_regs_used > 8 + %assign stack_size_padded (xmm_regs_used-8)*16 + (~stack_offset&8) + 32 + SUB rsp, stack_size_padded %endif + WIN64_PUSH_XMM %endmacro %macro WIN64_RESTORE_XMM_INTERNAL 1 - %if xmm_regs_used > 6 + %assign %%pad_size 0 + %if xmm_regs_used > 8 %assign %%i xmm_regs_used - %rep (xmm_regs_used-6) + %rep xmm_regs_used-8 %assign %%i %%i-1 - movdqa xmm %+ %%i, [%1 + (%%i-6)*16+stack_size+(~stack_offset&8)] + movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32] %endrep - %if stack_size_padded == 0 - add %1, (xmm_regs_used-6)*16+16 - %endif %endif %if stack_size_padded > 0 %if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0) mov rsp, rstkm %else add %1, stack_size_padded + %assign %%pad_size stack_size_padded %endif %endif + %if xmm_regs_used > 7 + movaps xmm7, [%1 + stack_offset - %%pad_size + 24] + %endif + %if xmm_regs_used > 6 + movaps xmm6, [%1 + stack_offset - %%pad_size + 8] + %endif %endmacro %macro WIN64_RESTORE_XMM 1 @@ -643,38 +639,48 @@ ; Applies any symbol mangling needed for C linkage, and sets up a define such that ; subsequent uses of the function name automatically refer to the mangled version. ; Appends cpuflags to the function name if cpuflags has been specified. +; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX +; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2). %macro cglobal 1-2+ "" ; name, [PROLOGUE args] - ; the "" is a workaround for nasm, which fails if SUFFIX is empty - ; and we call cglobal_internal with just %1 %+ SUFFIX (without %2) - cglobal_internal %1 %+ SUFFIX, %2 + cglobal_internal 1, %1 %+ SUFFIX, %2 %endmacro -%macro cglobal_internal 1-2+ - %ifndef cglobaled_%1 - %xdefine %1 mangle(program_name %+ _ %+ %1) - %xdefine %1.skip_prologue %1 %+ .skip_prologue - CAT_XDEFINE cglobaled_, %1, 1 +%macro cvisible 1-2+ "" ; name, [PROLOGUE args] + cglobal_internal 0, %1 %+ SUFFIX, %2 +%endmacro +%macro cglobal_internal 2-3+ + %if %1 + %xdefine %%FUNCTION_PREFIX private_prefix + %xdefine %%VISIBILITY hidden + %else + %xdefine %%FUNCTION_PREFIX public_prefix + %xdefine %%VISIBILITY + %endif + %ifndef cglobaled_%2 + %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2) + %xdefine %2.skip_prologue %2 %+ .skip_prologue + CAT_XDEFINE cglobaled_, %2, 1 %endif - %xdefine current_function %1 + %xdefine current_function %2 %ifidn __OUTPUT_FORMAT__,elf - global %1:function hidden + global %2:function %%VISIBILITY %else - global %1 + global %2 %endif align function_align - %1: - RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer - %xdefine rstk rsp - %assign stack_offset 0 - %assign stack_size 0 - %assign stack_size_padded 0 - %assign xmm_regs_used 0 - %ifnidn %2, "" - PROLOGUE %2 + %2: + RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer + %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required + %assign stack_offset 0 ; stack pointer offset relative to the return address + %assign stack_size 0 ; amount of stack space that can be freely used inside a function + %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding + %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 + %ifnidn %3, "" + PROLOGUE %3 %endif %endmacro %macro cextern 1 - %xdefine %1 mangle(program_name %+ _ %+ %1) + %xdefine %1 mangle(private_prefix %+ _ %+ %1) CAT_XDEFINE cglobaled_, %1, 1 extern %1 %endmacro @@ -686,9 +692,13 @@ extern %1 %endmacro -%macro const 2+ - %xdefine %1 mangle(program_name %+ _ %+ %1) - global %1 +%macro const 1-2+ + %xdefine %1 mangle(private_prefix %+ _ %+ %1) + %ifidn __OUTPUT_FORMAT__,elf + global %1:data hidden + %else + global %1 + %endif %1: %2 %endmacro @@ -724,9 +734,8 @@ %assign cpuflags_misalign (1<<20) %assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant %assign cpuflags_atom (1<<22) -%assign cpuflags_bmi1 (1<<23) +%assign cpuflags_bmi1 (1<<23)|cpuflags_lzcnt %assign cpuflags_bmi2 (1<<24)|cpuflags_bmi1 -%assign cpuflags_tbm (1<<25)|cpuflags_bmi1 %define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) %define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) @@ -735,6 +744,7 @@ ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. %macro INIT_CPUFLAGS 0-2 + CPU amdnop %if %0 >= 1 %xdefine cpuname %1 %assign cpuflags cpuflags_%1 @@ -756,6 +766,9 @@ %elifidn %1, sse3 %define movu lddqu %endif + %if ARCH_X86_64 == 0 && notcpuflag(sse2) + CPU basicnop + %endif %else %xdefine SUFFIX %undef cpuname @@ -763,7 +776,11 @@ %endif %endmacro -; merge mmx and sse* +; Merge mmx and sse* +; m# is a simd regsiter of the currently selected size +; xm# is the corresponding xmmreg (if selcted xmm or ymm size), or mmreg (if selected mmx) +; ym# is the corresponding ymmreg (if selcted xmm or ymm size), or mmreg (if selected mmx) +; (All 3 remain in sync through SWAP.) %macro CAT_XDEFINE 3 %xdefine %1%2 %3 @@ -840,6 +857,26 @@ INIT_XMM +%macro DECLARE_MMCAST 1 + %define mmmm%1 mm%1 + %define mmxmm%1 mm%1 + %define mmymm%1 mm%1 + %define xmmmm%1 mm%1 + %define xmmxmm%1 xmm%1 + %define xmmymm%1 xmm%1 + %define ymmmm%1 mm%1 + %define ymmxmm%1 ymm%1 + %define ymmymm%1 ymm%1 + %define xm%1 xmm %+ m%1 + %define ym%1 ymm %+ m%1 +%endmacro + +%assign i 0 +%rep 16 + DECLARE_MMCAST i +%assign i i+1 +%endrep + ; I often want to use macros that permute their arguments. e.g. there's no ; efficient way to implement butterfly or transpose or dct without swapping some ; arguments. @@ -856,42 +893,42 @@ %macro PERMUTE 2-* ; takes a list of pairs to swap %rep %0/2 - %xdefine tmp%2 m%2 - %xdefine ntmp%2 nm%2 + %xdefine %%tmp%2 m%2 %rotate 2 %endrep %rep %0/2 - %xdefine m%1 tmp%2 - %xdefine nm%1 ntmp%2 - %undef tmp%2 - %undef ntmp%2 + %xdefine m%1 %%tmp%2 + CAT_XDEFINE n, m%1, %1 %rotate 2 %endrep %endmacro -%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs) -%rep %0-1 -%ifdef m%1 - %xdefine tmp m%1 - %xdefine m%1 m%2 - %xdefine m%2 tmp - CAT_XDEFINE n, m%1, %1 - CAT_XDEFINE n, m%2, %2 -%else - ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here. - ; Be careful using this mode in nested macros though, as in some cases there may be - ; other copies of m# that have already been dereferenced and don't get updated correctly. - %xdefine %%n1 n %+ %1 - %xdefine %%n2 n %+ %2 - %xdefine tmp m %+ %%n1 - CAT_XDEFINE m, %%n1, m %+ %%n2 - CAT_XDEFINE m, %%n2, tmp - CAT_XDEFINE n, m %+ %%n1, %%n1 - CAT_XDEFINE n, m %+ %%n2, %%n2 +%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs) +%ifnum %1 ; SWAP 0, 1, ... + SWAP_INTERNAL_NUM %1, %2 +%else ; SWAP m0, m1, ... + SWAP_INTERNAL_NAME %1, %2 %endif - %undef tmp +%endmacro + +%macro SWAP_INTERNAL_NUM 2-* + %rep %0-1 + %xdefine %%tmp m%1 + %xdefine m%1 m%2 + %xdefine m%2 %%tmp + CAT_XDEFINE n, m%1, %1 + CAT_XDEFINE n, m%2, %2 %rotate 1 -%endrep + %endrep +%endmacro + +%macro SWAP_INTERNAL_NAME 2-* + %xdefine %%args n %+ %1 + %rep %0-1 + %xdefine %%args %%args, n %+ %2 + %rotate 1 + %endrep + SWAP_INTERNAL_NUM %%args %endmacro ; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later @@ -1094,10 +1131,10 @@ AVX_INSTR blendps, 1, 0, 0 AVX_INSTR blendvpd, 1, 0, 0 AVX_INSTR blendvps, 1, 0, 0 -AVX_INSTR cmppd, 1, 0, 0 -AVX_INSTR cmpps, 1, 0, 0 -AVX_INSTR cmpsd, 1, 0, 0 -AVX_INSTR cmpss, 1, 0, 0 +AVX_INSTR cmppd, 1, 1, 0 +AVX_INSTR cmpps, 1, 1, 0 +AVX_INSTR cmpsd, 1, 1, 0 +AVX_INSTR cmpss, 1, 1, 0 AVX_INSTR comisd AVX_INSTR comiss AVX_INSTR cvtdq2pd @@ -1399,3 +1436,14 @@ FMA4_INSTR fnmsubps, fnmsub132ps, fnmsub213ps, fnmsub231ps FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss + +; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug +%if ARCH_X86_64 == 0 +%macro vpbroadcastq 2 +%if sizeof%1 == 16 + movddup %1, %2 +%else + vbroadcastsd %1, %2 +%endif +%endmacro +%endif
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/x86util.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/x86util.asm
Changed
@@ -30,10 +30,14 @@ %assign SIZEOF_PIXEL 1 %assign SIZEOF_DCTCOEF 2 %define pixel byte +%define vpbroadcastdct vpbroadcastw +%define vpbroadcastpix vpbroadcastb %if HIGH_BIT_DEPTH %assign SIZEOF_PIXEL 2 %assign SIZEOF_DCTCOEF 4 %define pixel word + %define vpbroadcastdct vpbroadcastd + %define vpbroadcastpix vpbroadcastw %endif %assign FENC_STRIDEB SIZEOF_PIXEL*FENC_STRIDE @@ -52,7 +56,10 @@ %macro SBUTTERFLY 4 -%if avx_enabled && mmsize == 16 +%ifidn %1, dqqq + vperm2i128 m%4, m%2, m%3, q0301 ; punpckh + vinserti128 m%2, m%2, xm%3, 1 ; punpckl +%elif avx_enabled && mmsize >= 16 punpckh%1 m%4, m%2, m%3 punpckl%1 m%2, m%3 %else @@ -214,15 +221,20 @@ %endif %endmacro -%macro ABSD 2 +%macro ABSD 2-3 %if cpuflag(ssse3) pabsd %1, %2 %else - pxor %1, %1 - pcmpgtd %1, %2 - pxor %2, %1 - psubd %2, %1 - SWAP %1, %2 + %define %%s %2 +%if %0 == 3 + mova %3, %2 + %define %%s %3 +%endif + pxor %1, %1 + pcmpgtd %1, %%s + pxor %%s, %1 + psubd %%s, %1 + SWAP %1, %%s %endif %endmacro @@ -255,9 +267,13 @@ %endmacro %imacro SPLATW 2-3 0 - PSHUFLW %1, %2, (%3)*q1111 +%if cpuflag(avx2) && %3 == 0 + vpbroadcastw %1, %2 +%else + PSHUFLW %1, %2, (%3)*q1111 %if mmsize == 16 - punpcklqdq %1, %1 + punpcklqdq %1, %1 +%endif %endif %endmacro @@ -275,16 +291,24 @@ %endmacro %macro HADDD 2 ; sum junk -%if mmsize == 16 +%if sizeof%1 == 32 +%define %2 xmm%2 + vextracti128 %2, %1, 1 +%define %1 xmm%1 + paddd %1, %2 +%endif +%if mmsize >= 16 movhlps %2, %1 paddd %1, %2 %endif PSHUFLW %2, %1, q0032 paddd %1, %2 +%undef %1 +%undef %2 %endmacro %macro HADDW 2 ; reg, tmp -%if cpuflag(xop) && mmsize == 16 +%if cpuflag(xop) && sizeof%1 == 16 vphaddwq %1, %1 movhlps %2, %1 paddd %1, %2 @@ -294,22 +318,41 @@ %endif %endmacro -%macro HADDUW 2 -%if cpuflag(xop) && mmsize == 16 - vphadduwq %1, %1 - movhlps %2, %1 - paddd %1, %2 +%macro HADDUWD 2 +%if cpuflag(xop) && sizeof%1 == 16 + vphadduwd %1, %1 %else psrld %2, %1, 16 pslld %1, 16 psrld %1, 16 paddd %1, %2 - HADDD %1, %2 +%endif +%endmacro + +%macro HADDUW 2 +%if cpuflag(xop) && sizeof%1 == 16 + vphadduwq %1, %1 + movhlps %2, %1 + paddd %1, %2 +%else + HADDUWD %1, %2 + HADDD %1, %2 %endif %endmacro %macro PALIGNR 4-5 ; [dst,] src1, src2, imm, tmp -%if cpuflag(ssse3) +; AVX2 version uses a precalculated extra input that +; can be re-used across calls +%if sizeof%1==32 + ; %3 = abcdefgh ijklmnop (lower address) + ; %2 = ABCDEFGH IJKLMNOP (higher address) +; vperm2i128 %5, %2, %3, q0003 ; %5 = ijklmnop ABCDEFGH +%if %4 < 16 + palignr %1, %5, %3, %4 ; %1 = bcdefghi jklmnopA +%else + palignr %1, %2, %5, %4-16 ; %1 = pABCDEFG HIJKLMNO +%endif +%elif cpuflag(ssse3) %if %0==5 palignr %1, %2, %3, %4 %else @@ -475,7 +518,7 @@ %endif %elifidn %1, q shufps m%5, m%3, m%4, q3131 - shufps m%3, m%4, q2020 + shufps m%3, m%3, m%4, q2020 SWAP %4, %5 %endif %endmacro @@ -498,22 +541,24 @@ ; %5(%6): tmpregs %if %1!=0 ; have to reorder stuff for horizontal op %ifidn %2, sumsub - %define ORDER ord - ; sumsub needs order because a-b != b-a unless a=b + %define ORDER ord + ; sumsub needs order because a-b != b-a unless a=b %else - %define ORDER unord - ; if we just max, order doesn't matter (allows pblendw+or in sse4) + %define ORDER unord + ; if we just max, order doesn't matter (allows pblendw+or in sse4) %endif %if %1==1 - TRANS d, ORDER, %3, %4, %5, %6 + TRANS d, ORDER, %3, %4, %5, %6 %elif %1==2 - %if mmsize==8 - SBUTTERFLY dq, %3, %4, %5 - %else - TRANS q, ORDER, %3, %4, %5, %6 - %endif + %if mmsize==8 + SBUTTERFLY dq, %3, %4, %5 + %else + TRANS q, ORDER, %3, %4, %5, %6 + %endif %elif %1==4 - SBUTTERFLY qdq, %3, %4, %5 + SBUTTERFLY qdq, %3, %4, %5 + %elif %1==8 + SBUTTERFLY dqqq, %3, %4, %5 %endif %endif %ifidn %2, sumsub @@ -675,11 +720,18 @@ %endmacro -%macro LOAD_DIFF 5 +%macro LOAD_DIFF 5-6 1 %if HIGH_BIT_DEPTH +%if %6 ; %5 aligned? mova %1, %4 psubw %1, %5 -%elifidn %3, none +%else + movu %1, %4 + movu %2, %5 + psubw %1, %2 +%endif +%else ; !HIGH_BIT_DEPTH +%ifidn %3, none movh %1, %4 movh %2, %5 punpcklbw %1, %2 @@ -692,6 +744,7 @@ punpcklbw %2, %3 psubw %1, %2 %endif +%endif ; HIGH_BIT_DEPTH %endmacro %macro LOAD_DIFF8x4 8 ; 4x dst, 1x tmp, 1x mul, 2x ptr @@ -742,17 +795,27 @@ movh [r0+3*FDEC_STRIDE], %4 %endmacro -%macro LOAD_DIFF_8x4P 7-10 r0,r2,0 ; 4x dest, 2x temp, 2x pointer, increment? - LOAD_DIFF m%1, m%5, m%7, [%8], [%9] - LOAD_DIFF m%2, m%6, m%7, [%8+r1], [%9+r3] - LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3] - LOAD_DIFF m%4, m%6, m%7, [%8+r4], [%9+r5] +%macro LOAD_DIFF_8x4P 7-11 r0,r2,0,1 ; 4x dest, 2x temp, 2x pointer, increment, aligned? + LOAD_DIFF m%1, m%5, m%7, [%8], [%9], %11 + LOAD_DIFF m%2, m%6, m%7, [%8+r1], [%9+r3], %11 + LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3], %11 + LOAD_DIFF m%4, m%6, m%7, [%8+r4], [%9+r5], %11 %if %10 lea %8, [%8+4*r1] lea %9, [%9+4*r3] %endif %endmacro +; 2xdst, 2xtmp, 2xsrcrow +%macro LOAD_DIFF16x2_AVX2 6 + pmovzxbw m%1, [r1+%5*FENC_STRIDE] + pmovzxbw m%2, [r1+%6*FENC_STRIDE] + pmovzxbw m%3, [r2+(%5-4)*FDEC_STRIDE] + pmovzxbw m%4, [r2+(%6-4)*FDEC_STRIDE] + psubw m%1, m%3 + psubw m%2, m%4 +%endmacro + %macro DIFFx2 6-7 movh %3, %5 punpcklbw %3, %4
View file
x264-snapshot-20130224-2245.tar.bz2/configure -> x264-snapshot-20130723-2245.tar.bz2/configure
Changed
@@ -25,6 +25,7 @@ --system-libx264 use system libx264 instead of internal --enable-shared build shared library --enable-static build static library + --disable-opencl disable OpenCL features --disable-gpl disable GPL-only features --disable-thread disable multithreaded encoding --enable-win32thread use win32threads (windows only) @@ -46,7 +47,7 @@ --sysroot=SYSROOT root of cross-build tree External library support: - --disable-avs disable avisynth support (windows only) + --disable-avs disable avisynth support --disable-swscale disable swscale support --disable-lavf disable libavformat support --disable-ffms disable ffmpegsource support @@ -80,6 +81,9 @@ [[ "$arg" = -falign-loops* ]] && arg= [ "$arg" = -fno-tree-vectorize ] && arg= [ "$arg" = -Wshadow ] && arg= + [[ "$arg" = -mpreferred-stack-boundary* ]] && arg= + [[ "$arg" = -l* ]] && arg= + [[ "$arg" = -L* ]] && arg= if [ $compiler = ICL ]; then [ "$arg" = -Wall ] && arg=-W0 [ "$arg" = -g ] && arg=-Z7 @@ -133,7 +137,7 @@ [ -n "$1" ] && echo "#include <$1>" > conftest.c echo "int main () { $3 return 0; }" >> conftest.c if [ $compiler = ICL ]; then - cc_cmd="$CC conftest.c $CFLAGS $2 -link $(icl_ldflags $2 $LDFLAGSCLI $LDFLAGS)" + cc_cmd="$CC conftest.c $(intel_cflags $CFLAGS $2) -link $(icl_ldflags $2 $LDFLAGSCLI $LDFLAGS)" else cc_cmd="$CC conftest.c $CFLAGS $2 $LDFLAGSCLI $LDFLAGS -o conftest" fi @@ -273,6 +277,7 @@ bit_depth="8" chroma_format="all" compiler="GNU" +opencl="yes" CFLAGS="$CFLAGS -Wall -I. -I\$(SRCPATH)" LDFLAGS="$LDFLAGS" @@ -285,7 +290,7 @@ EXE="" # list of all preprocessor HAVE values we can define -CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F VISUALIZE SWSCALE LAVF FFMS GPAC GF_MALLOC AVS GPL VECTOREXT INTERLACED CPU_COUNT" +CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F VISUALIZE SWSCALE LAVF FFMS GPAC GF_MALLOC AVS GPL VECTOREXT INTERLACED CPU_COUNT OPENCL" # parse options @@ -381,6 +386,9 @@ --host=*) host="$optarg" ;; + --disable-opencl) + opencl="no" + ;; --cross-prefix=*) cross_prefix="$optarg" ;; @@ -521,6 +529,13 @@ fi HAVE_GETOPT_LONG=0 ;; + *qnx*) + SYS="QNX" + define HAVE_MALLOC_H + libm="-lm" + HAVE_GETOPT_LONG=0 + CFLAGS="$CFLAGS -I\$(SRCPATH)/extras" + ;; *) die "Unknown system $host, edit the configure" ;; @@ -564,6 +579,7 @@ elif [ "$SYS" = WINDOWS -o "$SYS" = CYGWIN ]; then ASFLAGS="$ASFLAGS -f win32 -DPREFIX" LDFLAGS="$LDFLAGS -Wl,--large-address-aware" + [ $compiler = GNU ] && LDFLAGS="$LDFLAGS -Wl,--nxcompat -Wl,--dynamicbase" [ $compiler = GNU ] && RCFLAGS="--target=pe-i386 $RCFLAGS" else ASFLAGS="$ASFLAGS -f elf" @@ -583,6 +599,7 @@ ASFLAGS="$ASFLAGS -f win32 -m amd64" # only the GNU toolchain is inconsistent in prefixing function names with _ [ $compiler = GNU ] && cc_check "" "-S" && grep -q "_main:" conftest && ASFLAGS="$ASFLAGS -DPREFIX" + [ $compiler = GNU ] && LDFLAGS="$LDFLAGS -Wl,--nxcompat -Wl,--dynamicbase" [ $compiler = GNU ] && RCFLAGS="--target=pe-x86-64 $RCFLAGS" else ASFLAGS="$ASFLAGS -f elf -m amd64" @@ -703,6 +720,10 @@ exit 1 fi define HAVE_MMX + if cc_check '' -mpreferred-stack-boundary=5 ; then + CFLAGS="$CFLAGS -mpreferred-stack-boundary=5" + define HAVE_32B_STACK_ALIGNMENT + fi fi if [ $asm = auto -a $ARCH = ARM ] ; then @@ -770,6 +791,9 @@ thread="win32" fi ;; + QNX) + cc_check pthread.h -lc && thread="posix" && libpthread="-lc" + ;; *) cc_check pthread.h -lpthread && thread="posix" && libpthread="-lpthread" ;; @@ -917,8 +941,16 @@ avs="no" # cygwin can use avisynth if it can use LoadLibrary if [ $SYS = WINDOWS ] || ([ $SYS = CYGWIN ] && cc_check windows.h "" "LoadLibrary(0);") ; then - avs="yes" + avs="avisynth" + define HAVE_AVS + define USE_AVXSYNTH 0 + elif [ "$SYS" = "LINUX" -o "$SYS" = "MACOSX" ] ; then + # AvxSynth currently only supports Linux and OSX + avs="avxsynth" define HAVE_AVS + define USE_AVXSYNTH 1 + AVS_LIBS="-ldl" + LDFLAGSCLI="$AVS_LIBS $LDFLAGSCLI" fi fi @@ -978,6 +1010,7 @@ if [ "$bit_depth" -gt "8" ]; then define HIGH_BIT_DEPTH ASFLAGS="$ASFLAGS -DHIGH_BIT_DEPTH=1" + opencl="no" else ASFLAGS="$ASFLAGS -DHIGH_BIT_DEPTH=0" fi @@ -992,6 +1025,30 @@ [ $interlaced = yes ] && define HAVE_INTERLACED && x264_interlaced=1 || x264_interlaced=0 +libdl="" +if [ "$opencl" = "yes" ]; then + opencl="no" + log_check "for perl" + output=$(perl -v) + if [ "$output" = "" ]; then + log_fail + echo 'OpenCL support requires perl to compile.' + echo 'use --disable-opencl to compile without OpenCL.' + exit 1 + fi + log_ok + # cygwin can use opencl if it can use LoadLibrary + if [ $SYS = WINDOWS ] || ([ $SYS = CYGWIN ] && cc_check windows.h "" "LoadLibrary(0);") ; then + opencl="yes" + define HAVE_OPENCL + elif [ "$SYS" = "LINUX" -o "$SYS" = "MACOSX" ] ; then + opencl="yes" + define HAVE_OPENCL + libdl="-ldl" + fi + LDFLAGS="$LDFLAGS $libdl" +fi + #define undefined vars as 0 for var in $CONFIG_HAVE; do grep -q "HAVE_$var 1" config.h || define HAVE_$var 0 @@ -1083,6 +1140,7 @@ PROF_GEN_LD=$PROF_GEN_LD PROF_USE_CC=$PROF_USE_CC PROF_USE_LD=$PROF_USE_LD +HAVE_OPENCL=$opencl EOF if [ $compiler = ICL ]; then @@ -1162,7 +1220,7 @@ Description: H.264 (MPEG4 AVC) encoder library Version: $(grep POINTVER < x264_config.h | sed -e 's/.* "//; s/".*//') Libs: -L$libdir -lx264 -Libs.private: $libpthread $libm +Libs.private: $libpthread $libm $libdl Cflags: -I$includedir EOF @@ -1186,6 +1244,7 @@ gpac: $gpac gpl: $gpl thread: $thread +opencl: $opencl filters: $filters debug: $debug gprof: $gprof
View file
x264-snapshot-20130224-2245.tar.bz2/doc/regression_test.txt -> x264-snapshot-20130723-2245.tar.bz2/doc/regression_test.txt
Changed
@@ -4,7 +4,7 @@ inherently caused by compression. # Install and compile x264 : -svn co svn://svn.videolan.org/x264/trunk x264 +git clone git://git.videolan.org/x264.git x264 cd x264 ./configure make
View file
x264-snapshot-20130224-2245.tar.bz2/encoder/analyse.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/analyse.c
Changed
@@ -467,8 +467,8 @@ if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col ) h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv ); } - h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border; - h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border; + h->mb.mv_limit_fpel[0][0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border; + h->mb.mv_limit_fpel[1][0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border; if( h->mb.i_mb_x == 0 && !(h->mb.i_mb_y & PARAM_INTERLACED) ) { int mb_y = h->mb.i_mb_y >> SLICE_MBAFF; @@ -516,8 +516,8 @@ h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range ); h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] ); h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 ); - h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border; - h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border; + h->mb.mv_limit_fpel[0][1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border; + h->mb.mv_limit_fpel[1][1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border; } } if( PARAM_INTERLACED ) @@ -527,8 +527,8 @@ h->mb.mv_max[1] = h->mb.mv_maxy_row[i]; h->mb.mv_min_spel[1] = h->mb.mv_miny_spel_row[i]; h->mb.mv_max_spel[1] = h->mb.mv_maxy_spel_row[i]; - h->mb.mv_min_fpel[1] = h->mb.mv_miny_fpel_row[i]; - h->mb.mv_max_fpel[1] = h->mb.mv_maxy_fpel_row[i]; + h->mb.mv_limit_fpel[0][1] = h->mb.mv_miny_fpel_row[i]; + h->mb.mv_limit_fpel[1][1] = h->mb.mv_maxy_fpel_row[i]; } #undef CLIP_FMV @@ -888,7 +888,7 @@ { if( !h->mb.b_lossless && predict_mode[5] >= 0 ) { - int satd[9]; + ALIGNED_ARRAY_16( int32_t, satd,[9] ); h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd ); int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V]; satd[i_pred_mode] -= 3 * lambda; @@ -1006,7 +1006,7 @@ { if( !h->mb.b_lossless && predict_mode[5] >= 0 ) { - int satd[9]; + ALIGNED_ARRAY_16( int32_t, satd,[9] ); h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd ); int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V]; satd[i_pred_mode] -= 3 * lambda; @@ -1706,7 +1706,7 @@ static ALWAYS_INLINE int x264_mb_analyse_inter_p4x4_chroma_internal( x264_t *h, x264_mb_analysis_t *a, pixel **p_fref, int i8x8, int size, int chroma ) { - ALIGNED_ARRAY_16( pixel, pix1,[16*16] ); + ALIGNED_ARRAY_N( pixel, pix1,[16*16] ); pixel *pix2 = pix1+8; int i_stride = h->mb.pic.i_stride[1]; int chroma_h_shift = chroma <= CHROMA_422; @@ -1890,8 +1890,8 @@ static ALWAYS_INLINE int x264_analyse_bi_chroma( x264_t *h, x264_mb_analysis_t *a, int idx, int i_pixel ) { - ALIGNED_ARRAY_16( pixel, pix, [4],[16*16] ); - ALIGNED_ARRAY_16( pixel, bi, [2],[16*16] ); + ALIGNED_ARRAY_N( pixel, pix, [4],[16*16] ); + ALIGNED_ARRAY_N( pixel, bi, [2],[16*16] ); int i_chroma_cost = 0; int chromapix = h->luma2chroma_pixel[i_pixel]; @@ -1984,8 +1984,8 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) { - ALIGNED_ARRAY_16( pixel, pix0,[16*16] ); - ALIGNED_ARRAY_16( pixel, pix1,[16*16] ); + ALIGNED_ARRAY_N( pixel, pix0,[16*16] ); + ALIGNED_ARRAY_N( pixel, pix1,[16*16] ); pixel *src0, *src1; intptr_t stride0 = 16, stride1 = 16; int i_ref, i_mvc; @@ -2454,7 +2454,7 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd ) { - ALIGNED_ARRAY_16( pixel, pix,[2],[16*8] ); + ALIGNED_ARRAY_N( pixel, pix,[2],[16*8] ); ALIGNED_4( int16_t mvc[3][2] ); h->mb.i_partition = D_16x8; @@ -2836,12 +2836,28 @@ int plane_count = CHROMA444 && h->mb.b_chroma_me ? 3 : 1; int i_cost8 = 0, i_cost4 = 0; - for( int p = 0; p < plane_count; p++ ) + /* Not all platforms have a merged SATD function */ + if( h->pixf.sa8d_satd[PIXEL_16x16] ) { - i_cost8 += h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, - h->mb.pic.p_fdec[p], FDEC_STRIDE ); - i_cost4 += h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, - h->mb.pic.p_fdec[p], FDEC_STRIDE ); + uint64_t cost = 0; + for( int p = 0; p < plane_count; p++ ) + { + cost += h->pixf.sa8d_satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, + h->mb.pic.p_fdec[p], FDEC_STRIDE ); + + } + i_cost8 = (uint32_t)cost; + i_cost4 = (uint32_t)(cost >> 32); + } + else + { + for( int p = 0; p < plane_count; p++ ) + { + i_cost8 += h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, + h->mb.pic.p_fdec[p], FDEC_STRIDE ); + i_cost4 += h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, + h->mb.pic.p_fdec[p], FDEC_STRIDE ); + } } h->mb.b_transform_8x8 = i_cost8 < i_cost4; @@ -3002,8 +3018,8 @@ h->mb.i_qp = x264_ratecontrol_mb_qp( h ); /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB, * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */ - if( h->param.rc.i_aq_mode && h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 ) - h->mb.i_qp = h->mb.i_last_qp; + if( h->param.rc.i_aq_mode && h->param.analyse.i_subpel_refine < 10 ) + h->mb.i_qp = abs(h->mb.i_qp - h->mb.i_last_qp) == 1 ? h->mb.i_last_qp : h->mb.i_qp; if( h->param.analyse.b_mb_info ) h->fdec->effective_qp[h->mb.i_mb_xy] = h->mb.i_qp; /* Store the real analysis QP. */
View file
x264-snapshot-20130224-2245.tar.bz2/encoder/analyse.h -> x264-snapshot-20130723-2245.tar.bz2/encoder/analyse.h
Changed
@@ -34,7 +34,7 @@ void x264_macroblock_analyse( x264_t *h ); void x264_slicetype_decide( x264_t *h ); -void x264_slicetype_analyse( x264_t *h, int keyframe ); +void x264_slicetype_analyse( x264_t *h, int intra_minigop ); int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t *w );
View file
x264-snapshot-20130224-2245.tar.bz2/encoder/cabac.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/cabac.c
Changed
@@ -152,8 +152,10 @@ int i_dqp = h->mb.i_qp - h->mb.i_last_qp; int ctx; - /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely flat background area */ - if( h->mb.i_type == I_16x16 && !h->mb.cbp[h->mb.i_mb_xy] ) + /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely + * flat background area. Don't do this if it would raise the quantizer, since that could + * cause unexpected deblocking artifacts. */ + if( h->mb.i_type == I_16x16 && !h->mb.cbp[h->mb.i_mb_xy] && h->mb.i_qp > h->mb.i_last_qp ) { #if !RDO_SKIP_BS h->mb.i_qp = h->mb.i_last_qp; @@ -161,9 +163,7 @@ i_dqp = 0; } - /* Since, per the above, empty-CBP I16x16 blocks never have delta quants, - * we don't have to check for them. */ - ctx = h->mb.i_last_dqp && h->mb.cbp[h->mb.i_mb_prev_xy]; + ctx = h->mb.i_last_dqp && (h->mb.type[h->mb.i_mb_prev_xy] == I_16x16 || (h->mb.cbp[h->mb.i_mb_prev_xy]&0x3f)); if( i_dqp != 0 ) { @@ -644,26 +644,17 @@ } } -static const uint16_t significant_coeff_flag_offset[2][14] = -{ - { 105+0, 105+15, 105+29, 105+44, 105+47, 402, 484+0, 484+15, 484+29, 660, 528+0, 528+15, 528+29, 718 }, - { 277+0, 277+15, 277+29, 277+44, 277+47, 436, 776+0, 776+15, 776+29, 675, 820+0, 820+15, 820+29, 733 } -}; -static const uint16_t last_coeff_flag_offset[2][14] = -{ - { 166+0, 166+15, 166+29, 166+44, 166+47, 417, 572+0, 572+15, 572+29, 690, 616+0, 616+15, 616+29, 748 }, - { 338+0, 338+15, 338+29, 338+44, 338+47, 451, 864+0, 864+15, 864+29, 699, 908+0, 908+15, 908+29, 757 } -}; -static const uint16_t coeff_abs_level_m1_offset[14] = -{ - 227+0, 227+10, 227+20, 227+30, 227+39, 426, 952+0, 952+10, 952+20, 708, 982+0, 982+10, 982+20, 766 -}; -#if RDO_SKIP_BS -extern const uint8_t x264_significant_coeff_flag_offset_8x8[2][63]; +#if !RDO_SKIP_BS +extern const uint8_t x264_significant_coeff_flag_offset_8x8[2][64]; extern const uint8_t x264_last_coeff_flag_offset_8x8[63]; extern const uint8_t x264_coeff_flag_offset_chroma_422_dc[7]; +extern const uint16_t x264_significant_coeff_flag_offset[2][16]; +extern const uint16_t x264_last_coeff_flag_offset[2][16]; +extern const uint16_t x264_coeff_abs_level_m1_offset[16]; +extern const uint8_t x264_count_cat_m1[14]; #else -const uint8_t x264_significant_coeff_flag_offset_8x8[2][63] = +/* Padded to [64] for easier addressing */ +const uint8_t x264_significant_coeff_flag_offset_8x8[2][64] = {{ 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5, 4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7, @@ -683,6 +674,21 @@ 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8 }; const uint8_t x264_coeff_flag_offset_chroma_422_dc[7] = { 0, 0, 1, 1, 2, 2, 2 }; /* MIN( i/2, 2 ) */ +const uint16_t x264_significant_coeff_flag_offset[2][16] = +{ + { 105+0, 105+15, 105+29, 105+44, 105+47, 402, 484+0, 484+15, 484+29, 660, 528+0, 528+15, 528+29, 718, 0, 0 }, + { 277+0, 277+15, 277+29, 277+44, 277+47, 436, 776+0, 776+15, 776+29, 675, 820+0, 820+15, 820+29, 733, 0, 0 } +}; +const uint16_t x264_last_coeff_flag_offset[2][16] = +{ + { 166+0, 166+15, 166+29, 166+44, 166+47, 417, 572+0, 572+15, 572+29, 690, 616+0, 616+15, 616+29, 748, 0, 0 }, + { 338+0, 338+15, 338+29, 338+44, 338+47, 451, 864+0, 864+15, 864+29, 699, 908+0, 908+15, 908+29, 757, 0, 0 } +}; +const uint16_t x264_coeff_abs_level_m1_offset[16] = +{ + 227+0, 227+10, 227+20, 227+30, 227+39, 426, 952+0, 952+10, 952+20, 708, 982+0, 982+10, 982+20, 766 +}; +const uint8_t x264_count_cat_m1[14] = {15, 14, 15, 3, 14, 63, 15, 14, 15, 63, 15, 14, 15, 63}; #endif // node ctx: 0..3: abslevel1 (with abslevelgt1 == 0). @@ -694,20 +700,20 @@ /* 4:2:2 chroma dc uses a slightly different state machine for some reason, also note that * 4:2:0 chroma dc doesn't use the last state so it has identical output with both arrays. */ static const uint8_t coeff_abs_levelgt1_ctx_chroma_dc[8] = { 5, 5, 5, 5, 6, 7, 8, 8 }; + static const uint8_t coeff_abs_level_transition[2][8] = { /* update node ctx after coding a level=1 */ { 1, 2, 3, 3, 4, 5, 6, 7 }, /* update node ctx after coding a level>1 */ { 4, 4, 4, 4, 5, 6, 7, 7 } }; -static const uint8_t count_cat_m1[14] = {15, 14, 15, 3, 14, 63, 15, 14, 15, 63, 15, 14, 15, 63}; #if !RDO_SKIP_BS static ALWAYS_INLINE void x264_cabac_block_residual_internal( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l, int chroma422dc ) { - int ctx_sig = significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; - int ctx_last = last_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; - int ctx_level = coeff_abs_level_m1_offset[ctx_block_cat]; + int ctx_sig = x264_significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; + int ctx_last = x264_last_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; + int ctx_level = x264_coeff_abs_level_m1_offset[ctx_block_cat]; int coeff_idx = -1, node_ctx = 0; int last = h->quantf.coeff_last[ctx_block_cat]( l ); const uint8_t *levelgt1_ctx = chroma422dc ? coeff_abs_levelgt1_ctx_chroma_dc : coeff_abs_levelgt1_ctx; @@ -747,7 +753,7 @@ } else { - int count_m1 = count_cat_m1[ctx_block_cat]; + int count_m1 = x264_count_cat_m1[ctx_block_cat]; if( count_m1 == 63 ) { const uint8_t *sig_offset = x264_significant_coeff_flag_offset_8x8[MB_INTERLACED]; @@ -787,10 +793,20 @@ x264_cabac_encode_bypass( cb, coeff_sign ); } while( --coeff_idx >= 0 ); } -static void x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) + +void x264_cabac_block_residual_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) { x264_cabac_block_residual_internal( h, cb, ctx_block_cat, l, 0 ); } + +static void ALWAYS_INLINE x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) +{ +#if ARCH_X86_64 && HAVE_MMX + h->bsf.cabac_block_residual_internal( l, MB_INTERLACED, ctx_block_cat, cb ); +#else + x264_cabac_block_residual_c( h, cb, ctx_block_cat, l ); +#endif +} static void x264_cabac_block_residual_422_dc( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) { /* Template a version specifically for chroma 4:2:2 DC in order to avoid @@ -806,16 +822,16 @@ static void ALWAYS_INLINE x264_cabac_block_residual_internal( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l, int b_8x8, int chroma422dc ) { const uint8_t *sig_offset = x264_significant_coeff_flag_offset_8x8[MB_INTERLACED]; - int ctx_sig = significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; - int ctx_last = last_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; - int ctx_level = coeff_abs_level_m1_offset[ctx_block_cat]; + int ctx_sig = x264_significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; + int ctx_last = x264_last_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; + int ctx_level = x264_coeff_abs_level_m1_offset[ctx_block_cat]; int last = h->quantf.coeff_last[ctx_block_cat]( l ); int coeff_abs = abs(l[last]); int ctx = coeff_abs_level1_ctx[0] + ctx_level; int node_ctx; const uint8_t *levelgt1_ctx = chroma422dc ? coeff_abs_levelgt1_ctx_chroma_dc : coeff_abs_levelgt1_ctx; - if( last != (b_8x8 ? 63 : chroma422dc ? 7 : count_cat_m1[ctx_block_cat]) ) + if( last != (b_8x8 ? 63 : chroma422dc ? 7 : x264_count_cat_m1[ctx_block_cat]) ) { x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[last] : chroma422dc ? x264_coeff_flag_offset_chroma_422_dc[last] : last), 1 ); @@ -888,17 +904,35 @@ } } -static void x264_cabac_block_residual_8x8( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) +void x264_cabac_block_residual_8x8_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) { x264_cabac_block_residual_internal( h, cb, ctx_block_cat, l, 1, 0 ); } -static void x264_cabac_block_residual_422_dc( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) +void x264_cabac_block_residual_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) { - x264_cabac_block_residual_internal( h, cb, DCT_CHROMA_DC, l, 0, 1 ); + x264_cabac_block_residual_internal( h, cb, ctx_block_cat, l, 0, 0 ); } -static void x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) + +static ALWAYS_INLINE void x264_cabac_block_residual_8x8( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) { - x264_cabac_block_residual_internal( h, cb, ctx_block_cat, l, 0, 0 ); +#if ARCH_X86_64 && HAVE_MMX + h->bsf.cabac_block_residual_8x8_rd_internal( l, MB_INTERLACED, ctx_block_cat, cb ); +#else + x264_cabac_block_residual_8x8_rd_c( h, cb, ctx_block_cat, l ); +#endif +} +static ALWAYS_INLINE void x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) +{ +#if ARCH_X86_64 && HAVE_MMX + h->bsf.cabac_block_residual_rd_internal( l, MB_INTERLACED, ctx_block_cat, cb ); +#else + x264_cabac_block_residual_rd_c( h, cb, ctx_block_cat, l ); +#endif +} + +static void x264_cabac_block_residual_422_dc( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) +{ + x264_cabac_block_residual_internal( h, cb, DCT_CHROMA_DC, l, 0, 1 ); } #endif @@ -1051,25 +1085,23 @@ MUNGE_8x8_NNZ( BACKUP ) for( int p = 0; p < 3; p++ ) - for( int i = 0; i < 4; i++ ) - if( h->mb.i_cbp_luma & ( 1 << i ) ) - x264_cabac_block_residual_8x8_cbf( h, cb, ctx_cat_plane[DCT_LUMA_8x8][p], i*4+p*16, h->dct.luma8x8[i+p*4], b_intra ); + FOREACH_BIT( i, 0, h->mb.i_cbp_luma ) + x264_cabac_block_residual_8x8_cbf( h, cb, ctx_cat_plane[DCT_LUMA_8x8][p], i*4+p*16, h->dct.luma8x8[i+p*4], b_intra ); MUNGE_8x8_NNZ( RESTORE ) } else { - for( int i = 0; i < 4; i++ ) - if( h->mb.i_cbp_luma & ( 1 << i ) ) - x264_cabac_block_residual_8x8( h, cb, DCT_LUMA_8x8, h->dct.luma8x8[i] ); + FOREACH_BIT( i, 0, h->mb.i_cbp_luma ) + x264_cabac_block_residual_8x8( h, cb, DCT_LUMA_8x8, h->dct.luma8x8[i] ); } } else { for( int p = 0; p < plane_count; p++ ) - for( int i = 0; i < 16; i++ ) - if( h->mb.i_cbp_luma & ( 1 << ( i >> 2 ) ) ) - x264_cabac_block_residual_cbf( h, cb, ctx_cat_plane[DCT_LUMA_4x4][p], i+p*16, h->dct.luma4x4[i+p*16], b_intra ); + FOREACH_BIT( i8x8, 0, h->mb.i_cbp_luma ) + for( int i = 0; i < 4; i++ ) + x264_cabac_block_residual_cbf( h, cb, ctx_cat_plane[DCT_LUMA_4x4][p], i+i8x8*4+p*16, h->dct.luma4x4[i+i8x8*4+p*16], b_intra ); } if( chroma && h->mb.i_cbp_chroma ) /* Chroma DC residual present */
View file
x264-snapshot-20130224-2245.tar.bz2/encoder/cavlc.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/cavlc.c
Changed
@@ -128,13 +128,13 @@ unsigned int i_sign; /* level and run and total */ - /* set these to 2 to allow branchless i_trailing calculation */ - runlevel.level[1] = 2; - runlevel.level[2] = 2; i_total = h->quantf.coeff_level_run[ctx_block_cat]( l, &runlevel ); x264_prefetch( &x264_run_before[runlevel.mask] ); i_total_zero = runlevel.last + 1 - i_total; + /* branchless i_trailing calculation */ + runlevel.level[i_total+0] = 2; + runlevel.level[i_total+1] = 2; i_trailing = ((((runlevel.level[0]+1) | (1-runlevel.level[0])) >> 31) & 1) // abs(runlevel.level[0])>1 | ((((runlevel.level[1]+1) | (1-runlevel.level[1])) >> 31) & 2) | ((((runlevel.level[2]+1) | (1-runlevel.level[2])) >> 31) & 4); @@ -213,11 +213,14 @@ bs_t *s = &h->out.bs; int i_dqp = h->mb.i_qp - h->mb.i_last_qp; - /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely flat background area */ + /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely + * flat background area. Don't do this if it would raise the quantizer, since that could + * cause unexpected deblocking artifacts. */ if( h->mb.i_type == I_16x16 && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) && !h->mb.cache.non_zero_count[x264_scan8[LUMA_DC]] && !h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] - && !h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] ) + && !h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] + && h->mb.i_qp > h->mb.i_last_qp ) { #if !RDO_SKIP_BS h->mb.i_qp = h->mb.i_last_qp; @@ -268,20 +271,33 @@ } } -static inline void x264_cavlc_macroblock_luma_residual( x264_t *h, int i8start, int i8end ) +static ALWAYS_INLINE void x264_cavlc_macroblock_luma_residual( x264_t *h, int plane_count ) { if( h->mb.b_transform_8x8 ) { /* shuffle 8x8 dct coeffs into 4x4 lists */ - for( int i8 = i8start; i8 <= i8end; i8++ ) - if( h->mb.cache.non_zero_count[x264_scan8[i8*4]] ) - h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4], h->dct.luma8x8[i8], &h->mb.cache.non_zero_count[x264_scan8[i8*4]] ); + for( int p = 0; p < plane_count; p++ ) + for( int i8 = 0; i8 < 4; i8++ ) + if( h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4]] ) + h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[p*16+i8*4], h->dct.luma8x8[p*4+i8], + &h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4]] ); } - for( int i8 = i8start; i8 <= i8end; i8++ ) - if( h->mb.i_cbp_luma & (1 << (i8&3)) ) + for( int p = 0; p < plane_count; p++ ) + FOREACH_BIT( i8, 0, h->mb.i_cbp_luma ) for( int i4 = 0; i4 < 4; i4++ ) - x264_cavlc_block_residual( h, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4] ); + x264_cavlc_block_residual( h, DCT_LUMA_4x4, i4+i8*4+p*16, h->dct.luma4x4[i4+i8*4+p*16] ); +} + +static ALWAYS_INLINE void x264_cavlc_partition_luma_residual( x264_t *h, int i8, int p ) +{ + if( h->mb.b_transform_8x8 && h->mb.cache.non_zero_count[x264_scan8[i8*4]] ) + h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4+p*16], h->dct.luma8x8[i8+p*4], + &h->mb.cache.non_zero_count[x264_scan8[i8*4+p*16]] ); + + if( h->mb.i_cbp_luma & (1 << i8) ) + for( int i4 = 0; i4 < 4; i4++ ) + x264_cavlc_block_residual( h, DCT_LUMA_4x4, i4+i8*4+p*16, h->dct.luma4x4[i4+i8*4+p*16] ); } static void x264_cavlc_mb_header_i( x264_t *h, int i_mb_type, int i_mb_i_offset, int chroma ) @@ -552,7 +568,7 @@ else if( h->mb.i_cbp_luma | h->mb.i_cbp_chroma ) { x264_cavlc_qp_delta( h ); - x264_cavlc_macroblock_luma_residual( h, 0, plane_count*4-1 ); + x264_cavlc_macroblock_luma_residual( h, plane_count ); } if( h->mb.i_cbp_chroma ) { @@ -612,7 +628,7 @@ for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- ) { for( int p = 0; p < plane_count; p++ ) - x264_cavlc_macroblock_luma_residual( h, p*4+i8, p*4+i8 ); + x264_cavlc_partition_luma_residual( h, i8, p ); if( h->mb.i_cbp_chroma ) { if( CHROMA_FORMAT == CHROMA_422 ) @@ -665,7 +681,7 @@ h->out.bs.i_bits_encoded = x264_cavlc_intra4x4_pred_size( h, 4*i8, i_mode ); bs_write_ue( &h->out.bs, cbp_to_golomb[!CHROMA444][1][(h->mb.i_cbp_chroma << 4)|h->mb.i_cbp_luma] ); for( int p = 0; p < plane_count; p++ ) - x264_cavlc_macroblock_luma_residual( h, p*4+i8, p*4+i8 ); + x264_cavlc_partition_luma_residual( h, i8, p ); return h->out.bs.i_bits_encoded; }
View file
x264-snapshot-20130224-2245.tar.bz2/encoder/encoder.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/encoder.c
Changed
@@ -353,34 +353,49 @@ /* If we are within a reasonable distance of the end of the memory allocated for the bitstream, */ /* reallocate, adding an arbitrary amount of space. */ -static int x264_bitstream_check_buffer( x264_t *h ) +static int x264_bitstream_check_buffer_internal( x264_t *h, int size, int b_cabac, int i_nal ) { - uint8_t *bs_bak = h->out.p_bitstream; - int max_row_size = (2500 << SLICE_MBAFF) * h->mb.i_mb_width; - if( (h->param.b_cabac && (h->cabac.p_end - h->cabac.p < max_row_size)) || - (h->out.bs.p_end - h->out.bs.p < max_row_size) ) + if( (b_cabac && (h->cabac.p_end - h->cabac.p < size)) || + (h->out.bs.p_end - h->out.bs.p < size) ) { - h->out.i_bitstream += max_row_size; - CHECKED_MALLOC( h->out.p_bitstream, h->out.i_bitstream ); - h->mc.memcpy_aligned( h->out.p_bitstream, bs_bak, (h->out.i_bitstream - max_row_size) & ~15 ); - intptr_t delta = h->out.p_bitstream - bs_bak; + int buf_size = h->out.i_bitstream + size; + uint8_t *buf = x264_malloc( buf_size ); + if( !buf ) + return -1; + int aligned_size = h->out.i_bitstream & ~15; + h->mc.memcpy_aligned( buf, h->out.p_bitstream, aligned_size ); + memcpy( buf + aligned_size, h->out.p_bitstream + aligned_size, h->out.i_bitstream - aligned_size ); + + intptr_t delta = buf - h->out.p_bitstream; h->out.bs.p_start += delta; h->out.bs.p += delta; - h->out.bs.p_end = h->out.p_bitstream + h->out.i_bitstream; + h->out.bs.p_end = buf + buf_size; h->cabac.p_start += delta; h->cabac.p += delta; - h->cabac.p_end = h->out.p_bitstream + h->out.i_bitstream; + h->cabac.p_end = buf + buf_size; - for( int i = 0; i <= h->out.i_nal; i++ ) + for( int i = 0; i <= i_nal; i++ ) h->out.nal[i].p_payload += delta; - x264_free( bs_bak ); + + x264_free( h->out.p_bitstream ); + h->out.p_bitstream = buf; + h->out.i_bitstream = buf_size; } return 0; -fail: - x264_free( bs_bak ); - return -1; +} + +static int x264_bitstream_check_buffer( x264_t *h ) +{ + int max_row_size = (2500 << SLICE_MBAFF) * h->mb.i_mb_width; + return x264_bitstream_check_buffer_internal( h, max_row_size, h->param.b_cabac, h->out.i_nal ); +} + +static int x264_bitstream_check_buffer_filler( x264_t *h, int filler ) +{ + filler += 32; // add padding for safety + return x264_bitstream_check_buffer_internal( h, filler, 0, -1 ); } #if HAVE_THREAD @@ -417,17 +432,33 @@ static int x264_validate_parameters( x264_t *h, int b_open ) { #if HAVE_MMX -#ifdef __SSE__ - if( b_open && !(x264_cpu_detect() & X264_CPU_SSE) ) + if( b_open ) { - x264_log( h, X264_LOG_ERROR, "your cpu does not support SSE1, but x264 was compiled with asm support\n"); + int cpuflags = x264_cpu_detect(); + int fail = 0; +#ifdef __SSE__ + if( !(cpuflags & X264_CPU_SSE) ) + { + x264_log( h, X264_LOG_ERROR, "your cpu does not support SSE1, but x264 was compiled with asm\n"); + fail = 1; + } #else - if( b_open && !(x264_cpu_detect() & X264_CPU_MMX2) ) - { - x264_log( h, X264_LOG_ERROR, "your cpu does not support MMXEXT, but x264 was compiled with asm support\n"); + if( !(cpuflags & X264_CPU_MMX2) ) + { + x264_log( h, X264_LOG_ERROR, "your cpu does not support MMXEXT, but x264 was compiled with asm\n"); + fail = 1; + } #endif - x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm support (configure --disable-asm)\n"); - return -1; + if( !fail && !(cpuflags & X264_CPU_CMOV) ) + { + x264_log( h, X264_LOG_ERROR, "your cpu does not support CMOV, but x264 was compiled with asm\n"); + fail = 1; + } + if( fail ) + { + x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm (configure --disable-asm)\n"); + return -1; + } } #endif @@ -503,8 +534,6 @@ if( h->param.i_threads == X264_THREADS_AUTO ) h->param.i_threads = x264_cpu_num_processors() * (h->param.b_sliced_threads?2:3)/2; - if( h->param.i_lookahead_threads == X264_THREADS_AUTO ) - h->param.i_lookahead_threads = h->param.i_threads / (h->param.b_sliced_threads?1:6); int max_sliced_threads = X264_MAX( 1, (h->param.i_height+15)/16 / 4 ); if( h->param.i_threads > 1 ) { @@ -518,7 +547,6 @@ h->param.i_threads = X264_MIN( h->param.i_threads, max_sliced_threads ); } h->param.i_threads = x264_clip3( h->param.i_threads, 1, X264_THREAD_MAX ); - h->param.i_lookahead_threads = x264_clip3( h->param.i_lookahead_threads, 1, X264_MIN( max_sliced_threads, X264_LOOKAHEAD_THREAD_MAX ) ); if( h->param.i_threads == 1 ) { h->param.b_sliced_threads = 0; @@ -528,6 +556,28 @@ if( h->i_thread_frames > 1 ) h->param.nalu_process = NULL; + if( h->param.b_opencl ) + { +#if !HAVE_OPENCL + x264_log( h, X264_LOG_WARNING, "OpenCL: not compiled with OpenCL support, disabling\n" ); + h->param.b_opencl = 0; +#elif BIT_DEPTH > 8 + x264_log( h, X264_LOG_WARNING, "OpenCL lookahead does not support high bit depth, disabling opencl\n" ); + h->param.b_opencl = 0; +#else + if( h->param.i_width < 32 || h->param.i_height < 32 ) + { + x264_log( h, X264_LOG_WARNING, "OpenCL: frame size is too small, disabling opencl\n" ); + h->param.b_opencl = 0; + } +#endif + if( h->param.opencl_device_id && h->param.i_opencl_device ) + { + x264_log( h, X264_LOG_WARNING, "OpenCL: device id and device skip count configured; dropping skip\n" ); + h->param.i_opencl_device = 0; + } + } + h->param.i_keyint_max = x264_clip3( h->param.i_keyint_max, 1, X264_KEYINT_MAX_INFINITE ); if( h->param.i_keyint_max == 1 ) { @@ -646,7 +696,7 @@ h->param.rc.i_rc_method == X264_RC_ABR ) { x264_log( h, X264_LOG_WARNING, "max bitrate less than average bitrate, assuming CBR\n" ); - h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate; + h->param.rc.i_bitrate = h->param.rc.i_vbv_max_bitrate; } } else if( h->param.rc.i_vbv_max_bitrate ) @@ -657,6 +707,22 @@ h->param.i_slice_max_size = X264_MAX( h->param.i_slice_max_size, 0 ); h->param.i_slice_max_mbs = X264_MAX( h->param.i_slice_max_mbs, 0 ); + h->param.i_slice_min_mbs = X264_MAX( h->param.i_slice_min_mbs, 0 ); + if( h->param.i_slice_max_mbs ) + h->param.i_slice_min_mbs = X264_MIN( h->param.i_slice_min_mbs, h->param.i_slice_max_mbs/2 ); + else if( !h->param.i_slice_max_size ) + h->param.i_slice_min_mbs = 0; + if( PARAM_INTERLACED && h->param.i_slice_min_mbs ) + { + x264_log( h, X264_LOG_WARNING, "interlace + slice-min-mbs is not implemented\n" ); + h->param.i_slice_min_mbs = 0; + } + int mb_width = (h->param.i_width+15)/16; + if( h->param.i_slice_min_mbs > mb_width ) + { + x264_log( h, X264_LOG_WARNING, "slice-min-mbs > row mb size (%d) not implemented\n", mb_width ); + h->param.i_slice_min_mbs = mb_width; + } int max_slices = (h->param.i_height+((16<<PARAM_INTERLACED)-1))/(16<<PARAM_INTERLACED); if( h->param.b_sliced_threads ) @@ -667,6 +733,8 @@ if( h->param.i_slice_max_mbs || h->param.i_slice_max_size ) h->param.i_slice_count = 0; } + if( h->param.i_slice_count_max > 0 ) + h->param.i_slice_count_max = X264_MAX( h->param.i_slice_count, h->param.i_slice_count_max ); if( h->param.b_bluray_compat ) { @@ -895,6 +963,35 @@ h->param.analyse.i_weighted_pred = x264_clip3( h->param.analyse.i_weighted_pred, X264_WEIGHTP_NONE, X264_WEIGHTP_SMART ); + if( h->param.i_lookahead_threads == X264_THREADS_AUTO ) + { + if( h->param.b_sliced_threads ) + h->param.i_lookahead_threads = h->param.i_threads; + else + { + /* If we're using much slower lookahead settings than encoding settings, it helps a lot to use + * more lookahead threads. This typically happens in the first pass of a two-pass encode, so + * try to guess at this sort of case. + * + * Tuned by a little bit of real encoding with the various presets. */ + int badapt = h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS; + int subme = X264_MIN( h->param.analyse.i_subpel_refine / 3, 3 ) + (h->param.analyse.i_subpel_refine > 1); + int bframes = X264_MIN( (h->param.i_bframe - 1) / 3, 3 ); + + /* [b-adapt 0/1 vs 2][quantized subme][quantized bframes] */ + static const uint8_t lookahead_thread_div[2][5][4] = + {{{6,6,6,6}, {3,3,3,3}, {4,4,4,4}, {6,6,6,6}, {12,12,12,12}}, + {{3,2,1,1}, {2,1,1,1}, {4,3,2,1}, {6,4,3,2}, {12, 9, 6, 4}}}; + + h->param.i_lookahead_threads = h->param.i_threads / lookahead_thread_div[badapt][subme][bframes]; + /* Since too many lookahead threads significantly degrades lookahead accuracy, limit auto + * lookahead threads to about 8 macroblock rows high each at worst. This number is chosen + * pretty much arbitrarily. */ + h->param.i_lookahead_threads = X264_MIN( h->param.i_lookahead_threads, h->param.i_height / 128 ); + } + } + h->param.i_lookahead_threads = x264_clip3( h->param.i_lookahead_threads, 1, X264_MIN( max_sliced_threads, X264_LOOKAHEAD_THREAD_MAX ) ); + if( PARAM_INTERLACED ) { if( h->param.analyse.i_me_method >= X264_ME_ESA ) @@ -982,7 +1079,9 @@ BOOLIFY( b_fake_interlaced ); BOOLIFY( b_open_gop ); BOOLIFY( b_bluray_compat ); + BOOLIFY( b_stitchable ); BOOLIFY( b_full_recon ); + BOOLIFY( b_opencl ); BOOLIFY( analyse.b_transform_8x8 ); BOOLIFY( analyse.b_weighted_bipred ); BOOLIFY( analyse.b_chroma_me ); @@ -1221,7 +1320,7 @@ x264_dct_init( h->param.cpu, &h->dctf ); x264_zigzag_init( h->param.cpu, &h->zigzagf_progressive, &h->zigzagf_interlaced ); memcpy( &h->zigzagf, PARAM_INTERLACED ? &h->zigzagf_interlaced : &h->zigzagf_progressive, sizeof(h->zigzagf) ); - x264_mc_init( h->param.cpu, &h->mc ); + x264_mc_init( h->param.cpu, &h->mc, h->param.b_cpu_independent ); x264_quant_init( h, h->param.cpu, &h->quantf ); x264_deblock_init( h->param.cpu, &h->loopf, PARAM_INTERLACED ); x264_bitstream_init( h->param.cpu, &h->bsf ); @@ -1236,6 +1335,9 @@ p = buf + sprintf( buf, "using cpu capabilities:" ); for( int i = 0; x264_cpu_names[i].flags; i++ ) { + if( !strcmp(x264_cpu_names[i].name, "SSE") + && h->param.cpu & (X264_CPU_SSE2) ) + continue; if( !strcmp(x264_cpu_names[i].name, "SSE2") && h->param.cpu & (X264_CPU_SSE2_IS_FAST|X264_CPU_SSE2_IS_SLOW) ) continue; @@ -1245,6 +1347,9 @@ if( !strcmp(x264_cpu_names[i].name, "SSE4.1") && (h->param.cpu & X264_CPU_SSE42) ) continue; + if( !strcmp(x264_cpu_names[i].name, "BMI1") + && (h->param.cpu & X264_CPU_BMI2) ) + continue; if( (h->param.cpu & x264_cpu_names[i].flags) == x264_cpu_names[i].flags && (!i || x264_cpu_names[i].flags != x264_cpu_names[i-1].flags) ) p += sprintf( p, " %s", x264_cpu_names[i].name ); @@ -1277,7 +1382,7 @@ { x264_log( h, X264_LOG_ERROR, "CLZ test failed: x264 has been miscompiled!\n" ); #if ARCH_X86 || ARCH_X86_64 - x264_log( h, X264_LOG_ERROR, "Are you attempting to run an SSE4a-targeted build on a CPU that\n" ); + x264_log( h, X264_LOG_ERROR, "Are you attempting to run an SSE4a/LZCNT-targeted build on a CPU that\n" ); x264_log( h, X264_LOG_ERROR, "doesn't support it?\n" ); #endif goto fail; @@ -1288,7 +1393,7 @@ * ( h->param.rc.i_rc_method == X264_RC_ABR ? pow( 0.95, h->param.rc.i_qp_min ) : pow( 0.95, h->param.rc.i_qp_constant ) * X264_MAX( 1, h->param.rc.f_ip_factor ))); - h->nal_buffer_size = h->out.i_bitstream * 3/2 + 4; + h->nal_buffer_size = h->out.i_bitstream * 3/2 + 4 + 64; /* +4 for startcode, +64 for nal_escape assembly padding */ CHECKED_MALLOC( h->nal_buffer, h->nal_buffer_size ); if( h->param.i_threads > 1 && @@ -1298,6 +1403,18 @@ x264_threadpool_init( &h->lookaheadpool, h->param.i_lookahead_threads, (void*)x264_lookahead_thread_init, h ) ) goto fail; +#if HAVE_OPENCL + if( h->param.b_opencl ) + { + h->opencl.ocl = x264_opencl_load_library(); + if( !h->opencl.ocl ) + { + x264_log( h, X264_LOG_WARNING, "failed to load OpenCL\n" ); + h->param.b_opencl = 0; + } + } +#endif + h->thread[0] = h; for( int i = 1; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ ) CHECKED_MALLOC( h->thread[i], sizeof(x264_t) ); @@ -1338,6 +1455,11 @@ goto fail; } +#if HAVE_OPENCL + if( h->param.b_opencl && x264_opencl_lookahead_init( h ) < 0 ) + h->param.b_opencl = 0; +#endif + if( x264_lookahead_init( h, i_slicetype_length ) ) goto fail; @@ -1452,7 +1574,9 @@ COPY( i_bframe_pyramid ); COPY( i_slice_max_size ); COPY( i_slice_max_mbs ); + COPY( i_slice_min_mbs ); COPY( i_slice_count ); + COPY( i_slice_count_max ); COPY( b_tff ); /* VBV can't be turned on if it wasn't on to begin with */ @@ -1529,9 +1653,9 @@ x264_nal_t *nal = &h->out.nal[h->out.i_nal]; uint8_t *end = &h->out.p_bitstream[bs_pos( &h->out.bs ) / 8]; nal->i_payload = end - nal->p_payload; - /* nal_escape_mmx reads past the end of the input. + /* Assembly implementation of nal_escape reads past the end of the input. * While undefined padding wouldn't actually affect the output, it makes valgrind unhappy. */ - memset( end, 0xff, 32 ); + memset( end, 0xff, 64 ); if( h->param.nalu_process ) h->param.nalu_process( h, nal, h->fenc->opaque ); h->out.i_nal++; @@ -1541,6 +1665,7 @@ static int x264_encoder_encapsulate_nals( x264_t *h, int start ) { + x264_t *h0 = h->thread[0]; int nal_size = 0, previous_nal_size = 0; if( h->param.nalu_process ) @@ -1557,20 +1682,26 @@ nal_size += h->out.nal[i].i_payload; /* Worst-case NAL unit escaping: reallocate the buffer if it's too small. */ - int necessary_size = nal_size * 3/2 + h->out.i_nal * 4; - if( h->nal_buffer_size < necessary_size ) + int necessary_size = previous_nal_size + nal_size * 3/2 + h->out.i_nal * 4 + 4 + 64; + if( h0->nal_buffer_size < necessary_size ) { - h->nal_buffer_size = necessary_size * 2; - uint8_t *buf = x264_malloc( h->nal_buffer_size ); + necessary_size *= 2; + uint8_t *buf = x264_malloc( necessary_size ); if( !buf ) return -1; if( previous_nal_size ) - memcpy( buf, h->nal_buffer, previous_nal_size ); - x264_free( h->nal_buffer ); - h->nal_buffer = buf; + memcpy( buf, h0->nal_buffer, previous_nal_size ); + + intptr_t delta = buf - h0->nal_buffer; + for( int i = 0; i < start; i++ ) + h->out.nal[i].p_payload += delta; + + x264_free( h0->nal_buffer ); + h0->nal_buffer = buf; + h0->nal_buffer_size = necessary_size; } - uint8_t *nal_buffer = h->nal_buffer + previous_nal_size; + uint8_t *nal_buffer = h0->nal_buffer + previous_nal_size; for( int i = start; i < h->out.i_nal; i++ ) { @@ -1581,7 +1712,7 @@ x264_emms(); - return nal_buffer - (h->nal_buffer + previous_nal_size); + return nal_buffer - (h0->nal_buffer + previous_nal_size); } /**************************************************************************** @@ -2224,8 +2355,12 @@ int b_deblock = h->sh.i_disable_deblocking_filter_idc != 1; int b_hpel = h->fdec->b_kept_as_ref; int orig_last_mb = h->sh.i_last_mb; + int thread_last_mb = h->i_threadslice_end * h->mb.i_mb_width - 1; uint8_t *last_emu_check; - x264_bs_bak_t bs_bak[2]; +#define BS_BAK_SLICE_MAX_SIZE 0 +#define BS_BAK_SLICE_MIN_MBS 1 +#define BS_BAK_ROW_VBV 2 + x264_bs_bak_t bs_bak[3]; b_deblock &= b_hpel || h->param.b_full_recon || h->param.psz_dump_yuv; bs_realign( &h->out.bs ); @@ -2273,13 +2408,17 @@ if( x264_bitstream_check_buffer( h ) ) return -1; if( !(i_mb_y & SLICE_MBAFF) && h->param.rc.i_vbv_buffer_size ) - x264_bitstream_backup( h, &bs_bak[1], i_skip, 1 ); + x264_bitstream_backup( h, &bs_bak[BS_BAK_ROW_VBV], i_skip, 1 ); if( !h->mb.b_reencode_mb ) x264_fdec_filter_row( h, i_mb_y, 0 ); } if( !(i_mb_y & SLICE_MBAFF) && back_up_bitstream ) - x264_bitstream_backup( h, &bs_bak[0], i_skip, 0 ); + { + x264_bitstream_backup( h, &bs_bak[BS_BAK_SLICE_MAX_SIZE], i_skip, 0 ); + if( slice_max_size && (thread_last_mb+1-mb_xy) == h->param.i_slice_min_mbs ) + x264_bitstream_backup( h, &bs_bak[BS_BAK_SLICE_MIN_MBS], i_skip, 0 ); + } if( PARAM_INTERLACED ) { @@ -2342,7 +2481,7 @@ h->mb.i_skip_intra = 0; h->mb.b_skip_mc = 0; h->mb.b_overflow = 0; - x264_bitstream_restore( h, &bs_bak[0], &i_skip, 0 ); + x264_bitstream_restore( h, &bs_bak[BS_BAK_SLICE_MAX_SIZE], &i_skip, 0 ); goto reencode; } } @@ -2367,26 +2506,50 @@ /* We'll just re-encode this last macroblock if we go over the max slice size. */ if( total_bits - starting_bits > slice_max_size && !h->mb.b_reencode_mb ) { - if( mb_xy-SLICE_MBAFF*h->mb.i_mb_stride != h->sh.i_first_mb ) + if( !x264_frame_new_slice( h, h->fdec ) ) { - x264_bitstream_restore( h, &bs_bak[0], &i_skip, 0 ); - h->mb.b_reencode_mb = 1; - if( SLICE_MBAFF ) + /* Handle the most obnoxious slice-min-mbs edge case: we need to end the slice + * because it's gone over the maximum size, but doing so would violate slice-min-mbs. + * If possible, roll back to the last checkpoint and try again. + * We could try raising QP, but that would break in the case where a slice spans multiple + * rows, which the re-encoding infrastructure can't currently handle. */ + if( mb_xy <= thread_last_mb && (thread_last_mb+1-mb_xy) < h->param.i_slice_min_mbs ) { - // set to bottom of previous mbpair - if( i_mb_x ) - h->sh.i_last_mb = mb_xy-1+h->mb.i_mb_stride*(!(i_mb_y&1)); + if( thread_last_mb-h->param.i_slice_min_mbs < h->sh.i_first_mb+h->param.i_slice_min_mbs ) + { + x264_log( h, X264_LOG_WARNING, "slice-max-size violated (frame %d, cause: slice-min-mbs)\n", h->i_frame ); + slice_max_size = 0; + goto cont; + } + x264_bitstream_restore( h, &bs_bak[BS_BAK_SLICE_MIN_MBS], &i_skip, 0 ); + h->mb.b_reencode_mb = 1; + h->sh.i_last_mb = thread_last_mb-h->param.i_slice_min_mbs; + break; + } + if( mb_xy-SLICE_MBAFF*h->mb.i_mb_stride != h->sh.i_first_mb ) + { + x264_bitstream_restore( h, &bs_bak[BS_BAK_SLICE_MAX_SIZE], &i_skip, 0 ); + h->mb.b_reencode_mb = 1; + if( SLICE_MBAFF ) + { + // set to bottom of previous mbpair + if( i_mb_x ) + h->sh.i_last_mb = mb_xy-1+h->mb.i_mb_stride*(!(i_mb_y&1)); + else + h->sh.i_last_mb = (i_mb_y-2+!(i_mb_y&1))*h->mb.i_mb_stride + h->mb.i_mb_width - 1; + } else - h->sh.i_last_mb = (i_mb_y-2+!(i_mb_y&1))*h->mb.i_mb_stride + h->mb.i_mb_width - 1; + h->sh.i_last_mb = mb_xy-1; + break; } else - h->sh.i_last_mb = mb_xy-1; - break; + h->sh.i_last_mb = mb_xy; } else - h->sh.i_last_mb = mb_xy; + slice_max_size = 0; } } +cont: h->mb.b_reencode_mb = 0; #if HAVE_VISUALIZE @@ -2399,7 +2562,7 @@ if( x264_ratecontrol_mb( h, mb_size ) < 0 ) { - x264_bitstream_restore( h, &bs_bak[1], &i_skip, 1 ); + x264_bitstream_restore( h, &bs_bak[BS_BAK_ROW_VBV], &i_skip, 1 ); h->mb.b_reencode_mb = 1; i_mb_x = 0; i_mb_y = i_mb_y - SLICE_MBAFF; @@ -2498,6 +2661,9 @@ i_mb_x = 0; } } + if( h->sh.i_last_mb < h->sh.i_first_mb ) + return 0; + h->out.nal[h->out.i_nal].i_last_mb = h->sh.i_last_mb; if( h->param.b_cabac ) @@ -2596,27 +2762,35 @@ while( h->sh.i_first_mb + SLICE_MBAFF*h->mb.i_mb_stride <= last_thread_mb ) { h->sh.i_last_mb = last_thread_mb; - if( h->param.i_slice_max_mbs ) + if( !i_slice_num || !x264_frame_new_slice( h, h->fdec ) ) { - if( SLICE_MBAFF ) + if( h->param.i_slice_max_mbs ) { - // convert first to mbaff form, add slice-max-mbs, then convert back to normal form - int last_mbaff = 2*(h->sh.i_first_mb % h->mb.i_mb_width) - + h->mb.i_mb_width*(h->sh.i_first_mb / h->mb.i_mb_width) - + h->param.i_slice_max_mbs - 1; - int last_x = (last_mbaff % (2*h->mb.i_mb_width))/2; - int last_y = (last_mbaff / (2*h->mb.i_mb_width))*2 + 1; - h->sh.i_last_mb = last_x + h->mb.i_mb_stride*last_y; + if( SLICE_MBAFF ) + { + // convert first to mbaff form, add slice-max-mbs, then convert back to normal form + int last_mbaff = 2*(h->sh.i_first_mb % h->mb.i_mb_width) + + h->mb.i_mb_width*(h->sh.i_first_mb / h->mb.i_mb_width) + + h->param.i_slice_max_mbs - 1; + int last_x = (last_mbaff % (2*h->mb.i_mb_width))/2; + int last_y = (last_mbaff / (2*h->mb.i_mb_width))*2 + 1; + h->sh.i_last_mb = last_x + h->mb.i_mb_stride*last_y; + } + else + { + h->sh.i_last_mb = h->sh.i_first_mb + h->param.i_slice_max_mbs - 1; + if( h->sh.i_last_mb < last_thread_mb && last_thread_mb - h->sh.i_last_mb < h->param.i_slice_min_mbs ) + h->sh.i_last_mb = last_thread_mb - h->param.i_slice_min_mbs; + } + i_slice_num++; + } + else if( h->param.i_slice_count && !h->param.b_sliced_threads ) + { + int height = h->mb.i_mb_height >> PARAM_INTERLACED; + int width = h->mb.i_mb_width << PARAM_INTERLACED; + i_slice_num++; + h->sh.i_last_mb = (height * i_slice_num + h->param.i_slice_count/2) / h->param.i_slice_count * width - 1; } - else - h->sh.i_last_mb = h->sh.i_first_mb + h->param.i_slice_max_mbs - 1; - } - else if( h->param.i_slice_count && !h->param.b_sliced_threads ) - { - int height = h->mb.i_mb_height >> PARAM_INTERLACED; - int width = h->mb.i_mb_width << PARAM_INTERLACED; - i_slice_num++; - h->sh.i_last_mb = (height * i_slice_num + h->param.i_slice_count/2) / h->param.i_slice_count * width - 1; } h->sh.i_last_mb = X264_MIN( h->sh.i_last_mb, last_thread_mb ); if( x264_stack_align( x264_slice_write, h ) ) @@ -2755,6 +2929,11 @@ int i_nal_type, i_nal_ref_idc, i_global_qp; int overhead = NALU_OVERHEAD; +#if HAVE_OPENCL + if( h->opencl.b_fatal_error ) + return -1; +#endif + if( h->i_thread_frames > 1 ) { thread_prev = h->thread[ h->i_thread_phase ]; @@ -3324,6 +3503,8 @@ else f = X264_MAX( 0, filler - overhead ); + if( x264_bitstream_check_buffer_filler( h, f ) ) + return -1; x264_nal_start( h, NAL_FILLER, NAL_PRIORITY_DISPOSABLE ); x264_filler_write( h, &h->out.bs, f ); if( x264_nal_end( h ) ) @@ -3503,6 +3684,11 @@ x264_lookahead_delete( h ); +#if HAVE_OPENCL + x264_opencl_lookahead_delete( h ); + x264_opencl_function_t *ocl = h->opencl.ocl; +#endif + if( h->param.b_sliced_threads ) x264_threadpool_wait_all( h ); if( h->param.i_threads > 1 ) @@ -3852,6 +4038,9 @@ x264_pthread_cond_destroy( &h->thread[i]->cv ); x264_free( h->thread[i] ); } +#if HAVE_OPENCL + x264_opencl_close_library( ocl ); +#endif } int x264_encoder_delayed_frames( x264_t *h )
View file
x264-snapshot-20130224-2245.tar.bz2/encoder/lookahead.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/lookahead.c
Changed
@@ -70,18 +70,19 @@ x264_stack_align( x264_slicetype_decide, h ); x264_lookahead_update_last_nonb( h, h->lookahead->next.list[0] ); + int shift_frames = h->lookahead->next.list[0]->i_bframes + 1; x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex ); while( h->lookahead->ofbuf.i_size == h->lookahead->ofbuf.i_max_size ) x264_pthread_cond_wait( &h->lookahead->ofbuf.cv_empty, &h->lookahead->ofbuf.mutex ); x264_pthread_mutex_lock( &h->lookahead->next.mutex ); - x264_lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, h->lookahead->next.list[0]->i_bframes + 1 ); + x264_lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, shift_frames ); x264_pthread_mutex_unlock( &h->lookahead->next.mutex ); /* For MB-tree and VBV lookahead, we have to perform propagation analysis on I-frames too. */ if( h->lookahead->b_analyse_keyframe && IS_X264_TYPE_I( h->lookahead->last_nonb->i_type ) ) - x264_stack_align( x264_slicetype_analyse, h, 1 ); + x264_stack_align( x264_slicetype_analyse, h, shift_frames ); x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex ); } @@ -236,11 +237,12 @@ x264_stack_align( x264_slicetype_decide, h ); x264_lookahead_update_last_nonb( h, h->lookahead->next.list[0] ); - x264_lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, h->lookahead->next.list[0]->i_bframes + 1 ); + int shift_frames = h->lookahead->next.list[0]->i_bframes + 1; + x264_lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, shift_frames ); /* For MB-tree and VBV lookahead, we have to perform propagation analysis on I-frames too. */ if( h->lookahead->b_analyse_keyframe && IS_X264_TYPE_I( h->lookahead->last_nonb->i_type ) ) - x264_stack_align( x264_slicetype_analyse, h, 1 ); + x264_stack_align( x264_slicetype_analyse, h, shift_frames ); x264_lookahead_encoder_shift( h ); }
View file
x264-snapshot-20130224-2245.tar.bz2/encoder/macroblock.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/macroblock.c
Changed
@@ -128,8 +128,8 @@ pixel *p_src = h->mb.pic.p_fenc[p]; pixel *p_dst = h->mb.pic.p_fdec[p]; - ALIGNED_ARRAY_16( dctcoef, dct4x4,[16],[16] ); - ALIGNED_ARRAY_16( dctcoef, dct_dc4x4,[16] ); + ALIGNED_ARRAY_N( dctcoef, dct4x4,[16],[16] ); + ALIGNED_ARRAY_N( dctcoef, dct_dc4x4,[16] ); int nz, block_cbp = 0; int decimate_score = h->mb.b_dct_decimate ? 0 : 9; @@ -157,28 +157,51 @@ return; } + M32( &h->mb.cache.non_zero_count[x264_scan8[ 0+p*16]] ) = 0; + M32( &h->mb.cache.non_zero_count[x264_scan8[ 2+p*16]] ) = 0; + M32( &h->mb.cache.non_zero_count[x264_scan8[ 8+p*16]] ) = 0; + M32( &h->mb.cache.non_zero_count[x264_scan8[10+p*16]] ) = 0; + h->dctf.sub16x16_dct( dct4x4, p_src, p_dst ); - for( int i = 0; i < 16; i++ ) + if( h->mb.b_noise_reduction ) + for( int idx = 0; idx < 16; idx++ ) + h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 ); + + for( int idx = 0; idx < 16; idx++ ) { - /* copy dc coeff */ - if( h->mb.b_noise_reduction ) - h->quantf.denoise_dct( dct4x4[i], h->nr_residual_sum[0], h->nr_offset[0], 16 ); - dct_dc4x4[block_idx_xy_1d[i]] = dct4x4[i][0]; - dct4x4[i][0] = 0; + dct_dc4x4[block_idx_xy_1d[idx]] = dct4x4[idx][0]; + dct4x4[idx][0] = 0; + } - /* quant/scan/dequant */ - if( h->mb.b_trellis ) - nz = x264_quant_4x4_trellis( h, dct4x4[i], i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_AC][p], 1, !!p, i ); - else - nz = h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] ); - h->mb.cache.non_zero_count[x264_scan8[16*p+i]] = nz; - if( nz ) + if( h->mb.b_trellis ) + { + for( int idx = 0; idx < 16; idx++ ) + if( x264_quant_4x4_trellis( h, dct4x4[idx], i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_AC][p], 1, !!p, idx ) ) + { + block_cbp = 0xf; + h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+idx], dct4x4[idx] ); + h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[i_quant_cat], i_qp ); + if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+idx] ); + h->mb.cache.non_zero_count[x264_scan8[16*p+idx]] = 1; + } + } + else + { + for( int i8x8 = 0; i8x8 < 4; i8x8++ ) { - h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+i], dct4x4[i] ); - h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[i_quant_cat], i_qp ); - if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+i] ); - block_cbp = 0xf; + nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] ); + if( nz ) + { + block_cbp = 0xf; + FOREACH_BIT( idx, i8x8*4, nz ) + { + h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+idx], dct4x4[idx] ); + h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[i_quant_cat], i_qp ); + if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+idx] ); + h->mb.cache.non_zero_count[x264_scan8[16*p+idx]] = 1; + } + } } } @@ -245,6 +268,18 @@ h->mb.i_cbp_chroma = 0; h->nr_count[2] += h->mb.b_noise_reduction * 4; + M16( &h->mb.cache.non_zero_count[x264_scan8[16]] ) = 0; + M16( &h->mb.cache.non_zero_count[x264_scan8[18]] ) = 0; + M16( &h->mb.cache.non_zero_count[x264_scan8[32]] ) = 0; + M16( &h->mb.cache.non_zero_count[x264_scan8[34]] ) = 0; + if( chroma422 ) + { + M16( &h->mb.cache.non_zero_count[x264_scan8[24]] ) = 0; + M16( &h->mb.cache.non_zero_count[x264_scan8[26]] ) = 0; + M16( &h->mb.cache.non_zero_count[x264_scan8[40]] ) = 0; + M16( &h->mb.cache.non_zero_count[x264_scan8[42]] ) = 0; + } + /* Early termination: check variance of chroma residual before encoding. * Don't bother trying early termination at low QPs. * Values are experimentally derived. */ @@ -259,17 +294,6 @@ score += h->pixf.var2[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] ); if( score < thresh*4 ) { - M16( &h->mb.cache.non_zero_count[x264_scan8[16]] ) = 0; - M16( &h->mb.cache.non_zero_count[x264_scan8[18]] ) = 0; - M16( &h->mb.cache.non_zero_count[x264_scan8[32]] ) = 0; - M16( &h->mb.cache.non_zero_count[x264_scan8[34]] ) = 0; - if( chroma422 ) - { - M16( &h->mb.cache.non_zero_count[x264_scan8[24]] ) = 0; - M16( &h->mb.cache.non_zero_count[x264_scan8[26]] ) = 0; - M16( &h->mb.cache.non_zero_count[x264_scan8[40]] ) = 0; - M16( &h->mb.cache.non_zero_count[x264_scan8[42]] ) = 0; - } h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] = 0; h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] = 0; @@ -326,10 +350,10 @@ { pixel *p_src = h->mb.pic.p_fenc[1+ch]; pixel *p_dst = h->mb.pic.p_fdec[1+ch]; - int i_decimate_score = 0; + int i_decimate_score = b_decimate ? 0 : 7; int nz_ac = 0; - ALIGNED_ARRAY_16( dctcoef, dct4x4,[8],[16] ); + ALIGNED_ARRAY_N( dctcoef, dct4x4,[8],[16] ); if( h->mb.b_lossless ) { @@ -361,20 +385,40 @@ dct2x2dc( dct_dc, dct4x4 ); /* calculate dct coeffs */ - for( int i = 0; i < (chroma422?8:4); i++ ) + for( int i8x8 = 0; i8x8 < (chroma422?2:1); i8x8++ ) { if( h->mb.b_trellis ) - nz = x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 1, 0 ); + { + for( int i4x4 = 0; i4x4 < 4; i4x4++ ) + { + if( x264_quant_4x4_trellis( h, dct4x4[i8x8*4+i4x4], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 1, 0 ) ) + { + int idx = 16+ch*16+i8x8*8+i4x4; + h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[i8x8*4+i4x4] ); + h->quantf.dequant_4x4( dct4x4[i8x8*4+i4x4], dequant_mf, i_qp ); + if( i_decimate_score < 7 ) + i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[idx] ); + h->mb.cache.non_zero_count[x264_scan8[idx]] = 1; + nz_ac = 1; + } + } + } else - nz = h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qp], h->quant4_bias[CQM_4IC+b_inter][i_qp] ); - h->mb.cache.non_zero_count[x264_scan8[16+i+(chroma422?i&4:0)+ch*16]] = nz; - if( nz ) { - nz_ac = 1; - h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16], dct4x4[i] ); - h->quantf.dequant_4x4( dct4x4[i], dequant_mf, i_qp ); - if( b_decimate ) - i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16] ); + nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[CQM_4IC+b_inter][i_qp], + h->quant4_bias[CQM_4IC+b_inter][i_qp] ); + nz_ac |= nz; + + FOREACH_BIT( i4x4, 0, nz ) + { + int idx = 16+ch*16+i8x8*8+i4x4; + + h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[i8x8*4+i4x4] ); + h->quantf.dequant_4x4( dct4x4[i8x8*4+i4x4], dequant_mf, i_qp ); + if( i_decimate_score < 7 ) + i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[idx] ); + h->mb.cache.non_zero_count[x264_scan8[idx]] = 1; + } } } @@ -390,7 +434,7 @@ h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = nz_dc; - if( (b_decimate && i_decimate_score < 7) || !nz_ac ) + if( i_decimate_score < 7 || !nz_ac ) { /* Decimate the block */ M16( &h->mb.cache.non_zero_count[x264_scan8[16+16*ch]] ) = 0; @@ -646,11 +690,8 @@ { h->mb.b_transform_8x8 = 0; - for( int p = 0; p < plane_count; p++ ) - { + for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp ) x264_mb_encode_i16x16( h, p, i_qp ); - i_qp = h->mb.i_chroma_qp; - } } else if( h->mb.i_type == I_8x8 ) { @@ -668,14 +709,13 @@ if( h->mb.i_skip_intra == 2 ) h->mc.memcpy_aligned( h->dct.luma8x8, h->mb.pic.i8x8_dct_buf, sizeof(h->mb.pic.i8x8_dct_buf) ); } - for( int p = 0; p < plane_count; p++ ) + for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp ) { for( int i = (p == 0 && h->mb.i_skip_intra) ? 3 : 0 ; i < 4; i++ ) { int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]]; x264_mb_encode_i8x8( h, p, i, i_qp, i_mode, NULL, 1 ); } - i_qp = h->mb.i_chroma_qp; } } else if( h->mb.i_type == I_4x4 ) @@ -694,7 +734,7 @@ if( h->mb.i_skip_intra == 2 ) h->mc.memcpy_aligned( h->dct.luma4x4, h->mb.pic.i4x4_dct_buf, sizeof(h->mb.pic.i4x4_dct_buf) ); } - for( int p = 0; p < plane_count; p++ ) + for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp ) { for( int i = (p == 0 && h->mb.i_skip_intra) ? 15 : 0 ; i < 16; i++ ) { @@ -707,7 +747,6 @@ x264_mb_encode_i4x4( h, p, i, i_qp, i_mode, 1 ); } - i_qp = h->mb.i_chroma_qp; } } else /* Inter MB */ @@ -744,11 +783,12 @@ } else if( h->mb.b_transform_8x8 ) { - ALIGNED_ARRAY_16( dctcoef, dct8x8,[4],[64] ); + ALIGNED_ARRAY_N( dctcoef, dct8x8,[4],[64] ); b_decimate &= !h->mb.b_trellis || !h->param.b_cabac; // 8x8 trellis is inherently optimal decimation for CABAC - for( int p = 0; p < plane_count; p++ ) + for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp ) { + CLEAR_16x16_NNZ( p ); h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[p], h->mb.pic.p_fdec[p] ); h->nr_count[1+!!p*2] += h->mb.b_noise_reduction * 4; @@ -772,99 +812,92 @@ } } - if( i_decimate_mb < 6 && b_decimate ) - { - plane_cbp = 0; - CLEAR_16x16_NNZ( p ); - } - else + if( i_decimate_mb >= 6 || !b_decimate ) { - for( int idx = 0; idx < 4; idx++ ) + h->mb.i_cbp_luma |= plane_cbp; + FOREACH_BIT( idx, 0, plane_cbp ) { - int x = idx&1; - int y = idx>>1; - - if( plane_cbp&(1<<idx) ) - { - h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[p?CQM_8PC:CQM_8PY], i_qp ); - h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE], dct8x8[idx] ); - STORE_8x8_NNZ( p, idx, 1 ); - } - else - STORE_8x8_NNZ( p, idx, 0 ); + h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[p?CQM_8PC:CQM_8PY], i_qp ); + h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[p][8*(idx&1) + 8*(idx>>1)*FDEC_STRIDE], dct8x8[idx] ); + STORE_8x8_NNZ( p, idx, 1 ); } } - h->mb.i_cbp_luma |= plane_cbp; - i_qp = h->mb.i_chroma_qp; } } else { - ALIGNED_ARRAY_16( dctcoef, dct4x4,[16],[16] ); - for( int p = 0; p < plane_count; p++ ) + ALIGNED_ARRAY_N( dctcoef, dct4x4,[16],[16] ); + for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp ) { + CLEAR_16x16_NNZ( p ); h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[p], h->mb.pic.p_fdec[p] ); - h->nr_count[0+!!p*2] += h->mb.b_noise_reduction * 16; + + if( h->mb.b_noise_reduction ) + { + h->nr_count[0+!!p*2] += 16; + for( int idx = 0; idx < 16; idx++ ) + h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 ); + } int plane_cbp = 0; for( int i8x8 = 0; i8x8 < 4; i8x8++ ) { - int i_decimate_8x8 = 0; - int cbp = 0; - - /* encode one 4x4 block */ - for( int i4x4 = 0; i4x4 < 4; i4x4++ ) + int i_decimate_8x8 = b_decimate ? 0 : 6; + int nnz8x8 = 0; + if( h->mb.b_trellis ) { - int idx = i8x8 * 4 + i4x4; - - nz = x264_quant_4x4( h, dct4x4[idx], i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, idx ); - h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = nz; - + for( int i4x4 = 0; i4x4 < 4; i4x4++ ) + { + int idx = i8x8*4+i4x4; + if( x264_quant_4x4_trellis( h, dct4x4[idx], CQM_4PY, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, !!p, p*16+idx ) ) + { + h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4[idx] ); + h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[p?CQM_4PC:CQM_4PY], i_qp ); + if( i_decimate_8x8 < 6 ) + i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+idx] ); + h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = 1; + nnz8x8 = 1; + } + } + } + else + { + nnz8x8 = nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ); if( nz ) { - h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4[idx] ); - h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[p?CQM_4PC:CQM_4PY], i_qp ); - if( b_decimate && i_decimate_8x8 < 6 ) - i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+idx] ); - cbp = 1; + FOREACH_BIT( idx, i8x8*4, nz ) + { + h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4[idx] ); + h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[p?CQM_4PC:CQM_4PY], i_qp ); + if( i_decimate_8x8 < 6 ) + i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+idx] ); + h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = 1; + } } } - - int x = i8x8&1; - int y = i8x8>>1; - - /* decimate this 8x8 block */ - i_decimate_mb += i_decimate_8x8; - if( b_decimate ) + if( nnz8x8 ) { + i_decimate_mb += i_decimate_8x8; if( i_decimate_8x8 < 4 ) STORE_8x8_NNZ( p, i8x8, 0 ); else plane_cbp |= 1<<i8x8; } - else if( cbp ) - { - h->dctf.add8x8_idct( &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE], &dct4x4[i8x8*4] ); - plane_cbp |= 1<<i8x8; - } } - if( b_decimate ) + if( i_decimate_mb < 6 ) { - if( i_decimate_mb < 6 ) - { - plane_cbp = 0; - CLEAR_16x16_NNZ( p ); - } - else + plane_cbp = 0; + CLEAR_16x16_NNZ( p ); + } + else + { + h->mb.i_cbp_luma |= plane_cbp; + FOREACH_BIT( i8x8, 0, plane_cbp ) { - for( int i8x8 = 0; i8x8 < 4; i8x8++ ) - if( plane_cbp&(1<<i8x8) ) - h->dctf.add8x8_idct( &h->mb.pic.p_fdec[p][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] ); + h->dctf.add8x8_idct( &h->mb.pic.p_fdec[p][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] ); } } - h->mb.i_cbp_luma |= plane_cbp; - i_qp = h->mb.i_chroma_qp; } } } @@ -933,12 +966,12 @@ *****************************************************************************/ static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_bidir, int plane_count, int chroma ) { - ALIGNED_ARRAY_16( dctcoef, dct4x4,[8],[16] ); + ALIGNED_ARRAY_N( dctcoef, dct4x4,[8],[16] ); ALIGNED_ARRAY_16( dctcoef, dctscan,[16] ); ALIGNED_4( int16_t mvp[2] ); int i_qp = h->mb.i_qp; - for( int p = 0; p < plane_count; p++ ) + for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp ) { int quant_cat = p ? CQM_4PC : CQM_4PY; if( !b_bidir ) @@ -957,23 +990,23 @@ { int fenc_offset = (i8x8&1) * 8 + (i8x8>>1) * FENC_STRIDE * 8; int fdec_offset = (i8x8&1) * 8 + (i8x8>>1) * FDEC_STRIDE * 8; - /* get luma diff */ + h->dctf.sub8x8_dct( dct4x4, h->mb.pic.p_fenc[p] + fenc_offset, h->mb.pic.p_fdec[p] + fdec_offset ); - /* encode one 4x4 block */ - for( int i4x4 = 0; i4x4 < 4; i4x4++ ) - { - if( h->mb.b_noise_reduction ) + + if( h->mb.b_noise_reduction ) + for( int i4x4 = 0; i4x4 < 4; i4x4++ ) h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 ); - if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[quant_cat][i_qp], h->quant4_bias[quant_cat][i_qp] ) ) - continue; - h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] ); + + int nz = h->quantf.quant_4x4x4( dct4x4, h->quant4_mf[quant_cat][i_qp], h->quant4_bias[quant_cat][i_qp] ); + FOREACH_BIT( idx, 0, nz ) + { + h->zigzagf.scan_4x4( dctscan, dct4x4[idx] ); i_decimate_mb += h->quantf.decimate_score16( dctscan ); if( i_decimate_mb >= 6 ) return 0; } } - i_qp = h->mb.i_chroma_qp; } if( chroma == CHROMA_420 || chroma == CHROMA_422 ) @@ -1023,6 +1056,7 @@ { h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 ); dct_dc[i4x4] = dct4x4[i4x4][0]; + dct4x4[i4x4][0] = 0; } } else @@ -1043,21 +1077,26 @@ continue; if( !h->mb.b_noise_reduction ) - for( int i = 0; i <= chroma422; i++ ) + for( int i = 0; i <= chroma422; i++ ) + { h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE ); + dct4x4[i*4+0][0] = 0; + dct4x4[i*4+1][0] = 0; + dct4x4[i*4+2][0] = 0; + dct4x4[i*4+3][0] = 0; + } /* calculate dct coeffs */ - for( int i4x4 = 0, i_decimate_mb = 0; i4x4 < (chroma422?8:4); i4x4++ ) + for( int i8x8 = 0, i_decimate_mb = 0; i8x8 < (chroma422?2:1); i8x8++ ) { - dct4x4[i4x4][0] = 0; - if( h->mb.b_noise_reduction ) - h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 ); - if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ) ) - continue; - h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] ); - i_decimate_mb += h->quantf.decimate_score15( dctscan ); - if( i_decimate_mb >= 7 ) - return 0; + int nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ); + FOREACH_BIT( idx, i8x8*4, nz ) + { + h->zigzagf.scan_4x4( dctscan, dct4x4[idx] ); + i_decimate_mb += h->quantf.decimate_score15( dctscan ); + if( i_decimate_mb >= 7 ) + return 0; + } } } } @@ -1176,12 +1215,13 @@ { if( h->mb.b_transform_8x8 ) { - for( int p = 0; p < plane_count; p++ ) + for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp ) { int quant_cat = p ? CQM_8PC : CQM_8PY; pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE; pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE; - ALIGNED_ARRAY_16( dctcoef, dct8x8,[64] ); + ALIGNED_ARRAY_N( dctcoef, dct8x8,[64] ); + h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec ); int nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, i8 ); if( nnz8x8 ) @@ -1196,50 +1236,74 @@ h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[quant_cat], i_qp ); h->dctf.add8x8_idct8( p_fdec, dct8x8 ); STORE_8x8_NNZ( p, i8, 1 ); + h->mb.i_cbp_luma |= 1 << i8; } else STORE_8x8_NNZ( p, i8, 0 ); } else STORE_8x8_NNZ( p, i8, 0 ); - h->mb.i_cbp_luma |= nnz8x8 << i8; - i_qp = h->mb.i_chroma_qp; } } else { - for( int p = 0; p < plane_count; p++ ) + for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp ) { int quant_cat = p ? CQM_4PC : CQM_4PY; pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE; pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE; - int i_decimate_8x8 = 0, nnz8x8 = 0; - ALIGNED_ARRAY_16( dctcoef, dct4x4,[4],[16] ); + int i_decimate_8x8 = b_decimate ? 0 : 4; + ALIGNED_ARRAY_N( dctcoef, dct4x4,[4],[16] ); + int nnz8x8 = 0; + h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec ); - for( int i4 = 0; i4 < 4; i4++ ) + STORE_8x8_NNZ( p, i8, 0 ); + + if( h->mb.b_noise_reduction ) + for( int idx = 0; idx < 4; idx++ ) + h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 ); + + if( h->mb.b_trellis ) { - nz = x264_quant_4x4( h, dct4x4[i4], i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, i8*4+i4 ); - h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4+i4]] = nz; + for( int i4x4 = 0; i4x4 < 4; i4x4++ ) + { + if( x264_quant_4x4_trellis( h, dct4x4[i4x4], quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, !!p, i8*4+i4x4+p*16 ) ) + { + h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+i8*4+i4x4], dct4x4[i4x4] ); + h->quantf.dequant_4x4( dct4x4[i4x4], h->dequant4_mf[quant_cat], i_qp ); + if( i_decimate_8x8 < 4 ) + i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+i8*4+i4x4] ); + h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4+i4x4]] = 1; + nnz8x8 = 1; + } + } + } + else + { + nnz8x8 = nz = h->quantf.quant_4x4x4( dct4x4, h->quant4_mf[quant_cat][i_qp], h->quant4_bias[quant_cat][i_qp] ); if( nz ) { - h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+i8*4+i4], dct4x4[i4] ); - h->quantf.dequant_4x4( dct4x4[i4], h->dequant4_mf[quant_cat], i_qp ); - if( b_decimate ) - i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+i8*4+i4] ); - nnz8x8 = 1; + FOREACH_BIT( i4x4, 0, nz ) + { + h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+i8*4+i4x4], dct4x4[i4x4] ); + h->quantf.dequant_4x4( dct4x4[i4x4], h->dequant4_mf[quant_cat], i_qp ); + if( i_decimate_8x8 < 4 ) + i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+i8*4+i4x4] ); + h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4+i4x4]] = 1; + } } } - - if( b_decimate && i_decimate_8x8 < 4 ) - nnz8x8 = 0; - if( nnz8x8 ) - h->dctf.add8x8_idct( p_fdec, dct4x4 ); - else - STORE_8x8_NNZ( p, i8, 0 ); - - h->mb.i_cbp_luma |= nnz8x8 << i8; - i_qp = h->mb.i_chroma_qp; + { + /* decimate this 8x8 block */ + if( i_decimate_8x8 < 4 ) + STORE_8x8_NNZ( p, i8, 0 ); + else + { + h->dctf.add8x8_idct( p_fdec, dct4x4 ); + h->mb.i_cbp_luma |= 1 << i8; + } + } } } @@ -1248,7 +1312,7 @@ i_qp = h->mb.i_chroma_qp; for( int ch = 0; ch < 2; ch++ ) { - ALIGNED_ARRAY_16( dctcoef, dct4x4,[2],[16] ); + ALIGNED_ARRAY_N( dctcoef, dct4x4,[2],[16] ); pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE; pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE; @@ -1297,7 +1361,7 @@ { int i_qp = h->mb.i_qp; - for( int p = 0; p < plane_count; p++ ) + for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp ) { int quant_cat = p ? CQM_4PC : CQM_4PY; pixel *p_fenc = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[i4]]; @@ -1313,7 +1377,7 @@ } else { - ALIGNED_ARRAY_16( dctcoef, dct4x4,[16] ); + ALIGNED_ARRAY_N( dctcoef, dct4x4,[16] ); h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec ); nz = x264_quant_4x4( h, dct4x4, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, i4 ); h->mb.cache.non_zero_count[x264_scan8[p*16+i4]] = nz; @@ -1324,7 +1388,6 @@ h->dctf.add4x4_idct( p_fdec, dct4x4 ); } } - i_qp = h->mb.i_chroma_qp; } }
View file
x264-snapshot-20130224-2245.tar.bz2/encoder/macroblock.h -> x264-snapshot-20130723-2245.tar.bz2/encoder/macroblock.h
Changed
@@ -104,12 +104,16 @@ M32( &h->mb.cache.non_zero_count[x264_scan8[16*p+10]] ) = 0;\ } while(0) +/* A special for loop that iterates branchlessly over each set + * bit in a 4-bit input. */ +#define FOREACH_BIT(idx,start,mask) for( int idx = start, msk = mask, skip; msk && (skip = x264_ctz_4bit(msk), idx += skip, msk >>= skip+1, 1); idx++ ) + static ALWAYS_INLINE void x264_mb_encode_i4x4( x264_t *h, int p, int idx, int i_qp, int i_mode, int b_predict ) { int nz; pixel *p_src = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[idx]]; pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[idx]]; - ALIGNED_ARRAY_16( dctcoef, dct4x4,[16] ); + ALIGNED_ARRAY_N( dctcoef, dct4x4,[16] ); if( b_predict ) { @@ -147,7 +151,7 @@ int nz; pixel *p_src = &h->mb.pic.p_fenc[p][8*x + 8*y*FENC_STRIDE]; pixel *p_dst = &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE]; - ALIGNED_ARRAY_16( dctcoef, dct8x8,[64] ); + ALIGNED_ARRAY_N( dctcoef, dct8x8,[64] ); ALIGNED_ARRAY_32( pixel, edge_buf,[36] ); if( b_predict )
View file
x264-snapshot-20130224-2245.tar.bz2/encoder/me.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/me.c
Changed
@@ -61,21 +61,22 @@ (p_cost_mvx[(mx)<<2] + p_cost_mvy[(my)<<2]) #define COST_MV( mx, my )\ +do\ {\ int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE,\ &p_fref_w[(my)*stride+(mx)], stride )\ + BITS_MVD(mx,my);\ COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my );\ -} +} while(0) -#define COST_MV_HPEL( mx, my ) \ -{ \ - intptr_t stride2 = 16; \ - pixel *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh, &m->weight[0] ); \ - int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, src, stride2 ) \ - + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \ - COPY3_IF_LT( bpred_cost, cost, bpred_mx, mx, bpred_my, my ); \ -} +#define COST_MV_HPEL( mx, my, cost )\ +do\ +{\ + intptr_t stride2 = 16;\ + pixel *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh, &m->weight[0] );\ + cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, src, stride2 )\ + + p_cost_mvx[ mx ] + p_cost_mvy[ my ];\ +} while(0) #define COST_MV_X3_DIR( m0x, m0y, m1x, m1y, m2x, m2y, costs )\ {\ @@ -174,6 +175,10 @@ }\ } +#define FPEL(mv) (((mv)+2)>>2) /* Convert subpel MV to fullpel with rounding... */ +#define SPEL(mv) ((mv)<<2) /* ... and the reverse. */ +#define SPELx2(mv) (SPEL(mv)&0xFFFCFFFC) /* for two packed MVs */ + void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_halfpel_thresh ) { const int bw = x264_pixel_size[m->i_pixel].w; @@ -181,97 +186,136 @@ const int i_pixel = m->i_pixel; const int stride = m->i_stride[0]; int i_me_range = h->param.analyse.i_me_range; - int bmx, bmy, bcost; - int bpred_mx = 0, bpred_my = 0, bpred_cost = COST_MAX; + int bmx, bmy, bcost = COST_MAX; + int bpred_cost = COST_MAX; int omx, omy, pmx, pmy; pixel *p_fenc = m->p_fenc[0]; pixel *p_fref_w = m->p_fref_w; - ALIGNED_ARRAY_16( pixel, pix,[16*16] ); - - int costs[16]; - - int mv_x_min = h->mb.mv_min_fpel[0]; - int mv_y_min = h->mb.mv_min_fpel[1]; - int mv_x_max = h->mb.mv_max_fpel[0]; - int mv_y_max = h->mb.mv_max_fpel[1]; - int mv_x_min_qpel = mv_x_min << 2; - int mv_y_min_qpel = mv_y_min << 2; - int mv_x_max_qpel = mv_x_max << 2; - int mv_y_max_qpel = mv_y_max << 2; + ALIGNED_ARRAY_N( pixel, pix,[16*16] ); + ALIGNED_ARRAY_8( int16_t, mvc_temp,[16],[2] ); + + ALIGNED_ARRAY_16( int, costs,[16] ); + + int mv_x_min = h->mb.mv_limit_fpel[0][0]; + int mv_y_min = h->mb.mv_limit_fpel[0][1]; + int mv_x_max = h->mb.mv_limit_fpel[1][0]; + int mv_y_max = h->mb.mv_limit_fpel[1][1]; /* Special version of pack to allow shortcuts in CHECK_MVRANGE */ #define pack16to32_mask2(mx,my) ((mx<<16)|(my&0x7FFF)) uint32_t mv_min = pack16to32_mask2( -mv_x_min, -mv_y_min ); uint32_t mv_max = pack16to32_mask2( mv_x_max, mv_y_max )|0x8000; + uint32_t pmv, bpred_mv = 0; #define CHECK_MVRANGE(mx,my) (!(((pack16to32_mask2(mx,my) + mv_min) | (mv_max - pack16to32_mask2(mx,my))) & 0x80004000)) const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0]; const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1]; - uint32_t pmv; - bmx = x264_clip3( m->mvp[0], mv_x_min_qpel, mv_x_max_qpel ); - bmy = x264_clip3( m->mvp[1], mv_y_min_qpel, mv_y_max_qpel ); - pmx = ( bmx + 2 ) >> 2; - pmy = ( bmy + 2 ) >> 2; - bcost = COST_MAX; - - /* try extra predictors if provided */ + /* Try extra predictors if provided. If subme >= 3, check subpel predictors, + * otherwise round them to fullpel. */ if( h->mb.i_subpel_refine >= 3 ) { - pmv = pack16to32_mask(bmx,bmy); - if( i_mvc ) - COST_MV_HPEL( bmx, bmy ); - for( int i = 0; i < i_mvc; i++ ) + /* Calculate and check the MVP first */ + int bpred_mx = x264_clip3( m->mvp[0], SPEL(mv_x_min), SPEL(mv_x_max) ); + int bpred_my = x264_clip3( m->mvp[1], SPEL(mv_y_min), SPEL(mv_y_max) ); + pmv = pack16to32_mask( bpred_mx, bpred_my ); + pmx = FPEL( bpred_mx ); + pmy = FPEL( bpred_my ); + + COST_MV_HPEL( bpred_mx, bpred_my, bpred_cost ); + int pmv_cost = bpred_cost; + + if( i_mvc > 0 ) { - if( M32( mvc[i] ) && (pmv != M32( mvc[i] )) ) + /* Clip MV candidates and eliminate those equal to zero and pmv. */ + int valid_mvcs = x264_predictor_clip( mvc_temp+2, mvc, i_mvc, h->mb.mv_limit_fpel, pmv ); + if( valid_mvcs > 0 ) { - int mx = x264_clip3( mvc[i][0], mv_x_min_qpel, mv_x_max_qpel ); - int my = x264_clip3( mvc[i][1], mv_y_min_qpel, mv_y_max_qpel ); - COST_MV_HPEL( mx, my ); + int i = 1, cost; + /* We stuff pmv here to branchlessly pick between pmv and the various + * MV candidates. [0] gets skipped in order to maintain alignment for + * x264_predictor_clip. */ + M32( mvc_temp[1] ) = pmv; + bpred_cost <<= 4; + do + { + int mx = mvc_temp[i+1][0]; + int my = mvc_temp[i+1][1]; + COST_MV_HPEL( mx, my, cost ); + COPY1_IF_LT( bpred_cost, (cost << 4) + i ); + } while( ++i <= valid_mvcs ); + bpred_mx = mvc_temp[(bpred_cost&15)+1][0]; + bpred_my = mvc_temp[(bpred_cost&15)+1][1]; + bpred_cost >>= 4; } } - bmx = ( bpred_mx + 2 ) >> 2; - bmy = ( bpred_my + 2 ) >> 2; - COST_MV( bmx, bmy ); + + /* Round the best predictor back to fullpel and get the cost, since this is where + * we'll be starting the fullpel motion search. */ + bmx = FPEL( bpred_mx ); + bmy = FPEL( bpred_my ); + bpred_mv = pack16to32_mask(bpred_mx, bpred_my); + if( bpred_mv&0x00030003 ) /* Only test if the tested predictor is actually subpel... */ + COST_MV( bmx, bmy ); + else /* Otherwise just copy the cost (we already know it) */ + bcost = bpred_cost; + + /* Test the zero vector if it hasn't been tested yet. */ + if( pmv ) + { + if( bmx|bmy ) COST_MV( 0, 0 ); + } + /* If a subpel mv candidate was better than the zero vector, the previous + * fullpel check won't have gotten it even if the pmv was zero. So handle + * that possibility here. */ + else + { + COPY3_IF_LT( bcost, pmv_cost, bmx, 0, bmy, 0 ); + } } else { - /* check the MVP */ - bmx = pmx; - bmy = pmy; + /* Calculate and check the fullpel MVP first */ + bmx = pmx = x264_clip3( FPEL(m->mvp[0]), mv_x_min, mv_x_max ); + bmy = pmy = x264_clip3( FPEL(m->mvp[1]), mv_y_min, mv_y_max ); + pmv = pack16to32_mask( bmx, bmy ); + /* Because we are rounding the predicted motion vector to fullpel, there will be * an extra MV cost in 15 out of 16 cases. However, when the predicted MV is * chosen as the best predictor, it is often the case that the subpel search will - * result in a vector at or next to the predicted motion vector. Therefore, it is - * sensible to omit the cost of the MV from the rounded MVP to avoid unfairly - * biasing against use of the predicted motion vector. */ + * result in a vector at or next to the predicted motion vector. Therefore, we omit + * the cost of the MV from the rounded MVP to avoid unfairly biasing against use of + * the predicted motion vector. + * + * Disclaimer: this is a post-hoc rationalization for why this hack works. */ bcost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[bmy*stride+bmx], stride ); - pmv = pack16to32_mask( bmx, bmy ); + if( i_mvc > 0 ) { - ALIGNED_ARRAY_8( int16_t, mvc_fpel,[16],[2] ); - x264_predictor_roundclip( mvc_fpel+2, mvc, i_mvc, mv_x_min, mv_x_max, mv_y_min, mv_y_max ); - M32( mvc_fpel[1] ) = pmv; - bcost <<= 4; - for( int i = 1; i <= i_mvc; i++ ) + /* Like in subme>=3, except we also round the candidates to fullpel. */ + int valid_mvcs = x264_predictor_roundclip( mvc_temp+2, mvc, i_mvc, h->mb.mv_limit_fpel, pmv ); + if( valid_mvcs > 0 ) { - if( M32( mvc_fpel[i+1] ) && (pmv != M32( mvc_fpel[i+1] )) ) + int i = 1, cost; + M32( mvc_temp[1] ) = pmv; + bcost <<= 4; + do { - int mx = mvc_fpel[i+1][0]; - int my = mvc_fpel[i+1][1]; - int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[my*stride+mx], stride ) + BITS_MVD( mx, my ); - cost = (cost << 4) + i; - COPY1_IF_LT( bcost, cost ); - } + int mx = mvc_temp[i+1][0]; + int my = mvc_temp[i+1][1]; + cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[my*stride+mx], stride ) + BITS_MVD( mx, my ); + COPY1_IF_LT( bcost, (cost << 4) + i ); + } while( ++i <= valid_mvcs ); + bmx = mvc_temp[(bcost&15)+1][0]; + bmy = mvc_temp[(bcost&15)+1][1]; + bcost >>= 4; } - bmx = mvc_fpel[(bcost&15)+1][0]; - bmy = mvc_fpel[(bcost&15)+1][1]; - bcost >>= 4; } - } - if( pmv ) - COST_MV( 0, 0 ); + /* Same as above, except the condition is simpler. */ + if( pmv ) + COST_MV( 0, 0 ); + } switch( h->mb.i_me_method ) { @@ -358,19 +402,20 @@ bcost >>= 3; #endif /* square refine */ - int dir = 0; + bcost <<= 4; COST_MV_X4_DIR( 0,-1, 0,1, -1,0, 1,0, costs ); - COPY2_IF_LT( bcost, costs[0], dir, 1 ); - COPY2_IF_LT( bcost, costs[1], dir, 2 ); - COPY2_IF_LT( bcost, costs[2], dir, 3 ); - COPY2_IF_LT( bcost, costs[3], dir, 4 ); + COPY1_IF_LT( bcost, (costs[0]<<4)+1 ); + COPY1_IF_LT( bcost, (costs[1]<<4)+2 ); + COPY1_IF_LT( bcost, (costs[2]<<4)+3 ); + COPY1_IF_LT( bcost, (costs[3]<<4)+4 ); COST_MV_X4_DIR( -1,-1, -1,1, 1,-1, 1,1, costs ); - COPY2_IF_LT( bcost, costs[0], dir, 5 ); - COPY2_IF_LT( bcost, costs[1], dir, 6 ); - COPY2_IF_LT( bcost, costs[2], dir, 7 ); - COPY2_IF_LT( bcost, costs[3], dir, 8 ); - bmx += square1[dir][0]; - bmy += square1[dir][1]; + COPY1_IF_LT( bcost, (costs[0]<<4)+5 ); + COPY1_IF_LT( bcost, (costs[1]<<4)+6 ); + COPY1_IF_LT( bcost, (costs[2]<<4)+7 ); + COPY1_IF_LT( bcost, (costs[3]<<4)+8 ); + bmx += square1[bcost&15][0]; + bmy += square1[bcost&15][1]; + bcost >>= 4; break; } @@ -609,7 +654,7 @@ if( h->mb.i_me_method == X264_ME_TESA ) { // ADS threshold, then SAD threshold, then keep the best few SADs, then SATD - mvsad_t *mvsads = (mvsad_t *)(xs + ((width+15)&~15) + 4); + mvsad_t *mvsads = (mvsad_t *)(xs + ((width+31)&~31) + 4); int nmvsad = 0, limit; int sad_thresh = i_me_range <= 16 ? 10 : i_me_range <= 24 ? 11 : 12; int bsad = h->pixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref_w+bmy*stride+bmx, stride ) @@ -727,24 +772,22 @@ } /* -> qpel mv */ - if( bpred_cost < bcost ) + uint32_t bmv = pack16to32_mask(bmx,bmy); + uint32_t bmv_spel = SPELx2(bmv); + if( h->mb.i_subpel_refine < 3 ) { - m->mv[0] = bpred_mx; - m->mv[1] = bpred_my; - m->cost = bpred_cost; + m->cost_mv = p_cost_mvx[bmx<<2] + p_cost_mvy[bmy<<2]; + m->cost = bcost; + /* compute the real cost */ + if( bmv == pmv ) m->cost += m->cost_mv; + M32( m->mv ) = bmv_spel; } else { - m->mv[0] = bmx << 2; - m->mv[1] = bmy << 2; - m->cost = bcost; + M32(m->mv) = bpred_cost < bcost ? bpred_mv : bmv_spel; + m->cost = X264_MIN( bpred_cost, bcost ); } - /* compute the real cost */ - m->cost_mv = p_cost_mvx[ m->mv[0] ] + p_cost_mvy[ m->mv[1] ]; - if( bmx == pmx && bmy == pmy && h->mb.i_subpel_refine < 3 ) - m->cost += m->cost_mv; - /* subpel refine */ if( h->mb.i_subpel_refine >= 2 ) { @@ -831,40 +874,52 @@ int chroma_v_shift = CHROMA_V_SHIFT; int mvy_offset = chroma_v_shift & MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; - ALIGNED_ARRAY_16( pixel, pix,[64*18] ); // really 17x17x2, but round up for alignment + ALIGNED_ARRAY_N( pixel, pix,[64*18] ); // really 17x17x2, but round up for alignment + ALIGNED_ARRAY_16( int, costs,[4] ); int bmx = m->mv[0]; int bmy = m->mv[1]; int bcost = m->cost; int odir = -1, bdir; - /* try the subpel component of the predicted mv */ - if( hpel_iters && h->mb.i_subpel_refine < 3 ) - { - int mx = x264_clip3( m->mvp[0], h->mb.mv_min_spel[0]+2, h->mb.mv_max_spel[0]-2 ); - int my = x264_clip3( m->mvp[1], h->mb.mv_min_spel[1]+2, h->mb.mv_max_spel[1]-2 ); - if( (mx-bmx)|(my-bmy) ) - COST_MV_SAD( mx, my ); - } - /* halfpel diamond search */ - for( int i = hpel_iters; i > 0; i-- ) + if( hpel_iters ) { - int omx = bmx, omy = bmy; - int costs[4]; - intptr_t stride = 64; // candidates are either all hpel or all qpel, so one stride is enough - pixel *src0, *src1, *src2, *src3; - src0 = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], omx, omy-2, bw, bh+1, &m->weight[0] ); - src2 = h->mc.get_ref( pix+32, &stride, m->p_fref, m->i_stride[0], omx-2, omy, bw+4, bh, &m->weight[0] ); - src1 = src0 + stride; - src3 = src2 + 1; - h->pixf.fpelcmp_x4[i_pixel]( m->p_fenc[0], src0, src1, src2, src3, stride, costs ); - COPY2_IF_LT( bcost, costs[0] + p_cost_mvx[omx ] + p_cost_mvy[omy-2], bmy, omy-2 ); - COPY2_IF_LT( bcost, costs[1] + p_cost_mvx[omx ] + p_cost_mvy[omy+2], bmy, omy+2 ); - COPY3_IF_LT( bcost, costs[2] + p_cost_mvx[omx-2] + p_cost_mvy[omy ], bmx, omx-2, bmy, omy ); - COPY3_IF_LT( bcost, costs[3] + p_cost_mvx[omx+2] + p_cost_mvy[omy ], bmx, omx+2, bmy, omy ); - if( (bmx == omx) & (bmy == omy) ) - break; + /* try the subpel component of the predicted mv */ + if( h->mb.i_subpel_refine < 3 ) + { + int mx = x264_clip3( m->mvp[0], h->mb.mv_min_spel[0]+2, h->mb.mv_max_spel[0]-2 ); + int my = x264_clip3( m->mvp[1], h->mb.mv_min_spel[1]+2, h->mb.mv_max_spel[1]-2 ); + if( (mx-bmx)|(my-bmy) ) + COST_MV_SAD( mx, my ); + } + + bcost <<= 6; + for( int i = hpel_iters; i > 0; i-- ) + { + int omx = bmx, omy = bmy; + intptr_t stride = 64; // candidates are either all hpel or all qpel, so one stride is enough + pixel *src0, *src1, *src2, *src3; + src0 = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], omx, omy-2, bw, bh+1, &m->weight[0] ); + src2 = h->mc.get_ref( pix+32, &stride, m->p_fref, m->i_stride[0], omx-2, omy, bw+4, bh, &m->weight[0] ); + src1 = src0 + stride; + src3 = src2 + 1; + h->pixf.fpelcmp_x4[i_pixel]( m->p_fenc[0], src0, src1, src2, src3, stride, costs ); + costs[0] += p_cost_mvx[omx ] + p_cost_mvy[omy-2]; + costs[1] += p_cost_mvx[omx ] + p_cost_mvy[omy+2]; + costs[2] += p_cost_mvx[omx-2] + p_cost_mvy[omy ]; + costs[3] += p_cost_mvx[omx+2] + p_cost_mvy[omy ]; + COPY1_IF_LT( bcost, (costs[0]<<6)+2 ); + COPY1_IF_LT( bcost, (costs[1]<<6)+6 ); + COPY1_IF_LT( bcost, (costs[2]<<6)+16 ); + COPY1_IF_LT( bcost, (costs[3]<<6)+48 ); + if( !(bcost&63) ) + break; + bmx -= (bcost<<26)>>29; + bmy -= (bcost<<29)>>29; + bcost &= ~63; + } + bcost >>= 6; } if( !b_refine_qpel && (h->pixf.mbcmp_unaligned[0] != h->pixf.fpelcmp[0] || b_chroma_me) ) @@ -909,7 +964,6 @@ /* Special simplified case for subme=1 */ else if( bmy > h->mb.mv_min_spel[1] && bmy < h->mb.mv_max_spel[1] && bmx > h->mb.mv_min_spel[0] && bmx < h->mb.mv_max_spel[0] ) { - int costs[4]; int omx = bmx, omy = bmy; /* We have to use mc_luma because all strides must be the same to use fpelcmp_x4 */ h->mc.mc_luma( pix , 64, m->p_fref, m->i_stride[0], omx, omy-1, bw, bh, &m->weight[0] ); @@ -917,10 +971,18 @@ h->mc.mc_luma( pix+32, 64, m->p_fref, m->i_stride[0], omx-1, omy, bw, bh, &m->weight[0] ); h->mc.mc_luma( pix+48, 64, m->p_fref, m->i_stride[0], omx+1, omy, bw, bh, &m->weight[0] ); h->pixf.fpelcmp_x4[i_pixel]( m->p_fenc[0], pix, pix+16, pix+32, pix+48, 64, costs ); - COPY2_IF_LT( bcost, costs[0] + p_cost_mvx[omx ] + p_cost_mvy[omy-1], bmy, omy-1 ); - COPY2_IF_LT( bcost, costs[1] + p_cost_mvx[omx ] + p_cost_mvy[omy+1], bmy, omy+1 ); - COPY3_IF_LT( bcost, costs[2] + p_cost_mvx[omx-1] + p_cost_mvy[omy ], bmx, omx-1, bmy, omy ); - COPY3_IF_LT( bcost, costs[3] + p_cost_mvx[omx+1] + p_cost_mvy[omy ], bmx, omx+1, bmy, omy ); + costs[0] += p_cost_mvx[omx ] + p_cost_mvy[omy-1]; + costs[1] += p_cost_mvx[omx ] + p_cost_mvy[omy+1]; + costs[2] += p_cost_mvx[omx-1] + p_cost_mvy[omy ]; + costs[3] += p_cost_mvx[omx+1] + p_cost_mvy[omy ]; + bcost <<= 4; + COPY1_IF_LT( bcost, (costs[0]<<4)+1 ); + COPY1_IF_LT( bcost, (costs[1]<<4)+3 ); + COPY1_IF_LT( bcost, (costs[2]<<4)+4 ); + COPY1_IF_LT( bcost, (costs[3]<<4)+12 ); + bmx -= (bcost<<28)>>30; + bmy -= (bcost<<30)>>30; + bcost >>= 4; } m->cost = bcost; @@ -971,9 +1033,9 @@ const int i_pixel = m0->i_pixel; const int bw = x264_pixel_size[i_pixel].w; const int bh = x264_pixel_size[i_pixel].h; - ALIGNED_ARRAY_16( pixel, pixy_buf,[2],[9][16*16] ); - ALIGNED_ARRAY_16( pixel, pixu_buf,[2],[9][16*16] ); - ALIGNED_ARRAY_16( pixel, pixv_buf,[2],[9][16*16] ); + ALIGNED_ARRAY_N( pixel, pixy_buf,[2],[9][16*16] ); + ALIGNED_ARRAY_N( pixel, pixu_buf,[2],[9][16*16] ); + ALIGNED_ARRAY_N( pixel, pixv_buf,[2],[9][16*16] ); pixel *src[3][2][9]; int chromapix = h->luma2chroma_pixel[i_pixel]; int chroma_v_shift = CHROMA_V_SHIFT; @@ -996,7 +1058,7 @@ uint64_t bcostrd = COST_MAX64; uint16_t amvd; /* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */ - ALIGNED_ARRAY_16( uint8_t, visited,[8],[8][8] ); + ALIGNED_ARRAY_N( uint8_t, visited,[8],[8][8] ); /* all permutations of an offset in up to 2 of the dimensions */ ALIGNED_4( static const int8_t dia4d[33][4] ) = {
View file
x264-snapshot-20130224-2245.tar.bz2/encoder/ratecontrol.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/ratecontrol.c
Changed
@@ -1118,7 +1118,8 @@ total_qp_aq += qp_aq; p = next; } - h->pps->i_pic_init_qp = SPEC_QP( (int)(total_qp_aq / rc->num_entries + 0.5) ); + if( !h->param.b_stitchable ) + h->pps->i_pic_init_qp = SPEC_QP( (int)(total_qp_aq / rc->num_entries + 0.5) ); x264_free( stats_buf ); @@ -1667,7 +1668,8 @@ rc->qpm = x264_clip3f( (prev_row_qp + rc->qpm)*0.5f, prev_row_qp + 1.0f, qp_max ); rc->qpa_rc = rc->qpa_rc_prev; rc->qpa_aq = rc->qpa_aq_prev; - h->fdec->i_row_bits[y] = h->fdec->i_row_bits[y-SLICE_MBAFF] = 0; + h->fdec->i_row_bits[y] = 0; + h->fdec->i_row_bits[y-SLICE_MBAFF] = 0; return -1; } } @@ -1683,7 +1685,8 @@ rc->qpm = qp_max; rc->qpa_rc = rc->qpa_rc_prev; rc->qpa_aq = rc->qpa_aq_prev; - h->fdec->i_row_bits[y] = h->fdec->i_row_bits[y-SLICE_MBAFF] = 0; + h->fdec->i_row_bits[y] = 0; + h->fdec->i_row_bits[y-SLICE_MBAFF] = 0; return -1; } } @@ -2591,14 +2594,16 @@ if( h->i_frame == 0 ) for( int i = 0; i < h->param.i_threads; i++ ) { - x264_ratecontrol_t *t = h->thread[i]->rc; - memcpy( t->row_preds, rc->row_preds, sizeof(rc->row_preds) ); + x264_t *t = h->thread[i]; + if( t != h ) + memcpy( t->rc->row_preds, rc->row_preds, sizeof(rc->row_preds) ); } for( int i = 0; i < h->param.i_threads; i++ ) { x264_t *t = h->thread[i]; - memcpy( t->rc, rc, offsetof(x264_ratecontrol_t, row_pred) ); + if( t != h ) + memcpy( t->rc, rc, offsetof(x264_ratecontrol_t, row_pred) ); t->rc->row_pred = &t->rc->row_preds[h->sh.i_type]; /* Calculate the planned slice size. */ if( rc->b_vbv && rc->frame_size_planned )
View file
x264-snapshot-20130224-2245.tar.bz2/encoder/rdo.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/rdo.c
Changed
@@ -634,13 +634,13 @@ const uint8_t *zigzag, int ctx_block_cat, int lambda2, int b_ac, int b_chroma, int dc, int num_coefs, int idx ) { - ALIGNED_ARRAY_16( dctcoef, orig_coefs, [64] ); - ALIGNED_ARRAY_16( dctcoef, quant_coefs, [64] ); + ALIGNED_ARRAY_N( dctcoef, orig_coefs, [64] ); + ALIGNED_ARRAY_N( dctcoef, quant_coefs, [64] ); const uint32_t *coef_weight1 = num_coefs == 64 ? x264_dct8_weight_tab : x264_dct4_weight_tab; const uint32_t *coef_weight2 = num_coefs == 64 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab; const int b_interlaced = MB_INTERLACED; - uint8_t *cabac_state_sig = &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][ctx_block_cat] ]; - uint8_t *cabac_state_last = &h->cabac.state[ last_coeff_flag_offset[b_interlaced][ctx_block_cat] ]; + uint8_t *cabac_state_sig = &h->cabac.state[ x264_significant_coeff_flag_offset[b_interlaced][ctx_block_cat] ]; + uint8_t *cabac_state_last = &h->cabac.state[ x264_last_coeff_flag_offset[b_interlaced][ctx_block_cat] ]; int levelgt1_ctx = b_chroma && dc ? 8 : 9; if( dc ) @@ -683,7 +683,7 @@ } int last_nnz = h->quantf.coeff_last[ctx_block_cat]( quant_coefs+b_ac )+b_ac; - uint8_t *cabac_state = &h->cabac.state[ coeff_abs_level_m1_offset[ctx_block_cat] ]; + uint8_t *cabac_state = &h->cabac.state[ x264_coeff_abs_level_m1_offset[ctx_block_cat] ]; /* shortcut for dc-only blocks. * this doesn't affect the output, but saves some unnecessary computation. */ @@ -1161,5 +1161,6 @@ h->mb.cache.non_zero_count[x264_scan8[idx*4+i]] = nz; nzaccum |= nz; } + STORE_8x8_NNZ( 0, idx, 0 ); return nzaccum; }
View file
x264-snapshot-20130224-2245.tar.bz2/encoder/set.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/set.c
Changed
@@ -208,9 +208,9 @@ ( csp >= X264_CSP_BGR ? 1 : 0 ) ); sps->vui.b_color_description_present = 0; - sps->vui.i_colorprim = ( param->vui.i_colorprim >= 0 && param->vui.i_colorprim <= 8 ? param->vui.i_colorprim : 2 ); - sps->vui.i_transfer = ( param->vui.i_transfer >= 0 && param->vui.i_transfer <= 10 ? param->vui.i_transfer : 2 ); - sps->vui.i_colmatrix = ( param->vui.i_colmatrix >= 0 && param->vui.i_colmatrix <= 8 ? param->vui.i_colmatrix : + sps->vui.i_colorprim = ( param->vui.i_colorprim >= 0 && param->vui.i_colorprim <= 9 ? param->vui.i_colorprim : 2 ); + sps->vui.i_transfer = ( param->vui.i_transfer >= 0 && param->vui.i_transfer <= 15 ? param->vui.i_transfer : 2 ); + sps->vui.i_colmatrix = ( param->vui.i_colmatrix >= 0 && param->vui.i_colmatrix <= 10 ? param->vui.i_colmatrix : ( csp >= X264_CSP_BGR ? 0 : 2 ) ); if( sps->vui.i_colorprim != 2 || sps->vui.i_transfer != 2 || @@ -430,7 +430,7 @@ pps->b_weighted_pred = param->analyse.i_weighted_pred > 0; pps->b_weighted_bipred = param->analyse.b_weighted_bipred ? 2 : 0; - pps->i_pic_init_qp = param->rc.i_rc_method == X264_RC_ABR ? 26 + QP_BD_OFFSET : SPEC_QP( param->rc.i_qp_constant ); + pps->i_pic_init_qp = param->rc.i_rc_method == X264_RC_ABR || param->b_stitchable ? 26 + QP_BD_OFFSET : SPEC_QP( param->rc.i_qp_constant ); pps->i_pic_init_qs = 26 + QP_BD_OFFSET; pps->i_chroma_qp_index_offset = param->analyse.i_chroma_qp_offset;
View file
x264-snapshot-20130723-2245.tar.bz2/encoder/slicetype-cl.c
Added
@@ -0,0 +1,780 @@ +/***************************************************************************** + * slicetype-cl.c: OpenCL slicetype decision code (lowres lookahead) + ***************************************************************************** + * Copyright (C) 2012-2013 x264 project + * + * Authors: Steve Borho <sborho@multicorewareinc.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "common/common.h" +#include "macroblock.h" +#include "me.h" + +#if HAVE_OPENCL +#ifdef _WIN32 +#include <windows.h> +#endif + +void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead ); + +/* We define CL_QUEUE_THREAD_HANDLE_AMD here because it is not defined + * in the OpenCL headers shipped with NVIDIA drivers. We need to be + * able to compile on an NVIDIA machine and run optimally on an AMD GPU. */ +#define CL_QUEUE_THREAD_HANDLE_AMD 0x403E + +#define OCLCHECK( method, ... )\ +do\ +{\ + if( h->opencl.b_fatal_error )\ + return -1;\ + status = ocl->method( __VA_ARGS__ );\ + if( status != CL_SUCCESS ) {\ + h->param.b_opencl = 0;\ + h->opencl.b_fatal_error = 1;\ + x264_log( h, X264_LOG_ERROR, # method " error '%d'\n", status );\ + return -1;\ + }\ +} while( 0 ) + +void x264_opencl_flush( x264_t *h ) +{ + x264_opencl_function_t *ocl = h->opencl.ocl; + + ocl->clFinish( h->opencl.queue ); + + /* Finish copies from the GPU by copying from the page-locked buffer to + * their final destination */ + for( int i = 0; i < h->opencl.num_copies; i++ ) + memcpy( h->opencl.copies[i].dest, h->opencl.copies[i].src, h->opencl.copies[i].bytes ); + h->opencl.num_copies = 0; + h->opencl.pl_occupancy = 0; +} + +static void *x264_opencl_alloc_locked( x264_t *h, int bytes ) +{ + if( h->opencl.pl_occupancy + bytes >= PAGE_LOCKED_BUF_SIZE ) + x264_opencl_flush( h ); + assert( bytes < PAGE_LOCKED_BUF_SIZE ); + char *ptr = h->opencl.page_locked_ptr + h->opencl.pl_occupancy; + h->opencl.pl_occupancy += bytes; + return ptr; +} + +int x264_opencl_lowres_init( x264_t *h, x264_frame_t *fenc, int lambda ) +{ + if( fenc->b_intra_calculated ) + return 0; + fenc->b_intra_calculated = 1; + + x264_opencl_function_t *ocl = h->opencl.ocl; + int luma_length = fenc->i_stride[0] * fenc->i_lines[0]; + +#define CREATEBUF( out, flags, size )\ + out = ocl->clCreateBuffer( h->opencl.context, (flags), (size), NULL, &status );\ + if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateBuffer error '%d'\n", status ); return -1; } +#define CREATEIMAGE( out, flags, pf, width, height )\ + out = ocl->clCreateImage2D( h->opencl.context, (flags), &pf, width, height, 0, NULL, &status );\ + if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateImage2D error '%d'\n", status ); return -1; } + + int mb_count = h->mb.i_mb_count; + cl_int status; + + if( !h->opencl.lowres_mv_costs ) + { + /* Allocate shared memory buffers */ + int width = h->mb.i_mb_width * 8 * sizeof(pixel); + int height = h->mb.i_mb_height * 8 * sizeof(pixel); + + cl_image_format pixel_format; + pixel_format.image_channel_order = CL_R; + pixel_format.image_channel_data_type = CL_UNSIGNED_INT32; + CREATEIMAGE( h->opencl.weighted_luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height ); + + for( int i = 0; i < NUM_IMAGE_SCALES; i++ ) + { + pixel_format.image_channel_order = CL_RGBA; + pixel_format.image_channel_data_type = CL_UNSIGNED_INT8; + CREATEIMAGE( h->opencl.weighted_scaled_images[i], CL_MEM_READ_WRITE, pixel_format, width, height ); + width >>= 1; + height >>= 1; + } + + CREATEBUF( h->opencl.lowres_mv_costs, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) ); + CREATEBUF( h->opencl.lowres_costs[0], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) ); + CREATEBUF( h->opencl.lowres_costs[1], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) ); + CREATEBUF( h->opencl.mv_buffers[0], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 ); + CREATEBUF( h->opencl.mv_buffers[1], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 ); + CREATEBUF( h->opencl.mvp_buffer, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 ); + CREATEBUF( h->opencl.frame_stats[0], CL_MEM_WRITE_ONLY, 4 * sizeof(int) ); + CREATEBUF( h->opencl.frame_stats[1], CL_MEM_WRITE_ONLY, 4 * sizeof(int) ); + CREATEBUF( h->opencl.row_satds[0], CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) ); + CREATEBUF( h->opencl.row_satds[1], CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) ); + CREATEBUF( h->opencl.luma_16x16_image[0], CL_MEM_READ_ONLY, luma_length ); + CREATEBUF( h->opencl.luma_16x16_image[1], CL_MEM_READ_ONLY, luma_length ); + } + + if( !fenc->opencl.intra_cost ) + { + /* Allocate per-frame buffers */ + int width = h->mb.i_mb_width * 8 * sizeof(pixel); + int height = h->mb.i_mb_height * 8 * sizeof(pixel); + + cl_image_format pixel_format; + pixel_format.image_channel_order = CL_R; + pixel_format.image_channel_data_type = CL_UNSIGNED_INT32; + CREATEIMAGE( fenc->opencl.luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height ); + + for( int i = 0; i < NUM_IMAGE_SCALES; i++ ) + { + pixel_format.image_channel_order = CL_RGBA; + pixel_format.image_channel_data_type = CL_UNSIGNED_INT8; + CREATEIMAGE( fenc->opencl.scaled_image2Ds[i], CL_MEM_READ_WRITE, pixel_format, width, height ); + width >>= 1; + height >>= 1; + } + CREATEBUF( fenc->opencl.inv_qscale_factor, CL_MEM_READ_ONLY, mb_count * sizeof(int16_t) ); + CREATEBUF( fenc->opencl.intra_cost, CL_MEM_WRITE_ONLY, mb_count * sizeof(int16_t) ); + CREATEBUF( fenc->opencl.lowres_mvs0, CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) ); + CREATEBUF( fenc->opencl.lowres_mvs1, CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) ); + CREATEBUF( fenc->opencl.lowres_mv_costs0, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) ); + CREATEBUF( fenc->opencl.lowres_mv_costs1, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) ); + } +#undef CREATEBUF +#undef CREATEIMAGE + + /* Copy image to the GPU, downscale to unpadded 8x8, then continue for all scales */ + + char *locked = x264_opencl_alloc_locked( h, luma_length ); + memcpy( locked, fenc->plane[0], luma_length ); + OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue, h->opencl.luma_16x16_image[h->opencl.last_buf], CL_FALSE, 0, luma_length, locked, 0, NULL, NULL ); + + size_t gdim[2]; + if( h->param.rc.i_aq_mode && fenc->i_inv_qscale_factor ) + { + int size = h->mb.i_mb_count * sizeof(int16_t); + locked = x264_opencl_alloc_locked( h, size ); + memcpy( locked, fenc->i_inv_qscale_factor, size ); + OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue, fenc->opencl.inv_qscale_factor, CL_FALSE, 0, size, locked, 0, NULL, NULL ); + } + else + { + /* Fill fenc->opencl.inv_qscale_factor with NOP (256) */ + cl_uint arg = 0; + int16_t value = 256; + OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor ); + OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(int16_t), &value ); + gdim[0] = h->mb.i_mb_count; + OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.memset_kernel, 1, NULL, gdim, NULL, 0, NULL, NULL ); + } + + int stride = fenc->i_stride[0]; + cl_uint arg = 0; + OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.luma_16x16_image[h->opencl.last_buf] ); + OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] ); + OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.luma_hpel ); + OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(int), &stride ); + gdim[0] = 8 * h->mb.i_mb_width; + gdim[1] = 8 * h->mb.i_mb_height; + OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.downscale_hpel_kernel, 2, NULL, gdim, NULL, 0, NULL, NULL ); + + for( int i = 0; i < NUM_IMAGE_SCALES - 1; i++ ) + { + /* Workaround for AMD Southern Island: + * + * Alternate kernel instances. No perf impact to this, so we do it for + * all GPUs. It prevents the same kernel from being enqueued + * back-to-back, avoiding a dependency calculation bug in the driver. + */ + cl_kernel kern = i & 1 ? h->opencl.downscale_kernel1 : h->opencl.downscale_kernel2; + + arg = 0; + OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i] ); + OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i+1] ); + gdim[0] >>= 1; + gdim[1] >>= 1; + if( gdim[0] < 16 || gdim[1] < 16 ) + break; + OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, kern, 2, NULL, gdim, NULL, 0, NULL, NULL ); + } + + size_t ldim[2]; + gdim[0] = ((h->mb.i_mb_width + 31)>>5)<<5; + gdim[1] = 8*h->mb.i_mb_height; + ldim[0] = 32; + ldim[1] = 8; + arg = 0; + + /* For presets slow, slower, and placebo, check all 10 intra modes that the + * C lookahead supports. For faster presets, only check the most frequent 8 + * modes + */ + int slow = h->param.analyse.i_subpel_refine > 7; + OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] ); + OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost ); + OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] ); + OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &lambda ); + OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width ); + OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &slow ); + OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL ); + + gdim[0] = 256; + gdim[1] = h->mb.i_mb_height; + ldim[0] = 256; + ldim[1] = 1; + arg = 0; + OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost ); + OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor ); + OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] ); + OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] ); + OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width ); + OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL ); + + if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 ) + x264_opencl_flush( h ); + + int size = h->mb.i_mb_count * sizeof(int16_t); + locked = x264_opencl_alloc_locked( h, size ); + OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.intra_cost, CL_FALSE, 0, size, locked, 0, NULL, NULL ); + h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[0][0]; + h->opencl.copies[h->opencl.num_copies].src = locked; + h->opencl.copies[h->opencl.num_copies].bytes = size; + h->opencl.num_copies++; + + size = h->mb.i_mb_height * sizeof(int); + locked = x264_opencl_alloc_locked( h, size ); + OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL ); + h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[0][0]; + h->opencl.copies[h->opencl.num_copies].src = locked; + h->opencl.copies[h->opencl.num_copies].bytes = size; + h->opencl.num_copies++; + + size = sizeof(int) * 4; + locked = x264_opencl_alloc_locked( h, size ); + OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL ); + h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[0][0]; + h->opencl.copies[h->opencl.num_copies].src = locked; + h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int); + h->opencl.num_copies++; + h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[0][0]; + h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int); + h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int); + h->opencl.num_copies++; + + h->opencl.last_buf = !h->opencl.last_buf; + return 0; +} + +/* This function was tested emprically on a number of AMD and NV GPUs. Making a + * function which returns perfect launch dimensions is impossible; some + * applications will have self-tuning code to try many possible variables and + * measure the runtime. Here we simply make an educated guess based on what we + * know GPUs typically prefer. */ +static void x264_optimal_launch_dims( x264_t *h, size_t *gdims, size_t *ldims, const cl_kernel kernel, const cl_device_id device ) +{ + x264_opencl_function_t *ocl = h->opencl.ocl; + size_t max_work_group = 256; /* reasonable defaults for OpenCL 1.0 devices, below APIs may fail */ + size_t preferred_multiple = 64; + cl_uint num_cus = 6; + + ocl->clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group, NULL ); + ocl->clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &preferred_multiple, NULL ); + ocl->clGetDeviceInfo( device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &num_cus, NULL ); + + ldims[0] = preferred_multiple; + ldims[1] = 8; + + /* make ldims[1] an even divisor of gdims[1] */ + while( gdims[1] & (ldims[1] - 1) ) + { + ldims[0] <<= 1; + ldims[1] >>= 1; + } + /* make total ldims fit under the max work-group dimensions for the device */ + while( ldims[0] * ldims[1] > max_work_group ) + { + if( (ldims[0] <= preferred_multiple) && (ldims[1] > 1) ) + ldims[1] >>= 1; + else + ldims[0] >>= 1; + } + + if( ldims[0] > gdims[0] ) + { + /* remove preferred multiples until we're close to gdims[0] */ + while( gdims[0] + preferred_multiple < ldims[0] ) + ldims[0] -= preferred_multiple; + gdims[0] = ldims[0]; + } + else + { + /* make gdims an even multiple of ldims */ + gdims[0] = (gdims[0]+ldims[0]-1)/ldims[0]; + gdims[0] *= ldims[0]; + } + + /* make ldims smaller to spread work across compute units */ + while( (gdims[0]/ldims[0]) * (gdims[1]/ldims[1]) * 2 <= num_cus ) + { + if( ldims[0] > preferred_multiple ) + ldims[0] >>= 1; + else if( ldims[1] > 1 ) + ldims[1] >>= 1; + else + break; + } + /* for smaller GPUs, try not to abuse their texture cache */ + if( num_cus == 6 && ldims[0] == 64 && ldims[1] == 4 ) + ldims[0] = 32; +} + +int x264_opencl_motionsearch( x264_t *h, x264_frame_t **frames, int b, int ref, int b_islist1, int lambda, const x264_weight_t *w ) +{ + x264_opencl_function_t *ocl = h->opencl.ocl; + x264_frame_t *fenc = frames[b]; + x264_frame_t *fref = frames[ref]; + + cl_mem ref_scaled_images[NUM_IMAGE_SCALES]; + cl_mem ref_luma_hpel; + cl_int status; + + if( w && w->weightfn ) + { + size_t gdims[2]; + + gdims[0] = 8 * h->mb.i_mb_width; + gdims[1] = 8 * h->mb.i_mb_height; + + /* WeightP: Perform a filter on fref->opencl.scaled_image2Ds[] and fref->opencl.luma_hpel */ + for( int i = 0; i < NUM_IMAGE_SCALES; i++ ) + { + cl_uint arg = 0; + OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &fref->opencl.scaled_image2Ds[i] ); + OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_scaled_images[i] ); + OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_offset ); + OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_scale ); + OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_denom ); + OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_scaled_images_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL ); + + gdims[0] >>= 1; + gdims[1] >>= 1; + if( gdims[0] < 16 || gdims[1] < 16 ) + break; + } + + cl_uint arg = 0; + gdims[0] = 8 * h->mb.i_mb_width; + gdims[1] = 8 * h->mb.i_mb_height; + + OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &fref->opencl.luma_hpel ); + OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_luma_hpel ); + OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_offset ); + OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_scale ); + OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_denom ); + OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_hpel_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL ); + + /* Use weighted reference planes for motion search */ + for( int i = 0; i < NUM_IMAGE_SCALES; i++ ) + ref_scaled_images[i] = h->opencl.weighted_scaled_images[i]; + ref_luma_hpel = h->opencl.weighted_luma_hpel; + } + else + { + /* Use unweighted reference planes for motion search */ + for( int i = 0; i < NUM_IMAGE_SCALES; i++ ) + ref_scaled_images[i] = fref->opencl.scaled_image2Ds[i]; + ref_luma_hpel = fref->opencl.luma_hpel; + } + + const int num_iterations[NUM_IMAGE_SCALES] = { 1, 1, 2, 3 }; + int b_first_iteration = 1; + int b_reverse_references = 1; + int A = 1; + + + int mb_per_group = 0; + int cost_local_size = 0; + int mvc_local_size = 0; + int mb_width; + + size_t gdims[2]; + size_t ldims[2]; + + /* scale 0 is 8x8 */ + for( int scale = NUM_IMAGE_SCALES-1; scale >= 0; scale-- ) + { + mb_width = h->mb.i_mb_width >> scale; + gdims[0] = mb_width; + gdims[1] = h->mb.i_mb_height >> scale; + if( gdims[0] < 2 || gdims[1] < 2 ) + continue; + gdims[0] <<= 2; + x264_optimal_launch_dims( h, gdims, ldims, h->opencl.hme_kernel, h->opencl.device ); + + mb_per_group = (ldims[0] >> 2) * ldims[1]; + cost_local_size = 4 * mb_per_group * sizeof(int16_t); + mvc_local_size = 4 * mb_per_group * sizeof(int16_t) * 2; + int scaled_me_range = h->param.analyse.i_me_range >> scale; + int b_shift_index = 1; + + cl_uint arg = 0; + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[scale] ); + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &ref_scaled_images[scale] ); + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] ); + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[!A] ); + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs ); + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), (void*)&h->opencl.mvp_buffer ); + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, cost_local_size, NULL ); + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, mvc_local_size, NULL ); + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &mb_width ); + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &lambda ); + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scaled_me_range ); + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scale ); + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_shift_index ); + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_first_iteration ); + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_reverse_references ); + + for( int iter = 0; iter < num_iterations[scale]; iter++ ) + { + OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.hme_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL ); + + b_shift_index = 0; + b_first_iteration = 0; + + /* alternate top-left vs bot-right MB references at lower scales, so + * motion field smooths more quickly. */ + if( scale > 2 ) + b_reverse_references ^= 1; + else + b_reverse_references = 0; + A = !A; + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 2, sizeof(cl_mem), &h->opencl.mv_buffers[A] ); + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 3, sizeof(cl_mem), &h->opencl.mv_buffers[!A] ); + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 3, sizeof(int), &b_shift_index ); + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 2, sizeof(int), &b_first_iteration ); + OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 1, sizeof(int), &b_reverse_references ); + } + } + + int satd_local_size = mb_per_group * sizeof(uint32_t) * 16; + cl_uint arg = 0; + OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] ); + OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &ref_luma_hpel ); + OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] ); + OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs ); + OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, cost_local_size, NULL ); + OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, satd_local_size, NULL ); + OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, mvc_local_size, NULL ); + + if( b_islist1 ) + { + OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 ); + OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 ); + } + else + { + OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 ); + OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 ); + } + + OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &mb_width ); + OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &lambda ); + OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b ); + OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &ref ); + OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b_islist1 ); + + if( h->opencl.b_device_AMD_SI ) + { + /* workaround for AMD Southern Island driver scheduling bug (fixed in + * July 2012), perform meaningless small copy to add a data dependency */ + OCLCHECK( clEnqueueCopyBuffer, h->opencl.queue, h->opencl.mv_buffers[A], h->opencl.mv_buffers[!A], 0, 0, 20, 0, NULL, NULL ); + } + + OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.subpel_refine_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL ); + + int mvlen = 2 * sizeof(int16_t) * h->mb.i_mb_count; + + if( h->opencl.num_copies >= MAX_FINISH_COPIES - 1 ) + x264_opencl_flush( h ); + + char *locked = x264_opencl_alloc_locked( h, mvlen ); + h->opencl.copies[h->opencl.num_copies].src = locked; + h->opencl.copies[h->opencl.num_copies].bytes = mvlen; + + if( b_islist1 ) + { + int mvs_offset = mvlen * (ref - b - 1); + OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs1, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL ); + h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[1][ref - b - 1]; + } + else + { + int mvs_offset = mvlen * (b - ref - 1); + OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs0, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL ); + h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[0][b - ref - 1]; + } + + h->opencl.num_copies++; + + return 0; +} + +int x264_opencl_finalize_cost( x264_t *h, int lambda, x264_frame_t **frames, int p0, int p1, int b, int dist_scale_factor ) +{ + x264_opencl_function_t *ocl = h->opencl.ocl; + cl_int status; + x264_frame_t *fenc = frames[b]; + x264_frame_t *fref0 = frames[p0]; + x264_frame_t *fref1 = frames[p1]; + + int bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor >> 2) : 32; + + /* Tasks for this kernel: + * 1. Select least cost mode (intra, ref0, ref1) + * list_used 0, 1, 2, or 3. if B frame, do not allow intra + * 2. if B frame, try bidir predictions. + * 3. lowres_costs[i_mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT); */ + size_t gdims[2] = { h->mb.i_mb_width, h->mb.i_mb_height }; + size_t ldim_bidir[2]; + size_t *ldims = NULL; + int cost_local_size = 4; + int satd_local_size = 4; + if( b < p1 ) + { + /* For B frames, use 4 threads per MB for BIDIR checks */ + ldims = ldim_bidir; + gdims[0] <<= 2; + x264_optimal_launch_dims( h, gdims, ldims, h->opencl.mode_select_kernel, h->opencl.device ); + int mb_per_group = (ldims[0] >> 2) * ldims[1]; + cost_local_size = 4 * mb_per_group * sizeof(int16_t); + satd_local_size = 16 * mb_per_group * sizeof(uint32_t); + } + + cl_uint arg = 0; + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] ); + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref0->opencl.luma_hpel ); + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.luma_hpel ); + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 ); + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 ); + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.lowres_mvs0 ); + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 ); + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 ); + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost ); + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] ); + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] ); + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, cost_local_size, NULL ); + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, satd_local_size, NULL ); + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &h->mb.i_mb_width ); + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &bipred_weight ); + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &dist_scale_factor ); + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &b ); + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p0 ); + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p1 ); + OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &lambda ); + OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.mode_select_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL ); + + /* Sum costs across rows, atomicAdd down frame */ + size_t gdim[2] = { 256, h->mb.i_mb_height }; + size_t ldim[2] = { 256, 1 }; + + arg = 0; + OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] ); + OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor ); + OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] ); + OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] ); + OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->mb.i_mb_width ); + OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->param.i_bframe_bias ); + OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &b ); + OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p0 ); + OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p1 ); + OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_inter_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL ); + + if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 ) + x264_opencl_flush( h ); + + int size = h->mb.i_mb_count * sizeof(int16_t); + char *locked = x264_opencl_alloc_locked( h, size ); + h->opencl.copies[h->opencl.num_copies].src = locked; + h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[b - p0][p1 - b]; + h->opencl.copies[h->opencl.num_copies].bytes = size; + OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.lowres_costs[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL ); + h->opencl.num_copies++; + + size = h->mb.i_mb_height * sizeof(int); + locked = x264_opencl_alloc_locked( h, size ); + h->opencl.copies[h->opencl.num_copies].src = locked; + h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[b - p0][p1 - b]; + h->opencl.copies[h->opencl.num_copies].bytes = size; + OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL ); + h->opencl.num_copies++; + + size = 4 * sizeof(int); + locked = x264_opencl_alloc_locked( h, size ); + OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL ); + h->opencl.last_buf = !h->opencl.last_buf; + + h->opencl.copies[h->opencl.num_copies].src = locked; + h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[b - p0][p1 - b]; + h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int); + h->opencl.num_copies++; + h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int); + h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[b - p0][p1 - b]; + h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int); + h->opencl.num_copies++; + + if( b == p1 ) // P frames only + { + h->opencl.copies[h->opencl.num_copies].src = locked + 2 * sizeof(int); + h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_intra_mbs[b - p0]; + h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int); + h->opencl.num_copies++; + } + return 0; +} + +void x264_opencl_slicetype_prep( x264_t *h, x264_frame_t **frames, int num_frames, int lambda ) +{ + if( h->param.b_opencl ) + { +#ifdef _WIN32 + /* Temporarily boost priority of this lookahead thread and the OpenCL + * driver's thread until the end of this function. On AMD GPUs this + * greatly reduces the latency of enqueuing kernels and getting results + * on Windows. */ + HANDLE id = GetCurrentThread(); + h->opencl.lookahead_thread_pri = GetThreadPriority( id ); + SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL ); + x264_opencl_function_t *ocl = h->opencl.ocl; + cl_int status = ocl->clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL ); + if( status == CL_SUCCESS ) + { + h->opencl.opencl_thread_pri = GetThreadPriority( id ); + SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL ); + } +#endif + + /* precalculate intra and I frames */ + for( int i = 0; i <= num_frames; i++ ) + x264_opencl_lowres_init( h, frames[i], lambda ); + x264_opencl_flush( h ); + + if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS && h->param.i_bframe ) + { + /* For trellis B-Adapt, precompute exhaustive motion searches */ + for( int b = 0; b <= num_frames; b++ ) + { + for( int j = 1; j < h->param.i_bframe; j++ ) + { + int p0 = b - j; + if( p0 >= 0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF ) + { + const x264_weight_t *w = x264_weight_none; + + if( h->param.analyse.i_weighted_pred ) + { + x264_emms(); + x264_weights_analyse( h, frames[b], frames[p0], 1 ); + w = frames[b]->weight[0]; + } + frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0; + x264_opencl_motionsearch( h, frames, b, p0, 0, lambda, w ); + } + int p1 = b + j; + if( p1 <= num_frames && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF ) + { + frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0; + x264_opencl_motionsearch( h, frames, b, p1, 1, lambda, NULL ); + } + } + } + + x264_opencl_flush( h ); + } + } +} + + +void x264_opencl_slicetype_end( x264_t *h ) +{ +#ifdef _WIN32 + if( h->param.b_opencl ) + { + HANDLE id = GetCurrentThread(); + SetThreadPriority( id, h->opencl.lookahead_thread_pri ); + x264_opencl_function_t *ocl = h->opencl.ocl; + cl_int status = ocl->clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL ); + if( status == CL_SUCCESS ) + SetThreadPriority( id, h->opencl.opencl_thread_pri ); + } +#endif +} + +int x264_opencl_precalculate_frame_cost( x264_t *h, x264_frame_t **frames, int lambda, int p0, int p1, int b ) +{ + if( (frames[b]->i_cost_est[b-p0][p1-b] >= 0) || (b == p0 && b == p1) ) + return 0; + else + { + int do_search[2]; + int dist_scale_factor = 128; + const x264_weight_t *w = x264_weight_none; + + // avoid duplicating work + frames[b]->i_cost_est[b-p0][p1-b] = 0; + + do_search[0] = b != p0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF; + do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF; + if( do_search[0] ) + { + if( h->param.analyse.i_weighted_pred && b == p1 ) + { + x264_emms(); + x264_weights_analyse( h, frames[b], frames[p0], 1 ); + w = frames[b]->weight[0]; + } + frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0; + } + if( do_search[1] ) + frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0; + if( b == p1 ) + frames[b]->i_intra_mbs[b-p0] = 0; + if( p1 != p0 ) + dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0); + + frames[b]->i_cost_est[b-p0][p1-b] = 0; + frames[b]->i_cost_est_aq[b-p0][p1-b] = 0; + + x264_opencl_lowres_init( h, frames[b], lambda ); + + if( do_search[0] ) + { + x264_opencl_lowres_init( h, frames[p0], lambda ); + x264_opencl_motionsearch( h, frames, b, p0, 0, lambda, w ); + } + if( do_search[1] ) + { + x264_opencl_lowres_init( h, frames[p1], lambda ); + x264_opencl_motionsearch( h, frames, b, p1, 1, lambda, NULL ); + } + x264_opencl_finalize_cost( h, lambda, frames, p0, p1, b, dist_scale_factor ); + return 1; + } +} + +#endif
View file
x264-snapshot-20130224-2245.tar.bz2/encoder/slicetype.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/slicetype.c
Changed
@@ -36,6 +36,18 @@ x264_frame_t **frames, int p0, int p1, int b, int b_intra_penalty ); +void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead ); + +#if HAVE_OPENCL +int x264_opencl_lowres_init( x264_t *h, x264_frame_t *fenc, int lambda ); +int x264_opencl_motionsearch( x264_t *h, x264_frame_t **frames, int b, int ref, int b_islist1, int lambda, const x264_weight_t *w ); +int x264_opencl_finalize_cost( x264_t *h, int lambda, x264_frame_t **frames, int p0, int p1, int b, int dist_scale_factor ); +int x264_opencl_precalculate_frame_cost( x264_t *h, x264_frame_t **frames, int lambda, int p0, int p1, int b ); +void x264_opencl_flush( x264_t *h ); +void x264_opencl_slicetype_prep( x264_t *h, x264_frame_t **frames, int num_frames, int lambda ); +void x264_opencl_slicetype_end( x264_t *h ); +#endif + static void x264_lowres_context_init( x264_t *h, x264_mb_analysis_t *a ) { a->i_qp = X264_LOOKAHEAD_QP; @@ -60,7 +72,7 @@ w->i_offset = offset; w->i_denom = 7; w->i_scale = weight_nonh264; - while( w->i_denom > 0 && (w->i_scale > 127 || !(w->i_scale & 1)) ) + while( w->i_denom > 0 && (w->i_scale > 127) ) { w->i_denom--; w->i_scale >>= 1; @@ -276,7 +288,7 @@ return cost; } -static void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead ) +void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead ) { int i_delta_index = fenc->i_frame - ref->i_frame - 1; /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */ @@ -286,21 +298,40 @@ SET_WEIGHT( weights[1], 0, 1, 0, 0 ); SET_WEIGHT( weights[2], 0, 1, 0, 0 ); int chroma_initted = 0; + float guess_scale[3]; + float fenc_mean[3]; + float ref_mean[3]; + for( int plane = 0; plane <= 2*!b_lookahead; plane++ ) + { + float fenc_var = fenc->i_pixel_ssd[plane] + !ref->i_pixel_ssd[plane]; + float ref_var = ref->i_pixel_ssd[plane] + !ref->i_pixel_ssd[plane]; + guess_scale[plane] = sqrtf( fenc_var / ref_var ); + fenc_mean[plane] = (float)fenc->i_pixel_sum[plane] / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]) / (1 << (BIT_DEPTH - 8)); + ref_mean[plane] = (float) ref->i_pixel_sum[plane] / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]) / (1 << (BIT_DEPTH - 8)); + } + + int chroma_denom = 7; + if( !b_lookahead ) + { + /* make sure both our scale factors fit */ + while( chroma_denom > 0 ) + { + float thresh = 127.f / (1<<chroma_denom); + if( guess_scale[1] < thresh && guess_scale[2] < thresh ) + break; + chroma_denom--; + } + } + /* Don't check chroma in lookahead, or if there wasn't a luma weight. */ for( int plane = 0; plane <= 2 && !( plane && ( !weights[0].weightfn || b_lookahead ) ); plane++ ) { - int cur_offset, start_offset, end_offset; int minoff, minscale, mindenom; unsigned int minscore, origscore; int found; - float fenc_var = fenc->i_pixel_ssd[plane] + !ref->i_pixel_ssd[plane]; - float ref_var = ref->i_pixel_ssd[plane] + !ref->i_pixel_ssd[plane]; - float guess_scale = sqrtf( fenc_var / ref_var ); - float fenc_mean = (float)fenc->i_pixel_sum[plane] / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]) / (1 << (BIT_DEPTH - 8)); - float ref_mean = (float) ref->i_pixel_sum[plane] / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]) / (1 << (BIT_DEPTH - 8)); //early termination - if( fabsf( ref_mean - fenc_mean ) < 0.5f && fabsf( 1.f - guess_scale ) < epsilon ) + if( fabsf( ref_mean[plane] - fenc_mean[plane] ) < 0.5f && fabsf( 1.f - guess_scale[plane] ) < epsilon ) { SET_WEIGHT( weights[plane], 0, 1, 0, 0 ); continue; @@ -308,8 +339,8 @@ if( plane ) { - weights[plane].i_denom = 6; - weights[plane].i_scale = x264_clip3( round( guess_scale * 64 ), 0, 255 ); + weights[plane].i_denom = chroma_denom; + weights[plane].i_scale = x264_clip3( round( guess_scale[plane] * (1<<chroma_denom) ), 0, 255 ); if( weights[plane].i_scale > 127 ) { weights[1].weightfn = weights[2].weightfn = NULL; @@ -317,7 +348,7 @@ } } else - x264_weight_get_h264( round( guess_scale * 128 ), 0, &weights[plane] ); + x264_weight_get_h264( round( guess_scale[plane] * 128 ), 0, &weights[plane] ); found = 0; mindenom = weights[plane].i_denom; @@ -357,33 +388,65 @@ if( !minscore ) continue; - // This gives a slight improvement due to rounding errors but only tests one offset in lookahead. - // Currently only searches within +/- 1 of the best offset found so far. - // TODO: Try other offsets/multipliers/combinations thereof? - cur_offset = fenc_mean - ref_mean * minscale / (1 << mindenom) + 0.5f * b_lookahead; - start_offset = x264_clip3( cur_offset - !b_lookahead, -128, 127 ); - end_offset = x264_clip3( cur_offset + !b_lookahead, -128, 127 ); - for( int i_off = start_offset; i_off <= end_offset; i_off++ ) + /* Picked somewhat arbitrarily */ + static const uint8_t weight_check_distance[][2] = + { + {0,0},{0,0},{0,1},{0,1}, + {0,1},{0,1},{0,1},{1,1}, + {1,1},{2,1},{2,1},{4,2} + }; + int scale_dist = b_lookahead ? 0 : weight_check_distance[h->param.analyse.i_subpel_refine][0]; + int offset_dist = b_lookahead ? 0 : weight_check_distance[h->param.analyse.i_subpel_refine][1]; + + int start_scale = x264_clip3( minscale - scale_dist, 0, 127 ); + int end_scale = x264_clip3( minscale + scale_dist, 0, 127 ); + for( int i_scale = start_scale; i_scale <= end_scale; i_scale++ ) { - SET_WEIGHT( weights[plane], 1, minscale, mindenom, i_off ); - unsigned int s; - if( plane ) + int cur_scale = i_scale; + int cur_offset = fenc_mean[plane] - ref_mean[plane] * cur_scale / (1 << mindenom) + 0.5f * b_lookahead; + if( cur_offset < - 128 || cur_offset > 127 ) { - if( CHROMA444 ) - s = x264_weight_cost_chroma444( h, fenc, mcbuf, &weights[plane], plane ); - else - s = x264_weight_cost_chroma( h, fenc, mcbuf, &weights[plane] ); + /* Rescale considering the constraints on cur_offset. We do it in this order + * because scale has a much wider range than offset (because of denom), so + * it should almost never need to be clamped. */ + cur_offset = x264_clip3( cur_offset, -128, 127 ); + cur_scale = (1 << mindenom) * (fenc_mean[plane] - cur_offset) / ref_mean[plane] + 0.5f; + cur_scale = x264_clip3( cur_scale, 0, 127 ); } - else - s = x264_weight_cost_luma( h, fenc, mcbuf, &weights[plane] ); - COPY3_IF_LT( minscore, s, minoff, i_off, found, 1 ); + int start_offset = x264_clip3( cur_offset - offset_dist, -128, 127 ); + int end_offset = x264_clip3( cur_offset + offset_dist, -128, 127 ); + for( int i_off = start_offset; i_off <= end_offset; i_off++ ) + { + SET_WEIGHT( weights[plane], 1, cur_scale, mindenom, i_off ); + unsigned int s; + if( plane ) + { + if( CHROMA444 ) + s = x264_weight_cost_chroma444( h, fenc, mcbuf, &weights[plane], plane ); + else + s = x264_weight_cost_chroma( h, fenc, mcbuf, &weights[plane] ); + } + else + s = x264_weight_cost_luma( h, fenc, mcbuf, &weights[plane] ); + COPY4_IF_LT( minscore, s, minscale, cur_scale, minoff, i_off, found, 1 ); - // Don't check any more offsets if the previous one had a lower cost than the current one - if( minoff == start_offset && i_off != start_offset ) - break; + // Don't check any more offsets if the previous one had a lower cost than the current one + if( minoff == start_offset && i_off != start_offset ) + break; + } } x264_emms(); + /* Use a smaller denominator if possible */ + if( !plane ) + { + while( mindenom > 0 && !(minscale&1) ) + { + mindenom--; + minscale >>= 1; + } + } + /* FIXME: More analysis can be done here on SAD vs. SATD termination. */ /* 0.2% termination derived experimentally to avoid weird weights in frames that are mostly intra. */ if( !found || (minscale == 1 << mindenom && minoff == 0) || (float)minscore / origscore > 0.998f ) @@ -398,18 +461,29 @@ fenc->f_weighted_cost_delta[i_delta_index] = (float)minscore / origscore; } - //FIXME, what is the correct way to deal with this? - if( weights[1].weightfn && weights[2].weightfn && weights[1].i_denom != weights[2].i_denom ) + /* Optimize and unify denominator */ + if( weights[1].weightfn || weights[2].weightfn ) { - int denom = X264_MIN( weights[1].i_denom, weights[2].i_denom ); - int i; - for( i = 1; i <= 2; i++ ) + int denom = weights[1].weightfn ? weights[1].i_denom : weights[2].i_denom; + int both_weighted = weights[1].weightfn && weights[2].weightfn; + /* If only one plane is weighted, the other has an implicit scale of 1<<denom. + * With denom==7, this comes out to 128, which is invalid, so don't allow that. */ + while( (!both_weighted && denom==7) || + (denom > 0 && !(weights[1].weightfn && (weights[1].i_scale&1)) + && !(weights[2].weightfn && (weights[2].i_scale&1))) ) { - weights[i].i_scale = x264_clip3( weights[i].i_scale >> ( weights[i].i_denom - denom ), 0, 255 ); - weights[i].i_denom = denom; - h->mc.weight_cache( h, &weights[i] ); + denom--; + for( int i = 1; i <= 2; i++ ) + if( weights[i].weightfn ) + { + weights[i].i_scale >>= 1; + weights[i].i_denom = denom; + } } } + for( int i = 1; i <= 2; i++ ) + if( weights[i].weightfn ) + h->mc.weight_cache( h, &weights[i] ); if( weights[0].weightfn && b_lookahead ) { @@ -472,16 +546,16 @@ goto lowres_intra_mb; // no need for h->mb.mv_min[] - h->mb.mv_min_fpel[0] = -8*h->mb.i_mb_x - 4; - h->mb.mv_max_fpel[0] = 8*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 4; - h->mb.mv_min_spel[0] = 4*( h->mb.mv_min_fpel[0] - 8 ); - h->mb.mv_max_spel[0] = 4*( h->mb.mv_max_fpel[0] + 8 ); + h->mb.mv_limit_fpel[0][0] = -8*h->mb.i_mb_x - 4; + h->mb.mv_limit_fpel[1][0] = 8*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 4; + h->mb.mv_min_spel[0] = 4*( h->mb.mv_limit_fpel[0][0] - 8 ); + h->mb.mv_max_spel[0] = 4*( h->mb.mv_limit_fpel[1][0] + 8 ); if( h->mb.i_mb_x >= h->mb.i_mb_width - 2 ) { - h->mb.mv_min_fpel[1] = -8*h->mb.i_mb_y - 4; - h->mb.mv_max_fpel[1] = 8*( h->mb.i_mb_height - h->mb.i_mb_y - 1 ) + 4; - h->mb.mv_min_spel[1] = 4*( h->mb.mv_min_fpel[1] - 8 ); - h->mb.mv_max_spel[1] = 4*( h->mb.mv_max_fpel[1] + 8 ); + h->mb.mv_limit_fpel[0][1] = -8*h->mb.i_mb_y - 4; + h->mb.mv_limit_fpel[1][1] = 8*( h->mb.i_mb_height - h->mb.i_mb_y - 1 ) + 4; + h->mb.mv_min_spel[1] = 4*( h->mb.mv_limit_fpel[0][1] - 8 ); + h->mb.mv_max_spel[1] = 4*( h->mb.mv_limit_fpel[1][1] + 8 ); } #define LOAD_HPELS_LUMA(dst, src) \ @@ -633,15 +707,16 @@ if( !fenc->b_intra_calculated ) { ALIGNED_ARRAY_16( pixel, edge,[36] ); - pixel *pix = &pix1[8+FDEC_STRIDE - 1]; - pixel *src = &fenc->lowres[0][i_pel_offset - 1]; + pixel *pix = &pix1[8+FDEC_STRIDE]; + pixel *src = &fenc->lowres[0][i_pel_offset]; const int intra_penalty = 5 * a->i_lambda; int satds[3]; + int pixoff = 4 / sizeof(pixel); - memcpy( pix-FDEC_STRIDE, src-i_stride, 17 * sizeof(pixel) ); - for( int i = 0; i < 8; i++ ) - pix[i*FDEC_STRIDE] = src[i*i_stride]; - pix++; + /* Avoid store forwarding stalls by writing larger chunks */ + memcpy( pix-FDEC_STRIDE, src-i_stride, 16 * sizeof(pixel) ); + for( int i = -1; i < 8; i++ ) + M32( &pix[i*FDEC_STRIDE-pixoff] ) = M32( &src[i*i_stride-pixoff] ); h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[0], pix, satds ); int i_icost = X264_MIN3( satds[0], satds[1], satds[2] ); @@ -793,96 +868,120 @@ output_inter[0] = h->scratch_buffer2; output_intra[0] = output_inter[0] + output_buf_size; - if( h->param.i_lookahead_threads > 1 ) +#if HAVE_OPENCL + if( h->param.b_opencl ) { - x264_slicetype_slice_t s[X264_LOOKAHEAD_THREAD_MAX]; + x264_opencl_lowres_init(h, fenc, a->i_lambda ); + if( do_search[0] ) + { + x264_opencl_lowres_init( h, frames[p0], a->i_lambda ); + x264_opencl_motionsearch( h, frames, b, p0, 0, a->i_lambda, w ); + } + if( do_search[1] ) + { + x264_opencl_lowres_init( h, frames[p1], a->i_lambda ); + x264_opencl_motionsearch( h, frames, b, p1, 1, a->i_lambda, NULL ); + } + if( b != p0 ) + x264_opencl_finalize_cost( h, a->i_lambda, frames, p0, p1, b, dist_scale_factor ); + x264_opencl_flush( h ); - for( int i = 0; i < h->param.i_lookahead_threads; i++ ) + i_score = fenc->i_cost_est[b-p0][p1-b]; + } + else +#endif + { + if( h->param.i_lookahead_threads > 1 ) { - x264_t *t = h->lookahead_thread[i]; + x264_slicetype_slice_t s[X264_LOOKAHEAD_THREAD_MAX]; - /* FIXME move this somewhere else */ - t->mb.i_me_method = h->mb.i_me_method; - t->mb.i_subpel_refine = h->mb.i_subpel_refine; - t->mb.b_chroma_me = h->mb.b_chroma_me; + for( int i = 0; i < h->param.i_lookahead_threads; i++ ) + { + x264_t *t = h->lookahead_thread[i]; - s[i] = (x264_slicetype_slice_t){ t, a, frames, p0, p1, b, dist_scale_factor, do_search, w, - output_inter[i], output_intra[i] }; + /* FIXME move this somewhere else */ + t->mb.i_me_method = h->mb.i_me_method; + t->mb.i_subpel_refine = h->mb.i_subpel_refine; + t->mb.b_chroma_me = h->mb.b_chroma_me; - t->i_threadslice_start = ((h->mb.i_mb_height * i + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads); - t->i_threadslice_end = ((h->mb.i_mb_height * (i+1) + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads); + s[i] = (x264_slicetype_slice_t){ t, a, frames, p0, p1, b, dist_scale_factor, do_search, w, + output_inter[i], output_intra[i] }; - int thread_height = t->i_threadslice_end - t->i_threadslice_start; - int thread_output_size = thread_height + NUM_INTS; - memset( output_inter[i], 0, thread_output_size * sizeof(int) ); - memset( output_intra[i], 0, thread_output_size * sizeof(int) ); - output_inter[i][NUM_ROWS] = output_intra[i][NUM_ROWS] = thread_height; + t->i_threadslice_start = ((h->mb.i_mb_height * i + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads); + t->i_threadslice_end = ((h->mb.i_mb_height * (i+1) + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads); - output_inter[i+1] = output_inter[i] + thread_output_size + PAD_SIZE; - output_intra[i+1] = output_intra[i] + thread_output_size + PAD_SIZE; + int thread_height = t->i_threadslice_end - t->i_threadslice_start; + int thread_output_size = thread_height + NUM_INTS; + memset( output_inter[i], 0, thread_output_size * sizeof(int) ); + memset( output_intra[i], 0, thread_output_size * sizeof(int) ); + output_inter[i][NUM_ROWS] = output_intra[i][NUM_ROWS] = thread_height; - x264_threadpool_run( h->lookaheadpool, (void*)x264_slicetype_slice_cost, &s[i] ); - } - for( int i = 0; i < h->param.i_lookahead_threads; i++ ) - x264_threadpool_wait( h->lookaheadpool, &s[i] ); - } - else - { - h->i_threadslice_start = 0; - h->i_threadslice_end = h->mb.i_mb_height; - memset( output_inter[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) ); - memset( output_intra[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) ); - output_inter[0][NUM_ROWS] = output_intra[0][NUM_ROWS] = h->mb.i_mb_height; - x264_slicetype_slice_t s = (x264_slicetype_slice_t){ h, a, frames, p0, p1, b, dist_scale_factor, do_search, w, - output_inter[0], output_intra[0] }; - x264_slicetype_slice_cost( &s ); - } + output_inter[i+1] = output_inter[i] + thread_output_size + PAD_SIZE; + output_intra[i+1] = output_intra[i] + thread_output_size + PAD_SIZE; - /* Sum up accumulators */ - if( b == p1 ) - fenc->i_intra_mbs[b-p0] = 0; - if( !fenc->b_intra_calculated ) - { - fenc->i_cost_est[0][0] = 0; - fenc->i_cost_est_aq[0][0] = 0; - } - fenc->i_cost_est[b-p0][p1-b] = 0; - fenc->i_cost_est_aq[b-p0][p1-b] = 0; + x264_threadpool_run( h->lookaheadpool, (void*)x264_slicetype_slice_cost, &s[i] ); + } + for( int i = 0; i < h->param.i_lookahead_threads; i++ ) + x264_threadpool_wait( h->lookaheadpool, &s[i] ); + } + else + { + h->i_threadslice_start = 0; + h->i_threadslice_end = h->mb.i_mb_height; + memset( output_inter[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) ); + memset( output_intra[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) ); + output_inter[0][NUM_ROWS] = output_intra[0][NUM_ROWS] = h->mb.i_mb_height; + x264_slicetype_slice_t s = (x264_slicetype_slice_t){ h, a, frames, p0, p1, b, dist_scale_factor, do_search, w, + output_inter[0], output_intra[0] }; + x264_slicetype_slice_cost( &s ); + } - int *row_satd_inter = fenc->i_row_satds[b-p0][p1-b]; - int *row_satd_intra = fenc->i_row_satds[0][0]; - for( int i = 0; i < h->param.i_lookahead_threads; i++ ) - { + /* Sum up accumulators */ if( b == p1 ) - fenc->i_intra_mbs[b-p0] += output_inter[i][INTRA_MBS]; + fenc->i_intra_mbs[b-p0] = 0; if( !fenc->b_intra_calculated ) { - fenc->i_cost_est[0][0] += output_intra[i][COST_EST]; - fenc->i_cost_est_aq[0][0] += output_intra[i][COST_EST_AQ]; + fenc->i_cost_est[0][0] = 0; + fenc->i_cost_est_aq[0][0] = 0; } + fenc->i_cost_est[b-p0][p1-b] = 0; + fenc->i_cost_est_aq[b-p0][p1-b] = 0; - fenc->i_cost_est[b-p0][p1-b] += output_inter[i][COST_EST]; - fenc->i_cost_est_aq[b-p0][p1-b] += output_inter[i][COST_EST_AQ]; - - if( h->param.rc.i_vbv_buffer_size ) + int *row_satd_inter = fenc->i_row_satds[b-p0][p1-b]; + int *row_satd_intra = fenc->i_row_satds[0][0]; + for( int i = 0; i < h->param.i_lookahead_threads; i++ ) { - int row_count = output_inter[i][NUM_ROWS]; - memcpy( row_satd_inter, output_inter[i] + NUM_INTS, row_count * sizeof(int) ); + if( b == p1 ) + fenc->i_intra_mbs[b-p0] += output_inter[i][INTRA_MBS]; if( !fenc->b_intra_calculated ) - memcpy( row_satd_intra, output_intra[i] + NUM_INTS, row_count * sizeof(int) ); - row_satd_inter += row_count; - row_satd_intra += row_count; + { + fenc->i_cost_est[0][0] += output_intra[i][COST_EST]; + fenc->i_cost_est_aq[0][0] += output_intra[i][COST_EST_AQ]; + } + + fenc->i_cost_est[b-p0][p1-b] += output_inter[i][COST_EST]; + fenc->i_cost_est_aq[b-p0][p1-b] += output_inter[i][COST_EST_AQ]; + + if( h->param.rc.i_vbv_buffer_size ) + { + int row_count = output_inter[i][NUM_ROWS]; + memcpy( row_satd_inter, output_inter[i] + NUM_INTS, row_count * sizeof(int) ); + if( !fenc->b_intra_calculated ) + memcpy( row_satd_intra, output_intra[i] + NUM_INTS, row_count * sizeof(int) ); + row_satd_inter += row_count; + row_satd_intra += row_count; + } } - } - i_score = fenc->i_cost_est[b-p0][p1-b]; - if( b != p1 ) - i_score = (uint64_t)i_score * 100 / (120 + h->param.i_bframe_bias); - else - fenc->b_intra_calculated = 1; + i_score = fenc->i_cost_est[b-p0][p1-b]; + if( b != p1 ) + i_score = (uint64_t)i_score * 100 / (120 + h->param.i_bframe_bias); + else + fenc->b_intra_calculated = 1; - fenc->i_cost_est[b-p0][p1-b] = i_score; - x264_emms(); + fenc->i_cost_est[b-p0][p1-b] = i_score; + x264_emms(); + } } if( b_intra_penalty ) @@ -1393,7 +1492,7 @@ return scenecut_internal( h, a, frames, p0, p1, real_scenecut ); } -void x264_slicetype_analyse( x264_t *h, int keyframe ) +void x264_slicetype_analyse( x264_t *h, int intra_minigop ) { x264_mb_analysis_t a; x264_frame_t *frames[X264_LOOKAHEAD_MAX+3] = { NULL, }; @@ -1402,8 +1501,13 @@ int cost1p0, cost2p0, cost1b1, cost2p1; int i_max_search = X264_MIN( h->lookahead->next.i_size, X264_LOOKAHEAD_MAX ); int vbv_lookahead = h->param.rc.i_vbv_buffer_size && h->param.rc.i_lookahead; + /* For determinism we should limit the search to the number of frames lookahead has for sure + * in h->lookahead->next.list buffer, except at the end of stream. + * For normal calls with (intra_minigop == 0) that is h->lookahead->i_slicetype_length + 1 frames. + * And for I-frame calls (intra_minigop != 0) we already removed intra_minigop frames from there. */ if( h->param.b_deterministic ) - i_max_search = X264_MIN( i_max_search, h->lookahead->i_slicetype_length + !keyframe ); + i_max_search = X264_MIN( i_max_search, h->lookahead->i_slicetype_length + 1 - intra_minigop ); + int keyframe = !!intra_minigop; assert( h->frames.b_have_lowres ); @@ -1448,6 +1552,10 @@ return; } +#if HAVE_OPENCL + x264_opencl_slicetype_prep( h, frames, num_frames, a.i_lambda ); +#endif + if( h->param.i_bframe ) { if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS ) @@ -1481,6 +1589,18 @@ continue; } +#if HAVE_OPENCL + if( h->param.b_opencl ) + { + int b_work_done = 0; + b_work_done |= x264_opencl_precalculate_frame_cost(h, frames, a.i_lambda, i+0, i+2, i+1 ); + b_work_done |= x264_opencl_precalculate_frame_cost(h, frames, a.i_lambda, i+0, i+1, i+1 ); + b_work_done |= x264_opencl_precalculate_frame_cost(h, frames, a.i_lambda, i+1, i+2, i+2 ); + if( b_work_done ) + x264_opencl_flush( h ); + } +#endif + cost1b1 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+2, i+1, 0 ); cost1p0 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+1, i+1, 0 ); cost2p0 = x264_slicetype_frame_cost( h, &a, frames, i+1, i+2, i+2, 0 ); @@ -1563,6 +1683,10 @@ /* Restore frametypes for all frames that haven't actually been decided yet. */ for( int j = reset_start; j <= num_frames; j++ ) frames[j]->i_type = X264_TYPE_AUTO; + +#if HAVE_OPENCL + x264_opencl_slicetype_end( h ); +#endif } void x264_slicetype_decide( x264_t *h )
View file
x264-snapshot-20130723-2245.tar.bz2/extras/avxsynth_c.h
Added
@@ -0,0 +1,727 @@ +// Avisynth C Interface Version 0.20 +// Copyright 2003 Kevin Atkinson + +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit +// http://www.gnu.org/copyleft/gpl.html . +// +// As a special exception, I give you permission to link to the +// Avisynth C interface with independent modules that communicate with +// the Avisynth C interface solely through the interfaces defined in +// avisynth_c.h, regardless of the license terms of these independent +// modules, and to copy and distribute the resulting combined work +// under terms of your choice, provided that every copy of the +// combined work is accompanied by a complete copy of the source code +// of the Avisynth C interface and Avisynth itself (with the version +// used to produce the combined work), being distributed under the +// terms of the GNU General Public License plus this exception. An +// independent module is a module which is not derived from or based +// on Avisynth C Interface, such as 3rd-party filters, import and +// export plugins, or graphical user interfaces. + +#ifndef __AVXSYNTH_C__ +#define __AVXSYNTH_C__ + +#include "windowsPorts/windows2linux.h" +#include <stdarg.h> + +#ifdef __cplusplus +# define EXTERN_C extern "C" +#else +# define EXTERN_C +#endif + +#define AVSC_USE_STDCALL 1 + +#ifndef AVSC_USE_STDCALL +# define AVSC_CC __cdecl +#else +# define AVSC_CC __stdcall +#endif + +#define AVSC_INLINE static __inline + +#ifdef AVISYNTH_C_EXPORTS +# define AVSC_EXPORT EXTERN_C +# define AVSC_API(ret, name) EXTERN_C __declspec(dllexport) ret AVSC_CC name +#else +# define AVSC_EXPORT EXTERN_C __declspec(dllexport) +# ifndef AVSC_NO_DECLSPEC +# define AVSC_API(ret, name) EXTERN_C __declspec(dllimport) ret AVSC_CC name +# else +# define AVSC_API(ret, name) typedef ret (AVSC_CC *name##_func) +# endif +#endif + +#ifdef __GNUC__ +typedef long long int INT64; +#else +typedef __int64 INT64; +#endif + + +///////////////////////////////////////////////////////////////////// +// +// Constants +// + +#ifndef __AVXSYNTH_H__ +enum { AVISYNTH_INTERFACE_VERSION = 3 }; +#endif + +enum {AVS_SAMPLE_INT8 = 1<<0, + AVS_SAMPLE_INT16 = 1<<1, + AVS_SAMPLE_INT24 = 1<<2, + AVS_SAMPLE_INT32 = 1<<3, + AVS_SAMPLE_FLOAT = 1<<4}; + +enum {AVS_PLANAR_Y=1<<0, + AVS_PLANAR_U=1<<1, + AVS_PLANAR_V=1<<2, + AVS_PLANAR_ALIGNED=1<<3, + AVS_PLANAR_Y_ALIGNED=AVS_PLANAR_Y|AVS_PLANAR_ALIGNED, + AVS_PLANAR_U_ALIGNED=AVS_PLANAR_U|AVS_PLANAR_ALIGNED, + AVS_PLANAR_V_ALIGNED=AVS_PLANAR_V|AVS_PLANAR_ALIGNED}; + + // Colorspace properties. +enum {AVS_CS_BGR = 1<<28, + AVS_CS_YUV = 1<<29, + AVS_CS_INTERLEAVED = 1<<30, + AVS_CS_PLANAR = 1<<31}; + + // Specific colorformats +enum { + AVS_CS_UNKNOWN = 0, + AVS_CS_BGR24 = 1<<0 | AVS_CS_BGR | AVS_CS_INTERLEAVED, + AVS_CS_BGR32 = 1<<1 | AVS_CS_BGR | AVS_CS_INTERLEAVED, + AVS_CS_YUY2 = 1<<2 | AVS_CS_YUV | AVS_CS_INTERLEAVED, + AVS_CS_YV12 = 1<<3 | AVS_CS_YUV | AVS_CS_PLANAR, // y-v-u, planar + AVS_CS_I420 = 1<<4 | AVS_CS_YUV | AVS_CS_PLANAR, // y-u-v, planar + AVS_CS_IYUV = 1<<4 | AVS_CS_YUV | AVS_CS_PLANAR // same as above +}; + +enum { + AVS_IT_BFF = 1<<0, + AVS_IT_TFF = 1<<1, + AVS_IT_FIELDBASED = 1<<2}; + +enum { + AVS_FILTER_TYPE=1, + AVS_FILTER_INPUT_COLORSPACE=2, + AVS_FILTER_OUTPUT_TYPE=9, + AVS_FILTER_NAME=4, + AVS_FILTER_AUTHOR=5, + AVS_FILTER_VERSION=6, + AVS_FILTER_ARGS=7, + AVS_FILTER_ARGS_INFO=8, + AVS_FILTER_ARGS_DESCRIPTION=10, + AVS_FILTER_DESCRIPTION=11}; + +enum { //SUBTYPES + AVS_FILTER_TYPE_AUDIO=1, + AVS_FILTER_TYPE_VIDEO=2, + AVS_FILTER_OUTPUT_TYPE_SAME=3, + AVS_FILTER_OUTPUT_TYPE_DIFFERENT=4}; + +enum { + AVS_CACHE_NOTHING=0, + AVS_CACHE_RANGE=1, + AVS_CACHE_ALL=2, + AVS_CACHE_AUDIO=3, + AVS_CACHE_AUDIO_NONE=4, + AVS_CACHE_AUDIO_AUTO=5 +}; + +#define AVS_FRAME_ALIGN 16 + +typedef struct AVS_Clip AVS_Clip; +typedef struct AVS_ScriptEnvironment AVS_ScriptEnvironment; + +///////////////////////////////////////////////////////////////////// +// +// AVS_VideoInfo +// + +// AVS_VideoInfo is layed out identicly to VideoInfo +typedef struct AVS_VideoInfo { + int width, height; // width=0 means no video + unsigned fps_numerator, fps_denominator; + int num_frames; + + int pixel_type; + + int audio_samples_per_second; // 0 means no audio + int sample_type; + INT64 num_audio_samples; + int nchannels; + + // Imagetype properties + + int image_type; +} AVS_VideoInfo; + +// useful functions of the above +AVSC_INLINE int avs_has_video(const AVS_VideoInfo * p) + { return (p->width!=0); } + +AVSC_INLINE int avs_has_audio(const AVS_VideoInfo * p) + { return (p->audio_samples_per_second!=0); } + +AVSC_INLINE int avs_is_rgb(const AVS_VideoInfo * p) + { return !!(p->pixel_type&AVS_CS_BGR); } + +AVSC_INLINE int avs_is_rgb24(const AVS_VideoInfo * p) + { return (p->pixel_type&AVS_CS_BGR24)==AVS_CS_BGR24; } // Clear out additional properties + +AVSC_INLINE int avs_is_rgb32(const AVS_VideoInfo * p) + { return (p->pixel_type & AVS_CS_BGR32) == AVS_CS_BGR32 ; } + +AVSC_INLINE int avs_is_yuv(const AVS_VideoInfo * p) + { return !!(p->pixel_type&AVS_CS_YUV ); } + +AVSC_INLINE int avs_is_yuy2(const AVS_VideoInfo * p) + { return (p->pixel_type & AVS_CS_YUY2) == AVS_CS_YUY2; } + +AVSC_INLINE int avs_is_yv12(const AVS_VideoInfo * p) + { return ((p->pixel_type & AVS_CS_YV12) == AVS_CS_YV12)||((p->pixel_type & AVS_CS_I420) == AVS_CS_I420); } + +AVSC_INLINE int avs_is_color_space(const AVS_VideoInfo * p, int c_space) + { return ((p->pixel_type & c_space) == c_space); } + +AVSC_INLINE int avs_is_property(const AVS_VideoInfo * p, int property) + { return ((p->pixel_type & property)==property ); } + +AVSC_INLINE int avs_is_planar(const AVS_VideoInfo * p) + { return !!(p->pixel_type & AVS_CS_PLANAR); } + +AVSC_INLINE int avs_is_field_based(const AVS_VideoInfo * p) + { return !!(p->image_type & AVS_IT_FIELDBASED); } + +AVSC_INLINE int avs_is_parity_known(const AVS_VideoInfo * p) + { return ((p->image_type & AVS_IT_FIELDBASED)&&(p->image_type & (AVS_IT_BFF | AVS_IT_TFF))); } + +AVSC_INLINE int avs_is_bff(const AVS_VideoInfo * p) + { return !!(p->image_type & AVS_IT_BFF); } + +AVSC_INLINE int avs_is_tff(const AVS_VideoInfo * p) + { return !!(p->image_type & AVS_IT_TFF); } + +AVSC_INLINE int avs_bits_per_pixel(const AVS_VideoInfo * p) +{ + switch (p->pixel_type) { + case AVS_CS_BGR24: return 24; + case AVS_CS_BGR32: return 32; + case AVS_CS_YUY2: return 16; + case AVS_CS_YV12: + case AVS_CS_I420: return 12; + default: return 0; + } +} +AVSC_INLINE int avs_bytes_from_pixels(const AVS_VideoInfo * p, int pixels) + { return pixels * (avs_bits_per_pixel(p)>>3); } // Will work on planar images, but will return only luma planes + +AVSC_INLINE int avs_row_size(const AVS_VideoInfo * p) + { return avs_bytes_from_pixels(p,p->width); } // Also only returns first plane on planar images + +AVSC_INLINE int avs_bmp_size(const AVS_VideoInfo * vi) + { if (avs_is_planar(vi)) {int p = vi->height * ((avs_row_size(vi)+3) & ~3); p+=p>>1; return p; } return vi->height * ((avs_row_size(vi)+3) & ~3); } + +AVSC_INLINE int avs_samples_per_second(const AVS_VideoInfo * p) + { return p->audio_samples_per_second; } + + +AVSC_INLINE int avs_bytes_per_channel_sample(const AVS_VideoInfo * p) +{ + switch (p->sample_type) { + case AVS_SAMPLE_INT8: return sizeof(signed char); + case AVS_SAMPLE_INT16: return sizeof(signed short); + case AVS_SAMPLE_INT24: return 3; + case AVS_SAMPLE_INT32: return sizeof(signed int); + case AVS_SAMPLE_FLOAT: return sizeof(float); + default: return 0; + } +} +AVSC_INLINE int avs_bytes_per_audio_sample(const AVS_VideoInfo * p) + { return p->nchannels*avs_bytes_per_channel_sample(p);} + +AVSC_INLINE INT64 avs_audio_samples_from_frames(const AVS_VideoInfo * p, INT64 frames) + { return ((INT64)(frames) * p->audio_samples_per_second * p->fps_denominator / p->fps_numerator); } + +AVSC_INLINE int avs_frames_from_audio_samples(const AVS_VideoInfo * p, INT64 samples) + { return (int)(samples * (INT64)p->fps_numerator / (INT64)p->fps_denominator / (INT64)p->audio_samples_per_second); } + +AVSC_INLINE INT64 avs_audio_samples_from_bytes(const AVS_VideoInfo * p, INT64 bytes) + { return bytes / avs_bytes_per_audio_sample(p); } + +AVSC_INLINE INT64 avs_bytes_from_audio_samples(const AVS_VideoInfo * p, INT64 samples) + { return samples * avs_bytes_per_audio_sample(p); } + +AVSC_INLINE int avs_audio_channels(const AVS_VideoInfo * p) + { return p->nchannels; } + +AVSC_INLINE int avs_sample_type(const AVS_VideoInfo * p) + { return p->sample_type;} + +// useful mutator +AVSC_INLINE void avs_set_property(AVS_VideoInfo * p, int property) + { p->image_type|=property; } + +AVSC_INLINE void avs_clear_property(AVS_VideoInfo * p, int property) + { p->image_type&=~property; } + +AVSC_INLINE void avs_set_field_based(AVS_VideoInfo * p, int isfieldbased) + { if (isfieldbased) p->image_type|=AVS_IT_FIELDBASED; else p->image_type&=~AVS_IT_FIELDBASED; } + +AVSC_INLINE void avs_set_fps(AVS_VideoInfo * p, unsigned numerator, unsigned denominator) +{ + unsigned x=numerator, y=denominator; + while (y) { // find gcd + unsigned t = x%y; x = y; y = t; + } + p->fps_numerator = numerator/x; + p->fps_denominator = denominator/x; +} + +AVSC_INLINE int avs_is_same_colorspace(AVS_VideoInfo * x, AVS_VideoInfo * y) +{ + return (x->pixel_type == y->pixel_type) + || (avs_is_yv12(x) && avs_is_yv12(y)); +} + +///////////////////////////////////////////////////////////////////// +// +// AVS_VideoFrame +// + +// VideoFrameBuffer holds information about a memory block which is used +// for video data. For efficiency, instances of this class are not deleted +// when the refcount reaches zero; instead they're stored in a linked list +// to be reused. The instances are deleted when the corresponding AVS +// file is closed. + +// AVS_VideoFrameBuffer is layed out identicly to VideoFrameBuffer +// DO NOT USE THIS STRUCTURE DIRECTLY +typedef struct AVS_VideoFrameBuffer { + unsigned char * data; + int data_size; + // sequence_number is incremented every time the buffer is changed, so + // that stale views can tell they're no longer valid. + long sequence_number; + + long refcount; +} AVS_VideoFrameBuffer; + +// VideoFrame holds a "window" into a VideoFrameBuffer. + +// AVS_VideoFrame is layed out identicly to IVideoFrame +// DO NOT USE THIS STRUCTURE DIRECTLY +typedef struct AVS_VideoFrame { + int refcount; + AVS_VideoFrameBuffer * vfb; + int offset, pitch, row_size, height, offsetU, offsetV, pitchUV; // U&V offsets are from top of picture. +} AVS_VideoFrame; + +// Access functions for AVS_VideoFrame +AVSC_INLINE int avs_get_pitch(const AVS_VideoFrame * p) { + return p->pitch;} + +AVSC_INLINE int avs_get_pitch_p(const AVS_VideoFrame * p, int plane) { + switch (plane) { + case AVS_PLANAR_U: case AVS_PLANAR_V: return p->pitchUV;} + return p->pitch;} + +AVSC_INLINE int avs_get_row_size(const AVS_VideoFrame * p) { + return p->row_size; } + +AVSC_INLINE int avs_get_row_size_p(const AVS_VideoFrame * p, int plane) { + int r; + switch (plane) { + case AVS_PLANAR_U: case AVS_PLANAR_V: + if (p->pitchUV) return p->row_size>>1; + else return 0; + case AVS_PLANAR_U_ALIGNED: case AVS_PLANAR_V_ALIGNED: + if (p->pitchUV) { + r = ((p->row_size+AVS_FRAME_ALIGN-1)&(~(AVS_FRAME_ALIGN-1)) )>>1; // Aligned rowsize + if (r < p->pitchUV) + return r; + return p->row_size>>1; + } else return 0; + case AVS_PLANAR_Y_ALIGNED: + r = (p->row_size+AVS_FRAME_ALIGN-1)&(~(AVS_FRAME_ALIGN-1)); // Aligned rowsize + if (r <= p->pitch) + return r; + return p->row_size; + } + return p->row_size; +} + +AVSC_INLINE int avs_get_height(const AVS_VideoFrame * p) { + return p->height;} + +AVSC_INLINE int avs_get_height_p(const AVS_VideoFrame * p, int plane) { + switch (plane) { + case AVS_PLANAR_U: case AVS_PLANAR_V: + if (p->pitchUV) return p->height>>1; + return 0; + } + return p->height;} + +AVSC_INLINE const unsigned char* avs_get_read_ptr(const AVS_VideoFrame * p) { + return p->vfb->data + p->offset;} + +AVSC_INLINE const unsigned char* avs_get_read_ptr_p(const AVS_VideoFrame * p, int plane) +{ + switch (plane) { + case AVS_PLANAR_U: return p->vfb->data + p->offsetU; + case AVS_PLANAR_V: return p->vfb->data + p->offsetV; + default: return p->vfb->data + p->offset;} +} + +AVSC_INLINE int avs_is_writable(const AVS_VideoFrame * p) { + return (p->refcount == 1 && p->vfb->refcount == 1);} + +AVSC_INLINE unsigned char* avs_get_write_ptr(const AVS_VideoFrame * p) +{ + if (avs_is_writable(p)) { + ++p->vfb->sequence_number; + return p->vfb->data + p->offset; + } else + return 0; +} + +AVSC_INLINE unsigned char* avs_get_write_ptr_p(const AVS_VideoFrame * p, int plane) +{ + if (plane==AVS_PLANAR_Y && avs_is_writable(p)) { + ++p->vfb->sequence_number; + return p->vfb->data + p->offset; + } else if (plane==AVS_PLANAR_Y) { + return 0; + } else { + switch (plane) { + case AVS_PLANAR_U: return p->vfb->data + p->offsetU; + case AVS_PLANAR_V: return p->vfb->data + p->offsetV; + default: return p->vfb->data + p->offset; + } + } +} + +#if defined __cplusplus +extern "C" +{ +#endif // __cplusplus +AVSC_API(void, avs_release_video_frame)(AVS_VideoFrame *); +// makes a shallow copy of a video frame +AVSC_API(AVS_VideoFrame *, avs_copy_video_frame)(AVS_VideoFrame *); +#if defined __cplusplus +} +#endif // __cplusplus + +#ifndef AVSC_NO_DECLSPEC +AVSC_INLINE void avs_release_frame(AVS_VideoFrame * f) + {avs_release_video_frame(f);} +AVSC_INLINE AVS_VideoFrame * avs_copy_frame(AVS_VideoFrame * f) + {return avs_copy_video_frame(f);} +#endif + +///////////////////////////////////////////////////////////////////// +// +// AVS_Value +// + +// Treat AVS_Value as a fat pointer. That is use avs_copy_value +// and avs_release_value appropiaty as you would if AVS_Value was +// a pointer. + +// To maintain source code compatibility with future versions of the +// avisynth_c API don't use the AVS_Value directly. Use the helper +// functions below. + +// AVS_Value is layed out identicly to AVSValue +typedef struct AVS_Value AVS_Value; +struct AVS_Value { + short type; // 'a'rray, 'c'lip, 'b'ool, 'i'nt, 'f'loat, 's'tring, 'v'oid, or 'l'ong + // for some function e'rror + short array_size; + union { + void * clip; // do not use directly, use avs_take_clip + char boolean; + int integer; + INT64 integer64; // match addition of __int64 to avxplugin.h + float floating_pt; + const char * string; + const AVS_Value * array; + } d; +}; + +// AVS_Value should be initilized with avs_void. +// Should also set to avs_void after the value is released +// with avs_copy_value. Consider it the equalvent of setting +// a pointer to NULL +static const AVS_Value avs_void = {'v'}; + +AVSC_API(void, avs_copy_value)(AVS_Value * dest, AVS_Value src); +AVSC_API(void, avs_release_value)(AVS_Value); + +AVSC_INLINE int avs_defined(AVS_Value v) { return v.type != 'v'; } +AVSC_INLINE int avs_is_clip(AVS_Value v) { return v.type == 'c'; } +AVSC_INLINE int avs_is_bool(AVS_Value v) { return v.type == 'b'; } +AVSC_INLINE int avs_is_int(AVS_Value v) { return v.type == 'i'; } +AVSC_INLINE int avs_is_float(AVS_Value v) { return v.type == 'f' || v.type == 'i'; } +AVSC_INLINE int avs_is_string(AVS_Value v) { return v.type == 's'; } +AVSC_INLINE int avs_is_array(AVS_Value v) { return v.type == 'a'; } +AVSC_INLINE int avs_is_error(AVS_Value v) { return v.type == 'e'; } + +#if defined __cplusplus +extern "C" +{ +#endif // __cplusplus +AVSC_API(AVS_Clip *, avs_take_clip)(AVS_Value, AVS_ScriptEnvironment *); +AVSC_API(void, avs_set_to_clip)(AVS_Value *, AVS_Clip *); +#if defined __cplusplus +} +#endif // __cplusplus + +AVSC_INLINE int avs_as_bool(AVS_Value v) + { return v.d.boolean; } +AVSC_INLINE int avs_as_int(AVS_Value v) + { return v.d.integer; } +AVSC_INLINE const char * avs_as_string(AVS_Value v) + { return avs_is_error(v) || avs_is_string(v) ? v.d.string : 0; } +AVSC_INLINE double avs_as_float(AVS_Value v) + { return avs_is_int(v) ? v.d.integer : v.d.floating_pt; } +AVSC_INLINE const char * avs_as_error(AVS_Value v) + { return avs_is_error(v) ? v.d.string : 0; } +AVSC_INLINE const AVS_Value * avs_as_array(AVS_Value v) + { return v.d.array; } +AVSC_INLINE int avs_array_size(AVS_Value v) + { return avs_is_array(v) ? v.array_size : 1; } +AVSC_INLINE AVS_Value avs_array_elt(AVS_Value v, int index) + { return avs_is_array(v) ? v.d.array[index] : v; } + +// only use these functions on am AVS_Value that does not already have +// an active value. Remember, treat AVS_Value as a fat pointer. +AVSC_INLINE AVS_Value avs_new_value_bool(int v0) + { AVS_Value v; v.type = 'b'; v.d.boolean = v0 == 0 ? 0 : 1; return v; } +AVSC_INLINE AVS_Value avs_new_value_int(int v0) + { AVS_Value v; v.type = 'i'; v.d.integer = v0; return v; } +AVSC_INLINE AVS_Value avs_new_value_string(const char * v0) + { AVS_Value v; v.type = 's'; v.d.string = v0; return v; } +AVSC_INLINE AVS_Value avs_new_value_float(float v0) + { AVS_Value v; v.type = 'f'; v.d.floating_pt = v0; return v;} +AVSC_INLINE AVS_Value avs_new_value_error(const char * v0) + { AVS_Value v; v.type = 'e'; v.d.string = v0; return v; } +#ifndef AVSC_NO_DECLSPEC +AVSC_INLINE AVS_Value avs_new_value_clip(AVS_Clip * v0) + { AVS_Value v; avs_set_to_clip(&v, v0); return v; } +#endif +AVSC_INLINE AVS_Value avs_new_value_array(AVS_Value * v0, int size) + { AVS_Value v; v.type = 'a'; v.d.array = v0; v.array_size = size; return v; } + +///////////////////////////////////////////////////////////////////// +// +// AVS_Clip +// +#if defined __cplusplus +extern "C" +{ +#endif // __cplusplus +AVSC_API(void, avs_release_clip)(AVS_Clip *); +AVSC_API(AVS_Clip *, avs_copy_clip)(AVS_Clip *); + +AVSC_API(const char *, avs_clip_get_error)(AVS_Clip *); // return 0 if no error + +AVSC_API(const AVS_VideoInfo *, avs_get_video_info)(AVS_Clip *); + +AVSC_API(int, avs_get_version)(AVS_Clip *); + +AVSC_API(AVS_VideoFrame *, avs_get_frame)(AVS_Clip *, int n); +// The returned video frame must be released with avs_release_video_frame + +AVSC_API(int, avs_get_parity)(AVS_Clip *, int n); +// return field parity if field_based, else parity of first field in frame + +AVSC_API(int, avs_get_audio)(AVS_Clip *, void * buf, + INT64 start, INT64 count); +// start and count are in samples + +AVSC_API(int, avs_set_cache_hints)(AVS_Clip *, + int cachehints, size_t frame_range); +#if defined __cplusplus +} +#endif // __cplusplus + +// This is the callback type used by avs_add_function +typedef AVS_Value (AVSC_CC * AVS_ApplyFunc) + (AVS_ScriptEnvironment *, AVS_Value args, void * user_data); + +typedef struct AVS_FilterInfo AVS_FilterInfo; +struct AVS_FilterInfo +{ + // these members should not be modified outside of the AVS_ApplyFunc callback + AVS_Clip * child; + AVS_VideoInfo vi; + AVS_ScriptEnvironment * env; + AVS_VideoFrame * (AVSC_CC * get_frame)(AVS_FilterInfo *, int n); + int (AVSC_CC * get_parity)(AVS_FilterInfo *, int n); + int (AVSC_CC * get_audio)(AVS_FilterInfo *, void * buf, + INT64 start, INT64 count); + int (AVSC_CC * set_cache_hints)(AVS_FilterInfo *, int cachehints, + int frame_range); + void (AVSC_CC * free_filter)(AVS_FilterInfo *); + + // Should be set when ever there is an error to report. + // It is cleared before any of the above methods are called + const char * error; + // this is to store whatever and may be modified at will + void * user_data; +}; + +// Create a new filter +// fi is set to point to the AVS_FilterInfo so that you can +// modify it once it is initilized. +// store_child should generally be set to true. If it is not +// set than ALL methods (the function pointers) must be defined +// If it is set than you do not need to worry about freeing the child +// clip. +#if defined __cplusplus +extern "C" +{ +#endif // __cplusplus +AVSC_API(AVS_Clip *, avs_new_c_filter)(AVS_ScriptEnvironment * e, + AVS_FilterInfo * * fi, + AVS_Value child, int store_child); +#if defined __cplusplus +} +#endif // __cplusplus + + +///////////////////////////////////////////////////////////////////// +// +// AVS_ScriptEnvironment +// + +// For GetCPUFlags. These are backwards-compatible with those in VirtualDub. +enum { + /* slowest CPU to support extension */ + AVS_CPU_FORCE = 0x01, // N/A + AVS_CPU_FPU = 0x02, // 386/486DX + AVS_CPU_MMX = 0x04, // P55C, K6, PII + AVS_CPU_INTEGER_SSE = 0x08, // PIII, Athlon + AVS_CPU_SSE = 0x10, // PIII, Athlon XP/MP + AVS_CPU_SSE2 = 0x20, // PIV, Hammer + AVS_CPU_3DNOW = 0x40, // K6-2 + AVS_CPU_3DNOW_EXT = 0x80, // Athlon + AVS_CPU_X86_64 = 0xA0, // Hammer (note: equiv. to 3DNow + SSE2, + // which only Hammer will have anyway) +}; + +#if defined __cplusplus +extern "C" +{ +#endif // __cplusplus +AVSC_API(const char *, avs_get_error)(AVS_ScriptEnvironment *); // return 0 if no error + +AVSC_API(long, avs_get_cpu_flags)(AVS_ScriptEnvironment *); +AVSC_API(int, avs_check_version)(AVS_ScriptEnvironment *, int version); + +AVSC_API(char *, avs_save_string)(AVS_ScriptEnvironment *, const char* s, int length); +AVSC_API(char *, avs_sprintf)(AVS_ScriptEnvironment *, const char * fmt, ...); + +AVSC_API(char *, avs_vsprintf)(AVS_ScriptEnvironment *, const char * fmt, va_list val); + // note: val is really a va_list; I hope everyone typedefs va_list to a pointer + +AVSC_API(int, avs_add_function)(AVS_ScriptEnvironment *, + const char * name, const char * params, + AVS_ApplyFunc apply, void * user_data); + +AVSC_API(int, avs_function_exists)(AVS_ScriptEnvironment *, const char * name); + +AVSC_API(AVS_Value, avs_invoke)(AVS_ScriptEnvironment *, const char * name, + AVS_Value args, const char** arg_names); +// The returned value must be be released with avs_release_value + +AVSC_API(AVS_Value, avs_get_var)(AVS_ScriptEnvironment *, const char* name); +// The returned value must be be released with avs_release_value + +AVSC_API(int, avs_set_var)(AVS_ScriptEnvironment *, const char* name, AVS_Value val); + +AVSC_API(int, avs_set_global_var)(AVS_ScriptEnvironment *, const char* name, const AVS_Value val); + +//void avs_push_context(AVS_ScriptEnvironment *, int level=0); +//void avs_pop_context(AVS_ScriptEnvironment *); + +AVSC_API(AVS_VideoFrame *, avs_new_video_frame_a)(AVS_ScriptEnvironment *, + const AVS_VideoInfo * vi, int align); +// align should be at least 16 +#if defined __cplusplus +} +#endif // __cplusplus + +#ifndef AVSC_NO_DECLSPEC +AVSC_INLINE +AVS_VideoFrame * avs_new_video_frame(AVS_ScriptEnvironment * env, + const AVS_VideoInfo * vi) + {return avs_new_video_frame_a(env,vi,AVS_FRAME_ALIGN);} + +AVSC_INLINE +AVS_VideoFrame * avs_new_frame(AVS_ScriptEnvironment * env, + const AVS_VideoInfo * vi) + {return avs_new_video_frame_a(env,vi,AVS_FRAME_ALIGN);} +#endif + +#if defined __cplusplus +extern "C" +{ +#endif // __cplusplus +AVSC_API(int, avs_make_writable)(AVS_ScriptEnvironment *, AVS_VideoFrame * * pvf); + +AVSC_API(void, avs_bit_blt)(AVS_ScriptEnvironment *, unsigned char* dstp, int dst_pitch, const unsigned char* srcp, int src_pitch, int row_size, int height); + +typedef void (AVSC_CC *AVS_ShutdownFunc)(void* user_data, AVS_ScriptEnvironment * env); +AVSC_API(void, avs_at_exit)(AVS_ScriptEnvironment *, AVS_ShutdownFunc function, void * user_data); + +AVSC_API(AVS_VideoFrame *, avs_subframe)(AVS_ScriptEnvironment *, AVS_VideoFrame * src, int rel_offset, int new_pitch, int new_row_size, int new_height); +// The returned video frame must be be released + +AVSC_API(int, avs_set_memory_max)(AVS_ScriptEnvironment *, int mem); + +AVSC_API(int, avs_set_working_dir)(AVS_ScriptEnvironment *, const char * newdir); + +// avisynth.dll exports this; it's a way to use it as a library, without +// writing an AVS script or without going through AVIFile. +AVSC_API(AVS_ScriptEnvironment *, avs_create_script_environment)(int version); +#if defined __cplusplus +} +#endif // __cplusplus + +// this symbol is the entry point for the plugin and must +// be defined +AVSC_EXPORT +const char * AVSC_CC avisynth_c_plugin_init(AVS_ScriptEnvironment* env); + + +#if defined __cplusplus +extern "C" +{ +#endif // __cplusplus +AVSC_API(void, avs_delete_script_environment)(AVS_ScriptEnvironment *); + + +AVSC_API(AVS_VideoFrame *, avs_subframe_planar)(AVS_ScriptEnvironment *, AVS_VideoFrame * src, int rel_offset, int new_pitch, int new_row_size, int new_height, int rel_offsetU, int rel_offsetV, int new_pitchUV); +// The returned video frame must be be released +#if defined __cplusplus +} +#endif // __cplusplus + +#endif //__AVXSYNTH_C__
View file
x264-snapshot-20130723-2245.tar.bz2/extras/cl.h
Added
@@ -0,0 +1,1209 @@ +/******************************************************************************* + * Copyright (c) 2008 - 2012 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + ******************************************************************************/ + +#ifndef __OPENCL_CL_H +#define __OPENCL_CL_H + +#include "cl_platform.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/******************************************************************************/ + +typedef struct _cl_platform_id * cl_platform_id; +typedef struct _cl_device_id * cl_device_id; +typedef struct _cl_context * cl_context; +typedef struct _cl_command_queue * cl_command_queue; +typedef struct _cl_mem * cl_mem; +typedef struct _cl_program * cl_program; +typedef struct _cl_kernel * cl_kernel; +typedef struct _cl_event * cl_event; +typedef struct _cl_sampler * cl_sampler; + +typedef cl_uint cl_bool; /* WARNING! Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */ +typedef cl_ulong cl_bitfield; +typedef cl_bitfield cl_device_type; +typedef cl_uint cl_platform_info; +typedef cl_uint cl_device_info; +typedef cl_bitfield cl_device_fp_config; +typedef cl_uint cl_device_mem_cache_type; +typedef cl_uint cl_device_local_mem_type; +typedef cl_bitfield cl_device_exec_capabilities; +typedef cl_bitfield cl_command_queue_properties; +typedef intptr_t cl_device_partition_property; +typedef cl_bitfield cl_device_affinity_domain; + +typedef intptr_t cl_context_properties; +typedef cl_uint cl_context_info; +typedef cl_uint cl_command_queue_info; +typedef cl_uint cl_channel_order; +typedef cl_uint cl_channel_type; +typedef cl_bitfield cl_mem_flags; +typedef cl_uint cl_mem_object_type; +typedef cl_uint cl_mem_info; +typedef cl_bitfield cl_mem_migration_flags; +typedef cl_uint cl_image_info; +typedef cl_uint cl_buffer_create_type; +typedef cl_uint cl_addressing_mode; +typedef cl_uint cl_filter_mode; +typedef cl_uint cl_sampler_info; +typedef cl_bitfield cl_map_flags; +typedef cl_uint cl_program_info; +typedef cl_uint cl_program_build_info; +typedef cl_uint cl_program_binary_type; +typedef cl_int cl_build_status; +typedef cl_uint cl_kernel_info; +typedef cl_uint cl_kernel_arg_info; +typedef cl_uint cl_kernel_arg_address_qualifier; +typedef cl_uint cl_kernel_arg_access_qualifier; +typedef cl_bitfield cl_kernel_arg_type_qualifier; +typedef cl_uint cl_kernel_work_group_info; +typedef cl_uint cl_event_info; +typedef cl_uint cl_command_type; +typedef cl_uint cl_profiling_info; + + +typedef struct _cl_image_format { + cl_channel_order image_channel_order; + cl_channel_type image_channel_data_type; +} cl_image_format; + +typedef struct _cl_image_desc { + cl_mem_object_type image_type; + size_t image_width; + size_t image_height; + size_t image_depth; + size_t image_array_size; + size_t image_row_pitch; + size_t image_slice_pitch; + cl_uint num_mip_levels; + cl_uint num_samples; + cl_mem buffer; +} cl_image_desc; + +typedef struct _cl_buffer_region { + size_t origin; + size_t size; +} cl_buffer_region; + + +/******************************************************************************/ + +/* Error Codes */ +#define CL_SUCCESS 0 +#define CL_DEVICE_NOT_FOUND -1 +#define CL_DEVICE_NOT_AVAILABLE -2 +#define CL_COMPILER_NOT_AVAILABLE -3 +#define CL_MEM_OBJECT_ALLOCATION_FAILURE -4 +#define CL_OUT_OF_RESOURCES -5 +#define CL_OUT_OF_HOST_MEMORY -6 +#define CL_PROFILING_INFO_NOT_AVAILABLE -7 +#define CL_MEM_COPY_OVERLAP -8 +#define CL_IMAGE_FORMAT_MISMATCH -9 +#define CL_IMAGE_FORMAT_NOT_SUPPORTED -10 +#define CL_BUILD_PROGRAM_FAILURE -11 +#define CL_MAP_FAILURE -12 +#define CL_MISALIGNED_SUB_BUFFER_OFFSET -13 +#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14 +#define CL_COMPILE_PROGRAM_FAILURE -15 +#define CL_LINKER_NOT_AVAILABLE -16 +#define CL_LINK_PROGRAM_FAILURE -17 +#define CL_DEVICE_PARTITION_FAILED -18 +#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE -19 + +#define CL_INVALID_VALUE -30 +#define CL_INVALID_DEVICE_TYPE -31 +#define CL_INVALID_PLATFORM -32 +#define CL_INVALID_DEVICE -33 +#define CL_INVALID_CONTEXT -34 +#define CL_INVALID_QUEUE_PROPERTIES -35 +#define CL_INVALID_COMMAND_QUEUE -36 +#define CL_INVALID_HOST_PTR -37 +#define CL_INVALID_MEM_OBJECT -38 +#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR -39 +#define CL_INVALID_IMAGE_SIZE -40 +#define CL_INVALID_SAMPLER -41 +#define CL_INVALID_BINARY -42 +#define CL_INVALID_BUILD_OPTIONS -43 +#define CL_INVALID_PROGRAM -44 +#define CL_INVALID_PROGRAM_EXECUTABLE -45 +#define CL_INVALID_KERNEL_NAME -46 +#define CL_INVALID_KERNEL_DEFINITION -47 +#define CL_INVALID_KERNEL -48 +#define CL_INVALID_ARG_INDEX -49 +#define CL_INVALID_ARG_VALUE -50 +#define CL_INVALID_ARG_SIZE -51 +#define CL_INVALID_KERNEL_ARGS -52 +#define CL_INVALID_WORK_DIMENSION -53 +#define CL_INVALID_WORK_GROUP_SIZE -54 +#define CL_INVALID_WORK_ITEM_SIZE -55 +#define CL_INVALID_GLOBAL_OFFSET -56 +#define CL_INVALID_EVENT_WAIT_LIST -57 +#define CL_INVALID_EVENT -58 +#define CL_INVALID_OPERATION -59 +#define CL_INVALID_GL_OBJECT -60 +#define CL_INVALID_BUFFER_SIZE -61 +#define CL_INVALID_MIP_LEVEL -62 +#define CL_INVALID_GLOBAL_WORK_SIZE -63 +#define CL_INVALID_PROPERTY -64 +#define CL_INVALID_IMAGE_DESCRIPTOR -65 +#define CL_INVALID_COMPILER_OPTIONS -66 +#define CL_INVALID_LINKER_OPTIONS -67 +#define CL_INVALID_DEVICE_PARTITION_COUNT -68 + +/* OpenCL Version */ +#define CL_VERSION_1_0 1 +#define CL_VERSION_1_1 1 +#define CL_VERSION_1_2 1 + +/* cl_bool */ +#define CL_FALSE 0 +#define CL_TRUE 1 +#define CL_BLOCKING CL_TRUE +#define CL_NON_BLOCKING CL_FALSE + +/* cl_platform_info */ +#define CL_PLATFORM_PROFILE 0x0900 +#define CL_PLATFORM_VERSION 0x0901 +#define CL_PLATFORM_NAME 0x0902 +#define CL_PLATFORM_VENDOR 0x0903 +#define CL_PLATFORM_EXTENSIONS 0x0904 + +/* cl_device_type - bitfield */ +#define CL_DEVICE_TYPE_DEFAULT (1 << 0) +#define CL_DEVICE_TYPE_CPU (1 << 1) +#define CL_DEVICE_TYPE_GPU (1 << 2) +#define CL_DEVICE_TYPE_ACCELERATOR (1 << 3) +#define CL_DEVICE_TYPE_CUSTOM (1 << 4) +#define CL_DEVICE_TYPE_ALL 0xFFFFFFFF + +/* cl_device_info */ +#define CL_DEVICE_TYPE 0x1000 +#define CL_DEVICE_VENDOR_ID 0x1001 +#define CL_DEVICE_MAX_COMPUTE_UNITS 0x1002 +#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 0x1003 +#define CL_DEVICE_MAX_WORK_GROUP_SIZE 0x1004 +#define CL_DEVICE_MAX_WORK_ITEM_SIZES 0x1005 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR 0x1006 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT 0x1007 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT 0x1008 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG 0x1009 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT 0x100A +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE 0x100B +#define CL_DEVICE_MAX_CLOCK_FREQUENCY 0x100C +#define CL_DEVICE_ADDRESS_BITS 0x100D +#define CL_DEVICE_MAX_READ_IMAGE_ARGS 0x100E +#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS 0x100F +#define CL_DEVICE_MAX_MEM_ALLOC_SIZE 0x1010 +#define CL_DEVICE_IMAGE2D_MAX_WIDTH 0x1011 +#define CL_DEVICE_IMAGE2D_MAX_HEIGHT 0x1012 +#define CL_DEVICE_IMAGE3D_MAX_WIDTH 0x1013 +#define CL_DEVICE_IMAGE3D_MAX_HEIGHT 0x1014 +#define CL_DEVICE_IMAGE3D_MAX_DEPTH 0x1015 +#define CL_DEVICE_IMAGE_SUPPORT 0x1016 +#define CL_DEVICE_MAX_PARAMETER_SIZE 0x1017 +#define CL_DEVICE_MAX_SAMPLERS 0x1018 +#define CL_DEVICE_MEM_BASE_ADDR_ALIGN 0x1019 +#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE 0x101A +#define CL_DEVICE_SINGLE_FP_CONFIG 0x101B +#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE 0x101C +#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE 0x101D +#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE 0x101E +#define CL_DEVICE_GLOBAL_MEM_SIZE 0x101F +#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE 0x1020 +#define CL_DEVICE_MAX_CONSTANT_ARGS 0x1021 +#define CL_DEVICE_LOCAL_MEM_TYPE 0x1022 +#define CL_DEVICE_LOCAL_MEM_SIZE 0x1023 +#define CL_DEVICE_ERROR_CORRECTION_SUPPORT 0x1024 +#define CL_DEVICE_PROFILING_TIMER_RESOLUTION 0x1025 +#define CL_DEVICE_ENDIAN_LITTLE 0x1026 +#define CL_DEVICE_AVAILABLE 0x1027 +#define CL_DEVICE_COMPILER_AVAILABLE 0x1028 +#define CL_DEVICE_EXECUTION_CAPABILITIES 0x1029 +#define CL_DEVICE_QUEUE_PROPERTIES 0x102A +#define CL_DEVICE_NAME 0x102B +#define CL_DEVICE_VENDOR 0x102C +#define CL_DRIVER_VERSION 0x102D +#define CL_DEVICE_PROFILE 0x102E +#define CL_DEVICE_VERSION 0x102F +#define CL_DEVICE_EXTENSIONS 0x1030 +#define CL_DEVICE_PLATFORM 0x1031 +#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032 +/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG */ +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF 0x1034 +#define CL_DEVICE_HOST_UNIFIED_MEMORY 0x1035 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR 0x1036 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT 0x1037 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT 0x1038 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG 0x1039 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT 0x103A +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE 0x103B +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF 0x103C +#define CL_DEVICE_OPENCL_C_VERSION 0x103D +#define CL_DEVICE_LINKER_AVAILABLE 0x103E +#define CL_DEVICE_BUILT_IN_KERNELS 0x103F +#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE 0x1040 +#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE 0x1041 +#define CL_DEVICE_PARENT_DEVICE 0x1042 +#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES 0x1043 +#define CL_DEVICE_PARTITION_PROPERTIES 0x1044 +#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN 0x1045 +#define CL_DEVICE_PARTITION_TYPE 0x1046 +#define CL_DEVICE_REFERENCE_COUNT 0x1047 +#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC 0x1048 +#define CL_DEVICE_PRINTF_BUFFER_SIZE 0x1049 +#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT 0x104A +#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT 0x104B + +/* cl_device_fp_config - bitfield */ +#define CL_FP_DENORM (1 << 0) +#define CL_FP_INF_NAN (1 << 1) +#define CL_FP_ROUND_TO_NEAREST (1 << 2) +#define CL_FP_ROUND_TO_ZERO (1 << 3) +#define CL_FP_ROUND_TO_INF (1 << 4) +#define CL_FP_FMA (1 << 5) +#define CL_FP_SOFT_FLOAT (1 << 6) +#define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT (1 << 7) + +/* cl_device_mem_cache_type */ +#define CL_NONE 0x0 +#define CL_READ_ONLY_CACHE 0x1 +#define CL_READ_WRITE_CACHE 0x2 + +/* cl_device_local_mem_type */ +#define CL_LOCAL 0x1 +#define CL_GLOBAL 0x2 + +/* cl_device_exec_capabilities - bitfield */ +#define CL_EXEC_KERNEL (1 << 0) +#define CL_EXEC_NATIVE_KERNEL (1 << 1) + +/* cl_command_queue_properties - bitfield */ +#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE (1 << 0) +#define CL_QUEUE_PROFILING_ENABLE (1 << 1) + +/* cl_context_info */ +#define CL_CONTEXT_REFERENCE_COUNT 0x1080 +#define CL_CONTEXT_DEVICES 0x1081 +#define CL_CONTEXT_PROPERTIES 0x1082 +#define CL_CONTEXT_NUM_DEVICES 0x1083 + +/* cl_context_properties */ +#define CL_CONTEXT_PLATFORM 0x1084 +#define CL_CONTEXT_INTEROP_USER_SYNC 0x1085 + +/* cl_device_partition_property */ +#define CL_DEVICE_PARTITION_EQUALLY 0x1086 +#define CL_DEVICE_PARTITION_BY_COUNTS 0x1087 +#define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END 0x0 +#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN 0x1088 + +/* cl_device_affinity_domain */ +#define CL_DEVICE_AFFINITY_DOMAIN_NUMA (1 << 0) +#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE (1 << 1) +#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE (1 << 2) +#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE (1 << 3) +#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE (1 << 4) +#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE (1 << 5) + +/* cl_command_queue_info */ +#define CL_QUEUE_CONTEXT 0x1090 +#define CL_QUEUE_DEVICE 0x1091 +#define CL_QUEUE_REFERENCE_COUNT 0x1092 +#define CL_QUEUE_PROPERTIES 0x1093 + +/* cl_mem_flags - bitfield */ +#define CL_MEM_READ_WRITE (1 << 0) +#define CL_MEM_WRITE_ONLY (1 << 1) +#define CL_MEM_READ_ONLY (1 << 2) +#define CL_MEM_USE_HOST_PTR (1 << 3) +#define CL_MEM_ALLOC_HOST_PTR (1 << 4) +#define CL_MEM_COPY_HOST_PTR (1 << 5) +// reserved (1 << 6) +#define CL_MEM_HOST_WRITE_ONLY (1 << 7) +#define CL_MEM_HOST_READ_ONLY (1 << 8) +#define CL_MEM_HOST_NO_ACCESS (1 << 9) + +/* cl_mem_migration_flags - bitfield */ +#define CL_MIGRATE_MEM_OBJECT_HOST (1 << 0) +#define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED (1 << 1) + +/* cl_channel_order */ +#define CL_R 0x10B0 +#define CL_A 0x10B1 +#define CL_RG 0x10B2 +#define CL_RA 0x10B3 +#define CL_RGB 0x10B4 +#define CL_RGBA 0x10B5 +#define CL_BGRA 0x10B6 +#define CL_ARGB 0x10B7 +#define CL_INTENSITY 0x10B8 +#define CL_LUMINANCE 0x10B9 +#define CL_Rx 0x10BA +#define CL_RGx 0x10BB +#define CL_RGBx 0x10BC +#define CL_DEPTH 0x10BD +#define CL_DEPTH_STENCIL 0x10BE + +/* cl_channel_type */ +#define CL_SNORM_INT8 0x10D0 +#define CL_SNORM_INT16 0x10D1 +#define CL_UNORM_INT8 0x10D2 +#define CL_UNORM_INT16 0x10D3 +#define CL_UNORM_SHORT_565 0x10D4 +#define CL_UNORM_SHORT_555 0x10D5 +#define CL_UNORM_INT_101010 0x10D6 +#define CL_SIGNED_INT8 0x10D7 +#define CL_SIGNED_INT16 0x10D8 +#define CL_SIGNED_INT32 0x10D9 +#define CL_UNSIGNED_INT8 0x10DA +#define CL_UNSIGNED_INT16 0x10DB +#define CL_UNSIGNED_INT32 0x10DC +#define CL_HALF_FLOAT 0x10DD +#define CL_FLOAT 0x10DE +#define CL_UNORM_INT24 0x10DF + +/* cl_mem_object_type */ +#define CL_MEM_OBJECT_BUFFER 0x10F0 +#define CL_MEM_OBJECT_IMAGE2D 0x10F1 +#define CL_MEM_OBJECT_IMAGE3D 0x10F2 +#define CL_MEM_OBJECT_IMAGE2D_ARRAY 0x10F3 +#define CL_MEM_OBJECT_IMAGE1D 0x10F4 +#define CL_MEM_OBJECT_IMAGE1D_ARRAY 0x10F5 +#define CL_MEM_OBJECT_IMAGE1D_BUFFER 0x10F6 + +/* cl_mem_info */ +#define CL_MEM_TYPE 0x1100 +#define CL_MEM_FLAGS 0x1101 +#define CL_MEM_SIZE 0x1102 +#define CL_MEM_HOST_PTR 0x1103 +#define CL_MEM_MAP_COUNT 0x1104 +#define CL_MEM_REFERENCE_COUNT 0x1105 +#define CL_MEM_CONTEXT 0x1106 +#define CL_MEM_ASSOCIATED_MEMOBJECT 0x1107 +#define CL_MEM_OFFSET 0x1108 + +/* cl_image_info */ +#define CL_IMAGE_FORMAT 0x1110 +#define CL_IMAGE_ELEMENT_SIZE 0x1111 +#define CL_IMAGE_ROW_PITCH 0x1112 +#define CL_IMAGE_SLICE_PITCH 0x1113 +#define CL_IMAGE_WIDTH 0x1114 +#define CL_IMAGE_HEIGHT 0x1115 +#define CL_IMAGE_DEPTH 0x1116 +#define CL_IMAGE_ARRAY_SIZE 0x1117 +#define CL_IMAGE_BUFFER 0x1118 +#define CL_IMAGE_NUM_MIP_LEVELS 0x1119 +#define CL_IMAGE_NUM_SAMPLES 0x111A + +/* cl_addressing_mode */ +#define CL_ADDRESS_NONE 0x1130 +#define CL_ADDRESS_CLAMP_TO_EDGE 0x1131 +#define CL_ADDRESS_CLAMP 0x1132 +#define CL_ADDRESS_REPEAT 0x1133 +#define CL_ADDRESS_MIRRORED_REPEAT 0x1134 + +/* cl_filter_mode */ +#define CL_FILTER_NEAREST 0x1140 +#define CL_FILTER_LINEAR 0x1141 + +/* cl_sampler_info */ +#define CL_SAMPLER_REFERENCE_COUNT 0x1150 +#define CL_SAMPLER_CONTEXT 0x1151 +#define CL_SAMPLER_NORMALIZED_COORDS 0x1152 +#define CL_SAMPLER_ADDRESSING_MODE 0x1153 +#define CL_SAMPLER_FILTER_MODE 0x1154 + +/* cl_map_flags - bitfield */ +#define CL_MAP_READ (1 << 0) +#define CL_MAP_WRITE (1 << 1) +#define CL_MAP_WRITE_INVALIDATE_REGION (1 << 2) + +/* cl_program_info */ +#define CL_PROGRAM_REFERENCE_COUNT 0x1160 +#define CL_PROGRAM_CONTEXT 0x1161 +#define CL_PROGRAM_NUM_DEVICES 0x1162 +#define CL_PROGRAM_DEVICES 0x1163 +#define CL_PROGRAM_SOURCE 0x1164 +#define CL_PROGRAM_BINARY_SIZES 0x1165 +#define CL_PROGRAM_BINARIES 0x1166 +#define CL_PROGRAM_NUM_KERNELS 0x1167 +#define CL_PROGRAM_KERNEL_NAMES 0x1168 + +/* cl_program_build_info */ +#define CL_PROGRAM_BUILD_STATUS 0x1181 +#define CL_PROGRAM_BUILD_OPTIONS 0x1182 +#define CL_PROGRAM_BUILD_LOG 0x1183 +#define CL_PROGRAM_BINARY_TYPE 0x1184 + +/* cl_program_binary_type */ +#define CL_PROGRAM_BINARY_TYPE_NONE 0x0 +#define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT 0x1 +#define CL_PROGRAM_BINARY_TYPE_LIBRARY 0x2 +#define CL_PROGRAM_BINARY_TYPE_EXECUTABLE 0x4 + +/* cl_build_status */ +#define CL_BUILD_SUCCESS 0 +#define CL_BUILD_NONE -1 +#define CL_BUILD_ERROR -2 +#define CL_BUILD_IN_PROGRESS -3 + +/* cl_kernel_info */ +#define CL_KERNEL_FUNCTION_NAME 0x1190 +#define CL_KERNEL_NUM_ARGS 0x1191 +#define CL_KERNEL_REFERENCE_COUNT 0x1192 +#define CL_KERNEL_CONTEXT 0x1193 +#define CL_KERNEL_PROGRAM 0x1194 +#define CL_KERNEL_ATTRIBUTES 0x1195 + +/* cl_kernel_arg_info */ +#define CL_KERNEL_ARG_ADDRESS_QUALIFIER 0x1196 +#define CL_KERNEL_ARG_ACCESS_QUALIFIER 0x1197 +#define CL_KERNEL_ARG_TYPE_NAME 0x1198 +#define CL_KERNEL_ARG_TYPE_QUALIFIER 0x1199 +#define CL_KERNEL_ARG_NAME 0x119A + +/* cl_kernel_arg_address_qualifier */ +#define CL_KERNEL_ARG_ADDRESS_GLOBAL 0x119B +#define CL_KERNEL_ARG_ADDRESS_LOCAL 0x119C +#define CL_KERNEL_ARG_ADDRESS_CONSTANT 0x119D +#define CL_KERNEL_ARG_ADDRESS_PRIVATE 0x119E + +/* cl_kernel_arg_access_qualifier */ +#define CL_KERNEL_ARG_ACCESS_READ_ONLY 0x11A0 +#define CL_KERNEL_ARG_ACCESS_WRITE_ONLY 0x11A1 +#define CL_KERNEL_ARG_ACCESS_READ_WRITE 0x11A2 +#define CL_KERNEL_ARG_ACCESS_NONE 0x11A3 + +/* cl_kernel_arg_type_qualifer */ +#define CL_KERNEL_ARG_TYPE_NONE 0 +#define CL_KERNEL_ARG_TYPE_CONST (1 << 0) +#define CL_KERNEL_ARG_TYPE_RESTRICT (1 << 1) +#define CL_KERNEL_ARG_TYPE_VOLATILE (1 << 2) + +/* cl_kernel_work_group_info */ +#define CL_KERNEL_WORK_GROUP_SIZE 0x11B0 +#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE 0x11B1 +#define CL_KERNEL_LOCAL_MEM_SIZE 0x11B2 +#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3 +#define CL_KERNEL_PRIVATE_MEM_SIZE 0x11B4 +#define CL_KERNEL_GLOBAL_WORK_SIZE 0x11B5 + +/* cl_event_info */ +#define CL_EVENT_COMMAND_QUEUE 0x11D0 +#define CL_EVENT_COMMAND_TYPE 0x11D1 +#define CL_EVENT_REFERENCE_COUNT 0x11D2 +#define CL_EVENT_COMMAND_EXECUTION_STATUS 0x11D3 +#define CL_EVENT_CONTEXT 0x11D4 + +/* cl_command_type */ +#define CL_COMMAND_NDRANGE_KERNEL 0x11F0 +#define CL_COMMAND_TASK 0x11F1 +#define CL_COMMAND_NATIVE_KERNEL 0x11F2 +#define CL_COMMAND_READ_BUFFER 0x11F3 +#define CL_COMMAND_WRITE_BUFFER 0x11F4 +#define CL_COMMAND_COPY_BUFFER 0x11F5 +#define CL_COMMAND_READ_IMAGE 0x11F6 +#define CL_COMMAND_WRITE_IMAGE 0x11F7 +#define CL_COMMAND_COPY_IMAGE 0x11F8 +#define CL_COMMAND_COPY_IMAGE_TO_BUFFER 0x11F9 +#define CL_COMMAND_COPY_BUFFER_TO_IMAGE 0x11FA +#define CL_COMMAND_MAP_BUFFER 0x11FB +#define CL_COMMAND_MAP_IMAGE 0x11FC +#define CL_COMMAND_UNMAP_MEM_OBJECT 0x11FD +#define CL_COMMAND_MARKER 0x11FE +#define CL_COMMAND_ACQUIRE_GL_OBJECTS 0x11FF +#define CL_COMMAND_RELEASE_GL_OBJECTS 0x1200 +#define CL_COMMAND_READ_BUFFER_RECT 0x1201 +#define CL_COMMAND_WRITE_BUFFER_RECT 0x1202 +#define CL_COMMAND_COPY_BUFFER_RECT 0x1203 +#define CL_COMMAND_USER 0x1204 +#define CL_COMMAND_BARRIER 0x1205 +#define CL_COMMAND_MIGRATE_MEM_OBJECTS 0x1206 +#define CL_COMMAND_FILL_BUFFER 0x1207 +#define CL_COMMAND_FILL_IMAGE 0x1208 + +/* command execution status */ +#define CL_COMPLETE 0x0 +#define CL_RUNNING 0x1 +#define CL_SUBMITTED 0x2 +#define CL_QUEUED 0x3 + +/* cl_buffer_create_type */ +#define CL_BUFFER_CREATE_TYPE_REGION 0x1220 + +/* cl_profiling_info */ +#define CL_PROFILING_COMMAND_QUEUED 0x1280 +#define CL_PROFILING_COMMAND_SUBMIT 0x1281 +#define CL_PROFILING_COMMAND_START 0x1282 +#define CL_PROFILING_COMMAND_END 0x1283 + +/********************************************************************************************************/ + +/* Platform API */ +extern CL_API_ENTRY cl_int CL_API_CALL +clGetPlatformIDs(cl_uint /* num_entries */, + cl_platform_id * /* platforms */, + cl_uint * /* num_platforms */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetPlatformInfo(cl_platform_id /* platform */, + cl_platform_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Device APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clGetDeviceIDs(cl_platform_id /* platform */, + cl_device_type /* device_type */, + cl_uint /* num_entries */, + cl_device_id * /* devices */, + cl_uint * /* num_devices */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetDeviceInfo(cl_device_id /* device */, + cl_device_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clCreateSubDevices(cl_device_id /* in_device */, + const cl_device_partition_property * /* properties */, + cl_uint /* num_devices */, + cl_device_id * /* out_devices */, + cl_uint * /* num_devices_ret */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2; + +/* Context APIs */ +extern CL_API_ENTRY cl_context CL_API_CALL +clCreateContext(const cl_context_properties * /* properties */, + cl_uint /* num_devices */, + const cl_device_id * /* devices */, + void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *), + void * /* user_data */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_context CL_API_CALL +clCreateContextFromType(const cl_context_properties * /* properties */, + cl_device_type /* device_type */, + void (CL_CALLBACK * /* pfn_notify*/ )(const char *, const void *, size_t, void *), + void * /* user_data */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetContextInfo(cl_context /* context */, + cl_context_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Command Queue APIs */ +extern CL_API_ENTRY cl_command_queue CL_API_CALL +clCreateCommandQueue(cl_context /* context */, + cl_device_id /* device */, + cl_command_queue_properties /* properties */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetCommandQueueInfo(cl_command_queue /* command_queue */, + cl_command_queue_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Memory Object APIs */ +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateBuffer(cl_context /* context */, + cl_mem_flags /* flags */, + size_t /* size */, + void * /* host_ptr */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateSubBuffer(cl_mem /* buffer */, + cl_mem_flags /* flags */, + cl_buffer_create_type /* buffer_create_type */, + const void * /* buffer_create_info */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateImage(cl_context /* context */, + cl_mem_flags /* flags */, + const cl_image_format * /* image_format */, + const cl_image_desc * /* image_desc */, + void * /* host_ptr */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetSupportedImageFormats(cl_context /* context */, + cl_mem_flags /* flags */, + cl_mem_object_type /* image_type */, + cl_uint /* num_entries */, + cl_image_format * /* image_formats */, + cl_uint * /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetMemObjectInfo(cl_mem /* memobj */, + cl_mem_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetImageInfo(cl_mem /* image */, + cl_image_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetMemObjectDestructorCallback( cl_mem /* memobj */, + void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), + void * /*user_data */ ) CL_API_SUFFIX__VERSION_1_1; + +/* Sampler APIs */ +extern CL_API_ENTRY cl_sampler CL_API_CALL +clCreateSampler(cl_context /* context */, + cl_bool /* normalized_coords */, + cl_addressing_mode /* addressing_mode */, + cl_filter_mode /* filter_mode */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetSamplerInfo(cl_sampler /* sampler */, + cl_sampler_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Program Object APIs */ +extern CL_API_ENTRY cl_program CL_API_CALL +clCreateProgramWithSource(cl_context /* context */, + cl_uint /* count */, + const char ** /* strings */, + const size_t * /* lengths */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_program CL_API_CALL +clCreateProgramWithBinary(cl_context /* context */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const size_t * /* lengths */, + const unsigned char ** /* binaries */, + cl_int * /* binary_status */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_program CL_API_CALL +clCreateProgramWithBuiltInKernels(cl_context /* context */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const char * /* kernel_names */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clBuildProgram(cl_program /* program */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const char * /* options */, + void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */), + void * /* user_data */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clCompileProgram(cl_program /* program */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const char * /* options */, + cl_uint /* num_input_headers */, + const cl_program * /* input_headers */, + const char ** /* header_include_names */, + void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */), + void * /* user_data */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_program CL_API_CALL +clLinkProgram(cl_context /* context */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const char * /* options */, + cl_uint /* num_input_programs */, + const cl_program * /* input_programs */, + void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */), + void * /* user_data */, + cl_int * /* errcode_ret */ ) CL_API_SUFFIX__VERSION_1_2; + + +extern CL_API_ENTRY cl_int CL_API_CALL +clUnloadPlatformCompiler(cl_platform_id /* platform */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetProgramInfo(cl_program /* program */, + cl_program_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetProgramBuildInfo(cl_program /* program */, + cl_device_id /* device */, + cl_program_build_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Kernel Object APIs */ +extern CL_API_ENTRY cl_kernel CL_API_CALL +clCreateKernel(cl_program /* program */, + const char * /* kernel_name */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clCreateKernelsInProgram(cl_program /* program */, + cl_uint /* num_kernels */, + cl_kernel * /* kernels */, + cl_uint * /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainKernel(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseKernel(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetKernelArg(cl_kernel /* kernel */, + cl_uint /* arg_index */, + size_t /* arg_size */, + const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetKernelInfo(cl_kernel /* kernel */, + cl_kernel_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetKernelArgInfo(cl_kernel /* kernel */, + cl_uint /* arg_indx */, + cl_kernel_arg_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetKernelWorkGroupInfo(cl_kernel /* kernel */, + cl_device_id /* device */, + cl_kernel_work_group_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Event Object APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clWaitForEvents(cl_uint /* num_events */, + const cl_event * /* event_list */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetEventInfo(cl_event /* event */, + cl_event_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_event CL_API_CALL +clCreateUserEvent(cl_context /* context */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetUserEventStatus(cl_event /* event */, + cl_int /* execution_status */) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetEventCallback( cl_event /* event */, + cl_int /* command_exec_callback_type */, + void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *), + void * /* user_data */) CL_API_SUFFIX__VERSION_1_1; + +/* Profiling APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clGetEventProfilingInfo(cl_event /* event */, + cl_profiling_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Flush and Finish APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clFlush(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clFinish(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; + +/* Enqueued Commands APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReadBuffer(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_read */, + size_t /* offset */, + size_t /* size */, + void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReadBufferRect(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_read */, + const size_t * /* buffer_offset */, + const size_t * /* host_offset */, + const size_t * /* region */, + size_t /* buffer_row_pitch */, + size_t /* buffer_slice_pitch */, + size_t /* host_row_pitch */, + size_t /* host_slice_pitch */, + void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueWriteBuffer(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_write */, + size_t /* offset */, + size_t /* size */, + const void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueWriteBufferRect(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_write */, + const size_t * /* buffer_offset */, + const size_t * /* host_offset */, + const size_t * /* region */, + size_t /* buffer_row_pitch */, + size_t /* buffer_slice_pitch */, + size_t /* host_row_pitch */, + size_t /* host_slice_pitch */, + const void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueFillBuffer(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + const void * /* pattern */, + size_t /* pattern_size */, + size_t /* offset */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyBuffer(cl_command_queue /* command_queue */, + cl_mem /* src_buffer */, + cl_mem /* dst_buffer */, + size_t /* src_offset */, + size_t /* dst_offset */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyBufferRect(cl_command_queue /* command_queue */, + cl_mem /* src_buffer */, + cl_mem /* dst_buffer */, + const size_t * /* src_origin */, + const size_t * /* dst_origin */, + const size_t * /* region */, + size_t /* src_row_pitch */, + size_t /* src_slice_pitch */, + size_t /* dst_row_pitch */, + size_t /* dst_slice_pitch */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReadImage(cl_command_queue /* command_queue */, + cl_mem /* image */, + cl_bool /* blocking_read */, + const size_t * /* origin[3] */, + const size_t * /* region[3] */, + size_t /* row_pitch */, + size_t /* slice_pitch */, + void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueWriteImage(cl_command_queue /* command_queue */, + cl_mem /* image */, + cl_bool /* blocking_write */, + const size_t * /* origin[3] */, + const size_t * /* region[3] */, + size_t /* input_row_pitch */, + size_t /* input_slice_pitch */, + const void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueFillImage(cl_command_queue /* command_queue */, + cl_mem /* image */, + const void * /* fill_color */, + const size_t * /* origin[3] */, + const size_t * /* region[3] */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyImage(cl_command_queue /* command_queue */, + cl_mem /* src_image */, + cl_mem /* dst_image */, + const size_t * /* src_origin[3] */, + const size_t * /* dst_origin[3] */, + const size_t * /* region[3] */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyImageToBuffer(cl_command_queue /* command_queue */, + cl_mem /* src_image */, + cl_mem /* dst_buffer */, + const size_t * /* src_origin[3] */, + const size_t * /* region[3] */, + size_t /* dst_offset */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyBufferToImage(cl_command_queue /* command_queue */, + cl_mem /* src_buffer */, + cl_mem /* dst_image */, + size_t /* src_offset */, + const size_t * /* dst_origin[3] */, + const size_t * /* region[3] */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY void * CL_API_CALL +clEnqueueMapBuffer(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_map */, + cl_map_flags /* map_flags */, + size_t /* offset */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY void * CL_API_CALL +clEnqueueMapImage(cl_command_queue /* command_queue */, + cl_mem /* image */, + cl_bool /* blocking_map */, + cl_map_flags /* map_flags */, + const size_t * /* origin[3] */, + const size_t * /* region[3] */, + size_t * /* image_row_pitch */, + size_t * /* image_slice_pitch */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueUnmapMemObject(cl_command_queue /* command_queue */, + cl_mem /* memobj */, + void * /* mapped_ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueMigrateMemObjects(cl_command_queue /* command_queue */, + cl_uint /* num_mem_objects */, + const cl_mem * /* mem_objects */, + cl_mem_migration_flags /* flags */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueNDRangeKernel(cl_command_queue /* command_queue */, + cl_kernel /* kernel */, + cl_uint /* work_dim */, + const size_t * /* global_work_offset */, + const size_t * /* global_work_size */, + const size_t * /* local_work_size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueTask(cl_command_queue /* command_queue */, + cl_kernel /* kernel */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueNativeKernel(cl_command_queue /* command_queue */, + void (CL_CALLBACK * /*user_func*/)(void *), + void * /* args */, + size_t /* cb_args */, + cl_uint /* num_mem_objects */, + const cl_mem * /* mem_list */, + const void ** /* args_mem_loc */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueMarkerWithWaitList(cl_command_queue /* command_queue */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueBarrierWithWaitList(cl_command_queue /* command_queue */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; + + +/* Extension function access + * + * Returns the extension function address for the given function name, + * or NULL if a valid function can not be found. The client must + * check to make sure the address is not NULL, before using or + * calling the returned function address. + */ +extern CL_API_ENTRY void * CL_API_CALL +clGetExtensionFunctionAddressForPlatform(cl_platform_id /* platform */, + const char * /* func_name */) CL_API_SUFFIX__VERSION_1_2; + + +// Deprecated OpenCL 1.1 APIs +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL +clCreateImage2D(cl_context /* context */, + cl_mem_flags /* flags */, + const cl_image_format * /* image_format */, + size_t /* image_width */, + size_t /* image_height */, + size_t /* image_row_pitch */, + void * /* host_ptr */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL +clCreateImage3D(cl_context /* context */, + cl_mem_flags /* flags */, + const cl_image_format * /* image_format */, + size_t /* image_width */, + size_t /* image_height */, + size_t /* image_depth */, + size_t /* image_row_pitch */, + size_t /* image_slice_pitch */, + void * /* host_ptr */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL +clEnqueueMarker(cl_command_queue /* command_queue */, + cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL +clEnqueueWaitForEvents(cl_command_queue /* command_queue */, + cl_uint /* num_events */, + const cl_event * /* event_list */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL +clEnqueueBarrier(cl_command_queue /* command_queue */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL +clUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL +clGetExtensionFunctionAddress(const char * /* func_name */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_H */
View file
x264-snapshot-20130723-2245.tar.bz2/extras/cl_platform.h
Added
@@ -0,0 +1,1268 @@ +/********************************************************************************** + * Copyright (c) 2008-2012 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +/* $Revision: 11803 $ on $Date: 2010-06-25 10:02:12 -0700 (Fri, 25 Jun 2010) $ */ + +#ifndef __CL_PLATFORM_H +#define __CL_PLATFORM_H + +#ifdef __APPLE__ + /* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */ + #include <AvailabilityMacros.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(_WIN32) + #define CL_API_ENTRY + #define CL_API_CALL __stdcall + #define CL_CALLBACK __stdcall +#else + #define CL_API_ENTRY + #define CL_API_CALL + #define CL_CALLBACK +#endif + +#ifdef __APPLE__ + #define CL_EXTENSION_WEAK_LINK __attribute__((weak_import)) + #ifndef UNAVAILABLE_ATTRIBUTE + #define UNAVAILABLE_ATTRIBUTE + #endif + #ifdef AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER + #define CL_API_SUFFIX__VERSION_1_0 AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_0 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER + #else + #define CL_API_SUFFIX__VERSION_1_0 UNAVAILABLE_ATTRIBUTE + #define CL_EXT_SUFFIX__VERSION_1_0 CL_EXTENSION_WEAK_LINK UNAVAILABLE_ATTRIBUTE + #endif + #ifdef AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define CL_API_SUFFIX__VERSION_1_1 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define GCL_API_SUFFIX__VERSION_1_1 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_1 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 + #else + #define CL_API_SUFFIX__VERSION_1_1 UNAVAILABLE_ATTRIBUTE + #define GCL_API_SUFFIX__VERSION_1_1 UNAVAILABLE_ATTRIBUTE + #define CL_EXT_SUFFIX__VERSION_1_1 CL_EXTENSION_WEAK_LINK UNAVAILABLE_ATTRIBUTE + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATE CL_EXT_SUFFIX__VERSION_1_0 + #endif + #ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define GCL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_2 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 + #else + #define CL_API_SUFFIX__VERSION_1_2 UNAVAILABLE_ATTRIBUTE + #define GCL_API_SUFFIX__VERSION_1_2 UNAVAILABLE_ATTRIBUTE + #define CL_EXT_SUFFIX__VERSION_1_2 CL_EXTENSION_WEAK_LINK UNAVAILABLE_ATTRIBUTE + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXT_SUFFIX__VERSION_1_1 + #endif +#else + #define CL_EXTENSION_WEAK_LINK + #define CL_API_SUFFIX__VERSION_1_0 + #define CL_EXT_SUFFIX__VERSION_1_0 + #define CL_API_SUFFIX__VERSION_1_1 + #define CL_EXT_SUFFIX__VERSION_1_1 + #define CL_API_SUFFIX__VERSION_1_2 + #define CL_EXT_SUFFIX__VERSION_1_2 + + #ifdef __GNUC__ + #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED __attribute__((deprecated)) + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED __attribute__((deprecated)) + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #endif + #elif _WIN32 + #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED __declspec(deprecated) + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED __declspec(deprecated) + #endif + #else + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED + + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #endif +#endif + +#if (defined (_WIN32) && defined(_MSC_VER)) + +/* scalar types */ +typedef signed __int8 cl_char; +typedef unsigned __int8 cl_uchar; +typedef signed __int16 cl_short; +typedef unsigned __int16 cl_ushort; +typedef signed __int32 cl_int; +typedef unsigned __int32 cl_uint; +typedef signed __int64 cl_long; +typedef unsigned __int64 cl_ulong; + +typedef unsigned __int16 cl_half; +typedef float cl_float; +typedef double cl_double; + +/* Macro names and corresponding values defined by OpenCL */ +#define CL_CHAR_BIT 8 +#define CL_SCHAR_MAX 127 +#define CL_SCHAR_MIN (-127-1) +#define CL_CHAR_MAX CL_SCHAR_MAX +#define CL_CHAR_MIN CL_SCHAR_MIN +#define CL_UCHAR_MAX 255 +#define CL_SHRT_MAX 32767 +#define CL_SHRT_MIN (-32767-1) +#define CL_USHRT_MAX 65535 +#define CL_INT_MAX 2147483647 +#define CL_INT_MIN (-2147483647-1) +#define CL_UINT_MAX 0xffffffffU +#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL) +#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL) +#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL) + +#define CL_FLT_DIG 6 +#define CL_FLT_MANT_DIG 24 +#define CL_FLT_MAX_10_EXP +38 +#define CL_FLT_MAX_EXP +128 +#define CL_FLT_MIN_10_EXP -37 +#define CL_FLT_MIN_EXP -125 +#define CL_FLT_RADIX 2 +#define CL_FLT_MAX 340282346638528859811704183484516925440.0f +#define CL_FLT_MIN 1.175494350822287507969e-38f +#define CL_FLT_EPSILON 0x1.0p-23f + +#define CL_DBL_DIG 15 +#define CL_DBL_MANT_DIG 53 +#define CL_DBL_MAX_10_EXP +308 +#define CL_DBL_MAX_EXP +1024 +#define CL_DBL_MIN_10_EXP -307 +#define CL_DBL_MIN_EXP -1021 +#define CL_DBL_RADIX 2 +#define CL_DBL_MAX 179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0 +#define CL_DBL_MIN 2.225073858507201383090e-308 +#define CL_DBL_EPSILON 2.220446049250313080847e-16 + +#define CL_M_E 2.718281828459045090796 +#define CL_M_LOG2E 1.442695040888963387005 +#define CL_M_LOG10E 0.434294481903251816668 +#define CL_M_LN2 0.693147180559945286227 +#define CL_M_LN10 2.302585092994045901094 +#define CL_M_PI 3.141592653589793115998 +#define CL_M_PI_2 1.570796326794896557999 +#define CL_M_PI_4 0.785398163397448278999 +#define CL_M_1_PI 0.318309886183790691216 +#define CL_M_2_PI 0.636619772367581382433 +#define CL_M_2_SQRTPI 1.128379167095512558561 +#define CL_M_SQRT2 1.414213562373095145475 +#define CL_M_SQRT1_2 0.707106781186547572737 + +#define CL_M_E_F 2.71828174591064f +#define CL_M_LOG2E_F 1.44269502162933f +#define CL_M_LOG10E_F 0.43429449200630f +#define CL_M_LN2_F 0.69314718246460f +#define CL_M_LN10_F 2.30258512496948f +#define CL_M_PI_F 3.14159274101257f +#define CL_M_PI_2_F 1.57079637050629f +#define CL_M_PI_4_F 0.78539818525314f +#define CL_M_1_PI_F 0.31830987334251f +#define CL_M_2_PI_F 0.63661974668503f +#define CL_M_2_SQRTPI_F 1.12837922573090f +#define CL_M_SQRT2_F 1.41421353816986f +#define CL_M_SQRT1_2_F 0.70710676908493f + +#define CL_NAN (CL_INFINITY - CL_INFINITY) +#define CL_HUGE_VALF ((cl_float) 1e50) +#define CL_HUGE_VAL ((cl_double) 1e500) +#define CL_MAXFLOAT CL_FLT_MAX +#define CL_INFINITY CL_HUGE_VALF + +#else + +#include <stdint.h> + +/* scalar types */ +typedef int8_t cl_char; +typedef uint8_t cl_uchar; +typedef int16_t cl_short __attribute__((aligned(2))); +typedef uint16_t cl_ushort __attribute__((aligned(2))); +typedef int32_t cl_int __attribute__((aligned(4))); +typedef uint32_t cl_uint __attribute__((aligned(4))); +typedef int64_t cl_long __attribute__((aligned(8))); +typedef uint64_t cl_ulong __attribute__((aligned(8))); + +typedef uint16_t cl_half __attribute__((aligned(2))); +typedef float cl_float __attribute__((aligned(4))); +typedef double cl_double __attribute__((aligned(8))); + +/* Macro names and corresponding values defined by OpenCL */ +#define CL_CHAR_BIT 8 +#define CL_SCHAR_MAX 127 +#define CL_SCHAR_MIN (-127-1) +#define CL_CHAR_MAX CL_SCHAR_MAX +#define CL_CHAR_MIN CL_SCHAR_MIN +#define CL_UCHAR_MAX 255 +#define CL_SHRT_MAX 32767 +#define CL_SHRT_MIN (-32767-1) +#define CL_USHRT_MAX 65535 +#define CL_INT_MAX 2147483647 +#define CL_INT_MIN (-2147483647-1) +#define CL_UINT_MAX 0xffffffffU +#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL) +#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL) +#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL) + +#define CL_FLT_DIG 6 +#define CL_FLT_MANT_DIG 24 +#define CL_FLT_MAX_10_EXP +38 +#define CL_FLT_MAX_EXP +128 +#define CL_FLT_MIN_10_EXP -37 +#define CL_FLT_MIN_EXP -125 +#define CL_FLT_RADIX 2 +#define CL_FLT_MAX 0x1.fffffep127f +#define CL_FLT_MIN 0x1.0p-126f +#define CL_FLT_EPSILON 0x1.0p-23f + +#define CL_DBL_DIG 15 +#define CL_DBL_MANT_DIG 53 +#define CL_DBL_MAX_10_EXP +308 +#define CL_DBL_MAX_EXP +1024 +#define CL_DBL_MIN_10_EXP -307 +#define CL_DBL_MIN_EXP -1021 +#define CL_DBL_RADIX 2 +#define CL_DBL_MAX 0x1.fffffffffffffp1023 +#define CL_DBL_MIN 0x1.0p-1022 +#define CL_DBL_EPSILON 0x1.0p-52 + +#define CL_M_E 2.718281828459045090796 +#define CL_M_LOG2E 1.442695040888963387005 +#define CL_M_LOG10E 0.434294481903251816668 +#define CL_M_LN2 0.693147180559945286227 +#define CL_M_LN10 2.302585092994045901094 +#define CL_M_PI 3.141592653589793115998 +#define CL_M_PI_2 1.570796326794896557999 +#define CL_M_PI_4 0.785398163397448278999 +#define CL_M_1_PI 0.318309886183790691216 +#define CL_M_2_PI 0.636619772367581382433 +#define CL_M_2_SQRTPI 1.128379167095512558561 +#define CL_M_SQRT2 1.414213562373095145475 +#define CL_M_SQRT1_2 0.707106781186547572737 + +#define CL_M_E_F 2.71828174591064f +#define CL_M_LOG2E_F 1.44269502162933f +#define CL_M_LOG10E_F 0.43429449200630f +#define CL_M_LN2_F 0.69314718246460f +#define CL_M_LN10_F 2.30258512496948f +#define CL_M_PI_F 3.14159274101257f +#define CL_M_PI_2_F 1.57079637050629f +#define CL_M_PI_4_F 0.78539818525314f +#define CL_M_1_PI_F 0.31830987334251f +#define CL_M_2_PI_F 0.63661974668503f +#define CL_M_2_SQRTPI_F 1.12837922573090f +#define CL_M_SQRT2_F 1.41421353816986f +#define CL_M_SQRT1_2_F 0.70710676908493f + +#if defined( __GNUC__ ) + #define CL_HUGE_VALF __builtin_huge_valf() + #define CL_HUGE_VAL __builtin_huge_val() + #define CL_NAN __builtin_nanf( "" ) +#else + #define CL_HUGE_VALF ((cl_float) 1e50) + #define CL_HUGE_VAL ((cl_double) 1e500) + float nanf( const char * ); + #define CL_NAN nanf( "" ) +#endif +#define CL_MAXFLOAT CL_FLT_MAX +#define CL_INFINITY CL_HUGE_VALF + +#endif + +#include <stddef.h> + +/* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */ +typedef unsigned int cl_GLuint; +typedef int cl_GLint; +typedef unsigned int cl_GLenum; + +/* + * Vector types + * + * Note: OpenCL requires that all types be naturally aligned. + * This means that vector types must be naturally aligned. + * For example, a vector of four floats must be aligned to + * a 16 byte boundary (calculated as 4 * the natural 4-byte + * alignment of the float). The alignment qualifiers here + * will only function properly if your compiler supports them + * and if you don't actively work to defeat them. For example, + * in order for a cl_float4 to be 16 byte aligned in a struct, + * the start of the struct must itself be 16-byte aligned. + * + * Maintaining proper alignment is the user's responsibility. + */ + +/* Define basic vector types */ +#if defined( __VEC__ ) + #include <altivec.h> /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */ + typedef vector unsigned char __cl_uchar16; + typedef vector signed char __cl_char16; + typedef vector unsigned short __cl_ushort8; + typedef vector signed short __cl_short8; + typedef vector unsigned int __cl_uint4; + typedef vector signed int __cl_int4; + typedef vector float __cl_float4; + #define __CL_UCHAR16__ 1 + #define __CL_CHAR16__ 1 + #define __CL_USHORT8__ 1 + #define __CL_SHORT8__ 1 + #define __CL_UINT4__ 1 + #define __CL_INT4__ 1 + #define __CL_FLOAT4__ 1 +#endif + +#if defined( __SSE__ ) + #if defined( __MINGW64__ ) + #include <intrin.h> + #else + #include <xmmintrin.h> + #endif + #if defined( __GNUC__ ) + typedef float __cl_float4 __attribute__((vector_size(16))); + #else + typedef __m128 __cl_float4; + #endif + #define __CL_FLOAT4__ 1 +#endif + +#if defined( __SSE2__ ) + #if defined( __MINGW64__ ) + #include <intrin.h> + #else + #include <emmintrin.h> + #endif + #if defined( __GNUC__ ) + typedef cl_uchar __cl_uchar16 __attribute__((vector_size(16))); + typedef cl_char __cl_char16 __attribute__((vector_size(16))); + typedef cl_ushort __cl_ushort8 __attribute__((vector_size(16))); + typedef cl_short __cl_short8 __attribute__((vector_size(16))); + typedef cl_uint __cl_uint4 __attribute__((vector_size(16))); + typedef cl_int __cl_int4 __attribute__((vector_size(16))); + typedef cl_ulong __cl_ulong2 __attribute__((vector_size(16))); + typedef cl_long __cl_long2 __attribute__((vector_size(16))); + typedef cl_double __cl_double2 __attribute__((vector_size(16))); + #else + typedef __m128i __cl_uchar16; + typedef __m128i __cl_char16; + typedef __m128i __cl_ushort8; + typedef __m128i __cl_short8; + typedef __m128i __cl_uint4; + typedef __m128i __cl_int4; + typedef __m128i __cl_ulong2; + typedef __m128i __cl_long2; + typedef __m128d __cl_double2; + #endif + #define __CL_UCHAR16__ 1 + #define __CL_CHAR16__ 1 + #define __CL_USHORT8__ 1 + #define __CL_SHORT8__ 1 + #define __CL_INT4__ 1 + #define __CL_UINT4__ 1 + #define __CL_ULONG2__ 1 + #define __CL_LONG2__ 1 + #define __CL_DOUBLE2__ 1 +#endif + +#if defined( __MMX__ ) + #include <mmintrin.h> + #if defined( __GNUC__ ) + typedef cl_uchar __cl_uchar8 __attribute__((vector_size(8))); + typedef cl_char __cl_char8 __attribute__((vector_size(8))); + typedef cl_ushort __cl_ushort4 __attribute__((vector_size(8))); + typedef cl_short __cl_short4 __attribute__((vector_size(8))); + typedef cl_uint __cl_uint2 __attribute__((vector_size(8))); + typedef cl_int __cl_int2 __attribute__((vector_size(8))); + typedef cl_ulong __cl_ulong1 __attribute__((vector_size(8))); + typedef cl_long __cl_long1 __attribute__((vector_size(8))); + typedef cl_float __cl_float2 __attribute__((vector_size(8))); + #else + typedef __m64 __cl_uchar8; + typedef __m64 __cl_char8; + typedef __m64 __cl_ushort4; + typedef __m64 __cl_short4; + typedef __m64 __cl_uint2; + typedef __m64 __cl_int2; + typedef __m64 __cl_ulong1; + typedef __m64 __cl_long1; + typedef __m64 __cl_float2; + #endif + #define __CL_UCHAR8__ 1 + #define __CL_CHAR8__ 1 + #define __CL_USHORT4__ 1 + #define __CL_SHORT4__ 1 + #define __CL_INT2__ 1 + #define __CL_UINT2__ 1 + #define __CL_ULONG1__ 1 + #define __CL_LONG1__ 1 + #define __CL_FLOAT2__ 1 +#endif + +#if defined( __AVX__ ) + #if defined( __MINGW64__ ) + #include <intrin.h> + #else + #include <immintrin.h> + #endif + #if defined( __GNUC__ ) + typedef cl_float __cl_float8 __attribute__((vector_size(32))); + typedef cl_double __cl_double4 __attribute__((vector_size(32))); + #else + typedef __m256 __cl_float8; + typedef __m256d __cl_double4; + #endif + #define __CL_FLOAT8__ 1 + #define __CL_DOUBLE4__ 1 +#endif + +/* Define alignment keys */ +#if defined( __GNUC__ ) + #define CL_ALIGNED(_x) __attribute__ ((aligned(_x))) +#elif defined( _WIN32) && (_MSC_VER) + /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements */ + /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx */ + /* #include <crtdefs.h> */ + /* #define CL_ALIGNED(_x) _CRT_ALIGN(_x) */ + #define CL_ALIGNED(_x) +#else + #warning Need to implement some method to align data here + #define CL_ALIGNED(_x) +#endif + +/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */ +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + /* .xyzw and .s0123...{f|F} are supported */ + #define CL_HAS_NAMED_VECTOR_FIELDS 1 + /* .hi and .lo are supported */ + #define CL_HAS_HI_LO_VECTOR_FIELDS 1 +#endif + +/* Define cl_vector types */ + +/* ---- cl_charn ---- */ +typedef union +{ + cl_char CL_ALIGNED(2) s[2]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_char x, y; }; + __extension__ struct{ cl_char s0, s1; }; + __extension__ struct{ cl_char lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2; +#endif +}cl_char2; + +typedef union +{ + cl_char CL_ALIGNED(4) s[4]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_char x, y, z, w; }; + __extension__ struct{ cl_char s0, s1, s2, s3; }; + __extension__ struct{ cl_char2 lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2[2]; +#endif +#if defined( __CL_CHAR4__) + __cl_char4 v4; +#endif +}cl_char4; + +/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */ +typedef cl_char4 cl_char3; + +typedef union +{ + cl_char CL_ALIGNED(8) s[8]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_char x, y, z, w; }; + __extension__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7; }; + __extension__ struct{ cl_char4 lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2[4]; +#endif +#if defined( __CL_CHAR4__) + __cl_char4 v4[2]; +#endif +#if defined( __CL_CHAR8__ ) + __cl_char8 v8; +#endif +}cl_char8; + +typedef union +{ + cl_char CL_ALIGNED(16) s[16]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_char x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __extension__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __extension__ struct{ cl_char8 lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2[8]; +#endif +#if defined( __CL_CHAR4__) + __cl_char4 v4[4]; +#endif +#if defined( __CL_CHAR8__ ) + __cl_char8 v8[2]; +#endif +#if defined( __CL_CHAR16__ ) + __cl_char16 v16; +#endif +}cl_char16; + + +/* ---- cl_ucharn ---- */ +typedef union +{ + cl_uchar CL_ALIGNED(2) s[2]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_uchar x, y; }; + __extension__ struct{ cl_uchar s0, s1; }; + __extension__ struct{ cl_uchar lo, hi; }; +#endif +#if defined( __cl_uchar2__) + __cl_uchar2 v2; +#endif +}cl_uchar2; + +typedef union +{ + cl_uchar CL_ALIGNED(4) s[4]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_uchar x, y, z, w; }; + __extension__ struct{ cl_uchar s0, s1, s2, s3; }; + __extension__ struct{ cl_uchar2 lo, hi; }; +#endif +#if defined( __CL_UCHAR2__) + __cl_uchar2 v2[2]; +#endif +#if defined( __CL_UCHAR4__) + __cl_uchar4 v4; +#endif +}cl_uchar4; + +/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */ +typedef cl_uchar4 cl_uchar3; + +typedef union +{ + cl_uchar CL_ALIGNED(8) s[8]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_uchar x, y, z, w; }; + __extension__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7; }; + __extension__ struct{ cl_uchar4 lo, hi; }; +#endif +#if defined( __CL_UCHAR2__) + __cl_uchar2 v2[4]; +#endif +#if defined( __CL_UCHAR4__) + __cl_uchar4 v4[2]; +#endif +#if defined( __CL_UCHAR8__ ) + __cl_uchar8 v8; +#endif +}cl_uchar8; + +typedef union +{ + cl_uchar CL_ALIGNED(16) s[16]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_uchar x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __extension__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __extension__ struct{ cl_uchar8 lo, hi; }; +#endif +#if defined( __CL_UCHAR2__) + __cl_uchar2 v2[8]; +#endif +#if defined( __CL_UCHAR4__) + __cl_uchar4 v4[4]; +#endif +#if defined( __CL_UCHAR8__ ) + __cl_uchar8 v8[2]; +#endif +#if defined( __CL_UCHAR16__ ) + __cl_uchar16 v16; +#endif +}cl_uchar16; + + +/* ---- cl_shortn ---- */ +typedef union +{ + cl_short CL_ALIGNED(4) s[2]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_short x, y; }; + __extension__ struct{ cl_short s0, s1; }; + __extension__ struct{ cl_short lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2; +#endif +}cl_short2; + +typedef union +{ + cl_short CL_ALIGNED(8) s[4]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_short x, y, z, w; }; + __extension__ struct{ cl_short s0, s1, s2, s3; }; + __extension__ struct{ cl_short2 lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2[2]; +#endif +#if defined( __CL_SHORT4__) + __cl_short4 v4; +#endif +}cl_short4; + +/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */ +typedef cl_short4 cl_short3; + +typedef union +{ + cl_short CL_ALIGNED(16) s[8]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_short x, y, z, w; }; + __extension__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7; }; + __extension__ struct{ cl_short4 lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2[4]; +#endif +#if defined( __CL_SHORT4__) + __cl_short4 v4[2]; +#endif +#if defined( __CL_SHORT8__ ) + __cl_short8 v8; +#endif +}cl_short8; + +typedef union +{ + cl_short CL_ALIGNED(32) s[16]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_short x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __extension__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __extension__ struct{ cl_short8 lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2[8]; +#endif +#if defined( __CL_SHORT4__) + __cl_short4 v4[4]; +#endif +#if defined( __CL_SHORT8__ ) + __cl_short8 v8[2]; +#endif +#if defined( __CL_SHORT16__ ) + __cl_short16 v16; +#endif +}cl_short16; + + +/* ---- cl_ushortn ---- */ +typedef union +{ + cl_ushort CL_ALIGNED(4) s[2]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_ushort x, y; }; + __extension__ struct{ cl_ushort s0, s1; }; + __extension__ struct{ cl_ushort lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2; +#endif +}cl_ushort2; + +typedef union +{ + cl_ushort CL_ALIGNED(8) s[4]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_ushort x, y, z, w; }; + __extension__ struct{ cl_ushort s0, s1, s2, s3; }; + __extension__ struct{ cl_ushort2 lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2[2]; +#endif +#if defined( __CL_USHORT4__) + __cl_ushort4 v4; +#endif +}cl_ushort4; + +/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */ +typedef cl_ushort4 cl_ushort3; + +typedef union +{ + cl_ushort CL_ALIGNED(16) s[8]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_ushort x, y, z, w; }; + __extension__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7; }; + __extension__ struct{ cl_ushort4 lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2[4]; +#endif +#if defined( __CL_USHORT4__) + __cl_ushort4 v4[2]; +#endif +#if defined( __CL_USHORT8__ ) + __cl_ushort8 v8; +#endif +}cl_ushort8; + +typedef union +{ + cl_ushort CL_ALIGNED(32) s[16]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_ushort x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __extension__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __extension__ struct{ cl_ushort8 lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2[8]; +#endif +#if defined( __CL_USHORT4__) + __cl_ushort4 v4[4]; +#endif +#if defined( __CL_USHORT8__ ) + __cl_ushort8 v8[2]; +#endif +#if defined( __CL_USHORT16__ ) + __cl_ushort16 v16; +#endif +}cl_ushort16; + +/* ---- cl_intn ---- */ +typedef union +{ + cl_int CL_ALIGNED(8) s[2]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_int x, y; }; + __extension__ struct{ cl_int s0, s1; }; + __extension__ struct{ cl_int lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2; +#endif +}cl_int2; + +typedef union +{ + cl_int CL_ALIGNED(16) s[4]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_int x, y, z, w; }; + __extension__ struct{ cl_int s0, s1, s2, s3; }; + __extension__ struct{ cl_int2 lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2[2]; +#endif +#if defined( __CL_INT4__) + __cl_int4 v4; +#endif +}cl_int4; + +/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */ +typedef cl_int4 cl_int3; + +typedef union +{ + cl_int CL_ALIGNED(32) s[8]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_int x, y, z, w; }; + __extension__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7; }; + __extension__ struct{ cl_int4 lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2[4]; +#endif +#if defined( __CL_INT4__) + __cl_int4 v4[2]; +#endif +#if defined( __CL_INT8__ ) + __cl_int8 v8; +#endif +}cl_int8; + +typedef union +{ + cl_int CL_ALIGNED(64) s[16]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_int x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __extension__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __extension__ struct{ cl_int8 lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2[8]; +#endif +#if defined( __CL_INT4__) + __cl_int4 v4[4]; +#endif +#if defined( __CL_INT8__ ) + __cl_int8 v8[2]; +#endif +#if defined( __CL_INT16__ ) + __cl_int16 v16; +#endif +}cl_int16; + + +/* ---- cl_uintn ---- */ +typedef union +{ + cl_uint CL_ALIGNED(8) s[2]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_uint x, y; }; + __extension__ struct{ cl_uint s0, s1; }; + __extension__ struct{ cl_uint lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2; +#endif +}cl_uint2; + +typedef union +{ + cl_uint CL_ALIGNED(16) s[4]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_uint x, y, z, w; }; + __extension__ struct{ cl_uint s0, s1, s2, s3; }; + __extension__ struct{ cl_uint2 lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2[2]; +#endif +#if defined( __CL_UINT4__) + __cl_uint4 v4; +#endif +}cl_uint4; + +/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */ +typedef cl_uint4 cl_uint3; + +typedef union +{ + cl_uint CL_ALIGNED(32) s[8]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_uint x, y, z, w; }; + __extension__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7; }; + __extension__ struct{ cl_uint4 lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2[4]; +#endif +#if defined( __CL_UINT4__) + __cl_uint4 v4[2]; +#endif +#if defined( __CL_UINT8__ ) + __cl_uint8 v8; +#endif +}cl_uint8; + +typedef union +{ + cl_uint CL_ALIGNED(64) s[16]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_uint x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __extension__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __extension__ struct{ cl_uint8 lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2[8]; +#endif +#if defined( __CL_UINT4__) + __cl_uint4 v4[4]; +#endif +#if defined( __CL_UINT8__ ) + __cl_uint8 v8[2]; +#endif +#if defined( __CL_UINT16__ ) + __cl_uint16 v16; +#endif +}cl_uint16; + +/* ---- cl_longn ---- */ +typedef union +{ + cl_long CL_ALIGNED(16) s[2]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_long x, y; }; + __extension__ struct{ cl_long s0, s1; }; + __extension__ struct{ cl_long lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2; +#endif +}cl_long2; + +typedef union +{ + cl_long CL_ALIGNED(32) s[4]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_long x, y, z, w; }; + __extension__ struct{ cl_long s0, s1, s2, s3; }; + __extension__ struct{ cl_long2 lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2[2]; +#endif +#if defined( __CL_LONG4__) + __cl_long4 v4; +#endif +}cl_long4; + +/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */ +typedef cl_long4 cl_long3; + +typedef union +{ + cl_long CL_ALIGNED(64) s[8]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_long x, y, z, w; }; + __extension__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7; }; + __extension__ struct{ cl_long4 lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2[4]; +#endif +#if defined( __CL_LONG4__) + __cl_long4 v4[2]; +#endif +#if defined( __CL_LONG8__ ) + __cl_long8 v8; +#endif +}cl_long8; + +typedef union +{ + cl_long CL_ALIGNED(128) s[16]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_long x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __extension__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __extension__ struct{ cl_long8 lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2[8]; +#endif +#if defined( __CL_LONG4__) + __cl_long4 v4[4]; +#endif +#if defined( __CL_LONG8__ ) + __cl_long8 v8[2]; +#endif +#if defined( __CL_LONG16__ ) + __cl_long16 v16; +#endif +}cl_long16; + + +/* ---- cl_ulongn ---- */ +typedef union +{ + cl_ulong CL_ALIGNED(16) s[2]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_ulong x, y; }; + __extension__ struct{ cl_ulong s0, s1; }; + __extension__ struct{ cl_ulong lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2; +#endif +}cl_ulong2; + +typedef union +{ + cl_ulong CL_ALIGNED(32) s[4]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_ulong x, y, z, w; }; + __extension__ struct{ cl_ulong s0, s1, s2, s3; }; + __extension__ struct{ cl_ulong2 lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2[2]; +#endif +#if defined( __CL_ULONG4__) + __cl_ulong4 v4; +#endif +}cl_ulong4; + +/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */ +typedef cl_ulong4 cl_ulong3; + +typedef union +{ + cl_ulong CL_ALIGNED(64) s[8]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_ulong x, y, z, w; }; + __extension__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7; }; + __extension__ struct{ cl_ulong4 lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2[4]; +#endif +#if defined( __CL_ULONG4__) + __cl_ulong4 v4[2]; +#endif +#if defined( __CL_ULONG8__ ) + __cl_ulong8 v8; +#endif +}cl_ulong8; + +typedef union +{ + cl_ulong CL_ALIGNED(128) s[16]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_ulong x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __extension__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __extension__ struct{ cl_ulong8 lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2[8]; +#endif +#if defined( __CL_ULONG4__) + __cl_ulong4 v4[4]; +#endif +#if defined( __CL_ULONG8__ ) + __cl_ulong8 v8[2]; +#endif +#if defined( __CL_ULONG16__ ) + __cl_ulong16 v16; +#endif +}cl_ulong16; + + +/* --- cl_floatn ---- */ + +typedef union +{ + cl_float CL_ALIGNED(8) s[2]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_float x, y; }; + __extension__ struct{ cl_float s0, s1; }; + __extension__ struct{ cl_float lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2; +#endif +}cl_float2; + +typedef union +{ + cl_float CL_ALIGNED(16) s[4]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_float x, y, z, w; }; + __extension__ struct{ cl_float s0, s1, s2, s3; }; + __extension__ struct{ cl_float2 lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2[2]; +#endif +#if defined( __CL_FLOAT4__) + __cl_float4 v4; +#endif +}cl_float4; + +/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */ +typedef cl_float4 cl_float3; + +typedef union +{ + cl_float CL_ALIGNED(32) s[8]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_float x, y, z, w; }; + __extension__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7; }; + __extension__ struct{ cl_float4 lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2[4]; +#endif +#if defined( __CL_FLOAT4__) + __cl_float4 v4[2]; +#endif +#if defined( __CL_FLOAT8__ ) + __cl_float8 v8; +#endif +}cl_float8; + +typedef union +{ + cl_float CL_ALIGNED(64) s[16]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_float x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __extension__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __extension__ struct{ cl_float8 lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2[8]; +#endif +#if defined( __CL_FLOAT4__) + __cl_float4 v4[4]; +#endif +#if defined( __CL_FLOAT8__ ) + __cl_float8 v8[2]; +#endif +#if defined( __CL_FLOAT16__ ) + __cl_float16 v16; +#endif +}cl_float16; + +/* --- cl_doublen ---- */ + +typedef union +{ + cl_double CL_ALIGNED(16) s[2]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_double x, y; }; + __extension__ struct{ cl_double s0, s1; }; + __extension__ struct{ cl_double lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2; +#endif +}cl_double2; + +typedef union +{ + cl_double CL_ALIGNED(32) s[4]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_double x, y, z, w; }; + __extension__ struct{ cl_double s0, s1, s2, s3; }; + __extension__ struct{ cl_double2 lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2[2]; +#endif +#if defined( __CL_DOUBLE4__) + __cl_double4 v4; +#endif +}cl_double4; + +/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */ +typedef cl_double4 cl_double3; + +typedef union +{ + cl_double CL_ALIGNED(64) s[8]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_double x, y, z, w; }; + __extension__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7; }; + __extension__ struct{ cl_double4 lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2[4]; +#endif +#if defined( __CL_DOUBLE4__) + __cl_double4 v4[2]; +#endif +#if defined( __CL_DOUBLE8__ ) + __cl_double8 v8; +#endif +}cl_double8; + +typedef union +{ + cl_double CL_ALIGNED(128) s[16]; +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) + __extension__ struct{ cl_double x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __extension__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __extension__ struct{ cl_double8 lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2[8]; +#endif +#if defined( __CL_DOUBLE4__) + __cl_double4 v4[4]; +#endif +#if defined( __CL_DOUBLE8__ ) + __cl_double8 v8[2]; +#endif +#if defined( __CL_DOUBLE16__ ) + __cl_double16 v16; +#endif +}cl_double16; + +/* Macro to facilitate debugging + * Usage: + * Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source. + * The first line ends with: CL_PROGRAM_STRING_DEBUG_INFO \" + * Each line thereafter of OpenCL C source must end with: \n\ + * The last line ends in "; + * + * Example: + * + * const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\ + * kernel void foo( int a, float * b ) \n\ + * { \n\ + * // my comment \n\ + * *b[ get_global_id(0)] = a; \n\ + * } \n\ + * "; + * + * This should correctly set up the line, (column) and file information for your source + * string so you can do source level debugging. + */ +#define __CL_STRINGIFY( _x ) # _x +#define _CL_STRINGIFY( _x ) __CL_STRINGIFY( _x ) +#define CL_PROGRAM_STRING_DEBUG_INFO "#line " _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n" + +#ifdef __cplusplus +} +#endif + +#endif /* __CL_PLATFORM_H */
View file
x264-snapshot-20130723-2245.tar.bz2/extras/windowsPorts
Added
+(directory)
View file
x264-snapshot-20130723-2245.tar.bz2/extras/windowsPorts/basicDataTypeConversions.h
Added
@@ -0,0 +1,85 @@ +#ifndef __DATA_TYPE_CONVERSIONS_H__ +#define __DATA_TYPE_CONVERSIONS_H__ + +#include <stdint.h> +#include <wchar.h> + +#ifdef __cplusplus +namespace avxsynth { +#endif // __cplusplus + +typedef int64_t __int64; +typedef int32_t __int32; +#ifdef __cplusplus +typedef bool BOOL; +#else +typedef uint32_t BOOL; +#endif // __cplusplus +typedef void* HMODULE; +typedef void* LPVOID; +typedef void* PVOID; +typedef PVOID HANDLE; +typedef HANDLE HWND; +typedef HANDLE HINSTANCE; +typedef void* HDC; +typedef void* HBITMAP; +typedef void* HICON; +typedef void* HFONT; +typedef void* HGDIOBJ; +typedef void* HBRUSH; +typedef void* HMMIO; +typedef void* HACMSTREAM; +typedef void* HACMDRIVER; +typedef void* HIC; +typedef void* HACMOBJ; +typedef HACMSTREAM* LPHACMSTREAM; +typedef void* HACMDRIVERID; +typedef void* LPHACMDRIVER; +typedef unsigned char BYTE; +typedef BYTE* LPBYTE; +typedef char TCHAR; +typedef TCHAR* LPTSTR; +typedef const TCHAR* LPCTSTR; +typedef char* LPSTR; +typedef LPSTR LPOLESTR; +typedef const char* LPCSTR; +typedef LPCSTR LPCOLESTR; +typedef wchar_t WCHAR; +typedef unsigned short WORD; +typedef unsigned int UINT; +typedef UINT MMRESULT; +typedef uint32_t DWORD; +typedef DWORD COLORREF; +typedef DWORD FOURCC; +typedef DWORD HRESULT; +typedef DWORD* LPDWORD; +typedef DWORD* DWORD_PTR; +typedef int32_t LONG; +typedef int32_t* LONG_PTR; +typedef LONG_PTR LRESULT; +typedef uint32_t ULONG; +typedef uint32_t* ULONG_PTR; +//typedef __int64_t intptr_t; +typedef uint64_t _fsize_t; + + +// +// Structures +// + +typedef struct _GUID { + DWORD Data1; + WORD Data2; + WORD Data3; + BYTE Data4[8]; +} GUID; + +typedef GUID REFIID; +typedef GUID CLSID; +typedef CLSID* LPCLSID; +typedef GUID IID; + +#ifdef __cplusplus +}; // namespace avxsynth +#endif // __cplusplus +#endif // __DATA_TYPE_CONVERSIONS_H__
View file
x264-snapshot-20130723-2245.tar.bz2/extras/windowsPorts/windows2linux.h
Added
@@ -0,0 +1,77 @@ +#ifndef __WINDOWS2LINUX_H__ +#define __WINDOWS2LINUX_H__ + +/* + * LINUX SPECIFIC DEFINITIONS +*/ +// +// Data types conversions +// +#include <stdlib.h> +#include <string.h> +#include "basicDataTypeConversions.h" + +#ifdef __cplusplus +namespace avxsynth { +#endif // __cplusplus +// +// purposefully define the following MSFT definitions +// to mean nothing (as they do not mean anything on Linux) +// +#define __stdcall +#define __cdecl +#define noreturn +#define __declspec(x) +#define STDAPI extern "C" HRESULT +#define STDMETHODIMP HRESULT __stdcall +#define STDMETHODIMP_(x) x __stdcall + +#define STDMETHOD(x) virtual HRESULT x +#define STDMETHOD_(a, x) virtual a x + +#ifndef TRUE +#define TRUE true +#endif + +#ifndef FALSE +#define FALSE false +#endif + +#define S_OK (0x00000000) +#define S_FALSE (0x00000001) +#define E_NOINTERFACE (0X80004002) +#define E_POINTER (0x80004003) +#define E_FAIL (0x80004005) +#define E_OUTOFMEMORY (0x8007000E) + +#define INVALID_HANDLE_VALUE ((HANDLE)((LONG_PTR)-1)) +#define FAILED(hr) ((hr) & 0x80000000) +#define SUCCEEDED(hr) (!FAILED(hr)) + + +// +// Functions +// +#define MAKEDWORD(a,b,c,d) ((a << 24) | (b << 16) | (c << 8) | (d)) +#define MAKEWORD(a,b) ((a << 8) | (b)) + +#define lstrlen strlen +#define lstrcpy strcpy +#define lstrcmpi strcasecmp +#define _stricmp strcasecmp +#define InterlockedIncrement(x) __sync_fetch_and_add((x), 1) +#define InterlockedDecrement(x) __sync_fetch_and_sub((x), 1) +// Windows uses (new, old) ordering but GCC has (old, new) +#define InterlockedCompareExchange(x,y,z) __sync_val_compare_and_swap(x,z,y) + +#define UInt32x32To64(a, b) ( (uint64_t) ( ((uint64_t)((uint32_t)(a))) * ((uint32_t)(b)) ) ) +#define Int64ShrlMod32(a, b) ( (uint64_t) ( (uint64_t)(a) >> (b) ) ) +#define Int32x32To64(a, b) ((__int64)(((__int64)((long)(a))) * ((long)(b)))) + +#define MulDiv(nNumber, nNumerator, nDenominator) (int32_t) (((int64_t) (nNumber) * (int64_t) (nNumerator) + (int64_t) ((nDenominator)/2)) / (int64_t) (nDenominator)) + +#ifdef __cplusplus +}; // namespace avxsynth +#endif // __cplusplus + +#endif // __WINDOWS2LINUX_H__
View file
x264-snapshot-20130224-2245.tar.bz2/input/avs.c -> x264-snapshot-20130723-2245.tar.bz2/input/avs.c
Changed
@@ -24,12 +24,30 @@ *****************************************************************************/ #include "input.h" +#if USE_AVXSYNTH +#include <dlfcn.h> +#if SYS_MACOSX +#define avs_open dlopen( "libavxsynth.dylib", RTLD_NOW ) +#else +#define avs_open dlopen( "libavxsynth.so", RTLD_NOW ) +#endif +#define avs_close dlclose +#define avs_address dlsym +#else #include <windows.h> +#define avs_open LoadLibrary( "avisynth" ) +#define avs_close FreeLibrary +#define avs_address GetProcAddress +#endif #define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "avs", __VA_ARGS__ ) #define AVSC_NO_DECLSPEC #undef EXTERN_C +#if USE_AVXSYNTH +#include "extras/avxsynth_c.h" +#else #include "extras/avisynth_c.h" +#endif #define AVSC_DECLARE_FUNC(name) name##_func name /* AVS uses a versioned interface to control backwards compatibility */ @@ -40,12 +58,20 @@ #include <libavutil/pixfmt.h> #endif +/* AvxSynth doesn't have yv24, yv16, yv411, or y8, so disable them. */ +#if USE_AVXSYNTH +#define avs_is_yv24( vi ) 0 +#define avs_is_yv16( vi ) 0 +#define avs_is_yv411( vi ) 0 +#define avs_is_y8( vi ) 0 +#endif + /* maximum size of the sequence of filters to try on non script files */ #define AVS_MAX_SEQUENCE 5 #define LOAD_AVS_FUNC(name, continue_on_fail)\ {\ - h->func.name = (void*)GetProcAddress( h->library, #name );\ + h->func.name = (void*)avs_address( h->library, #name );\ if( !continue_on_fail && !h->func.name )\ goto fail;\ } @@ -76,7 +102,7 @@ /* load the library and functions we require from it */ static int x264_avs_load_library( avs_hnd_t *h ) { - h->library = LoadLibrary( "avisynth" ); + h->library = avs_open; if( !h->library ) return -1; LOAD_AVS_FUNC( avs_clip_get_error, 0 ); @@ -93,7 +119,7 @@ LOAD_AVS_FUNC( avs_take_clip, 0 ); return 0; fail: - FreeLibrary( h->library ); + avs_close( h->library ); return -1; } @@ -101,6 +127,9 @@ static void avs_build_filter_sequence( char *filename_ext, const char *filter[AVS_MAX_SEQUENCE+1] ) { int i = 0; +#if USE_AVXSYNTH + const char *all_purpose[] = { "FFVideoSource", 0 }; +#else const char *all_purpose[] = { "FFmpegSource2", "DSS2", "DirectShowSource", 0 }; if( !strcasecmp( filename_ext, "avi" ) ) filter[i++] = "AVISource"; @@ -108,6 +137,7 @@ filter[i++] = "MPEG2Source"; if( !strcasecmp( filename_ext, "dga" ) ) filter[i++] = "AVCSource"; +#endif for( int j = 0; all_purpose[j] && i < AVS_MAX_SEQUENCE; j++ ) filter[i++] = all_purpose[j]; } @@ -123,6 +153,13 @@ static float get_avs_version( avs_hnd_t *h ) { +/* AvxSynth has its version defined starting at 4.0, even though it's based on + AviSynth 2.5.8. This is troublesome for get_avs_version and working around + the new colorspaces in 2.6. So if AvxSynth is detected, explicitly define + the version as 2.58. */ +#if USE_AVXSYNTH + return 2.58f; +#else FAIL_IF_ERROR( !h->func.avs_function_exists( h->env, "VersionNumber" ), "VersionNumber does not exist\n" ) AVS_Value ver = h->func.avs_invoke( h->env, "VersionNumber", avs_new_value_array( NULL, 0 ), NULL ); FAIL_IF_ERROR( avs_is_error( ver ), "unable to determine avisynth version: %s\n", avs_as_error( ver ) ) @@ -130,6 +167,7 @@ float ret = avs_as_float( ver ); h->func.avs_release_value( ver ); return ret; +#endif } static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt ) @@ -219,11 +257,11 @@ } #if !HAVE_SWSCALE /* if swscale is not available, convert the CSP if necessary */ + FAIL_IF_ERROR( avs_version < 2.6f && (opt->output_csp == X264_CSP_I422 || opt->output_csp == X264_CSP_I444), + "avisynth >= 2.6 is required for i422/i444 output\n" ) if( (opt->output_csp == X264_CSP_I420 && !avs_is_yv12( vi )) || (opt->output_csp == X264_CSP_I422 && !avs_is_yv16( vi )) || (opt->output_csp == X264_CSP_I444 && !avs_is_yv24( vi )) || (opt->output_csp == X264_CSP_RGB && !avs_is_rgb( vi )) ) { - FAIL_IF_ERROR( avs_version < 2.6f && (opt->output_csp == X264_CSP_I422 || opt->output_csp == X264_CSP_I444), - "avisynth >= 2.6 is required for i422/i444 output\n" ) const char *csp = opt->output_csp == X264_CSP_I420 ? "YV12" : opt->output_csp == X264_CSP_I422 ? "YV16" : @@ -270,6 +308,7 @@ opt->input_range = opt->output_range; } #endif + h->func.avs_release_value( res ); info->width = vi->width; @@ -357,7 +396,7 @@ h->func.avs_release_clip( h->clip ); if( h->func.avs_delete_script_environment ) h->func.avs_delete_script_environment( h->env ); - FreeLibrary( h->library ); + avs_close( h->library ); free( h ); return 0; }
View file
x264-snapshot-20130224-2245.tar.bz2/input/lavf.c -> x264-snapshot-20130723-2245.tar.bz2/input/lavf.c
Changed
@@ -183,8 +183,8 @@ h->stream_id = i; h->next_frame = 0; AVCodecContext *c = h->lavf->streams[i]->codec; - info->fps_num = h->lavf->streams[i]->r_frame_rate.num; - info->fps_den = h->lavf->streams[i]->r_frame_rate.den; + info->fps_num = h->lavf->streams[i]->avg_frame_rate.num; + info->fps_den = h->lavf->streams[i]->avg_frame_rate.den; info->timebase_num = h->lavf->streams[i]->time_base.num; info->timebase_den = h->lavf->streams[i]->time_base.den; /* lavf is thread unsafe as calling av_read_frame invalidates previously read AVPackets */
View file
x264-snapshot-20130224-2245.tar.bz2/input/y4m.c -> x264-snapshot-20130723-2245.tar.bz2/input/y4m.c
Changed
@@ -46,7 +46,6 @@ static int parse_csp_and_depth( char *csp_name, int *bit_depth ) { int csp = X264_CSP_MAX; - *bit_depth = 8; /* Set colorspace from known variants */ if( !strncmp( "420", csp_name, 3 ) ) @@ -57,8 +56,8 @@ csp = X264_CSP_I444; /* Set high bit depth from known extensions */ - if( !strncmp( "p", csp_name + 3, 1 ) ) - *bit_depth = strtol( csp_name + 4, NULL, 10 ); + if( sscanf( csp_name, "%*d%*[pP]%d", bit_depth ) != 1 ) + *bit_depth = 8; return csp; }
View file
x264-snapshot-20130224-2245.tar.bz2/tools/checkasm-a.asm -> x264-snapshot-20130723-2245.tar.bz2/tools/checkasm-a.asm
Changed
@@ -4,7 +4,7 @@ ;* Copyright (C) 2008-2013 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> -;* Henrik Gramner <hengar-6@student.ltu.se> +;* Henrik Gramner <henrik@gramner.com> ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -88,8 +88,7 @@ ; intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... ) ;----------------------------------------------------------------------------- INIT_XMM -cglobal checkasm_call, 2,15,16 - SUB rsp, max_args*8+16 +cglobal checkasm_call, 2,15,16,max_args*8+8 mov r6, r0 mov [rsp+max_args*8], r1 @@ -158,7 +157,6 @@ mov dword [r1], 0 mov rax, r9 .ok: - ADD rsp, max_args*8+16 RET %else @@ -207,8 +205,12 @@ ; int x264_stack_pagealign( int (*func)(), int align ) ;----------------------------------------------------------------------------- cglobal stack_pagealign, 2,2 + movsxdifnidn r1, r1d push rbp mov rbp, rsp +%if WIN64 + sub rsp, 32 ; shadow space +%endif and rsp, ~0xfff sub rsp, r1 call r0
View file
x264-snapshot-20130224-2245.tar.bz2/tools/checkasm.c -> x264-snapshot-20130723-2245.tar.bz2/tools/checkasm.c
Changed
@@ -61,7 +61,7 @@ { void *pointer; // just for detecting duplicates uint32_t cpu; - uint32_t cycles; + uint64_t cycles; uint32_t den; } bench_t; @@ -137,12 +137,12 @@ static void print_bench(void) { - uint16_t nops[10000] = {0}; + uint16_t nops[10000]; int nfuncs, nop_time=0; for( int i = 0; i < 10000; i++ ) { - int t = read_time(); + uint32_t t = read_time(); nops[i] = read_time() - t; } qsort( nops, 10000, sizeof(uint16_t), cmp_nop ); @@ -164,6 +164,7 @@ if( k < j ) continue; printf( "%s_%s%s: %"PRId64"\n", benchs[i].name, +#if HAVE_MMX b->cpu&X264_CPU_AVX2 && b->cpu&X264_CPU_FMA3 ? "avx2_fma3" : b->cpu&X264_CPU_AVX2 ? "avx2" : b->cpu&X264_CPU_FMA3 ? "fma3" : @@ -176,21 +177,30 @@ /* print sse2slow only if there's also a sse2fast version of the same func */ b->cpu&X264_CPU_SSE2_IS_SLOW && j<MAX_CPUS-1 && b[1].cpu&X264_CPU_SSE2_IS_FAST && !(b[1].cpu&X264_CPU_SSE3) ? "sse2slow" : b->cpu&X264_CPU_SSE2 ? "sse2" : + b->cpu&X264_CPU_SSE ? "sse" : b->cpu&X264_CPU_MMX ? "mmx" : +#elif ARCH_PPC b->cpu&X264_CPU_ALTIVEC ? "altivec" : +#elif ARCH_ARM b->cpu&X264_CPU_NEON ? "neon" : - b->cpu&X264_CPU_ARMV6 ? "armv6" : "c", + b->cpu&X264_CPU_ARMV6 ? "armv6" : +#endif + "c", +#if HAVE_MMX b->cpu&X264_CPU_CACHELINE_32 ? "_c32" : + b->cpu&X264_CPU_SLOW_ATOM && b->cpu&X264_CPU_CACHELINE_64 ? "_c64_atom" : b->cpu&X264_CPU_CACHELINE_64 ? "_c64" : - b->cpu&X264_CPU_SHUFFLE_IS_FAST && !(b->cpu&X264_CPU_SSE4) ? "_fastshuffle" : + b->cpu&X264_CPU_SLOW_SHUFFLE ? "_slowshuffle" : b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" : b->cpu&X264_CPU_LZCNT ? "_lzcnt" : b->cpu&X264_CPU_BMI2 ? "_bmi2" : - b->cpu&X264_CPU_TBM ? "_tbm" : b->cpu&X264_CPU_BMI1 ? "_bmi1" : - b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" : b->cpu&X264_CPU_SLOW_CTZ ? "_slow_ctz" : - b->cpu&X264_CPU_SLOW_ATOM ? "_slow_atom" : "", + b->cpu&X264_CPU_SLOW_ATOM ? "_atom" : +#elif ARCH_ARM + b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" : +#endif + "", ((int64_t)10*b->cycles/b->den - nop_time)/4 ); } } @@ -231,7 +241,7 @@ #define call_bench(func,cpu,...)\ if( do_bench && !strncmp(func_name, bench_pattern, bench_pattern_len) )\ {\ - uint32_t tsum = 0;\ + uint64_t tsum = 0;\ int tcount = 0;\ call_a1(func, __VA_ARGS__);\ for( int ti = 0; ti < (cpu?BENCH_RUNS:BENCH_RUNS/4); ti++ )\ @@ -242,7 +252,7 @@ func(__VA_ARGS__);\ func(__VA_ARGS__);\ t = read_time() - t;\ - if( t*tcount <= tsum*4 && ti > 0 )\ + if( (uint64_t)t*tcount <= tsum*4 && ti > 0 )\ {\ tsum += t;\ tcount++;\ @@ -299,7 +309,7 @@ #define TEST_PIXEL( name, align ) \ ok = 1, used_asm = 0; \ - for( int i = 0; i < 8; i++ ) \ + for( int i = 0; i < ARRAY_ELEMS(pixel_c.name); i++ ) \ { \ int res_c, res_asm; \ if( pixel_asm.name[i] != pixel_ref.name[i] ) \ @@ -337,11 +347,49 @@ TEST_PIXEL( satd, 0 ); TEST_PIXEL( sa8d, 1 ); + ok = 1, used_asm = 0; + if( pixel_asm.sa8d_satd[PIXEL_16x16] != pixel_ref.sa8d_satd[PIXEL_16x16] ) + { + set_func_name( "sa8d_satd_%s", pixel_names[PIXEL_16x16] ); + used_asm = 1; + for( int j = 0; j < 64; j++ ) + { + uint32_t cost8_c = pixel_c.sa8d[PIXEL_16x16]( pbuf1, 16, pbuf2, 64 ); + uint32_t cost4_c = pixel_c.satd[PIXEL_16x16]( pbuf1, 16, pbuf2, 64 ); + uint64_t res_a = call_a( pixel_asm.sa8d_satd[PIXEL_16x16], pbuf1, (intptr_t)16, pbuf2, (intptr_t)64 ); + uint32_t cost8_a = res_a; + uint32_t cost4_a = res_a >> 32; + if( cost8_a != cost8_c || cost4_a != cost4_c ) + { + ok = 0; + fprintf( stderr, "sa8d_satd [%d]: (%d,%d) != (%d,%d) [FAILED]\n", PIXEL_16x16, + cost8_c, cost4_c, cost8_a, cost4_a ); + break; + } + } + for( int j = 0; j < 0x1000 && ok; j += 256 ) \ + { + uint32_t cost8_c = pixel_c.sa8d[PIXEL_16x16]( pbuf3+j, 16, pbuf4+j, 16 ); + uint32_t cost4_c = pixel_c.satd[PIXEL_16x16]( pbuf3+j, 16, pbuf4+j, 16 ); + uint64_t res_a = pixel_asm.sa8d_satd[PIXEL_16x16]( pbuf3+j, 16, pbuf4+j, 16 ); + uint32_t cost8_a = res_a; + uint32_t cost4_a = res_a >> 32; + if( cost8_a != cost8_c || cost4_a != cost4_c ) + { + ok = 0; + fprintf( stderr, "sa8d_satd [%d]: overflow (%d,%d) != (%d,%d) [FAILED]\n", PIXEL_16x16, + cost8_c, cost4_c, cost8_a, cost4_a ); + } + } + } + report( "pixel sa8d_satd :" ); + #define TEST_PIXEL_X( N ) \ ok = 1; used_asm = 0; \ for( int i = 0; i < 7; i++ ) \ { \ - int res_c[4]={0}, res_asm[4]={0}; \ + ALIGNED_16( int res_c[4] ) = {0}; \ + ALIGNED_16( int res_asm[4] ) = {0}; \ if( pixel_asm.sad_x##N[i] && pixel_asm.sad_x##N[i] != pixel_ref.sad_x##N[i] ) \ { \ set_func_name( "sad_x%d_%s", N, pixel_names[i] ); \ @@ -494,7 +542,8 @@ #define TEST_INTRA_X3( name, i8x8, ... ) \ if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \ { \ - int res_c[3], res_asm[3]; \ + ALIGNED_16( int res_c[3] ); \ + ALIGNED_16( int res_asm[3] ); \ set_func_name( #name ); \ used_asm = 1; \ call_c( pixel_c.name, pbuf1+48, i8x8 ? edge : pbuf3+48, res_c ); \ @@ -696,8 +745,8 @@ { ALIGNED_16( uint16_t sums[72] ); ALIGNED_16( int dc[4] ); - ALIGNED_16( int16_t mvs_a[32] ); - ALIGNED_16( int16_t mvs_c[32] ); + ALIGNED_16( int16_t mvs_a[48] ); + ALIGNED_16( int16_t mvs_c[48] ); int mvn_a, mvn_c; int thresh = rand() & 0x3fff; set_func_name( "esa_ads" ); @@ -732,10 +781,10 @@ x264_dct_function_t dct_asm; x264_quant_function_t qf; int ret = 0, ok, used_asm, interlace = 0; - ALIGNED_16( dctcoef dct1[16][16] ); - ALIGNED_16( dctcoef dct2[16][16] ); - ALIGNED_16( dctcoef dct4[16][16] ); - ALIGNED_16( dctcoef dct8[4][64] ); + ALIGNED_ARRAY_N( dctcoef, dct1, [16],[16] ); + ALIGNED_ARRAY_N( dctcoef, dct2, [16],[16] ); + ALIGNED_ARRAY_N( dctcoef, dct4, [16],[16] ); + ALIGNED_ARRAY_N( dctcoef, dct8, [4],[64] ); ALIGNED_16( dctcoef dctdc[2][8] ); x264_t h_buf; x264_t *h = &h_buf; @@ -1030,7 +1079,7 @@ call_a( zigzag_asm[interlace].name, t2, dct, buf4 ); \ if( memcmp( t1, t2, size*sizeof(dctcoef) ) || memcmp( buf3, buf4, 10 ) ) \ { \ - ok = 0; \ + ok = 0; printf("%d: %d %d %d %d\n%d %d %d %d\n\n",memcmp( t1, t2, size*sizeof(dctcoef) ),buf3[0], buf3[1], buf3[8], buf3[9], buf4[0], buf4[1], buf4[8], buf4[9]);break;\ } \ } \ } @@ -1040,13 +1089,13 @@ x264_zigzag_init( cpu_new, &zigzag_asm[0], &zigzag_asm[1] ); ok = 1; used_asm = 0; - TEST_INTERLEAVE( interleave_8x8_cavlc, level1, level2, dct1[0], 64 ); + TEST_INTERLEAVE( interleave_8x8_cavlc, level1, level2, dct8[0], 64 ); report( "zigzag_interleave :" ); for( interlace = 0; interlace <= 1; interlace++ ) { ok = 1; used_asm = 0; - TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, dct1[0], 8 ); + TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, dct8[0], 8 ); TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 4 ); TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 ); TEST_ZIGZAG_SUBAC( sub_4x4ac, level1, level2 ); @@ -1073,9 +1122,9 @@ int ret = 0, ok, used_asm; - x264_mc_init( 0, &mc_c ); - x264_mc_init( cpu_ref, &mc_ref ); - x264_mc_init( cpu_new, &mc_a ); + x264_mc_init( 0, &mc_c, 0 ); + x264_mc_init( cpu_ref, &mc_ref, 0 ); + x264_mc_init( cpu_new, &mc_a, 0 ); x264_pixel_init( 0, &pixf ); #define MC_TEST_LUMA( w, h ) \ @@ -1227,8 +1276,12 @@ fprintf( stderr, #name "[%d]: [FAILED] s:%d o:%d d%d\n", i, s, o, d ); \ break; \ } \ - call_c2( mc_c.weight[i], buffC, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \ - call_a2( weight.weightfn[i], buffA, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \ + /* omit unlikely high scales for benchmarking */ \ + if( (s << (8-d)) < 512 ) \ + { \ + call_c2( mc_c.weight[i], buffC, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \ + call_a2( weight.weightfn[i], buffA, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \ + } \ } \ } @@ -1437,23 +1490,24 @@ pixel *dsta[4] = { pbuf4, pbuf4+1024, pbuf4+2048, pbuf4+3072 }; set_func_name( "lowres_init" ); ok = 1; used_asm = 1; - for( int w = 40; w <= 48; w += 8 ) + for( int w = 96; w <= 96+24; w += 8 ) { - intptr_t stride = (w+8)&~15; - call_c( mc_c.frame_init_lowres_core, pbuf1, dstc[0], dstc[1], dstc[2], dstc[3], (intptr_t)w*2, stride, w, 16 ); - call_a( mc_a.frame_init_lowres_core, pbuf1, dsta[0], dsta[1], dsta[2], dsta[3], (intptr_t)w*2, stride, w, 16 ); - for( int i = 0; i < 16; i++ ) + intptr_t stride = (w*2+31)&~31; + intptr_t stride_lowres = (w+31)&~31; + call_c( mc_c.frame_init_lowres_core, pbuf1, dstc[0], dstc[1], dstc[2], dstc[3], stride, stride_lowres, w, 8 ); + call_a( mc_a.frame_init_lowres_core, pbuf1, dsta[0], dsta[1], dsta[2], dsta[3], stride, stride_lowres, w, 8 ); + for( int i = 0; i < 8; i++ ) { for( int j = 0; j < 4; j++ ) - if( memcmp( dstc[j]+i*stride, dsta[j]+i*stride, w * sizeof(pixel) ) ) + if( memcmp( dstc[j]+i*stride_lowres, dsta[j]+i*stride_lowres, w * sizeof(pixel) ) ) { ok = 0; fprintf( stderr, "frame_init_lowres differs at plane %d line %d\n", j, i ); for( int k = 0; k < w; k++ ) - printf( "%d ", dstc[j][k+i*stride] ); + printf( "%d ", dstc[j][k+i*stride_lowres] ); printf( "\n" ); for( int k = 0; k < w; k++ ) - printf( "%d ", dsta[j][k+i*stride] ); + printf( "%d ", dsta[j][k+i*stride_lowres] ); printf( "\n" ); break; } @@ -1465,7 +1519,7 @@ #define INTEGRAL_INIT( name, size, ... )\ if( mc_a.name != mc_ref.name )\ {\ - intptr_t stride = 80;\ + intptr_t stride = 96;\ set_func_name( #name );\ used_asm = 1;\ memcpy( buf3, buf1, size*2*stride );\ @@ -1637,8 +1691,8 @@ ALIGNED_ARRAY_16( uint8_t, nnz, [X264_SCAN8_SIZE] ); ALIGNED_4( int8_t ref[2][X264_SCAN8_LUMA_SIZE] ); ALIGNED_ARRAY_16( int16_t, mv, [2],[X264_SCAN8_LUMA_SIZE][2] ); - ALIGNED_ARRAY_16( uint8_t, bs, [2],[2][8][4] ); - memset( bs, 99, sizeof(bs) ); + ALIGNED_ARRAY_N( uint8_t, bs, [2],[2][8][4] ); + memset( bs, 99, sizeof(uint8_t)*2*4*8*2 ); for( int j = 0; j < X264_SCAN8_SIZE; j++ ) nnz[j] = ((rand()&7) == 7) * rand() & 0xf; for( int j = 0; j < 2; j++ ) @@ -1651,7 +1705,7 @@ set_func_name( "deblock_strength" ); call_c( db_c.deblock_strength, nnz, ref, mv, bs[0], 2<<(i&1), ((i>>1)&1) ); call_a( db_a.deblock_strength, nnz, ref, mv, bs[1], 2<<(i&1), ((i>>1)&1) ); - if( memcmp( bs[0], bs[1], sizeof(bs[0]) ) ) + if( memcmp( bs[0], bs[1], sizeof(uint8_t)*2*4*8 ) ) { ok = 0; fprintf( stderr, "deblock_strength: [FAILED]\n" ); @@ -1681,11 +1735,11 @@ x264_quant_function_t qf_c; x264_quant_function_t qf_ref; x264_quant_function_t qf_a; - ALIGNED_16( dctcoef dct1[64] ); - ALIGNED_16( dctcoef dct2[64] ); - ALIGNED_16( dctcoef dct3[8][16] ); - ALIGNED_16( dctcoef dct4[8][16] ); - ALIGNED_16( uint8_t cqm_buf[64] ); + ALIGNED_ARRAY_N( dctcoef, dct1,[64] ); + ALIGNED_ARRAY_N( dctcoef, dct2,[64] ); + ALIGNED_ARRAY_N( dctcoef, dct3,[8],[16] ); + ALIGNED_ARRAY_N( dctcoef, dct4,[8],[16] ); + ALIGNED_ARRAY_N( uint8_t, cqm_buf,[64] ); int ret = 0, ok, used_asm; int oks[3] = {1,1,1}, used_asms[3] = {0,0,0}; x264_t h_buf; @@ -1731,23 +1785,23 @@ x264_quant_init( h, cpu_ref, &qf_ref ); x264_quant_init( h, cpu_new, &qf_a ); -#define INIT_QUANT8(j) \ +#define INIT_QUANT8(j,max) \ { \ static const int scale1d[8] = {32,31,24,31,32,31,24,31}; \ - for( int i = 0; i < 64; i++ ) \ + for( int i = 0; i < max; i++ ) \ { \ - unsigned int scale = (255*scale1d[i>>3]*scale1d[i&7])/16; \ - dct1[i] = dct2[i] = j ? (rand()%(2*scale+1))-scale : 0; \ + unsigned int scale = (255*scale1d[(i>>3)&7]*scale1d[i&7])/16; \ + dct1[i] = dct2[i] = (j>>(i>>6))&1 ? (rand()%(2*scale+1))-scale : 0; \ } \ } -#define INIT_QUANT4(j) \ +#define INIT_QUANT4(j,max) \ { \ static const int scale1d[4] = {4,6,4,6}; \ - for( int i = 0; i < 16; i++ ) \ + for( int i = 0; i < max; i++ ) \ { \ - unsigned int scale = 255*scale1d[i>>2]*scale1d[i&3]; \ - dct1[i] = dct2[i] = j ? (rand()%(2*scale+1))-scale : 0; \ + unsigned int scale = 255*scale1d[(i>>2)&3]*scale1d[i&3]; \ + dct1[i] = dct2[i] = (j>>(i>>4))&1 ? (rand()%(2*scale+1))-scale : 0; \ } \ } @@ -1777,34 +1831,36 @@ } \ } -#define TEST_QUANT( qname, block, w ) \ +#define TEST_QUANT( qname, block, type, w, maxj ) \ if( qf_a.qname != qf_ref.qname ) \ { \ set_func_name( #qname ); \ used_asms[0] = 1; \ for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \ { \ - for( int j = 0; j < 2; j++ ) \ + for( int j = 0; j < maxj; j++ ) \ { \ - INIT_QUANT##w(j) \ - int result_c = call_c1( qf_c.qname, dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ - int result_a = call_a1( qf_a.qname, dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ + INIT_QUANT##type(j, w*w) \ + int result_c = call_c1( qf_c.qname, (void*)dct1, h->quant##type##_mf[block][qp], h->quant##type##_bias[block][qp] ); \ + int result_a = call_a1( qf_a.qname, (void*)dct2, h->quant##type##_mf[block][qp], h->quant##type##_bias[block][qp] ); \ if( memcmp( dct1, dct2, w*w*sizeof(dctcoef) ) || result_c != result_a ) \ { \ oks[0] = 0; \ fprintf( stderr, #qname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \ break; \ } \ - call_c2( qf_c.qname, dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ - call_a2( qf_a.qname, dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ + call_c2( qf_c.qname, (void*)dct1, h->quant##type##_mf[block][qp], h->quant##type##_bias[block][qp] ); \ + call_a2( qf_a.qname, (void*)dct2, h->quant##type##_mf[block][qp], h->quant##type##_bias[block][qp] ); \ } \ } \ } - TEST_QUANT( quant_8x8, CQM_8IY, 8 ); - TEST_QUANT( quant_8x8, CQM_8PY, 8 ); - TEST_QUANT( quant_4x4, CQM_4IY, 4 ); - TEST_QUANT( quant_4x4, CQM_4PY, 4 ); + TEST_QUANT( quant_8x8, CQM_8IY, 8, 8, 2 ); + TEST_QUANT( quant_8x8, CQM_8PY, 8, 8, 2 ); + TEST_QUANT( quant_4x4, CQM_4IY, 4, 4, 2 ); + TEST_QUANT( quant_4x4, CQM_4PY, 4, 4, 2 ); + TEST_QUANT( quant_4x4x4, CQM_4IY, 4, 8, 16 ); + TEST_QUANT( quant_4x4x4, CQM_4PY, 4, 8, 16 ); TEST_QUANT_DC( quant_4x4_dc, **h->quant4_mf[CQM_4IY] ); TEST_QUANT_DC( quant_2x2_dc, **h->quant4_mf[CQM_4IC] ); @@ -1815,7 +1871,7 @@ used_asms[1] = 1; \ for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \ { \ - INIT_QUANT##w(1) \ + INIT_QUANT##w(1, w*w) \ qf_c.qname( dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \ call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \ @@ -2092,7 +2148,7 @@ int ret = 0, ok = 1, used_asm = 0; ALIGNED_ARRAY_32( pixel, edge,[36] ); ALIGNED_ARRAY_32( pixel, edge2,[36] ); - ALIGNED_16( pixel fdec[FDEC_STRIDE*20] ); + ALIGNED_ARRAY_32( pixel, fdec,[FDEC_STRIDE*20] ); struct { x264_predict_t predict_16x16[4+3]; @@ -2263,13 +2319,99 @@ #define run_cabac_terminal_asm run_cabac_terminal_c #endif +extern const uint8_t x264_count_cat_m1[14]; +void x264_cabac_block_residual_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ); +void x264_cabac_block_residual_8x8_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ); +void x264_cabac_block_residual_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ); + static int check_cabac( int cpu_ref, int cpu_new ) { - int ret = 0, ok, used_asm = 1; + int ret = 0, ok = 1, used_asm = 0; x264_t h; h.sps->i_chroma_format_idc = 3; + + x264_bitstream_function_t bs_ref; + x264_bitstream_function_t bs_a; + x264_bitstream_init( cpu_ref, &bs_ref ); + x264_bitstream_init( cpu_new, &bs_a ); + x264_quant_init( &h, cpu_new, &h.quantf ); + h.quantf.coeff_last[DCT_CHROMA_DC] = h.quantf.coeff_last4; + +#define CABAC_RESIDUAL(name, start, end, rd)\ +{\ + if( bs_a.name##_internal && (bs_a.name##_internal != bs_ref.name##_internal || (cpu_new&X264_CPU_SSE2_IS_SLOW)) )\ + {\ + used_asm = 1;\ + set_func_name( #name );\ + for( int i = 0; i < 2; i++ )\ + {\ + for( intptr_t ctx_block_cat = start; ctx_block_cat <= end; ctx_block_cat++ )\ + {\ + for( int j = 0; j < 256; j++ )\ + {\ + ALIGNED_ARRAY_N( dctcoef, dct, [2],[64] );\ + uint8_t bitstream[2][1<<16];\ + static const uint8_t ctx_ac[14] = {0,1,0,0,1,0,0,1,0,0,0,1,0,0};\ + int ac = ctx_ac[ctx_block_cat];\ + int nz = 0;\ + while( !nz )\ + {\ + for( int k = 0; k <= x264_count_cat_m1[ctx_block_cat]; k++ )\ + {\ + /* Very rough distribution that covers possible inputs */\ + int rnd = rand();\ + int coef = !(rnd&3);\ + coef += !(rnd& 15) * (rand()&0x0006);\ + coef += !(rnd& 63) * (rand()&0x0008);\ + coef += !(rnd& 255) * (rand()&0x00F0);\ + coef += !(rnd&1023) * (rand()&0x7F00);\ + nz |= dct[0][ac+k] = dct[1][ac+k] = coef * ((rand()&1) ? 1 : -1);\ + }\ + }\ + h.mb.b_interlaced = i;\ + x264_cabac_t cb[2];\ + x264_cabac_context_init( &h, &cb[0], SLICE_TYPE_P, 26, 0 );\ + x264_cabac_context_init( &h, &cb[1], SLICE_TYPE_P, 26, 0 );\ + x264_cabac_encode_init( &cb[0], bitstream[0], bitstream[0]+0xfff0 );\ + x264_cabac_encode_init( &cb[1], bitstream[1], bitstream[1]+0xfff0 );\ + cb[0].f8_bits_encoded = 0;\ + cb[1].f8_bits_encoded = 0;\ + if( !rd ) memcpy( bitstream[1], bitstream[0], 0x400 );\ + call_c1( x264_##name##_c, &h, &cb[0], ctx_block_cat, dct[0]+ac );\ + call_a1( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, &cb[1] );\ + ok = cb[0].f8_bits_encoded == cb[1].f8_bits_encoded && !memcmp(cb[0].state, cb[1].state, 1024);\ + if( !rd ) ok |= !memcmp( bitstream[1], bitstream[0], 0x400 ) && !memcmp( &cb[1], &cb[0], offsetof(x264_cabac_t, p_start) );\ + if( !ok )\ + {\ + fprintf( stderr, #name " : [FAILED] ctx_block_cat %d", (int)ctx_block_cat );\ + if( rd && cb[0].f8_bits_encoded != cb[1].f8_bits_encoded )\ + fprintf( stderr, " (%d != %d)", cb[0].f8_bits_encoded, cb[1].f8_bits_encoded );\ + fprintf( stderr, "\n");\ + goto name##fail;\ + }\ + if( (j&15) == 0 )\ + {\ + call_c2( x264_##name##_c, &h, &cb[0], ctx_block_cat, dct[0]+ac );\ + call_a2( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, &cb[1] );\ + }\ + }\ + }\ + }\ + }\ +}\ +name##fail: + + CABAC_RESIDUAL( cabac_block_residual, 0, DCT_LUMA_8x8, 0 ) + report( "cabac residual:" ); + + ok = 1; used_asm = 0; + CABAC_RESIDUAL( cabac_block_residual_rd, 0, DCT_LUMA_8x8-1, 1 ) + CABAC_RESIDUAL( cabac_block_residual_8x8_rd, DCT_LUMA_8x8, DCT_LUMA_8x8, 1 ) + report( "cabac residual rd:" ); + if( cpu_ref || run_cabac_decision_c == run_cabac_decision_asm ) - return 0; + return ret; + ok = 1; used_asm = 0; x264_cabac_init( &h ); set_func_name( "cabac_encode_decision" ); @@ -2394,18 +2536,18 @@ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "MMX SlowCTZ" ); cpu1 &= ~X264_CPU_SLOW_CTZ; } + if( x264_cpu_detect() & X264_CPU_SSE ) + ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE, "SSE" ); if( x264_cpu_detect() & X264_CPU_SSE2 ) { - ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE | X264_CPU_SSE2 | X264_CPU_SSE2_IS_SLOW, "SSE2Slow" ); + ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2 | X264_CPU_SSE2_IS_SLOW, "SSE2Slow" ); ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2_IS_FAST, "SSE2Fast" ); ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSE2Fast Cache64" ); cpu1 &= ~X264_CPU_CACHELINE_64; - ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSE2 FastShuffle" ); - cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST; + ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_SHUFFLE, "SSE2 SlowShuffle" ); + cpu1 &= ~X264_CPU_SLOW_SHUFFLE; ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" ); cpu1 &= ~X264_CPU_SLOW_CTZ; - ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSE2 SlowAtom" ); - cpu1 &= ~X264_CPU_SLOW_ATOM; } if( x264_cpu_detect() & X264_CPU_SSE_MISALIGN ) { @@ -2427,15 +2569,17 @@ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" ); ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" ); cpu1 &= ~X264_CPU_CACHELINE_64; - ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSSE3 FastShuffle" ); - cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST; + ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_SHUFFLE, "SSSE3 SlowShuffle" ); + cpu1 &= ~X264_CPU_SLOW_SHUFFLE; ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSSE3 SlowCTZ" ); cpu1 &= ~X264_CPU_SLOW_CTZ; ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSSE3 SlowAtom" ); + ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64 SlowAtom" ); + cpu1 &= ~X264_CPU_CACHELINE_64; cpu1 &= ~X264_CPU_SLOW_ATOM; } if( x264_cpu_detect() & X264_CPU_SSE4 ) - ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4 | X264_CPU_SHUFFLE_IS_FAST, "SSE4" ); + ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" ); if( x264_cpu_detect() & X264_CPU_AVX ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX, "AVX" ); if( x264_cpu_detect() & X264_CPU_XOP ) @@ -2448,20 +2592,22 @@ if( x264_cpu_detect() & X264_CPU_BMI1 ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" ); - if( x264_cpu_detect() & X264_CPU_TBM ) - { - ret |= add_flags( &cpu0, &cpu1, X264_CPU_TBM, "TBM" ); - cpu1 &= ~X264_CPU_TBM; - } - if( x264_cpu_detect() & X264_CPU_BMI2 ) - { - ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI2, "BMI2" ); - cpu1 &= ~X264_CPU_BMI2; - } cpu1 &= ~X264_CPU_BMI1; } if( x264_cpu_detect() & X264_CPU_AVX2 ) + { ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" ); + if( x264_cpu_detect() & X264_CPU_LZCNT ) + { + ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "AVX2_LZCNT" ); + cpu1 &= ~X264_CPU_LZCNT; + } + } + if( x264_cpu_detect() & X264_CPU_BMI2 ) + { + ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1|X264_CPU_BMI2, "BMI2" ); + cpu1 &= ~(X264_CPU_BMI1|X264_CPU_BMI2); + } if( x264_cpu_detect() & X264_CPU_FMA3 ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" ); @@ -2508,8 +2654,8 @@ fprintf( stderr, "x264: using random seed %u\n", seed ); srand( seed ); - buf1 = x264_malloc( 0x1e00 + 0x2000*sizeof(pixel) + 16*BENCH_ALIGNS ); - pbuf1 = x264_malloc( 0x1e00*sizeof(pixel) + 16*BENCH_ALIGNS ); + buf1 = x264_malloc( 0x1e00 + 0x2000*sizeof(pixel) + 32*BENCH_ALIGNS ); + pbuf1 = x264_malloc( 0x1e00*sizeof(pixel) + 32*BENCH_ALIGNS ); if( !buf1 || !pbuf1 ) { fprintf( stderr, "malloc failed, unable to initiate tests!\n" ); @@ -2530,19 +2676,19 @@ } memset( buf1+0x1e00, 0, 0x2000*sizeof(pixel) ); - /* 16-byte alignment is guaranteed whenever it's useful, but some functions also vary in speed depending on %64 */ + /* 32-byte alignment is guaranteed whenever it's useful, but some functions also vary in speed depending on %64 */ if( do_bench ) for( int i = 0; i < BENCH_ALIGNS && !ret; i++ ) { INIT_POINTER_OFFSETS; - ret |= x264_stack_pagealign( check_all_flags, i*16 ); - buf1 += 16; - pbuf1 += 16; + ret |= x264_stack_pagealign( check_all_flags, i*32 ); + buf1 += 32; + pbuf1 += 32; quiet = 1; fprintf( stderr, "%d/%d\r", i+1, BENCH_ALIGNS ); } else - ret = check_all_flags(); + ret = x264_stack_pagealign( check_all_flags, 0 ); if( ret ) {
View file
x264-snapshot-20130723-2245.tar.bz2/tools/cltostr.pl
Added
@@ -0,0 +1,65 @@ +# Perl script used for compiling OpenCL src into x264 binary +# +# Copyright (C) 2013 x264 project +# Authors: Steve Borho <sborho@multicorewareinc.com> + +use Digest::MD5 qw(md5_hex); + +# xxd takes a VAR, which will be the variable name +# and BYTES, a string of bytes to beencoded. +sub xxd +{ + my %args = @_; + my $var = $args{VAR}; + my $bytes = $args{BYTES}; + my @hexbytes; + my @bytes = split //, $$bytes; + foreach $b (@bytes) + { + push @hexbytes, sprintf("0x%02X", ord($b)); + } + + # Format 'em nice and pretty-like. + print 'static const char ' . $var . '[] = {' . "\n"; + my $count = 0; + foreach my $h (@hexbytes) + { + print "$h, "; + $count++; + if ($count == 16) + { + print "\n"; + $count = 0; + } + } + print "\n0x00 };\n\n"; + + return; +} + +if (@ARGV < 1) +{ + printf "%s: VARNAME ", $0 . "\n"; + exit(-1); +} + + +my @lines; +while(<STDIN>) +{ + s/^\s+//; # trim leading whitespace + if (/^\/\//) + { + next; # skip the line if it starts with '//' + } + push @lines, $_; +} + +my $lines = join '', @lines; +xxd(VAR => @ARGV[0], BYTES => \$lines); + +my $hash = md5_hex($lines); +@hash = ( $hash =~ m/../g ); + + +xxd(VAR => @ARGV[0] . "_hash", BYTES => \$hash);
View file
x264-snapshot-20130224-2245.tar.bz2/x264.c -> x264-snapshot-20130723-2245.tar.bz2/x264.c
Changed
@@ -225,7 +225,7 @@ va_end( arg ); } -static void print_version_info() +static void print_version_info( void ) { #ifdef X264_POINTVER printf( "x264 "X264_POINTVER"\n" ); @@ -596,8 +596,11 @@ H2( " --slices <integer> Number of slices per frame; forces rectangular\n" " slices and is overridden by other slicing options\n" ); else H1( " --slices <integer> Number of slices per frame\n" ); + H2( " --slices-max <integer> Absolute maximum slices per frame; overrides\n" + " slice-max-size/slice-max-mbs when necessary\n" ); H2( " --slice-max-size <integer> Limit the size of each slice in bytes\n"); - H2( " --slice-max-mbs <integer> Limit the size of each slice in macroblocks\n"); + H2( " --slice-max-mbs <integer> Limit the size of each slice in macroblocks (max)\n"); + H2( " --slice-min-mbs <integer> Limit the size of each slice in macroblocks (min)\n"); H0( " --tff Enable interlaced mode (top field first)\n" ); H0( " --bff Enable interlaced mode (bottom field first)\n" ); H2( " --constrained-intra Enable constrained intra prediction.\n" ); @@ -743,16 +746,18 @@ H2( " --range <string> Specify color range [\"%s\"]\n" " - %s\n", range_names[0], stringify_names( buf, range_names ) ); H2( " --colorprim <string> Specify color primaries [\"%s\"]\n" - " - undef, bt709, bt470m, bt470bg\n" - " smpte170m, smpte240m, film\n", + " - undef, bt709, bt470m, bt470bg, smpte170m,\n" + " smpte240m, film, bt2020\n", strtable_lookup( x264_colorprim_names, defaults->vui.i_colorprim ) ); H2( " --transfer <string> Specify transfer characteristics [\"%s\"]\n" - " - undef, bt709, bt470m, bt470bg, linear,\n" - " log100, log316, smpte170m, smpte240m\n", + " - undef, bt709, bt470m, bt470bg, smpte170m,\n" + " smpte240m, linear, log100, log316,\n" + " iec61966-2-4, bt1361e, iec61966-2-1,\n" + " bt2020-10, bt2020-12\n", strtable_lookup( x264_transfer_names, defaults->vui.i_transfer ) ); H2( " --colormatrix <string> Specify color matrix setting [\"%s\"]\n" - " - undef, bt709, fcc, bt470bg\n" - " smpte170m, smpte240m, GBR, YCgCo\n", + " - undef, bt709, fcc, bt470bg, smpte170m,\n" + " smpte240m, GBR, YCgCo, bt2020nc, bt2020c\n", strtable_lookup( x264_colmatrix_names, defaults->vui.i_colmatrix ) ); H2( " --chromaloc <integer> Specify chroma sample location (0 to 5) [%d]\n", defaults->vui.i_chroma_loc ); @@ -787,6 +792,8 @@ H0( " --frames <integer> Maximum number of frames to encode\n" ); H0( " --level <string> Specify level (as defined by Annex A)\n" ); H1( " --bluray-compat Enable compatibility hacks for Blu-ray support\n" ); + H1( " --stitchable Don't optimize headers based on video content\n" + " Ensures ability to recombine a segmented encode\n" ); H1( "\n" ); H1( " -v, --verbose Print stats for each frame\n" ); H1( " --no-progress Don't show the progress indicator while encoding\n" ); @@ -806,6 +813,9 @@ " as opposed to letting them select different algorithms\n" ); H2( " --asm <integer> Override CPU detection\n" ); H2( " --no-asm Disable all CPU optimizations\n" ); + H2( " --opencl Enable use of OpenCL\n" ); + H2( " --opencl-clbin <string> Specify path of compiled OpenCL kernel cache\n" ); + H2( " --opencl-device <integer> Specify OpenCL device ordinal\n" ); H2( " --visualize Show MB types overlayed on the encoded video\n" ); H2( " --dump-yuv <string> Save reconstructed frames\n" ); H2( " --sps-id <integer> Set SPS and PPS id numbers [%d]\n", defaults->i_sps_id ); @@ -910,6 +920,9 @@ { "ref", required_argument, NULL, 'r' }, { "asm", required_argument, NULL, 0 }, { "no-asm", no_argument, NULL, 0 }, + { "opencl", no_argument, NULL, 1 }, + { "opencl-clbin",required_argument, NULL, 0 }, + { "opencl-device",required_argument, NULL, 0 }, { "sar", required_argument, NULL, 0 }, { "fps", required_argument, NULL, OPT_FPS }, { "frames", required_argument, NULL, OPT_FRAMES }, @@ -971,7 +984,9 @@ { "no-sliced-threads", no_argument, NULL, 0 }, { "slice-max-size", required_argument, NULL, 0 }, { "slice-max-mbs", required_argument, NULL, 0 }, + { "slice-min-mbs", required_argument, NULL, 0 }, { "slices", required_argument, NULL, 0 }, + { "slices-max", required_argument, NULL, 0 }, { "thread-input", no_argument, NULL, OPT_THREAD_INPUT }, { "sync-lookahead", required_argument, NULL, 0 }, { "non-deterministic", no_argument, NULL, 0 }, @@ -1025,6 +1040,7 @@ { "dts-compress", no_argument, NULL, OPT_DTS_COMPRESSION }, { "output-csp", required_argument, NULL, OPT_OUTPUT_CSP }, { "input-range", required_argument, NULL, OPT_INPUT_RANGE }, + { "stitchable", no_argument, NULL, 0 }, {0, 0, 0, 0} };
View file
x264-snapshot-20130224-2245.tar.bz2/x264.h -> x264-snapshot-20130723-2245.tar.bz2/x264.h
Changed
@@ -28,7 +28,7 @@ #ifndef X264_X264_H #define X264_X264_H -#if !defined(_STDINT_H) && !defined(_STDINT_H_) && \ +#if !defined(_STDINT_H) && !defined(_STDINT_H_) && !defined(_STDINT_H_INCLUDED) &&\ !defined(_INTTYPES_H) && !defined(_INTTYPES_H_) # ifdef _MSC_VER # pragma message("You must include stdint.h or inttypes.h before x264.h") @@ -41,7 +41,7 @@ #include "x264_config.h" -#define X264_BUILD 129 +#define X264_BUILD 135 /* Application developers planning to link against a shared library version of * libx264 from a Microsoft Visual Studio or similar development environment @@ -109,43 +109,53 @@ /**************************************************************************** * Encoder parameters ****************************************************************************/ -/* CPU flags - */ -#define X264_CPU_CACHELINE_32 0x0000001 /* avoid memory loads that span the border between two cachelines */ -#define X264_CPU_CACHELINE_64 0x0000002 /* 32/64 is the size of a cacheline in bytes */ -#define X264_CPU_ALTIVEC 0x0000004 -#define X264_CPU_MMX 0x0000008 -#define X264_CPU_MMX2 0x0000010 /* MMX2 aka MMXEXT aka ISSE */ +/* CPU flags */ + +/* x86 */ +#define X264_CPU_CMOV 0x0000001 +#define X264_CPU_MMX 0x0000002 +#define X264_CPU_MMX2 0x0000004 /* MMX2 aka MMXEXT aka ISSE */ #define X264_CPU_MMXEXT X264_CPU_MMX2 -#define X264_CPU_SSE 0x0000020 -#define X264_CPU_SSE2 0x0000040 -#define X264_CPU_SSE2_IS_SLOW 0x0000080 /* avoid most SSE2 functions on Athlon64 */ -#define X264_CPU_SSE2_IS_FAST 0x0000100 /* a few functions are only faster on Core2 and Phenom */ -#define X264_CPU_SSE3 0x0000200 -#define X264_CPU_SSSE3 0x0000400 -#define X264_CPU_SHUFFLE_IS_FAST 0x0000800 /* Penryn, Nehalem, and Phenom have fast shuffle units */ -#define X264_CPU_STACK_MOD4 0x0001000 /* if stack is only mod4 and not mod16 */ -#define X264_CPU_SSE4 0x0002000 /* SSE4.1 */ -#define X264_CPU_SSE42 0x0004000 /* SSE4.2 */ -#define X264_CPU_SSE_MISALIGN 0x0008000 /* Phenom support for misaligned SSE instruction arguments */ -#define X264_CPU_LZCNT 0x0010000 /* Phenom support for "leading zero count" instruction. */ -#define X264_CPU_ARMV6 0x0020000 -#define X264_CPU_NEON 0x0040000 /* ARM NEON */ -#define X264_CPU_FAST_NEON_MRC 0x0080000 /* Transfer from NEON to ARM register is fast (Cortex-A9) */ -#define X264_CPU_SLOW_CTZ 0x0100000 /* BSR/BSF x86 instructions are really slow on some CPUs */ -#define X264_CPU_SLOW_ATOM 0x0200000 /* The Atom just sucks */ -#define X264_CPU_AVX 0x0400000 /* AVX support: requires OS support even if YMM registers - * aren't used. */ -#define X264_CPU_XOP 0x0800000 /* AMD XOP */ -#define X264_CPU_FMA4 0x1000000 /* AMD FMA4 */ -#define X264_CPU_AVX2 0x2000000 /* AVX2 */ -#define X264_CPU_FMA3 0x4000000 /* Intel FMA3 */ -#define X264_CPU_BMI1 0x8000000 /* BMI1 */ -#define X264_CPU_BMI2 0x10000000 /* BMI2 */ -#define X264_CPU_TBM 0x20000000 /* AMD TBM */ - -/* Analyse flags - */ +#define X264_CPU_SSE 0x0000008 +#define X264_CPU_SSE2 0x0000010 +#define X264_CPU_SSE3 0x0000020 +#define X264_CPU_SSSE3 0x0000040 +#define X264_CPU_SSE4 0x0000080 /* SSE4.1 */ +#define X264_CPU_SSE42 0x0000100 /* SSE4.2 */ +#define X264_CPU_SSE_MISALIGN 0x0000200 /* Phenom support for misaligned SSE instruction arguments */ +#define X264_CPU_LZCNT 0x0000400 /* Phenom support for "leading zero count" instruction. */ +#define X264_CPU_AVX 0x0000800 /* AVX support: requires OS support even if YMM registers aren't used. */ +#define X264_CPU_XOP 0x0001000 /* AMD XOP */ +#define X264_CPU_FMA4 0x0002000 /* AMD FMA4 */ +#define X264_CPU_AVX2 0x0004000 /* AVX2 */ +#define X264_CPU_FMA3 0x0008000 /* Intel FMA3 */ +#define X264_CPU_BMI1 0x0010000 /* BMI1 */ +#define X264_CPU_BMI2 0x0020000 /* BMI2 */ +/* x86 modifiers */ +#define X264_CPU_CACHELINE_32 0x0040000 /* avoid memory loads that span the border between two cachelines */ +#define X264_CPU_CACHELINE_64 0x0080000 /* 32/64 is the size of a cacheline in bytes */ +#define X264_CPU_SSE2_IS_SLOW 0x0100000 /* avoid most SSE2 functions on Athlon64 */ +#define X264_CPU_SSE2_IS_FAST 0x0200000 /* a few functions are only faster on Core2 and Phenom */ +#define X264_CPU_SLOW_SHUFFLE 0x0400000 /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */ +#define X264_CPU_STACK_MOD4 0x0800000 /* if stack is only mod4 and not mod16 */ +#define X264_CPU_SLOW_CTZ 0x1000000 /* BSR/BSF x86 instructions are really slow on some CPUs */ +#define X264_CPU_SLOW_ATOM 0x2000000 /* The Atom is terrible: slow SSE unaligned loads, slow + * SIMD multiplies, slow SIMD variable shifts, slow pshufb, + * cacheline split penalties -- gather everything here that + * isn't shared by other CPUs to avoid making half a dozen + * new SLOW flags. */ +#define X264_CPU_SLOW_PSHUFB 0x4000000 /* such as on the Intel Atom */ +#define X264_CPU_SLOW_PALIGNR 0x8000000 /* such as on the AMD Bobcat */ + +/* PowerPC */ +#define X264_CPU_ALTIVEC 0x0000001 + +/* ARM */ +#define X264_CPU_ARMV6 0x0000001 +#define X264_CPU_NEON 0x0000002 /* ARM NEON */ +#define X264_CPU_FAST_NEON_MRC 0x0000004 /* Transfer from NEON to ARM register is fast (Cortex-A9) */ + +/* Analyse flags */ #define X264_ANALYSE_I4x4 0x0001 /* Analyse i4x4 */ #define X264_ANALYSE_I8x8 0x0002 /* Analyse i8x8 (requires 8x8 transform) */ #define X264_ANALYSE_PSUB16x16 0x0010 /* Analyse p16x8, p8x16 and p8x8 */ @@ -188,9 +198,10 @@ static const char * const x264_overscan_names[] = { "undef", "show", "crop", 0 }; static const char * const x264_vidformat_names[] = { "component", "pal", "ntsc", "secam", "mac", "undef", 0 }; static const char * const x264_fullrange_names[] = { "off", "on", 0 }; -static const char * const x264_colorprim_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "film", 0 }; -static const char * const x264_transfer_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "linear", "log100", "log316", 0 }; -static const char * const x264_colmatrix_names[] = { "GBR", "bt709", "undef", "", "fcc", "bt470bg", "smpte170m", "smpte240m", "YCgCo", 0 }; +static const char * const x264_colorprim_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "film", "bt2020", 0 }; +static const char * const x264_transfer_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "linear", "log100", "log316", + "iec61966-2-4", "bt1361e", "iec61966-2-1", "bt2020-10", "bt2020-12", 0 }; +static const char * const x264_colmatrix_names[] = { "GBR", "bt709", "undef", "", "fcc", "bt470bg", "smpte170m", "smpte240m", "YCgCo", "bt2020nc", "bt2020c", 0 }; static const char * const x264_nal_hrd_names[] = { "none", "vbr", "cbr", 0 }; /* Colorspace type */ @@ -464,10 +475,23 @@ int b_fake_interlaced; + /* Don't optimize header parameters based on video content, e.g. ensure that splitting an input video, compressing + * each part, and stitching them back together will result in identical SPS/PPS. This is necessary for stitching + * with container formats that don't allow multiple SPS/PPS. */ + int b_stitchable; + + int b_opencl; /* use OpenCL when available */ + int i_opencl_device; /* specify count of GPU devices to skip, for CLI users */ + void *opencl_device_id; /* pass explicit cl_device_id as void*, for API users */ + char *psz_clbin_file; /* compiled OpenCL kernel cache file */ + /* Slicing parameters */ int i_slice_max_size; /* Max size per slice in bytes; includes estimated NAL overhead. */ int i_slice_max_mbs; /* Max number of MBs per slice; overrides i_slice_count. */ + int i_slice_min_mbs; /* Min number of MBs per slice */ int i_slice_count; /* Number of slices per frame: forces rectangular slices. */ + int i_slice_count_max; /* Absolute cap on slices per frame; stops applying slice-max-size + * and slice-max-mbs if this is reached. */ /* Optional callback for freeing this x264_param_t when it is done being used. * Only used when the x264_param_t sits in memory for an indefinite period of time, @@ -481,7 +505,7 @@ * is done encoding. * * This callback MUST do the following in order to work correctly: - * 1) Have available an output buffer of at least size nal->i_payload*3/2 + 5 + 16. + * 1) Have available an output buffer of at least size nal->i_payload*3/2 + 5 + 64. * 2) Call x264_nal_encode( h, dst, nal ), where dst is the output buffer. * After these steps, the content of nal is valid and can be used in the same way as if * the NAL unit were output by x264_encoder_encode. @@ -834,7 +858,13 @@ * due to delay, this may not be the next frame passed to encoder_encode. * if the change should apply to some particular frame, use x264_picture_t->param instead. * returns 0 on success, negative on parameter validation error. - * not all parameters can be changed; see the actual function for a detailed breakdown. */ + * not all parameters can be changed; see the actual function for a detailed breakdown. + * + * since not all parameters can be changed, moving from preset to preset may not always + * fully copy all relevant parameters, but should still work usably in practice. however, + * more so than for other presets, many of the speed shortcuts used in ultrafast cannot be + * switched out of; using reconfig to switch between ultrafast and other presets is not + * recommended without a more fine-grained breakdown of parameters to take this into account. */ int x264_encoder_reconfig( x264_t *, x264_param_t * ); /* x264_encoder_parameters: * copies the current internal set of parameters to the pointer provided
Locations
Projects
Search
Status Monitor
Help
Open Build Service
OBS Manuals
API Documentation
OBS Portal
Reporting a Bug
Contact
Mailing List
Forums
Chat (IRC)
Twitter
Open Build Service (OBS)
is an
openSUSE project
.