Projects
Essentials
libx264
Sign Up
Log In
Username
Password
We truncated the diff of some files because they were too big. If you want to see the full diff for every file,
click here
.
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
Expand all
Collapse all
Changes of Revision 4
View file
libx264.changes
Changed
@@ -1,4 +1,9 @@ ------------------------------------------------------------------- +Wed Jul 24 14:11:22 UTC 2013 - i@margueirte.su + +- update version 20130723. + +------------------------------------------------------------------- Thu Mar 7 08:36:00 UTC+0800 2013 - marguerite@opensuse.org - fallback to 8-bit depth again.
View file
libx264.spec
Changed
@@ -1,5 +1,6 @@ # vim: set ts=4 sw=4 et: # Copyright (c) 2012 Pascal Bleser <pascal.bleser@opensuse.org> +# COpyright (c) 2013 Marguerite Su <marguerite@opensuse.org> # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed @@ -10,20 +11,19 @@ # license that conforms to the Open Source Definition (Version 1.9) # published by the Open Source Initiative. -# Please submit bugfixes or comments via http://bugs.opensuse.org/ +# Please submit bugfixes or comments via http://bugs.links2linux.org/ Name: libx264 -%define libname %{name} -%define soname 129 -%define svn 20130224 +%define soname 135 +%define svn 20130723 Version: 0.%{soname}svn%{svn} Release: 1 License: GPL-2.0+ Summary: A free h264/avc encoder - encoder binary Url: http://developers.videolan.org/x264.html Group: Productivity/Multimedia/Video/Editors and Convertors -Source0: ftp://ftp.videolan.org/pub/videolan/x264/snapshots/x264-snapshot-%{svn}-2245.tar.bz2 -Patch0: x264-use-shared-library.patch +Source: ftp://ftp.videolan.org/pub/videolan/x264/snapshots/x264-snapshot-%{svn}-2245.tar.bz2 +Patch: x264-use-shared-library.patch BuildRequires: nasm BuildRequires: pkg-config BuildRequires: yasm >= 1.2.0 @@ -59,11 +59,11 @@ moment so please use mencoder or another tool that supports x264 library for all other file types. -%package -n %{libname}-%{soname} +%package %{soname} Summary: A free h264/avc encoder - encoder binary Group: Productivity/Multimedia/Video/Editors and Convertors -%description -n %{libname}-%{soname} +%description %{soname} x264 is a free library for encoding next-generation H264/AVC video streams. The code is written from scratch by Laurent Aimar, Loren Merritt, Eric Petit (OS X), Min Chen (vfw/asm), Justin Clay (vfw), Mans @@ -73,15 +73,14 @@ development with libx264. This library is needed to build mplayer/mencoder with H264 encoding support. -%package -n %{libname}-devel +%package devel Summary: Libraries and include file for the %{name} encoder Group: Development/Libraries/C and C++ -Requires: %{buildrequires} -Requires: %{libname}-%{soname} = %{version} -Provides: %{name}-devel = %{version} -Obsoletes: %{name}-devel < %{version} +Requires: %{name}-%{soname} = %{version} +Provides: x264-devel = %{version} +Obsoletes: x264-devel < %{version} -%description -n %{libname}-devel +%description devel x264 is a free library for encoding next-generation H264/AVC video streams. The code is written from scratch by Laurent Aimar, Loren Merritt, Eric Petit (OS X), Min Chen (vfw/asm), Justin Clay (vfw), Mans @@ -92,8 +91,8 @@ mplayer/mencoder with H264 encoding support. %prep -%setup -q -n "x264-snapshot-%{svn}-2245" -%patch0 -p0 +%setup -q -n x264-snapshot-%{svn}-2245 +%patch -p1 FAKE_BUILDDATE=$(LC_ALL=C date -u -r %{_sourcedir}/%{name}.changes '+%%b %%e %%Y') sed -i "s/__DATE__/\"$FAKE_BUILDDATE\"/" x264.c @@ -104,29 +103,26 @@ %install %makeinstall -rm -f "%{buildroot}%{_libdir}/%{libname}.so" -rm -f "%{buildroot}%{_libdir}/%{libname}.a" -ln -s %{libname}.so.%{soname} "%{buildroot}%{_libdir}/%{libname}.so" +rm -f %{buildroot}%{_libdir}/%{name}.so +rm -f %{buildroot}%{_libdir}/%{name}.a +ln -s %{name}.so.%{soname} %{buildroot}%{_libdir}/%{name}.so -rm "%{buildroot}%{_bindir}"/* +rm %{buildroot}%{_bindir}/* -echo "%{libname}-%{soname}" > %{_sourcedir}/baselibs.conf +echo "%{name}-%{soname}" > %{_sourcedir}/baselibs.conf -%clean -%{?buildroot:%__rm -rf "%{buildroot}"} +%post -n %{name}-%{soname} -p /sbin/ldconfig +%postun -n %{name}-%{soname} -p /sbin/ldconfig -%post -n %{libname}-%{soname} -p /sbin/ldconfig -%postun -n %{libname}-%{soname} -p /sbin/ldconfig - -%files -n %{libname}-%{soname} +%files %{soname} %defattr(0644,root,root) -%{_libdir}/%{libname}.so.%{soname} +%{_libdir}/%{name}.so.%{soname} -%files -n %{libname}-devel +%files devel %defattr(0644,root,root) %{_includedir}/x264.h %{_includedir}/x264_config.h %{_libdir}/pkgconfig/x264.pc -%{_libdir}/%{libname}.so +%{_libdir}/%{name}.so %changelog
View file
x264-use-shared-library.patch
Changed
@@ -1,21 +1,23 @@ ---- Makefile.orig 2011-12-26 22:45:03.000000000 +0100 -+++ Makefile 2011-12-27 20:03:46.070404383 +0100 -@@ -152,6 +152,7 @@ +Index: x264-snapshot-20130723-2245/Makefile +=================================================================== +--- x264-snapshot-20130723-2245.orig/Makefile ++++ x264-snapshot-20130723-2245/Makefile +@@ -171,6 +171,7 @@ $(LIBX264): $(GENERATED) .depend $(OBJS) - $(SONAME): .depend $(OBJS) $(OBJASM) $(OBJSO) + $(SONAME): $(GENERATED) .depend $(OBJS) $(OBJASM) $(OBJSO) $(LD)$@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS) + ln -s $(SONAME) libx264.so ifneq ($(EXE),) .PHONY: x264 checkasm -@@ -159,8 +160,8 @@ +@@ -178,8 +179,8 @@ x264: x264$(EXE) checkasm: checkasm$(EXE) endif --x264$(EXE): .depend $(OBJCLI) $(CLI_LIBX264) +-x264$(EXE): $(GENERATED) .depend $(OBJCLI) $(CLI_LIBX264) - $(LD)$@ $(OBJCLI) $(CLI_LIBX264) $(LDFLAGSCLI) $(LDFLAGS) -+x264$(EXE): .depend $(OBJCLI) $(SONAME) ++x264$(EXE): $(GENERATED) .depend $(OBJCLI) $(SONAME) + $(LD)$@ $(OBJCLI) -L. -lx264 $(LDFLAGSCLI) $(LDFLAGS) - checkasm$(EXE): .depend $(OBJCHK) $(LIBX264) + checkasm$(EXE): $(GENERATED) .depend $(OBJCHK) $(LIBX264) $(LD)$@ $(OBJCHK) $(LIBX264) $(LDFLAGS)
View file
x264-snapshot-20130224-2245.tar.bz2/.gitignore -> x264-snapshot-20130723-2245.tar.bz2/.gitignore
Changed
@@ -43,3 +43,5 @@ .digress_x264 dataDec.txt log.dec +common/oclobj.h +x264_lookahead.clbin
View file
x264-snapshot-20130224-2245.tar.bz2/Makefile -> x264-snapshot-20130723-2245.tar.bz2/Makefile
Changed
@@ -8,6 +8,8 @@ vpath %.asm $(SRCPATH) vpath %.rc $(SRCPATH) +GENERATED = + all: default default: @@ -145,6 +147,13 @@ endif endif +ifeq ($(HAVE_OPENCL),yes) +common/oclobj.h: common/opencl/x264-cl.h $(wildcard $(SRCPATH)/common/opencl/*.cl) + cat $^ | perl $(SRCPATH)/tools/cltostr.pl x264_opencl_source > $@ +GENERATED += common/oclobj.h +SRCS += common/opencl.c encoder/slicetype-cl.c +endif + OBJS += $(SRCS:%.c=%.o) OBJCLI += $(SRCCLI:%.c=%.o) OBJSO += $(SRCSO:%.c=%.o) @@ -155,12 +164,12 @@ lib-static: $(LIBX264) lib-shared: $(SONAME) -$(LIBX264): .depend $(OBJS) $(OBJASM) +$(LIBX264): $(GENERATED) .depend $(OBJS) $(OBJASM) rm -f $(LIBX264) $(AR)$@ $(OBJS) $(OBJASM) $(if $(RANLIB), $(RANLIB) $@) -$(SONAME): .depend $(OBJS) $(OBJASM) $(OBJSO) +$(SONAME): $(GENERATED) .depend $(OBJS) $(OBJASM) $(OBJSO) $(LD)$@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS) ifneq ($(EXE),) @@ -169,10 +178,10 @@ checkasm: checkasm$(EXE) endif -x264$(EXE): .depend $(OBJCLI) $(CLI_LIBX264) +x264$(EXE): $(GENERATED) .depend $(OBJCLI) $(CLI_LIBX264) $(LD)$@ $(OBJCLI) $(CLI_LIBX264) $(LDFLAGSCLI) $(LDFLAGS) -checkasm$(EXE): .depend $(OBJCHK) $(LIBX264) +checkasm$(EXE): $(GENERATED) .depend $(OBJCHK) $(LIBX264) $(LD)$@ $(OBJCHK) $(LIBX264) $(LDFLAGS) $(OBJS) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK): .depend @@ -231,7 +240,7 @@ clean: rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(SONAME) *.a *.lib *.exp *.pdb x264 x264.exe .depend TAGS - rm -f checkasm checkasm.exe $(OBJCHK) + rm -f checkasm checkasm.exe $(OBJCHK) $(GENERATED) x264_lookahead.clbin rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock distclean: clean
View file
x264-snapshot-20130224-2245.tar.bz2/common/arm/mc-a.S -> x264-snapshot-20130723-2245.tar.bz2/common/arm/mc-a.S
Changed
@@ -5,6 +5,7 @@ * * Authors: David Conrad <lessen42@gmail.com> * Mans Rullgard <mans@mansr.com> + * Stefan Groenroos <stefan.gronroos@gmail.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -813,54 +814,57 @@ // void x264_mc_chroma_neon( uint8_t *dst, intptr_t i_dst_stride, // uint8_t *src, intptr_t i_src_stride, // int dx, int dy, int i_width, int i_height ); + function x264_mc_chroma_neon - push {r4-r6, lr} - ldrd r4, [sp, #16] - ldr r6, [sp, #24] + push {r4-r8, lr} + vpush {d8-d11} + ldrd r4, [sp, #56] + ldrd r6, [sp, #64] - asr lr, r5, #3 - mul lr, r3, lr - add r2, r2, r4, asr #3 - cmp r6, #4 - add r2, r2, lr + asr lr, r6, #3 + mul lr, r4, lr + add r3, r3, r5, asr #2 + cmp r7, #4 - and r4, r4, #7 and r5, r5, #7 - pld [r2] - pld [r2, r3] + and r6, r6, #7 + + add r3, r3, lr + bic r3, r3, #0x1 + + pld [r3] + pld [r3, r4] bgt mc_chroma_w8 beq mc_chroma_w4 -// calculate cA cB cC cD -.macro CHROMA_MC_START r0 r1 - muls lr, r4, r5 - rsb r6, lr, r5, lsl #3 - rsb ip, lr, r4, lsl #3 - sub r4, lr, r4, lsl #3 - sub r4, r4, r5, lsl #3 - add r4, r4, #64 +.macro CHROMA_MC_START r00, r01, r10, r11 + muls lr, r5, r6 + rsb r7, lr, r6, lsl #3 + rsb ip, lr, r5, lsl #3 + sub r5, lr, r5, lsl #3 + sub r5, r5, r6, lsl #3 + add r5, r5, #64 beq 2f + vld2.8 {\r00-\r01}, [r3], r4 - add r5, r2, r3 + vdup.8 d0, r5 + vdup.8 d1, ip - vdup.8 d0, r4 - lsl r3, r3, #1 - vdup.8 d1, ip - vld1.64 {\r0}, [r2], r3 - vdup.8 d2, r6 - vld1.64 {\r1}, [r5], r3 - vdup.8 d3, lr - ldr r4, [sp, #28] - - vext.8 d5, d4, d5, #1 - vext.8 d7, d6, d7, #1 + vdup.8 d2, r7 + vld2.8 {\r10-\r11}, [r3], r4 + vdup.8 d3, lr + ldr r5, [sp, #72] .endm .macro CHROMA_MC width, align mc_chroma_w\width: - CHROMA_MC_START d4, d6 + CHROMA_MC_START d4, d5, d8, d9 + vext.8 d6, d4, d6, #1 + vext.8 d7, d5, d7, #1 + vext.8 d10, d8, d10, #1 + vext.8 d11, d9, d11, #1 // since the element size varies, there's a different index for the 2nd store .if \width == 4 .set st2, 1 @@ -868,187 +872,292 @@ .set st2, 2 .endif - vtrn.32 d4, d5 - vtrn.32 d6, d7 + vtrn.32 d4, d6 + vtrn.32 d5, d7 + vtrn.32 d8, d10 + vtrn.32 d9, d11 - vtrn.32 d0, d1 - vtrn.32 d2, d3 + vtrn.32 d0, d1 + vtrn.32 d2, d3 1: // height loop, interpolate xy - pld [r5] + vmull.u8 q8, d4, d0 - vmlal.u8 q8, d6, d2 - vld1.64 {d4}, [r2], r3 - vext.8 d5, d4, d5, #1 - vtrn.32 d4, d5 - vmull.u8 q9, d6, d0 - vmlal.u8 q9, d4, d2 - vld1.64 {d6}, [r5], r3 + vmlal.u8 q8, d8, d2 + vmull.u8 q9, d5, d0 + vmlal.u8 q9, d9, d2 + + vld2.8 {d4-d5}, [r3], r4 + + vext.8 d6, d4, d6, #1 + vext.8 d7, d5, d7, #1 + vadd.i16 d16, d16, d17 vadd.i16 d17, d18, d19 + + vtrn.32 d4, d6 + vtrn.32 d5, d7 + + vmull.u8 q10, d8, d0 + vmlal.u8 q10, d4, d2 + vmull.u8 q11, d9, d0 + vmlal.u8 q11, d5, d2 + + vld2.8 {d8-d9}, [r3], r4 + vrshrn.u16 d16, q8, #6 - subs r4, r4, #2 - pld [r2] - vext.8 d7, d6, d7, #1 - vtrn.32 d6, d7 - vst1.\align {d16[0]}, [r0,:\align], r1 - vst1.\align {d16[st2]}, [r0,:\align], r1 + + vext.8 d10, d8, d10, #1 + vext.8 d11, d9, d11, #1 + + vadd.i16 d18, d20, d21 + vadd.i16 d19, d22, d23 + + vtrn.32 d8, d10 + vtrn.32 d9, d11 + + vrshrn.u16 d18, q9, #6 + + subs r5, r5, #2 + + pld [r3] + pld [r3, r4] + + vst1.\align {d16[0]}, [r0,:\align], r2 + vst1.\align {d16[st2]}, [r1,:\align], r2 + vst1.\align {d18[0]}, [r0,:\align], r2 + vst1.\align {d18[st2]}, [r1,:\align], r2 bgt 1b - pop {r4-r6, pc} + vpop {d8-d11} + pop {r4-r8, pc} 2: // dx or dy are 0 - tst r6, r6 - add ip, ip, r6 - vdup.8 d0, r4 + tst r7, r7 + add ip, ip, r7 + vdup.8 d0, r5 + ldr r5, [sp, #72] vdup.8 d1, ip - vtrn.32 d0, d1 - ldr r4, [sp, #28] beq 4f - vext.32 d1, d0, d1, #1 - add r5, r2, r3 - lsl r3, r3, #1 - vld1.32 {d4[0]}, [r2], r3 - vld1.32 {d4[1]}, [r5], r3
View file
x264-snapshot-20130224-2245.tar.bz2/common/arm/mc-c.c -> x264-snapshot-20130723-2245.tar.bz2/common/arm/mc-c.c
Changed
@@ -238,7 +238,7 @@ pf->offsetsub = x264_mc_offsetsub_wtab_neon; pf->weight_cache = x264_weight_cache_neon; -// pf->mc_chroma = x264_mc_chroma_neon; + pf->mc_chroma = x264_mc_chroma_neon; pf->mc_luma = mc_luma_neon; pf->get_ref = get_ref_neon; pf->hpel_filter = hpel_filter_neon;
View file
x264-snapshot-20130224-2245.tar.bz2/common/arm/quant-a.S -> x264-snapshot-20130723-2245.tar.bz2/common/arm/quant-a.S
Changed
@@ -35,7 +35,7 @@ .text -.macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 load_mf=no +.macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 mask load_mf=no vadd.u16 q8, q8, \bias0 vadd.u16 q9, q9, \bias1 .ifc \load_mf, yes @@ -55,7 +55,7 @@ veor q9, q9, q15 vsub.s16 q8, q8, q14 vsub.s16 q9, q9, q15 - vorr \bias0, q8, q9 + vorr \mask, q8, q9 vst1.64 {d16-d19}, [r0,:128]! .endm @@ -89,7 +89,7 @@ vabs.s16 q9, q15 vdup.16 q0, r2 vdup.16 q2, r1 - QUANT_TWO q0, q0, d4, d5, d4, d5 + QUANT_TWO q0, q0, d4, d5, d4, d5, q0 vorr d0, d0, d1 QUANT_END d0 .endfunc @@ -101,11 +101,52 @@ vabs.s16 q9, q15 vld1.64 {d0-d3}, [r2,:128] vld1.64 {d4-d7}, [r1,:128] - QUANT_TWO q0, q1, d4, d5, d6, d7 + QUANT_TWO q0, q1, d4, d5, d6, d7, q0 vorr d0, d0, d1 QUANT_END d0 .endfunc +// quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] ) +function x264_quant_4x4x4_neon + vpush {d8-d15} + vld1.64 {d28-d31}, [r0,:128] + vabs.s16 q8, q14 + vabs.s16 q9, q15 + vld1.64 {d0-d3}, [r2,:128] + vld1.64 {d4-d7}, [r1,:128] + QUANT_TWO q0, q1, d4, d5, d6, d7, q4 + vld1.64 {d28-d31}, [r0,:128] + vabs.s16 q8, q14 + vabs.s16 q9, q15 + QUANT_TWO q0, q1, d4, d5, d6, d7, q5 + vld1.64 {d28-d31}, [r0,:128] + vabs.s16 q8, q14 + vabs.s16 q9, q15 + QUANT_TWO q0, q1, d4, d5, d6, d7, q6 + vld1.64 {d28-d31}, [r0,:128] + vabs.s16 q8, q14 + vabs.s16 q9, q15 + QUANT_TWO q0, q1, d4, d5, d6, d7, q7 + vorr d8, d8, d9 + vorr d10, d10, d11 + vorr d12, d12, d13 + vorr d14, d14, d15 + vmov r0, r1, d8 + vmov r2, r3, d10 + orrs r0, r1 + movne r0, #1 + orrs r2, r3 + orrne r0, #2 + vmov r1, r2, d12 + vmov r3, ip, d14 + orrs r1, r2 + orrne r0, #4 + orrs r3, ip + orrne r0, #8 + vpop {d8-d15} + bx lr +.endfunc + // quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] ) function x264_quant_8x8_neon vld1.64 {d28-d31}, [r0,:128] @@ -113,13 +154,13 @@ vabs.s16 q9, q15 vld1.64 {d0-d3}, [r2,:128]! vld1.64 {d4-d7}, [r1,:128]! - QUANT_TWO q0, q1, d4, d5, d6, d7 + QUANT_TWO q0, q1, d4, d5, d6, d7, q0 .rept 3 vld1.64 {d28-d31}, [r0,:128] vabs.s16 q8, q14 vabs.s16 q9, q15 vld1.64 {d2-d5}, [r2,:128]! - QUANT_TWO q1, q2, d4, d5, d6, d7, yes + QUANT_TWO q1, q2, d4, d5, d6, d7, q1, yes vorr q0, q0, q1 .endr vorr d0, d0, d1
View file
x264-snapshot-20130224-2245.tar.bz2/common/arm/quant.h -> x264-snapshot-20130723-2245.tar.bz2/common/arm/quant.h
Changed
@@ -31,6 +31,7 @@ int x264_quant_2x2_dc_neon( int16_t dct[4], int mf, int bias ); int x264_quant_4x4_dc_neon( int16_t dct[16], int mf, int bias ); int x264_quant_4x4_neon( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ); +int x264_quant_4x4x4_neon( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] ); int x264_quant_8x8_neon( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] ); void x264_dequant_4x4_dc_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
View file
x264-snapshot-20130224-2245.tar.bz2/common/bitstream.c -> x264-snapshot-20130723-2245.tar.bz2/common/bitstream.c
Changed
@@ -39,11 +39,20 @@ return dst; } -#if HAVE_MMX uint8_t *x264_nal_escape_mmx2( uint8_t *dst, uint8_t *src, uint8_t *end ); uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end ); -uint8_t *x264_nal_escape_avx( uint8_t *dst, uint8_t *src, uint8_t *end ); -#endif +uint8_t *x264_nal_escape_avx2( uint8_t *dst, uint8_t *src, uint8_t *end ); +void x264_cabac_block_residual_rd_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_rd_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_rd_internal_ssse3 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_8x8_rd_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_8x8_rd_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_8x8_rd_internal_ssse3 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_internal_avx2_bmi2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); /**************************************************************************** * x264_nal_encode: @@ -88,13 +97,49 @@ void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf ) { + memset( pf, 0, sizeof(*pf) ); + pf->nal_escape = x264_nal_escape_c; #if HAVE_MMX +#if ARCH_X86_64 + pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_sse2; + pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_sse2; + pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_sse2; +#endif + if( cpu&X264_CPU_MMX2 ) pf->nal_escape = x264_nal_escape_mmx2; - if( (cpu&X264_CPU_SSE2) && (cpu&X264_CPU_SSE2_IS_FAST) ) - pf->nal_escape = x264_nal_escape_sse2; - if( cpu&X264_CPU_AVX ) - pf->nal_escape = x264_nal_escape_avx; + if( cpu&X264_CPU_SSE2 ) + { +#if ARCH_X86_64 + if( cpu&X264_CPU_LZCNT ) + { + pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_sse2_lzcnt; + pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_sse2_lzcnt; + pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_sse2_lzcnt; + } +#endif + if( cpu&X264_CPU_SSE2_IS_FAST ) + pf->nal_escape = x264_nal_escape_sse2; + } +#if ARCH_X86_64 + if( cpu&X264_CPU_SSSE3 ) + { + pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_ssse3; + pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_ssse3; + if( cpu&X264_CPU_LZCNT ) + { + pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_ssse3_lzcnt; + pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt; + } + } + + if( cpu&X264_CPU_AVX2 ) + { + pf->nal_escape = x264_nal_escape_avx2; + if( cpu&X264_CPU_BMI2 ) + pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx2_bmi2; + } +#endif #endif }
View file
x264-snapshot-20130224-2245.tar.bz2/common/bitstream.h -> x264-snapshot-20130723-2245.tar.bz2/common/bitstream.h
Changed
@@ -55,9 +55,9 @@ typedef struct { - int last; - int mask; - dctcoef level[16]; + int32_t last; + int32_t mask; + ALIGNED_16( dctcoef level[18] ); } x264_run_level_t; extern const vlc_t x264_coeff0_token[6]; @@ -69,6 +69,12 @@ typedef struct { uint8_t *(*nal_escape) ( uint8_t *dst, uint8_t *src, uint8_t *end ); + void (*cabac_block_residual_internal)( dctcoef *l, int b_interlaced, + intptr_t ctx_block_cat, x264_cabac_t *cb ); + void (*cabac_block_residual_rd_internal)( dctcoef *l, int b_interlaced, + intptr_t ctx_block_cat, x264_cabac_t *cb ); + void (*cabac_block_residual_8x8_rd_internal)( dctcoef *l, int b_interlaced, + intptr_t ctx_block_cat, x264_cabac_t *cb ); } x264_bitstream_function_t; void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf );
View file
x264-snapshot-20130224-2245.tar.bz2/common/common.c -> x264-snapshot-20130723-2245.tar.bz2/common/common.c
Changed
@@ -171,6 +171,10 @@ param->b_pic_struct = 0; param->b_fake_interlaced = 0; param->i_frame_packing = -1; + param->b_opencl = 0; + param->i_opencl_device = 0; + param->opencl_device_id = NULL; + param->psz_clbin_file = NULL; } static int x264_param_apply_preset( x264_param_t *param, const char *preset ) @@ -563,6 +567,8 @@ } #define atobool(str) ( name_was_bool = 1, x264_atobool( str, &b_error ) ) +#undef atoi +#undef atof #define atoi(str) x264_atoi( str, &b_error ) #define atof(str) x264_atof( str, &b_error ) @@ -620,10 +626,8 @@ b_error = 1; } free( buf ); - if( p->cpu & X264_CPU_SSSE3 ) + if( (p->cpu&X264_CPU_SSSE3) && !(p->cpu&X264_CPU_SSE2_IS_SLOW) ) p->cpu |= X264_CPU_SSE2_IS_FAST; - if( p->cpu & X264_CPU_SSE4 ) - p->cpu |= X264_CPU_SHUFFLE_IS_FAST; } } OPT("threads") @@ -778,8 +782,12 @@ p->i_slice_max_size = atoi(value); OPT("slice-max-mbs") p->i_slice_max_mbs = atoi(value); + OPT("slice-min-mbs") + p->i_slice_min_mbs = atoi(value); OPT("slices") p->i_slice_count = atoi(value); + OPT("slices-max") + p->i_slice_count_max = atoi(value); OPT("cabac") p->b_cabac = atobool(value); OPT("cabac-idc") @@ -1029,6 +1037,14 @@ p->b_fake_interlaced = atobool(value); OPT("frame-packing") p->i_frame_packing = atoi(value); + OPT("stitchable") + p->b_stitchable = atobool(value); + OPT("opencl") + p->b_opencl = atobool( value ); + OPT("opencl-clbin") + p->psz_clbin_file = strdup( value ); + OPT("opencl-device") + p->i_opencl_device = atoi( value ); else return X264_PARAM_BAD_NAME; #undef OPT @@ -1166,17 +1182,14 @@ void *x264_malloc( int i_size ) { uint8_t *align_buf = NULL; -#if SYS_MACOSX || (SYS_WINDOWS && ARCH_X86_64) - /* Mac OS X and Win x64 always returns 16 byte aligned memory */ - align_buf = malloc( i_size ); -#elif HAVE_MALLOC_H - align_buf = memalign( 16, i_size ); +#if HAVE_MALLOC_H + align_buf = memalign( NATIVE_ALIGN, i_size ); #else - uint8_t *buf = malloc( i_size + 15 + sizeof(void **) ); + uint8_t *buf = malloc( i_size + (NATIVE_ALIGN-1) + sizeof(void **) ); if( buf ) { - align_buf = buf + 15 + sizeof(void **); - align_buf -= (intptr_t) align_buf & 15; + align_buf = buf + (NATIVE_ALIGN-1) + sizeof(void **); + align_buf -= (intptr_t) align_buf & (NATIVE_ALIGN-1); *( (void **) ( align_buf - sizeof(void **) ) ) = buf; } #endif @@ -1192,7 +1205,7 @@ { if( p ) { -#if HAVE_MALLOC_H || SYS_MACOSX || (SYS_WINDOWS && ARCH_X86_64) +#if HAVE_MALLOC_H free( p ); #else free( *( ( ( void **) p ) - 1 ) ); @@ -1281,6 +1294,8 @@ s += sprintf( s, "bitdepth=%d ", BIT_DEPTH ); } + if( p->b_opencl ) + s += sprintf( s, "opencl=%d ", p->b_opencl ); s += sprintf( s, "cabac=%d", p->b_cabac ); s += sprintf( s, " ref=%d", p->i_frame_reference ); s += sprintf( s, " deblock=%d:%d:%d", p->b_deblocking_filter, @@ -1305,14 +1320,20 @@ s += sprintf( s, " sliced_threads=%d", p->b_sliced_threads ); if( p->i_slice_count ) s += sprintf( s, " slices=%d", p->i_slice_count ); + if( p->i_slice_count_max ) + s += sprintf( s, " slices_max=%d", p->i_slice_count_max ); if( p->i_slice_max_size ) s += sprintf( s, " slice_max_size=%d", p->i_slice_max_size ); if( p->i_slice_max_mbs ) s += sprintf( s, " slice_max_mbs=%d", p->i_slice_max_mbs ); + if( p->i_slice_min_mbs ) + s += sprintf( s, " slice_min_mbs=%d", p->i_slice_min_mbs ); s += sprintf( s, " nr=%d", p->analyse.i_noise_reduction ); s += sprintf( s, " decimate=%d", p->analyse.b_dct_decimate ); s += sprintf( s, " interlaced=%s", p->b_interlaced ? p->b_tff ? "tff" : "bff" : p->b_fake_interlaced ? "fake" : "0" ); s += sprintf( s, " bluray_compat=%d", p->b_bluray_compat ); + if( p->b_stitchable ) + s += sprintf( s, " stitchable=%d", p->b_stitchable ); s += sprintf( s, " constrained_intra=%d", p->b_constrained_intra );
View file
x264-snapshot-20130224-2245.tar.bz2/common/common.h -> x264-snapshot-20130723-2245.tar.bz2/common/common.h
Changed
@@ -40,6 +40,7 @@ #define IS_DISPOSABLE(type) ( type == X264_TYPE_B ) #define FIX8(f) ((int)(f*(1<<8)+.5)) #define ALIGN(x,a) (((x)+((a)-1))&~((a)-1)) +#define ARRAY_ELEMS(a) ((sizeof(a))/(sizeof(a[0]))) #define CHECKED_MALLOC( var, size )\ do {\ @@ -53,6 +54,8 @@ memset( var, 0, size );\ } while( 0 ) +#define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0])) + #define X264_BFRAME_MAX 16 #define X264_REF_MAX 16 #define X264_THREAD_MAX 128 @@ -202,6 +205,10 @@ }; #include "x264.h" +#if HAVE_OPENCL +#include "opencl.h" +#endif +#include "cabac.h" #include "bitstream.h" #include "set.h" #include "predict.h" @@ -209,7 +216,6 @@ #include "mc.h" #include "frame.h" #include "dct.h" -#include "cabac.h" #include "quant.h" #include "cpu.h" #include "threadpool.h" @@ -291,17 +297,6 @@ return amvd0 + (amvd1<<8); } -static void ALWAYS_INLINE x264_predictor_roundclip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max ) -{ - for( int i = 0; i < i_mvc; i++ ) - { - int mx = (mvc[i][0] + 2) >> 2; - int my = (mvc[i][1] + 2) >> 2; - dst[i][0] = x264_clip3( mx, mv_x_min, mv_x_max ); - dst[i][1] = x264_clip3( my, mv_y_min, mv_y_max ); - } -} - extern const uint8_t x264_exp2_lut[64]; extern const float x264_log2_lut[128]; extern const float x264_log2_lz_lut[32]; @@ -614,11 +609,11 @@ /* Current MB DCT coeffs */ struct { - ALIGNED_16( dctcoef luma16x16_dc[3][16] ); + ALIGNED_N( dctcoef luma16x16_dc[3][16] ); ALIGNED_16( dctcoef chroma_dc[2][8] ); // FIXME share memory? - ALIGNED_16( dctcoef luma8x8[12][64] ); - ALIGNED_16( dctcoef luma4x4[16*3][16] ); + ALIGNED_N( dctcoef luma8x8[12][64] ); + ALIGNED_N( dctcoef luma4x4[16*3][16] ); } dct; /* MB table and cache for current frame/mb */ @@ -671,8 +666,7 @@ int mv_miny_spel_row[3]; int mv_maxy_spel_row[3]; /* Fullpel MV range for motion search */ - int mv_min_fpel[2]; - int mv_max_fpel[2]; + ALIGNED_8( int16_t mv_limit_fpel[2][2] ); /* min_x, min_y, max_x, max_y */ int mv_miny_fpel_row[3]; int mv_maxy_fpel_row[3]; @@ -758,7 +752,7 @@ #define FENC_STRIDE 16 #define FDEC_STRIDE 32 ALIGNED_16( pixel fenc_buf[48*FENC_STRIDE] ); - ALIGNED_16( pixel fdec_buf[52*FDEC_STRIDE] ); + ALIGNED_N( pixel fdec_buf[52*FDEC_STRIDE] ); /* i4x4 and i8x8 backup data, for skipping the encode stage when possible */ ALIGNED_16( pixel i4x4_fdec_buf[16*16] ); @@ -775,8 +769,8 @@ ALIGNED_16( dctcoef fenc_dct4[16][16] ); /* Psy RD SATD/SA8D scores cache */ - ALIGNED_16( uint64_t fenc_hadamard_cache[9] ); - ALIGNED_16( uint32_t fenc_satd_cache[32] ); + ALIGNED_N( uint64_t fenc_hadamard_cache[9] ); + ALIGNED_N( uint32_t fenc_satd_cache[32] ); /* pointer over mb of the frame to be compressed */ pixel *p_fenc[3]; /* y,u,v */ @@ -910,8 +904,8 @@ uint32_t (*nr_residual_sum)[64]; uint32_t *nr_count; - ALIGNED_16( udctcoef nr_offset_denoise[4][64] ); - ALIGNED_16( uint32_t nr_residual_sum_buf[2][4][64] ); + ALIGNED_N( udctcoef nr_offset_denoise[4][64] ); + ALIGNED_N( uint32_t nr_residual_sum_buf[2][4][64] ); uint32_t nr_count_buf[2][4]; uint8_t luma2chroma_pixel[7]; /* Subsampled pixel size */ @@ -947,11 +941,48 @@ struct visualize_t *visualize; #endif x264_lookahead_t *lookahead; + +#if HAVE_OPENCL + x264_opencl_t opencl; +#endif }; // included at the end because it needs x264_t #include "macroblock.h" +static int ALWAYS_INLINE x264_predictor_roundclip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv ) +{ + int cnt = 0; + for( int i = 0; i < i_mvc; i++ ) + { + int mx = (mvc[i][0] + 2) >> 2; + int my = (mvc[i][1] + 2) >> 2; + uint32_t mv = pack16to32_mask(mx, my); + if( !mv || mv == pmv ) continue; + dst[cnt][0] = x264_clip3( mx, mv_limit[0][0], mv_limit[1][0] ); + dst[cnt][1] = x264_clip3( my, mv_limit[0][1], mv_limit[1][1] ); + cnt++; + } + return cnt; +} + +static int ALWAYS_INLINE x264_predictor_clip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv ) +{ + int cnt = 0; + int qpel_limit[4] = {mv_limit[0][0] << 2, mv_limit[0][1] << 2, mv_limit[1][0] << 2, mv_limit[1][1] << 2}; + for( int i = 0; i < i_mvc; i++ ) + { + uint32_t mv = M32( mvc[i] ); + int mx = mvc[i][0]; + int my = mvc[i][1]; + if( !mv || mv == pmv ) continue; + dst[cnt][0] = x264_clip3( mx, qpel_limit[0], qpel_limit[2] ); + dst[cnt][1] = x264_clip3( my, qpel_limit[1], qpel_limit[3] ); + cnt++; + } + return cnt; +} + #if ARCH_X86 || ARCH_X86_64 #include "x86/util.h" #endif
View file
x264-snapshot-20130224-2245.tar.bz2/common/cpu.c -> x264-snapshot-20130723-2245.tar.bz2/common/cpu.c
Changed
@@ -47,18 +47,19 @@ const x264_cpu_name_t x264_cpu_names[] = { - {"Altivec", X264_CPU_ALTIVEC}, -// {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore - {"MMX2", X264_CPU_MMX|X264_CPU_MMX2}, - {"MMXEXT", X264_CPU_MMX|X264_CPU_MMX2}, -// {"SSE", X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_SSE}, // there are no sse1 functions in x264 -#define SSE2 X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_SSE|X264_CPU_SSE2 +#if HAVE_MMX +// {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore +// {"CMOV", X264_CPU_CMOV}, // we require this unconditionally, so don't print it +#define MMX2 X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_CMOV + {"MMX2", MMX2}, + {"MMXEXT", MMX2}, + {"SSE", MMX2|X264_CPU_SSE}, +#define SSE2 MMX2|X264_CPU_SSE|X264_CPU_SSE2 {"SSE2Slow", SSE2|X264_CPU_SSE2_IS_SLOW}, {"SSE2", SSE2}, {"SSE2Fast", SSE2|X264_CPU_SSE2_IS_FAST}, {"SSE3", SSE2|X264_CPU_SSE3}, {"SSSE3", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3}, - {"FastShuffle", SSE2|X264_CPU_SHUFFLE_IS_FAST}, {"SSE4.1", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4}, {"SSE4", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4}, {"SSE4.2", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42}, @@ -70,19 +71,26 @@ {"FMA3", AVX|X264_CPU_FMA3}, #undef AVX #undef SSE2 +#undef MMX2 {"Cache32", X264_CPU_CACHELINE_32}, {"Cache64", X264_CPU_CACHELINE_64}, {"SSEMisalign", X264_CPU_SSE_MISALIGN}, {"LZCNT", X264_CPU_LZCNT}, {"BMI1", X264_CPU_BMI1}, {"BMI2", X264_CPU_BMI1|X264_CPU_BMI2}, - {"TBM", X264_CPU_TBM}, - {"Slow_mod4_stack", X264_CPU_STACK_MOD4}, - {"ARMv6", X264_CPU_ARMV6}, - {"NEON", X264_CPU_NEON}, - {"Fast_NEON_MRC", X264_CPU_FAST_NEON_MRC}, {"SlowCTZ", X264_CPU_SLOW_CTZ}, {"SlowAtom", X264_CPU_SLOW_ATOM}, + {"SlowPshufb", X264_CPU_SLOW_PSHUFB}, + {"SlowPalignr", X264_CPU_SLOW_PALIGNR}, + {"SlowShuffle", X264_CPU_SLOW_SHUFFLE}, + {"UnalignedStack", X264_CPU_STACK_MOD4}, +#elif ARCH_PPC + {"Altivec", X264_CPU_ALTIVEC}, +#elif ARCH_ARM + {"ARMv6", X264_CPU_ARMV6}, + {"NEON", X264_CPU_NEON}, + {"FastNeonMRC", X264_CPU_FAST_NEON_MRC}, +#endif {"", 0}, }; @@ -131,9 +139,13 @@ if( edx&0x00800000 ) cpu |= X264_CPU_MMX; else - return 0; + return cpu; if( edx&0x02000000 ) cpu |= X264_CPU_MMX2|X264_CPU_SSE; + if( edx&0x00008000 ) + cpu |= X264_CPU_CMOV; + else + return cpu; if( edx&0x04000000 ) cpu |= X264_CPU_SSE2; if( ecx&0x00000001 ) @@ -170,46 +182,56 @@ if( cpu & X264_CPU_SSSE3 ) cpu |= X264_CPU_SSE2_IS_FAST; - if( cpu & X264_CPU_SSE4 ) - cpu |= X264_CPU_SHUFFLE_IS_FAST; x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx ); max_extended_cap = eax; - if( !strcmp((char*)vendor, "AuthenticAMD") && max_extended_cap >= 0x80000001 ) + if( max_extended_cap >= 0x80000001 ) { - cpu |= X264_CPU_SLOW_CTZ; x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx ); - if( edx&0x00400000 ) - cpu |= X264_CPU_MMX2; - if( cpu & X264_CPU_SSE2 ) + + if( ecx&0x00000020 ) + cpu |= X264_CPU_LZCNT; /* Supported by Intel chips starting with Haswell */ + if( ecx&0x00000040 ) /* SSE4a, AMD only */ { - if( ecx&0x00000040 ) /* SSE4a */ + int family = ((eax>>8)&0xf) + ((eax>>20)&0xff); + cpu |= X264_CPU_SSE2_IS_FAST; /* Phenom and later CPUs have fast SSE units */ + if( family == 0x14 ) { - cpu |= X264_CPU_SSE2_IS_FAST; - cpu |= X264_CPU_LZCNT; - cpu |= X264_CPU_SHUFFLE_IS_FAST; - cpu &= ~X264_CPU_SLOW_CTZ; + cpu &= ~X264_CPU_SSE2_IS_FAST; /* SSSE3 doesn't imply fast SSE anymore... */ + cpu |= X264_CPU_SSE2_IS_SLOW; /* Bobcat has 64-bit SIMD units */ + cpu |= X264_CPU_SLOW_PALIGNR; /* palignr is insanely slow on Bobcat */ } - else - cpu |= X264_CPU_SSE2_IS_SLOW; - - if( ecx&0x00000080 ) /* Misalign SSE */ + if( family == 0x16 ) { - cpu |= X264_CPU_SSE_MISALIGN; - x264_cpu_mask_misalign_sse(); + cpu |= X264_CPU_SLOW_PSHUFB; /* Jaguar's pshufb isn't that slow, but it's slow enough + * compared to alternate instruction sequences that this + * is equal or faster on almost all such functions. */ } + } - if( cpu & X264_CPU_AVX ) - { - if( ecx&0x00000800 ) /* XOP */ - cpu |= X264_CPU_XOP; - if( ecx&0x00010000 ) /* FMA4 */ - cpu |= X264_CPU_FMA4; - } + if( ecx&0x00000080 ) /* Misalign SSE */ + { + cpu |= X264_CPU_SSE_MISALIGN; + x264_cpu_mask_misalign_sse(); + } - if( ecx&0x00200000 ) - cpu |= X264_CPU_TBM; + if( cpu & X264_CPU_AVX ) + { + if( ecx&0x00000800 ) /* XOP */ + cpu |= X264_CPU_XOP; + if( ecx&0x00010000 ) /* FMA4 */ + cpu |= X264_CPU_FMA4; + } + + if( !strcmp((char*)vendor, "AuthenticAMD") ) + { + if( edx&0x00400000 ) + cpu |= X264_CPU_MMX2; + if( !(cpu&X264_CPU_LZCNT) ) + cpu |= X264_CPU_SLOW_CTZ; + if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_FAST) ) + cpu |= X264_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */ } } @@ -233,11 +255,12 @@ { cpu |= X264_CPU_SLOW_ATOM; cpu |= X264_CPU_SLOW_CTZ; + cpu |= X264_CPU_SLOW_PSHUFB; } - /* Some Penryns and Nehalems are pointlessly crippled (SSE4 disabled), so - * detect them here. */ - else if( model >= 23 ) - cpu |= X264_CPU_SHUFFLE_IS_FAST; + /* Conroe has a slow shuffle unit. Check the model number to make sure not + * to include crippled low-end Penryns and Nehalems that don't have SSE4. */ + else if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE4) && model < 23 ) + cpu |= X264_CPU_SLOW_SHUFFLE; } }
View file
x264-snapshot-20130224-2245.tar.bz2/common/cpu.h -> x264-snapshot-20130723-2245.tar.bz2/common/cpu.h
Changed
@@ -48,15 +48,17 @@ void x264_cpu_mask_misalign_sse( void ); void x264_safe_intel_cpu_indicator_init( void ); -/* kluge: +/* kludge: * gcc can't give variables any greater alignment than the stack frame has. - * We need 16 byte alignment for SSE2, so here we make sure that the stack is - * aligned to 16 bytes. + * We need 32 byte alignment for AVX2, so here we make sure that the stack is + * aligned to 32 bytes. * gcc 4.2 introduced __attribute__((force_align_arg_pointer)) to fix this * problem, but I don't want to require such a new version. - * This applies only to x86_32, since other architectures that need alignment - * either have ABIs that ensure aligned stack, or don't support it at all. */ -#if ARCH_X86 && HAVE_MMX + * aligning to 32 bytes only works if the compiler supports keeping that + * alignment between functions (osdep.h handles manual alignment of arrays + * if it doesn't). + */ +#if (ARCH_X86 || HAVE_32B_STACK_ALIGNMENT) && HAVE_MMX int x264_stack_align( void (*func)(), ... ); #define x264_stack_align(func,...) x264_stack_align((void (*)())func, __VA_ARGS__) #else
View file
x264-snapshot-20130224-2245.tar.bz2/common/dct.c -> x264-snapshot-20130723-2245.tar.bz2/common/dct.c
Changed
@@ -640,23 +640,32 @@ dctf->add8x8_idct8 = x264_add8x8_idct8_sse2; dctf->add16x16_idct8= x264_add16x16_idct8_sse2; - dctf->sub8x8_dct = x264_sub8x8_dct_sse2; - dctf->sub16x16_dct = x264_sub16x16_dct_sse2; - dctf->add8x8_idct = x264_add8x8_idct_sse2; - dctf->add16x16_idct = x264_add16x16_idct_sse2; - dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2; + if( !(cpu&X264_CPU_SSE2_IS_SLOW) ) + { + dctf->sub8x8_dct = x264_sub8x8_dct_sse2; + dctf->sub16x16_dct = x264_sub16x16_dct_sse2; + dctf->add8x8_idct = x264_add8x8_idct_sse2; + dctf->add16x16_idct = x264_add16x16_idct_sse2; + dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2; + } } - if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SLOW_ATOM) ) + if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE2_IS_SLOW) ) { - dctf->sub4x4_dct = x264_sub4x4_dct_ssse3; - dctf->sub8x8_dct = x264_sub8x8_dct_ssse3; - dctf->sub16x16_dct = x264_sub16x16_dct_ssse3; - dctf->sub8x8_dct8 = x264_sub8x8_dct8_ssse3; - dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3; dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_ssse3; - dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3; - dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3; + if( !(cpu&X264_CPU_SLOW_ATOM) ) + { + dctf->sub4x4_dct = x264_sub4x4_dct_ssse3; + dctf->sub8x8_dct = x264_sub8x8_dct_ssse3; + dctf->sub16x16_dct = x264_sub16x16_dct_ssse3; + dctf->sub8x8_dct8 = x264_sub8x8_dct8_ssse3; + dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3; + if( !(cpu&X264_CPU_SLOW_PSHUFB) ) + { + dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3; + dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3; + } + } } if( cpu&X264_CPU_SSE4 ) @@ -681,6 +690,18 @@ dctf->sub8x8_dct = x264_sub8x8_dct_xop; dctf->sub16x16_dct = x264_sub16x16_dct_xop; } + + if( cpu&X264_CPU_AVX2 ) + { + dctf->add8x8_idct = x264_add8x8_idct_avx2; + dctf->add16x16_idct = x264_add16x16_idct_avx2; + dctf->sub8x8_dct = x264_sub8x8_dct_avx2; + dctf->sub16x16_dct = x264_sub16x16_dct_avx2; + dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx2; +#if ARCH_X86_64 + dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx2; +#endif + } #endif //HAVE_MMX #if HAVE_ALTIVEC @@ -951,7 +972,7 @@ pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_ssse3; pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3; pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3; - if( cpu&X264_CPU_SHUFFLE_IS_FAST ) + if( !(cpu&X264_CPU_SLOW_SHUFFLE) ) pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3; } if( cpu&X264_CPU_AVX ) @@ -962,8 +983,7 @@ pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_avx; pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_avx; #endif - if( cpu&X264_CPU_SHUFFLE_IS_FAST ) - pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx; + pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx; } if( cpu&X264_CPU_XOP ) { @@ -1005,7 +1025,7 @@ pf_interlaced->interleave_8x8_cavlc = pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx; } - if( cpu&X264_CPU_SHUFFLE_IS_FAST ) + if( (cpu&X264_CPU_SSE2) && !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SSE2_IS_SLOW)) ) { pf_interlaced->interleave_8x8_cavlc = pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2; @@ -1016,6 +1036,12 @@ pf_interlaced->interleave_8x8_cavlc = pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx; } + + if( cpu&X264_CPU_AVX2 ) + { + pf_interlaced->interleave_8x8_cavlc = + pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx2; + } #endif // HIGH_BIT_DEPTH #endif }
View file
x264-snapshot-20130224-2245.tar.bz2/common/deblock.c -> x264-snapshot-20130723-2245.tar.bz2/common/deblock.c
Changed
@@ -686,6 +686,9 @@ void x264_deblock_strength_avx ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit, int bframe ); +void x264_deblock_strength_avx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + int mvy_limit, int bframe ); void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta ); void x264_deblock_h_chroma_intra_mbaff_sse2( pixel *pix, intptr_t stride, int alpha, int beta ); @@ -816,6 +819,10 @@ #endif } } + if( cpu&X264_CPU_AVX2 ) + { + pf->deblock_strength = x264_deblock_strength_avx2; + } } #endif
View file
x264-snapshot-20130224-2245.tar.bz2/common/display-x11.c -> x264-snapshot-20130723-2245.tar.bz2/common/display-x11.c
Changed
@@ -49,7 +49,7 @@ abort(); } -static void disp_init_display() +static void disp_init_display( void ) { Visual *visual; int dpy_class; @@ -130,7 +130,7 @@ XFree( shint ); } -void disp_sync() +void disp_sync( void ) { XSync( disp_display, 1 ); }
View file
x264-snapshot-20130224-2245.tar.bz2/common/frame.c -> x264-snapshot-20130723-2245.tar.bz2/common/frame.c
Changed
@@ -72,8 +72,18 @@ int i_mb_count = h->mb.i_mb_count; int i_stride, i_width, i_lines, luma_plane_count; int i_padv = PADV << PARAM_INTERLACED; - int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16; - int disalign = h->param.cpu&X264_CPU_ALTIVEC ? 1<<9 : 1<<10; + int align = 16; +#if ARCH_X86 || ARCH_X86_64 + if( h->param.cpu&X264_CPU_CACHELINE_64 ) + align = 64; + else if( h->param.cpu&X264_CPU_CACHELINE_32 || h->param.cpu&X264_CPU_AVX2 ) + align = 32; +#endif +#if ARCH_PPC + int disalign = 1<<9; +#else + int disalign = 1<<10; +#endif CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) ); @@ -251,6 +261,10 @@ if( x264_pthread_cond_init( &frame->cv, NULL ) ) goto fail; +#if HAVE_OPENCL + frame->opencl.ocl = h->opencl.ocl; +#endif + return frame; fail: @@ -312,6 +326,9 @@ } x264_pthread_mutex_destroy( &frame->mutex ); x264_pthread_cond_destroy( &frame->cv ); +#if HAVE_OPENCL + x264_opencl_frame_delete( frame ); +#endif } x264_free( frame ); } @@ -655,6 +672,21 @@ x264_pthread_mutex_unlock( &h->mutex ); } +int x264_frame_new_slice( x264_t *h, x264_frame_t *frame ) +{ + if( h->param.i_slice_count_max ) + { + int slice_count; + if( h->param.b_sliced_threads ) + slice_count = x264_pthread_fetch_and_add( &frame->i_slice_count, 1, &frame->mutex ); + else + slice_count = frame->i_slice_count++; + if( slice_count >= h->param.i_slice_count_max ) + return -1; + } + return 0; +} + /* list operators */ void x264_frame_push( x264_frame_t **list, x264_frame_t *frame ) @@ -717,6 +749,7 @@ frame->b_scenecut = 1; frame->b_keyframe = 0; frame->b_corrupt = 0; + frame->i_slice_count = h->param.b_sliced_threads ? h->param.i_threads : 1; memset( frame->weight, 0, sizeof(frame->weight) ); memset( frame->f_weighted_cost_delta, 0, sizeof(frame->f_weighted_cost_delta) );
View file
x264-snapshot-20130224-2245.tar.bz2/common/frame.h -> x264-snapshot-20130723-2245.tar.bz2/common/frame.h
Changed
@@ -152,6 +152,7 @@ int i_reference_count; /* number of threads using this frame (not necessarily the number of pointers) */ x264_pthread_mutex_t mutex; x264_pthread_cond_t cv; + int i_slice_count; /* Atomically written to/read from with slice threads */ /* periodic intra refresh */ float f_pir_position; @@ -171,6 +172,10 @@ /* user frame properties */ uint8_t *mb_info; void (*mb_info_free)( void* ); + +#if HAVE_OPENCL + x264_frame_opencl_t opencl; +#endif } x264_frame_t; /* synchronized frame list */ @@ -230,6 +235,7 @@ void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed ); void x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed ); +int x264_frame_new_slice( x264_t *h, x264_frame_t *frame ); void x264_threadslice_cond_broadcast( x264_t *h, int pass ); void x264_threadslice_cond_wait( x264_t *h, int pass );
View file
x264-snapshot-20130224-2245.tar.bz2/common/macroblock.c -> x264-snapshot-20130723-2245.tar.bz2/common/macroblock.c
Changed
@@ -122,8 +122,8 @@ int mvy1 = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y; int i_mode = x264_size2pixel[height][width]; intptr_t i_stride0 = 16, i_stride1 = 16; - ALIGNED_ARRAY_16( pixel, tmp0,[16*16] ); - ALIGNED_ARRAY_16( pixel, tmp1,[16*16] ); + ALIGNED_ARRAY_N( pixel, tmp0,[16*16] ); + ALIGNED_ARRAY_N( pixel, tmp1,[16*16] ); pixel *src0, *src1; MC_LUMA_BI( 0 ); @@ -387,7 +387,7 @@ int scratch_size = 0; if( !b_lookahead ) { - int buf_hpel = (h->thread[0]->fdec->i_width[0]+48) * sizeof(int16_t); + int buf_hpel = (h->thread[0]->fdec->i_width[0]+48+32) * sizeof(int16_t); int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int); int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range); int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
View file
x264-snapshot-20130224-2245.tar.bz2/common/mc.c -> x264-snapshot-20130723-2245.tar.bz2/common/mc.c
Changed
@@ -469,7 +469,7 @@ } } -void x264_mc_init( int cpu, x264_mc_functions_t *pf ) +void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent ) { pf->mc_luma = mc_luma; pf->get_ref = get_ref; @@ -534,6 +534,9 @@ #if HAVE_ARMV6 x264_mc_init_arm( cpu, pf ); #endif + + if( cpu_independent ) + pf->mbtree_propagate_cost = mbtree_propagate_cost; } void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
View file
x264-snapshot-20130224-2245.tar.bz2/common/mc.h -> x264-snapshot-20130723-2245.tar.bz2/common/mc.h
Changed
@@ -123,6 +123,6 @@ uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); } x264_mc_functions_t; -void x264_mc_init( int cpu, x264_mc_functions_t *pf ); +void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent ); #endif
View file
x264-snapshot-20130723-2245.tar.bz2/common/opencl
Added
+(directory)
View file
x264-snapshot-20130723-2245.tar.bz2/common/opencl.c
Added
@@ -0,0 +1,718 @@ +/***************************************************************************** + * opencl.c: OpenCL initialization and kernel compilation + ***************************************************************************** + * Copyright (C) 2012-2013 x264 project + * + * Authors: Steve Borho <sborho@multicorewareinc.com> + * Anton Mitrofanov <BugMaster@narod.ru> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "common.h" + +#ifdef _WIN32 +#include <windows.h> +#define ocl_open LoadLibrary( "OpenCL" ) +#define ocl_close FreeLibrary +#define ocl_address GetProcAddress +#else +#include <dlfcn.h> //dlopen, dlsym, dlclose +#if SYS_MACOSX +#define ocl_open dlopen( "/System/Library/Frameworks/OpenCL.framework/OpenCL", RTLD_NOW ) +#else +#define ocl_open dlopen( "libOpenCL.so", RTLD_NOW ) +#endif +#define ocl_close dlclose +#define ocl_address dlsym +#endif + +#define LOAD_OCL_FUNC(name, continue_on_fail)\ +{\ + ocl->name = (void*)ocl_address( ocl->library, #name );\ + if( !continue_on_fail && !ocl->name )\ + goto fail;\ +} + +/* load the library and functions we require from it */ +x264_opencl_function_t *x264_opencl_load_library( void ) +{ + x264_opencl_function_t *ocl; +#undef fail +#define fail fail0 + CHECKED_MALLOCZERO( ocl, sizeof(x264_opencl_function_t) ); +#undef fail +#define fail fail1 + ocl->library = ocl_open; + if( !ocl->library ) + goto fail; +#undef fail +#define fail fail2 + LOAD_OCL_FUNC( clBuildProgram, 0 ); + LOAD_OCL_FUNC( clCreateBuffer, 0 ); + LOAD_OCL_FUNC( clCreateCommandQueue, 0 ); + LOAD_OCL_FUNC( clCreateContext, 0 ); + LOAD_OCL_FUNC( clCreateImage2D, 0 ); + LOAD_OCL_FUNC( clCreateKernel, 0 ); + LOAD_OCL_FUNC( clCreateProgramWithBinary, 0 ); + LOAD_OCL_FUNC( clCreateProgramWithSource, 0 ); + LOAD_OCL_FUNC( clEnqueueCopyBuffer, 0 ); + LOAD_OCL_FUNC( clEnqueueMapBuffer, 0 ); + LOAD_OCL_FUNC( clEnqueueNDRangeKernel, 0 ); + LOAD_OCL_FUNC( clEnqueueReadBuffer, 0 ); + LOAD_OCL_FUNC( clEnqueueWriteBuffer, 0 ); + LOAD_OCL_FUNC( clFinish, 0 ); + LOAD_OCL_FUNC( clGetCommandQueueInfo, 0 ); + LOAD_OCL_FUNC( clGetDeviceIDs, 0 ); + LOAD_OCL_FUNC( clGetDeviceInfo, 0 ); + LOAD_OCL_FUNC( clGetKernelWorkGroupInfo, 0 ); + LOAD_OCL_FUNC( clGetPlatformIDs, 0 ); + LOAD_OCL_FUNC( clGetProgramBuildInfo, 0 ); + LOAD_OCL_FUNC( clGetProgramInfo, 0 ); + LOAD_OCL_FUNC( clGetSupportedImageFormats, 0 ); + LOAD_OCL_FUNC( clReleaseCommandQueue, 0 ); + LOAD_OCL_FUNC( clReleaseContext, 0 ); + LOAD_OCL_FUNC( clReleaseKernel, 0 ); + LOAD_OCL_FUNC( clReleaseMemObject, 0 ); + LOAD_OCL_FUNC( clReleaseProgram, 0 ); + LOAD_OCL_FUNC( clSetKernelArg, 0 ); + return ocl; +#undef fail +fail2: + ocl_close( ocl->library ); +fail1: + x264_free( ocl ); +fail0: + return NULL; +} + +void x264_opencl_close_library( x264_opencl_function_t *ocl ) +{ + if( !ocl ) + return; + ocl_close( ocl->library ); + x264_free( ocl ); +} + +/* define from recent cl_ext.h, copied here in case headers are old */ +#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD 0x4042 + +/* Requires full include path in case of out-of-tree builds */ +#include "common/oclobj.h" + +static int x264_detect_switchable_graphics( void ); + +/* Try to load the cached compiled program binary, verify the device context is + * still valid before reuse */ +static cl_program x264_opencl_cache_load( x264_t *h, char *dev_name, char *dev_vendor, char *driver_version ) +{ + /* try to load cached program binary */ + FILE *fp = fopen( h->param.psz_clbin_file, "rb" ); + if( !fp ) + return NULL; + + x264_opencl_function_t *ocl = h->opencl.ocl; + cl_program program = NULL; + uint8_t *binary = NULL; + + fseek( fp, 0, SEEK_END ); + size_t size = ftell( fp ); + rewind( fp ); + CHECKED_MALLOC( binary, size ); + + fread( binary, 1, size, fp ); + const uint8_t *ptr = (const uint8_t*)binary; + +#define CHECK_STRING( STR )\ + do {\ + size_t len = strlen( STR );\ + if( size <= len || strncmp( (char*)ptr, STR, len ) )\ + goto fail;\ + else {\ + size -= (len+1); ptr += (len+1);\ + }\ + } while( 0 ) + + CHECK_STRING( dev_name ); + CHECK_STRING( dev_vendor ); + CHECK_STRING( driver_version ); + CHECK_STRING( x264_opencl_source_hash ); +#undef CHECK_STRING + + cl_int status; + program = ocl->clCreateProgramWithBinary( h->opencl.context, 1, &h->opencl.device, &size, &ptr, NULL, &status ); + if( status != CL_SUCCESS ) + program = NULL; + +fail: + fclose( fp ); + x264_free( binary ); + return program; +} + +/* Save the compiled program binary to a file for later reuse. Device context + * is also saved in the cache file so we do not reuse stale binaries */ +static void x264_opencl_cache_save( x264_t *h, cl_program program, char *dev_name, char *dev_vendor, char *driver_version ) +{ + FILE *fp = fopen( h->param.psz_clbin_file, "wb" ); + if( !fp ) + { + x264_log( h, X264_LOG_INFO, "OpenCL: unable to open clbin file for write\n" ); + return; + } + + x264_opencl_function_t *ocl = h->opencl.ocl; + uint8_t *binary = NULL; + + size_t size = 0; + cl_int status = ocl->clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL ); + if( status != CL_SUCCESS || !size ) + { + x264_log( h, X264_LOG_INFO, "OpenCL: Unable to query program binary size, no cache file generated\n" ); + goto fail; + } + + CHECKED_MALLOC( binary, size ); + status = ocl->clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &binary, NULL ); + if( status != CL_SUCCESS ) + { + x264_log( h, X264_LOG_INFO, "OpenCL: Unable to query program binary, no cache file generated\n" ); + goto fail; + } + + fputs( dev_name, fp ); + fputc( '\n', fp );
View file
x264-snapshot-20130723-2245.tar.bz2/common/opencl.h
Added
@@ -0,0 +1,804 @@ +/***************************************************************************** + * opencl.h: OpenCL structures and defines + ***************************************************************************** + * Copyright (C) 2012-2013 x264 project + * + * Authors: Steve Borho <sborho@multicorewareinc.com> + * Anton Mitrofanov <BugMaster@narod.ru> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_OPENCL_H +#define X264_OPENCL_H + +#define CL_USE_DEPRECATED_OPENCL_1_1_APIS +#include "extras/cl.h" + +#define OCL_API(ret, attr, name) typedef ret (attr *name##_func) + +/* Platform API */ +OCL_API(cl_int, CL_API_CALL, clGetPlatformIDs) +( cl_uint /* num_entries */, + cl_platform_id * /* platforms */, + cl_uint * /* num_platforms */); + +OCL_API(cl_int, CL_API_CALL, clGetPlatformInfo) +( cl_platform_id /* platform */, + cl_platform_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */); + +/* Device APIs */ +OCL_API(cl_int, CL_API_CALL, clGetDeviceIDs) +( cl_platform_id /* platform */, + cl_device_type /* device_type */, + cl_uint /* num_entries */, + cl_device_id * /* devices */, + cl_uint * /* num_devices */); + +OCL_API(cl_int, CL_API_CALL, clGetDeviceInfo) +( cl_device_id /* device */, + cl_device_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */); + +OCL_API(cl_int, CL_API_CALL, clCreateSubDevices) +( cl_device_id /* in_device */, + const cl_device_partition_property * /* properties */, + cl_uint /* num_devices */, + cl_device_id * /* out_devices */, + cl_uint * /* num_devices_ret */); + +OCL_API(cl_int, CL_API_CALL, clRetainDevice) +( cl_device_id /* device */); + +OCL_API(cl_int, CL_API_CALL, clReleaseDevice) +( cl_device_id /* device */); + +/* Context APIs */ +OCL_API(cl_context, CL_API_CALL, clCreateContext) +( const cl_context_properties * /* properties */, + cl_uint /* num_devices */, + const cl_device_id * /* devices */, + void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *), + void * /* user_data */, + cl_int * /* errcode_ret */); + +OCL_API(cl_context, CL_API_CALL, clCreateContextFromType) +( const cl_context_properties * /* properties */, + cl_device_type /* device_type */, + void (CL_CALLBACK * /* pfn_notify*/ )(const char *, const void *, size_t, void *), + void * /* user_data */, + cl_int * /* errcode_ret */); + +OCL_API(cl_int, CL_API_CALL, clRetainContext) +( cl_context /* context */); + +OCL_API(cl_int, CL_API_CALL, clReleaseContext) +( cl_context /* context */); + +OCL_API(cl_int, CL_API_CALL, clGetContextInfo) +( cl_context /* context */, + cl_context_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */); + +/* Command Queue APIs */ +OCL_API(cl_command_queue, CL_API_CALL, clCreateCommandQueue) +( cl_context /* context */, + cl_device_id /* device */, + cl_command_queue_properties /* properties */, + cl_int * /* errcode_ret */); + +OCL_API(cl_int, CL_API_CALL, clRetainCommandQueue) +( cl_command_queue /* command_queue */); + +OCL_API(cl_int, CL_API_CALL, clReleaseCommandQueue) +( cl_command_queue /* command_queue */); + +OCL_API(cl_int, CL_API_CALL, clGetCommandQueueInfo) +( cl_command_queue /* command_queue */, + cl_command_queue_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */); + +/* Memory Object APIs */ +OCL_API(cl_mem, CL_API_CALL, clCreateBuffer) +( cl_context /* context */, + cl_mem_flags /* flags */, + size_t /* size */, + void * /* host_ptr */, + cl_int * /* errcode_ret */); + +OCL_API(cl_mem, CL_API_CALL, clCreateSubBuffer) +( cl_mem /* buffer */, + cl_mem_flags /* flags */, + cl_buffer_create_type /* buffer_create_type */, + const void * /* buffer_create_info */, + cl_int * /* errcode_ret */); + +OCL_API(cl_mem, CL_API_CALL, clCreateImage) +( cl_context /* context */, + cl_mem_flags /* flags */, + const cl_image_format * /* image_format */, + const cl_image_desc * /* image_desc */, + void * /* host_ptr */, + cl_int * /* errcode_ret */); + +OCL_API(cl_int, CL_API_CALL, clRetainMemObject) +( cl_mem /* memobj */); + +OCL_API(cl_int, CL_API_CALL, clReleaseMemObject) +( cl_mem /* memobj */); + +OCL_API(cl_int, CL_API_CALL, clGetSupportedImageFormats) +( cl_context /* context */, + cl_mem_flags /* flags */, + cl_mem_object_type /* image_type */, + cl_uint /* num_entries */, + cl_image_format * /* image_formats */, + cl_uint * /* num_image_formats */); + +OCL_API(cl_int, CL_API_CALL, clGetMemObjectInfo) +( cl_mem /* memobj */, + cl_mem_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */); + +OCL_API(cl_int, CL_API_CALL, clGetImageInfo) +( cl_mem /* image */, + cl_image_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */); + +OCL_API(cl_int, CL_API_CALL, clSetMemObjectDestructorCallback) +( cl_mem /* memobj */, + void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), + void * /*user_data */ ); + +/* Sampler APIs */ +OCL_API(cl_sampler, CL_API_CALL, clCreateSampler) +( cl_context /* context */, + cl_bool /* normalized_coords */, + cl_addressing_mode /* addressing_mode */, + cl_filter_mode /* filter_mode */, + cl_int * /* errcode_ret */); + +OCL_API(cl_int, CL_API_CALL, clRetainSampler) +( cl_sampler /* sampler */); + +OCL_API(cl_int, CL_API_CALL, clReleaseSampler) +( cl_sampler /* sampler */); + +OCL_API(cl_int, CL_API_CALL, clGetSamplerInfo) +( cl_sampler /* sampler */, + cl_sampler_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */,
View file
x264-snapshot-20130723-2245.tar.bz2/common/opencl/bidir.cl
Added
@@ -0,0 +1,265 @@ +/* Mode selection routines, select the least SATD cost mode for each lowres + * macroblock. When measuring B slices, this includes measuring the cost of + * three bidir modes. */ + +/* Four threads cooperatively measure 8x8 BIDIR cost with SATD */ +int bidir_satd_8x8_ii_coop4( read_only image2d_t fenc_lowres, + int2 fencpos, + read_only image2d_t fref0_planes, + int2 qpos0, + read_only image2d_t fref1_planes, + int2 qpos1, + int weight, + local sum2_t *tmpp, + int idx ) +{ + volatile local sum2_t( *tmp )[4] = (volatile local sum2_t( * )[4])tmpp; + sum2_t b0, b1, b2, b3; + sum2_t sum = 0; + + // fencpos is full-pel position of original MB + // qpos0 is qpel position within reference frame 0 + // qpos1 is qpel position within reference frame 1 + + int2 fref0Apos = (int2)(qpos0.x>>2, qpos0.y>>2); + int hpel0A = ((qpos0.x&2)>>1) + (qpos0.y&2); + + int2 qpos0B = (int2)qpos0 + (int2)(((qpos0.x&1)<<1), ((qpos0.y&1)<<1)); + int2 fref0Bpos = (int2)(qpos0B.x>>2, qpos0B.y>>2); + int hpel0B = ((qpos0B.x&2)>>1) + (qpos0B.y&2); + + int2 fref1Apos = (int2)(qpos1.x>>2, qpos1.y>>2); + int hpel1A = ((qpos1.x&2)>>1) + (qpos1.y&2); + + int2 qpos1B = (int2)qpos1 + (int2)(((qpos1.x&1)<<1), ((qpos1.y&1)<<1)); + int2 fref1Bpos = (int2)(qpos1B.x>>2, qpos1B.y>>2); + int hpel1B = ((qpos1B.x&2)>>1) + (qpos1B.y&2); + + uint mask_shift0A = 8 * hpel0A, mask_shift0B = 8 * hpel0B; + uint mask_shift1A = 8 * hpel1A, mask_shift1B = 8 * hpel1B; + + uint vA, vB; + uint enc, ref0, ref1; + uint a0, a1; + const int weight2 = 64 - weight; + +#define READ_BIDIR_DIFF( OUT, X )\ + enc = read_imageui( fenc_lowres, sampler, fencpos + (int2)(X, idx) ).s0;\ + vA = (read_imageui( fref0_planes, sampler, fref0Apos + (int2)(X, idx) ).s0 >> mask_shift0A) & 0xFF;\ + vB = (read_imageui( fref0_planes, sampler, fref0Bpos + (int2)(X, idx) ).s0 >> mask_shift0B) & 0xFF;\ + ref0 = rhadd( vA, vB );\ + vA = (read_imageui( fref1_planes, sampler, fref1Apos + (int2)(X, idx) ).s0 >> mask_shift1A) & 0xFF;\ + vB = (read_imageui( fref1_planes, sampler, fref1Bpos + (int2)(X, idx) ).s0 >> mask_shift1B) & 0xFF;\ + ref1 = rhadd( vA, vB );\ + OUT = enc - ((ref0 * weight + ref1 * weight2 + (1 << 5)) >> 6); + +#define READ_DIFF_EX( OUT, a, b )\ + READ_BIDIR_DIFF( a0, a );\ + READ_BIDIR_DIFF( a1, b );\ + OUT = a0 + (a1<<BITS_PER_SUM); + +#define ROW_8x4_SATD( a, b, c )\ + fencpos.y += a;\ + fref0Apos.y += b;\ + fref0Bpos.y += b;\ + fref1Apos.y += c;\ + fref1Bpos.y += c;\ + READ_DIFF_EX( b0, 0, 4 );\ + READ_DIFF_EX( b1, 1, 5 );\ + READ_DIFF_EX( b2, 2, 6 );\ + READ_DIFF_EX( b3, 3, 7 );\ + HADAMARD4( tmp[idx][0], tmp[idx][1], tmp[idx][2], tmp[idx][3], b0, b1, b2, b3 );\ + HADAMARD4( b0, b1, b2, b3, tmp[0][idx], tmp[1][idx], tmp[2][idx], tmp[3][idx] );\ + sum += abs2( b0 ) + abs2( b1 ) + abs2( b2 ) + abs2( b3 ); + + ROW_8x4_SATD( 0, 0, 0 ); + ROW_8x4_SATD( 4, 4, 4 ); + +#undef READ_BIDIR_DIFF +#undef READ_DIFF_EX +#undef ROW_8x4_SATD + + return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1; +} + +/* + * mode selection - pick the least cost partition type for each 8x8 macroblock. + * Intra, list0 or list1. When measuring a B slice, also test three bidir + * possibilities. + * + * fenc_lowres_mvs[0|1] and fenc_lowres_mv_costs[0|1] are large buffers that + * hold many frames worth of motion vectors. We must offset into the correct + * location for this frame's vectors: + * + * CPU equivalent: fenc->lowres_mvs[0][b - p0 - 1] + * GPU equivalent: fenc_lowres_mvs0[(b - p0 - 1) * mb_count] + * + * global launch dimensions for P slice estimate: [mb_width, mb_height] + * global launch dimensions for B slice estimate: [mb_width * 4, mb_height] + */ +kernel void mode_selection( read_only image2d_t fenc_lowres, + read_only image2d_t fref0_planes, + read_only image2d_t fref1_planes, + const global short2 *fenc_lowres_mvs0, + const global short2 *fenc_lowres_mvs1, + const global short2 *fref1_lowres_mvs0, + const global int16_t *fenc_lowres_mv_costs0, + const global int16_t *fenc_lowres_mv_costs1, + const global uint16_t *fenc_intra_cost, + global uint16_t *lowres_costs, + global int *frame_stats, + local int16_t *cost_local, + local sum2_t *satd_local, + int mb_width, + int bipred_weight, + int dist_scale_factor, + int b, + int p0, + int p1, + int lambda ) +{ + int mb_x = get_global_id( 0 ); + int b_bidir = b < p1; + if( b_bidir ) + { + /* when mode_selection is run for B frames, it must perform BIDIR SATD + * measurements, so it is launched with four times as many threads in + * order to spread the work around more of the GPU. And it can add + * padding threads in the X direction. */ + mb_x >>= 2; + if( mb_x >= mb_width ) + return; + } + int mb_y = get_global_id( 1 ); + int mb_height = get_global_size( 1 ); + int mb_count = mb_width * mb_height; + int mb_xy = mb_x + mb_y * mb_width; + + /* Initialize int frame_stats[4] for next kernel (sum_inter_cost) */ + if( mb_x < 4 && mb_y == 0 ) + frame_stats[mb_x] = 0; + + int bcost = COST_MAX; + int list_used = 0; + + if( !b_bidir ) + { + int icost = fenc_intra_cost[mb_xy]; + COPY2_IF_LT( bcost, icost, list_used, 0 ); + } + if( b != p0 ) + { + int mv_cost0 = fenc_lowres_mv_costs0[(b - p0 - 1) * mb_count + mb_xy]; + COPY2_IF_LT( bcost, mv_cost0, list_used, 1 ); + } + if( b != p1 ) + { + int mv_cost1 = fenc_lowres_mv_costs1[(p1 - b - 1) * mb_count + mb_xy]; + COPY2_IF_LT( bcost, mv_cost1, list_used, 2 ); + } + + if( b_bidir ) + { + int2 coord = (int2)(mb_x, mb_y) << 3; + int mb_i = get_global_id( 0 ) & 3; + int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2); + cost_local += mb_in_group * 4; + satd_local += mb_in_group * 16; + +#define TRY_BIDIR( mv0, mv1, penalty )\ +{\ + int2 qpos0 = (int2)((coord.x<<2) + mv0.x, (coord.y<<2) + mv0.y);\ + int2 qpos1 = (int2)((coord.x<<2) + mv1.x, (coord.y<<2) + mv1.y);\ + cost_local[mb_i] = bidir_satd_8x8_ii_coop4( fenc_lowres, coord, fref0_planes, qpos0, fref1_planes, qpos1, bipred_weight, satd_local, mb_i );\ + int cost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3];\ + COPY2_IF_LT( bcost, penalty * lambda + cost, list_used, 3 );\ +} + + /* temporal prediction */ + short2 dmv0, dmv1; + short2 mvr = fref1_lowres_mvs0[mb_xy]; + dmv0 = (mvr * (short) dist_scale_factor + (short) 128) >> (short) 8; + dmv1 = dmv0 - mvr; + TRY_BIDIR( dmv0, dmv1, 0 ) + + if( as_uint( dmv0 ) || as_uint( dmv1 ) ) + { + /* B-direct prediction */ + dmv0 = 0; dmv1 = 0; + TRY_BIDIR( dmv0, dmv1, 0 ); + } + + /* L0+L1 prediction */ + dmv0 = fenc_lowres_mvs0[(b - p0 - 1) * mb_count + mb_xy]; + dmv1 = fenc_lowres_mvs1[(p1 - b - 1) * mb_count + mb_xy]; + TRY_BIDIR( dmv0, dmv1, 5 ); +#undef TRY_BIDIR + } + + lowres_costs[mb_xy] = min( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT);
View file
x264-snapshot-20130723-2245.tar.bz2/common/opencl/downscale.cl
Added
@@ -0,0 +1,135 @@ +/* + * downscale lowres luma: full-res buffer to down scale image, and to packed hpel image + * + * -- + * + * fenc_img is an output image (area of memory referenced through a texture + * cache). A read of any pixel location (x,y) returns four pixel values: + * + * val.s0 = P(x,y) + * val.s1 = P(x+1,y) + * val.s2 = P(x+2,y) + * val.s3 = P(x+3,y) + * + * This is a 4x replication of the lowres pixels, a trade-off between memory + * size and read latency. + * + * -- + * + * hpel_planes is an output image that contains the four HPEL planes used for + * subpel refinement. A read of any pixel location (x,y) returns a UInt32 with + * the four planar values C | V | H | F + * + * launch dimensions: [lowres-width, lowres-height] + */ +kernel void downscale_hpel( const global pixel *fenc, + write_only image2d_t fenc_img, + write_only image2d_t hpel_planes, + int stride ) +{ + int x = get_global_id( 0 ); + int y = get_global_id( 1 ); + uint4 values; + + fenc += y * stride * 2; + const global pixel *src1 = fenc + stride; + const global pixel *src2 = (y == get_global_size( 1 )-1) ? src1 : src1 + stride; + int2 pos = (int2)(x, y); + pixel right, left; + + right = rhadd( fenc[x*2], src1[x*2] ); + left = rhadd( fenc[x*2+1], src1[x*2+1] ); + values.s0 = rhadd( right, left ); // F + + right = rhadd( fenc[2*x+1], src1[2*x+1] ); + left = rhadd( fenc[2*x+2], src1[2*x+2] ); + values.s1 = rhadd( right, left ); // H + + right = rhadd( src1[2*x], src2[2*x] ); + left = rhadd( src1[2*x+1], src2[2*x+1] ); + values.s2 = rhadd( right, left ); // V + + right = rhadd( src1[2*x+1], src2[2*x+1] ); + left = rhadd( src1[2*x+2], src2[2*x+2] ); + values.s3 = rhadd( right, left ); // C + + uint4 val = (uint4) ((values.s3 & 0xff) << 24) | ((values.s2 & 0xff) << 16) | ((values.s1 & 0xff) << 8) | (values.s0 & 0xff); + write_imageui( hpel_planes, pos, val ); + + x = select( x, x+1, x+1 < get_global_size( 0 ) ); + right = rhadd( fenc[x*2], src1[x*2] ); + left = rhadd( fenc[x*2+1], src1[x*2+1] ); + values.s1 = rhadd( right, left ); + + x = select( x, x+1, x+1 < get_global_size( 0 ) ); + right = rhadd( fenc[x*2], src1[x*2] ); + left = rhadd( fenc[x*2+1], src1[x*2+1] ); + values.s2 = rhadd( right, left ); + + x = select( x, x+1, x+1 < get_global_size( 0 ) ); + right = rhadd( fenc[x*2], src1[x*2] ); + left = rhadd( fenc[x*2+1], src1[x*2+1] ); + values.s3 = rhadd( right, left ); + + write_imageui( fenc_img, pos, values ); +} + +/* + * downscale lowres hierarchical motion search image, copy from one image to + * another decimated image. This kernel is called iteratively to generate all + * of the downscales. + * + * launch dimensions: [lower_res width, lower_res height] + */ +kernel void downscale1( read_only image2d_t higher_res, write_only image2d_t lower_res ) +{ + int x = get_global_id( 0 ); + int y = get_global_id( 1 ); + int2 pos = (int2)(x, y); + int gs = get_global_size( 0 ); + uint4 top, bot, values; + top = read_imageui( higher_res, sampler, (int2)(x*2, 2*y) ); + bot = read_imageui( higher_res, sampler, (int2)(x*2, 2*y+1) ); + values.s0 = rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ); + + /* these select statements appear redundant, and they should be, but tests break when + * they are not here. I believe this was caused by a driver bug + */ + values.s1 = select( values.s0, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 1 < gs) ); + top = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y) ); + bot = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y+1) ); + values.s2 = select( values.s1, rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ), ( x + 2 < gs ) ); + values.s3 = select( values.s2, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 3 < gs ) ); + write_imageui( lower_res, pos, (uint4)(values) ); +} + +/* + * Second copy of downscale kernel, no differences. This is a (no perf loss) + * workaround for a scheduling bug in current Tahiti drivers. This bug has + * theoretically been fixed in the July 2012 driver release from AMD. + */ +kernel void downscale2( read_only image2d_t higher_res, write_only image2d_t lower_res ) +{ + int x = get_global_id( 0 ); + int y = get_global_id( 1 ); + int2 pos = (int2)(x, y); + int gs = get_global_size( 0 ); + uint4 top, bot, values; + top = read_imageui( higher_res, sampler, (int2)(x*2, 2*y) ); + bot = read_imageui( higher_res, sampler, (int2)(x*2, 2*y+1) ); + values.s0 = rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ); + + // see comment in above function copy + values.s1 = select( values.s0, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 1 < gs) ); + top = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y) ); + bot = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y+1) ); + values.s2 = select( values.s1, rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ), ( x + 2 < gs ) ); + values.s3 = select( values.s2, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 3 < gs ) ); + write_imageui( lower_res, pos, (uint4)(values) ); +} + +/* OpenCL 1.2 finally added a memset command, but we're not targeting 1.2 */ +kernel void memset_int16( global int16_t *buf, int16_t value ) +{ + buf[get_global_id( 0 )] = value; +}
View file
x264-snapshot-20130723-2245.tar.bz2/common/opencl/intra.cl
Added
@@ -0,0 +1,1072 @@ +/* Lookahead lowres intra analysis + * + * Each intra analysis function has been implemented twice, once for scalar GPUs + * (NV) and once for vectorized GPUs (AMD pre-Southern Islands). x264 detects + * the GPU type and sets the -DVECTORIZE compile flag accordingly. + * + * All the intra analysis functions were based on their C versions in pixel.c + * and produce the exact same results. + */ + +/* force all clamp arguments and return value to int, prevent ambiguous types */ +#define clamp_int( X, MIN, MAX ) (int) clamp( (int)(X), (int)(MIN), (int)(MAX) ) + +#if VECTORIZE +int satd_8x4_intra_lr( const local pixel *data, int data_stride, int8 pr0, int8 pr1, int8 pr2, int8 pr3 ) +{ + int8 a_v, d_v; + int2 tmp00, tmp01, tmp02, tmp03, tmp10, tmp11, tmp12, tmp13; + int2 tmp20, tmp21, tmp22, tmp23, tmp30, tmp31, tmp32, tmp33; + + d_v = convert_int8( vload8( 0, data ) ); + a_v.s01234567 = (d_v - pr0).s04152637; + HADAMARD4V( tmp00, tmp01, tmp02, tmp03, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi ); + + data += data_stride; + d_v = convert_int8( vload8( 0, data ) ); + a_v.s01234567 = (d_v - pr1).s04152637; + HADAMARD4V( tmp10, tmp11, tmp12, tmp13, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi ); + + data += data_stride; + d_v = convert_int8( vload8( 0, data ) ); + a_v.s01234567 = (d_v - pr2).s04152637; + HADAMARD4V( tmp20, tmp21, tmp22, tmp23, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi ); + + data += data_stride; + d_v = convert_int8( vload8( 0, data ) ); + a_v.s01234567 = (d_v - pr3).s04152637; + HADAMARD4V( tmp30, tmp31, tmp32, tmp33, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi ); + + uint8 sum_v; + + HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp00, tmp10, tmp20, tmp30 ); + sum_v = abs( a_v ); + + HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp01, tmp11, tmp21, tmp31 ); + sum_v += abs( a_v ); + + HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp02, tmp12, tmp22, tmp32 ); + sum_v += abs( a_v ); + + HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp03, tmp13, tmp23, tmp33 ); + sum_v += abs( a_v ); + + uint4 sum2 = sum_v.hi + sum_v.lo; + uint2 sum3 = sum2.hi + sum2.lo; + return ( sum3.hi + sum3.lo ) >> 1; +} +#else +SATD_C_8x4_Q( satd_8x4_lp, const local, private ) +#endif + +/**************************************************************************** + * 8x8 prediction for intra luma block + ****************************************************************************/ + +#define F1 rhadd +#define F2( a, b, c ) ( a+2*b+c+2 )>>2 + +#if VECTORIZE +int x264_predict_8x8_ddl( const local pixel *src, int src_stride, const local pixel *top ) +{ + int8 pr0, pr1, pr2, pr3; + + // Upper half of pred[] + pr0.s0 = ( 2 + top[0] + 2*top[1] + top[2] ) >> 2; + pr0.s1 = ( 2 + top[1] + 2*top[2] + top[3] ) >> 2; + pr0.s2 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2; + pr0.s3 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2; + pr0.s4 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2; + pr0.s5 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2; + pr0.s6 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2; + pr0.s7 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2; + + pr1.s0 = ( 2 + top[1] + 2*top[2] + top[3] ) >> 2; + pr1.s1 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2; + pr1.s2 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2; + pr1.s3 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2; + pr1.s4 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2; + pr1.s5 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2; + pr1.s6 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2; + pr1.s7 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2; + + pr2.s0 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2; + pr2.s1 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2; + pr2.s2 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2; + pr2.s3 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2; + pr2.s4 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2; + pr2.s5 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2; + pr2.s6 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2; + pr2.s7 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2; + + pr3.s0 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2; + pr3.s1 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2; + pr3.s2 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2; + pr3.s3 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2; + pr3.s4 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2; + pr3.s5 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2; + pr3.s6 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2; + pr3.s7 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2; + int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 ); + + // Lower half of pred[] + pr0.s0 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2; + pr0.s1 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2; + pr0.s2 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2; + pr0.s3 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2; + pr0.s4 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2; + pr0.s5 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2; + pr0.s6 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2; + pr0.s7 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2; + + pr1.s0 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2; + pr1.s1 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2; + pr1.s2 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2; + pr1.s3 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2; + pr1.s4 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2; + pr1.s5 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2; + pr1.s6 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2; + pr1.s7 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2; + + pr2.s0 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2; + pr2.s1 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2; + pr2.s2 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2; + pr2.s3 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2; + pr2.s4 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2; + pr2.s5 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2; + pr2.s6 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2; + pr2.s7 = ( 2 + top[13] + 2*top[14] + top[15] ) >> 2; + + pr3.s0 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2; + pr3.s1 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2; + pr3.s2 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2; + pr3.s3 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2; + pr3.s4 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2; + pr3.s5 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2; + pr3.s6 = ( 2 + top[13] + 2*top[14] + top[15] ) >> 2; + pr3.s7 = ( 2 + top[14] + 3*top[15] ) >> 2; + + return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 ); +} + +int x264_predict_8x8_ddr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top ) +{ + int8 pr0, pr1, pr2, pr3; + + // Upper half of pred[] + pr3.s0 = F2( left[1], left[2], left[3] ); + pr2.s0 = pr3.s1 = F2( left[0], left[1], left[2] ); + pr1.s0 = pr2.s1 = pr3.s2 = F2( left[1], left[0], left_top ); + pr0.s0 = pr1.s1 = pr2.s2 = pr3.s3 = F2( left[0], left_top, top[0] ); + pr0.s1 = pr1.s2 = pr2.s3 = pr3.s4 = F2( left_top, top[0], top[1] ); + pr0.s2 = pr1.s3 = pr2.s4 = pr3.s5 = F2( top[0], top[1], top[2] ); + pr0.s3 = pr1.s4 = pr2.s5 = pr3.s6 = F2( top[1], top[2], top[3] ); + pr0.s4 = pr1.s5 = pr2.s6 = pr3.s7 = F2( top[2], top[3], top[4] ); + pr0.s5 = pr1.s6 = pr2.s7 = F2( top[3], top[4], top[5] ); + pr0.s6 = pr1.s7 = F2( top[4], top[5], top[6] ); + pr0.s7 = F2( top[5], top[6], top[7] ); + int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 ); + + // Lower half of pred[] + pr3.s0 = F2( left[5], left[6], left[7] ); + pr2.s0 = pr3.s1 = F2( left[4], left[5], left[6] ); + pr1.s0 = pr2.s1 = pr3.s2 = F2( left[3], left[4], left[5] ); + pr0.s0 = pr1.s1 = pr2.s2 = pr3.s3 = F2( left[2], left[3], left[4] ); + pr0.s1 = pr1.s2 = pr2.s3 = pr3.s4 = F2( left[1], left[2], left[3] ); + pr0.s2 = pr1.s3 = pr2.s4 = pr3.s5 = F2( left[0], left[1], left[2] ); + pr0.s3 = pr1.s4 = pr2.s5 = pr3.s6 = F2( left[1], left[0], left_top ); + pr0.s4 = pr1.s5 = pr2.s6 = pr3.s7 = F2( left[0], left_top, top[0] ); + pr0.s5 = pr1.s6 = pr2.s7 = F2( left_top, top[0], top[1] ); + pr0.s6 = pr1.s7 = F2( top[0], top[1], top[2] ); + pr0.s7 = F2( top[1], top[2], top[3] ); + return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 ); +} + +int x264_predict_8x8_vr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top ) +{ + int8 pr0, pr1, pr2, pr3; + + // Upper half of pred[] + pr2.s0 = F2( left[1], left[0], left_top ); + pr3.s0 = F2( left[2], left[1], left[0] ); + pr1.s0 = pr3.s1 = F2( left[0], left_top, top[0] ); + pr0.s0 = pr2.s1 = F1( left_top, top[0] ); + pr1.s1 = pr3.s2 = F2( left_top, top[0], top[1] ); + pr0.s1 = pr2.s2 = F1( top[0], top[1] ); + pr1.s2 = pr3.s3 = F2( top[0], top[1], top[2] ); + pr0.s2 = pr2.s3 = F1( top[1], top[2] ); + pr1.s3 = pr3.s4 = F2( top[1], top[2], top[3] ); + pr0.s3 = pr2.s4 = F1( top[2], top[3] );
View file
x264-snapshot-20130723-2245.tar.bz2/common/opencl/motionsearch.cl
Added
@@ -0,0 +1,249 @@ +/* Hierarchical (iterative) OpenCL lowres motion search */ + +inline int find_downscale_mb_xy( int x, int y, int mb_width, int mb_height ) +{ + /* edge macroblocks might not have a direct descendant, use nearest */ + x = select( x >> 1, (x - (mb_width&1)) >> 1, x == mb_width-1 ); + y = select( y >> 1, (y - (mb_height&1)) >> 1, y == mb_height-1 ); + return (mb_width>>1) * y + x; +} + +/* Four threads calculate an 8x8 SAD. Each does two rows */ +int sad_8x8_ii_coop4( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos, int idx, local int16_t *costs ) +{ + frefpos.y += idx << 1; + fencpos.y += idx << 1; + int cost = 0; + if( frefpos.x < 0 ) + { + /* slow path when MV goes past left edge. The GPU clamps reads from + * (-1, 0) to (0,0), so you get pixels [0, 1, 2, 3] when what you really + * want are [0, 0, 1, 2] + */ + for( int y = 0; y < 2; y++ ) + { + for( int x = 0; x < 8; x++ ) + { + pixel enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0; + pixel ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0; + cost += abs_diff( enc, ref ); + } + } + } + else + { + uint4 enc, ref, costs = 0; + enc = read_imageui( fenc, sampler, fencpos ); + ref = read_imageui( fref, sampler, frefpos ); + costs += abs_diff( enc, ref ); + enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 0) ); + ref = read_imageui( fref, sampler, frefpos + (int2)(4, 0) ); + costs += abs_diff( enc, ref ); + enc = read_imageui( fenc, sampler, fencpos + (int2)(0, 1) ); + ref = read_imageui( fref, sampler, frefpos + (int2)(0, 1) ); + costs += abs_diff( enc, ref ); + enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 1) ); + ref = read_imageui( fref, sampler, frefpos + (int2)(4, 1) ); + costs += abs_diff( enc, ref ); + cost = costs.s0 + costs.s1 + costs.s2 + costs.s3; + } + costs[idx] = cost; + return costs[0] + costs[1] + costs[2] + costs[3]; +} + +/* One thread performs 8x8 SAD */ +int sad_8x8_ii( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos ) +{ + if( frefpos.x < 0 ) + { + /* slow path when MV goes past left edge */ + int cost = 0; + for( int y = 0; y < 8; y++ ) + { + for( int x = 0; x < 8; x++ ) + { + uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0; + uint ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0; + cost += abs_diff( enc, ref ); + } + } + return cost; + } + else + { + uint4 enc, ref, cost = 0; + for( int y = 0; y < 8; y++ ) + { + for( int x = 0; x < 8; x += 4 ) + { + enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ); + ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ); + cost += abs_diff( enc, ref ); + } + } + return cost.s0 + cost.s1 + cost.s2 + cost.s3; + } +} +/* + * hierarchical motion estimation + * + * Each kernel launch is a single iteration + * + * MB per work group is determined by lclx / 4 * lcly + * + * global launch dimensions: [mb_width * 4, mb_height] + */ +kernel void hierarchical_motion( read_only image2d_t fenc, + read_only image2d_t fref, + const global short2 *in_mvs, + global short2 *out_mvs, + global int16_t *out_mv_costs, + global short2 *mvp_buffer, + local int16_t *cost_local, + local short2 *mvc_local, + int mb_width, + int lambda, + int me_range, + int scale, + int b_shift_index, + int b_first_iteration, + int b_reverse_references ) +{ + int mb_x = get_global_id( 0 ) >> 2; + if( mb_x >= mb_width ) + return; + int mb_height = get_global_size( 1 ); + int mb_i = get_global_id( 0 ) & 3; + int mb_y = get_global_id( 1 ); + int mb_xy = mb_y * mb_width + mb_x; + const int mb_size = 8; + int2 coord = (int2)(mb_x, mb_y) * mb_size; + + const int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2); + cost_local += 4 * mb_in_group; + + int i_mvc = 0; + mvc_local += 4 * mb_in_group; + mvc_local[mb_i] = 0; + int2 mvp =0; + + if( !b_first_iteration ) + { +#define MVC( DX, DY )\ + {\ + int px = mb_x + DX;\ + int py = mb_y + DY;\ + mvc_local[i_mvc] = b_shift_index ? in_mvs[find_downscale_mb_xy( px, py, mb_width, mb_height )] : \ + in_mvs[mb_width * py + px];\ + mvc_local[i_mvc] >>= (short) scale;\ + i_mvc++;\ + } + /* Find MVP from median of MVCs */ + if( b_reverse_references ) + { + /* odd iterations: derive MVP from down and right */ + if( mb_x < mb_width - 1 ) + MVC( 1, 0 ); + if( mb_y < mb_height - 1 ) + { + MVC( 0, 1 ); + if( mb_x > b_shift_index ) + MVC( -1, 1 ); + if( mb_x < mb_width - 1 ) + MVC( 1, 1 ); + } + } + else + { + /* even iterations: derive MVP from up and left */ + if( mb_x > 0 ) + MVC( -1, 0 ); + if( mb_y > 0 ) + { + MVC( 0, -1 ); + if( mb_x < mb_width - 1 ) + MVC( 1, -1 ); + if( mb_x > b_shift_index ) + MVC( -1, -1 ); + } + } +#undef MVC + mvp = (i_mvc <= 1) ? convert_int2_sat(mvc_local[0]) : x264_median_mv( mvc_local[0], mvc_local[1], mvc_local[2] ); + } + /* current mvp matches the previous mvp and we have not changed scale. We know + * we're going to arrive at the same MV again, so just copy the previous + * result to our output. */ + if( !b_shift_index && mvp.x == mvp_buffer[mb_xy].x && mvp.y == mvp_buffer[mb_xy].y ) + { + out_mvs[mb_xy] = in_mvs[mb_xy]; + return; + } + mvp_buffer[mb_xy] = convert_short2_sat(mvp); + int2 mv_min = -mb_size * (int2)(mb_x, mb_y) - 4; + int2 mv_max = mb_size * ((int2)(mb_width, mb_height) - (int2)(mb_x, mb_y) - 1) + 4; + + int2 bestmv = clamp(mvp, mv_min, mv_max); + int2 refcrd = coord + bestmv; + + /* measure cost at bestmv */ + int bcost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) + + lambda * mv_cost( abs_diff( bestmv, mvp ) << (2 + scale) ); + + do + { + /* measure costs at offsets from bestmv */ + refcrd = coord + bestmv + dia_offs[mb_i]; + int2 trymv = bestmv + dia_offs[mb_i]; + int cost = sad_8x8_ii( fenc, coord, fref, refcrd ) + + lambda * mv_cost( abs_diff( trymv, mvp ) << (2 + scale) ); +
View file
x264-snapshot-20130723-2245.tar.bz2/common/opencl/subpel.cl
Added
@@ -0,0 +1,242 @@ +/* OpenCL lowres subpel Refine */ + +/* Each thread performs 8x8 SAD. 4 threads per MB, so the 4 DIA HPEL offsets are + * calculated simultaneously */ +int sad_8x8_ii_hpel( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref_planes, int2 qpos ) +{ + int2 frefpos = qpos >> 2; + int hpel_idx = ((qpos.x & 2) >> 1) + (qpos.y & 2); + uint mask_shift = 8 * hpel_idx; + + uint4 cost4 = 0; + + for( int y = 0; y < 8; y++ ) + { + uint4 enc, val4; + enc = read_imageui( fenc, sampler, fencpos + (int2)(0, y)); + val4.s0 = (read_imageui( fref_planes, sampler, frefpos + (int2)(0, y)).s0 >> mask_shift) & 0xFF; + val4.s1 = (read_imageui( fref_planes, sampler, frefpos + (int2)(1, y)).s0 >> mask_shift) & 0xFF; + val4.s2 = (read_imageui( fref_planes, sampler, frefpos + (int2)(2, y)).s0 >> mask_shift) & 0xFF; + val4.s3 = (read_imageui( fref_planes, sampler, frefpos + (int2)(3, y)).s0 >> mask_shift) & 0xFF; + cost4 += abs_diff( enc, val4 ); + + enc = read_imageui( fenc, sampler, fencpos + (int2)(4, y)); + val4.s0 = (read_imageui( fref_planes, sampler, frefpos + (int2)(4, y)).s0 >> mask_shift) & 0xFF; + val4.s1 = (read_imageui( fref_planes, sampler, frefpos + (int2)(5, y)).s0 >> mask_shift) & 0xFF; + val4.s2 = (read_imageui( fref_planes, sampler, frefpos + (int2)(6, y)).s0 >> mask_shift) & 0xFF; + val4.s3 = (read_imageui( fref_planes, sampler, frefpos + (int2)(7, y)).s0 >> mask_shift) & 0xFF; + cost4 += abs_diff( enc, val4 ); + } + + return cost4.s0 + cost4.s1 + cost4.s2 + cost4.s3; +} + +/* One thread measures 8x8 SAD cost at a QPEL offset into an HPEL plane */ +int sad_8x8_ii_qpel( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref_planes, int2 qpos ) +{ + int2 frefApos = qpos >> 2; + int hpelA = ((qpos.x & 2) >> 1) + (qpos.y & 2); + + int2 qposB = qpos + ((qpos & 1) << 1); + int2 frefBpos = qposB >> 2; + int hpelB = ((qposB.x & 2) >> 1) + (qposB.y & 2); + + uint mask_shift0 = 8 * hpelA, mask_shift1 = 8 * hpelB; + + int cost = 0; + + for( int y = 0; y < 8; y++ ) + { + for( int x = 0; x < 8; x++ ) + { + uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y)).s0; + uint vA = (read_imageui( fref_planes, sampler, frefApos + (int2)(x, y)).s0 >> mask_shift0) & 0xFF; + uint vB = (read_imageui( fref_planes, sampler, frefBpos + (int2)(x, y)).s0 >> mask_shift1) & 0xFF; + cost += abs_diff( enc, rhadd( vA, vB ) ); + } + } + + return cost; +} + +/* Four threads measure 8x8 SATD cost at a QPEL offset into an HPEL plane + * + * Each thread collects 1/4 of the rows of diffs and processes one quarter of + * the transforms + */ +int satd_8x8_ii_qpel_coop4( read_only image2d_t fenc, + int2 fencpos, + read_only image2d_t fref_planes, + int2 qpos, + local sum2_t *tmpp, + int idx ) +{ + volatile local sum2_t( *tmp )[4] = (volatile local sum2_t( * )[4])tmpp; + sum2_t b0, b1, b2, b3; + + // fencpos is full-pel position of original MB + // qpos is qpel position within reference frame + int2 frefApos = qpos >> 2; + int hpelA = ((qpos.x&2)>>1) + (qpos.y&2); + + int2 qposB = qpos + (int2)(((qpos.x&1)<<1), ((qpos.y&1)<<1)); + int2 frefBpos = qposB >> 2; + int hpelB = ((qposB.x&2)>>1) + (qposB.y&2); + + uint mask_shift0 = 8 * hpelA, mask_shift1 = 8 * hpelB; + + uint vA, vB; + uint a0, a1; + uint enc; + sum2_t sum = 0; + +#define READ_DIFF( OUT, X )\ + enc = read_imageui( fenc, sampler, fencpos + (int2)(X, idx) ).s0;\ + vA = (read_imageui( fref_planes, sampler, frefApos + (int2)(X, idx) ).s0 >> mask_shift0) & 0xFF;\ + vB = (read_imageui( fref_planes, sampler, frefBpos + (int2)(X, idx) ).s0 >> mask_shift1) & 0xFF;\ + OUT = enc - rhadd( vA, vB ); + +#define READ_DIFF_EX( OUT, a, b )\ + {\ + READ_DIFF( a0, a );\ + READ_DIFF( a1, b );\ + OUT = a0 + (a1<<BITS_PER_SUM);\ + } +#define ROW_8x4_SATD( a, b )\ + {\ + fencpos.y += a;\ + frefApos.y += b;\ + frefBpos.y += b;\ + READ_DIFF_EX( b0, 0, 4 );\ + READ_DIFF_EX( b1, 1, 5 );\ + READ_DIFF_EX( b2, 2, 6 );\ + READ_DIFF_EX( b3, 3, 7 );\ + HADAMARD4( tmp[idx][0], tmp[idx][1], tmp[idx][2], tmp[idx][3], b0, b1, b2, b3 );\ + HADAMARD4( b0, b1, b2, b3, tmp[0][idx], tmp[1][idx], tmp[2][idx], tmp[3][idx] );\ + sum += abs2( b0 ) + abs2( b1 ) + abs2( b2 ) + abs2( b3 );\ + } + ROW_8x4_SATD( 0, 0 ); + ROW_8x4_SATD( 4, 4 ); + +#undef READ_DIFF +#undef READ_DIFF_EX +#undef ROW_8x4_SATD + return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1; +} + +constant int2 hpoffs[4] = +{ + {0, -2}, {-2, 0}, {2, 0}, {0, 2} +}; + +/* sub pixel refinement of motion vectors, output MVs and costs are moved from + * temporary buffers into final per-frame buffer + * + * global launch dimensions: [mb_width * 4, mb_height] + * + * With X being the source 16x16 pixels, F is the lowres pixel used by the + * motion search. We will now utilize the H V and C pixels (stored in separate + * planes) to search at half-pel increments. + * + * X X X X X X + * F H F H F + * X X X X X X + * V C V C V + * X X X X X X + * F H F H F + * X X X X X X + * + * The YX HPEL bits of the motion vector selects the plane we search in. The + * four planes are packed in the fref_planes 2D image buffer. Each sample + * returns: s0 = F, s1 = H, s2 = V, s3 = C */ +kernel void subpel_refine( read_only image2d_t fenc, + read_only image2d_t fref_planes, + const global short2 *in_mvs, + const global int16_t *in_sad_mv_costs, + local int16_t *cost_local, + local sum2_t *satd_local, + local short2 *mvc_local, + global short2 *fenc_lowres_mv, + global int16_t *fenc_lowres_mv_costs, + int mb_width, + int lambda, + int b, + int ref, + int b_islist1 ) +{ + int mb_x = get_global_id( 0 ) >> 2; + if( mb_x >= mb_width ) + return; + int mb_height = get_global_size( 1 ); + + int mb_i = get_global_id( 0 ) & 3; + int mb_y = get_global_id( 1 ); + int mb_xy = mb_y * mb_width + mb_x; + + /* fenc_lowres_mv and fenc_lowres_mv_costs are large buffers that + * hold many frames worth of motion vectors. We must offset into the correct + * location for this frame's vectors. The kernel will be passed the correct + * directional buffer for the direction of the search: list1 or list0 + * + * CPU equivalent: fenc->lowres_mvs[0][b - p0 - 1] + * GPU equivalent: fenc_lowres_mvs[(b - p0 - 1) * mb_count] */ + fenc_lowres_mv += (b_islist1 ? (ref-b-1) : (b-ref-1)) * mb_width * mb_height; + fenc_lowres_mv_costs += (b_islist1 ? (ref-b-1) : (b-ref-1)) * mb_width * mb_height; + + /* Adjust pointers into local memory buffers for this thread's data */ + int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2); + cost_local += mb_in_group * 4; + satd_local += mb_in_group * 16; + mvc_local += mb_in_group * 4; + + int i_mvc = 0; + + mvc_local[0] = mvc_local[1] = mvc_local[2] = mvc_local[3] = 0; + +#define MVC( DX, DY ) mvc_local[i_mvc++] = in_mvs[mb_width * (mb_y + DY) + (mb_x + DX)]; + if( mb_x > 0 ) + MVC( -1, 0 ); + if( mb_y > 0 )
View file
x264-snapshot-20130723-2245.tar.bz2/common/opencl/weightp.cl
Added
@@ -0,0 +1,48 @@ +/* Weightp filter a downscaled image into a temporary output buffer. + * This kernel is launched once for each scale. + * + * Launch dimensions: width x height (in pixels) + */ +kernel void weightp_scaled_images( read_only image2d_t in_plane, + write_only image2d_t out_plane, + uint offset, + uint scale, + uint denom ) +{ + int gx = get_global_id( 0 ); + int gy = get_global_id( 1 ); + uint4 input_val; + uint4 output_val; + + input_val = read_imageui( in_plane, sampler, (int2)(gx, gy)); + output_val = (uint4)(offset) + ( ( ((uint4)(scale)) * input_val ) >> ((uint4)(denom)) ); + write_imageui( out_plane, (int2)(gx, gy), output_val ); +} + +/* Weightp filter for the half-pel interpolated image + * + * Launch dimensions: width x height (in pixels) + */ +kernel void weightp_hpel( read_only image2d_t in_plane, + write_only image2d_t out_plane, + uint offset, + uint scale, + uint denom ) +{ + int gx = get_global_id( 0 ); + int gy = get_global_id( 1 ); + uint input_val; + uint output_val; + + input_val = read_imageui( in_plane, sampler, (int2)(gx, gy)).s0; + //Unpack + uint4 temp; + temp.s0 = input_val & 0x00ff; temp.s1 = (input_val >> 8) & 0x00ff; + temp.s2 = (input_val >> 16) & 0x00ff; temp.s3 = (input_val >> 24) & 0x00ff; + + temp = (uint4)(offset) + ( ( ((uint4)(scale)) * temp ) >> ((uint4)(denom)) ); + + //Pack + output_val = temp.s0 | (temp.s1 << 8) | (temp.s2 << 16) | (temp.s3 << 24); + write_imageui( out_plane, (int2)(gx, gy), output_val ); +}
View file
x264-snapshot-20130723-2245.tar.bz2/common/opencl/x264-cl.h
Added
@@ -0,0 +1,132 @@ +#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable + +constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; + +/* 7.18.1.1 Exact-width integer types */ +typedef signed char int8_t; +typedef unsigned char uint8_t; +typedef short int16_t; +typedef unsigned short uint16_t; +typedef int int32_t; +typedef unsigned uint32_t; + +typedef uint8_t pixel; +typedef uint16_t sum_t; +typedef uint32_t sum2_t; + +#define LOWRES_COST_MASK ((1<<14)-1) +#define LOWRES_COST_SHIFT 14 +#define COST_MAX (1<<28) + +#define PIXEL_MAX 255 +#define BITS_PER_SUM (8 * sizeof(sum_t)) + +/* Constants for offsets into frame statistics buffer */ +#define COST_EST 0 +#define COST_EST_AQ 1 +#define INTRA_MBS 2 + +#define COPY2_IF_LT( x, y, a, b )\ + if((y)<(x))\ + {\ + (x) = (y);\ + (a) = (b);\ + } + +constant int2 dia_offs[4] = +{ + {0, -1}, {-1, 0}, {1, 0}, {0, 1}, +}; + +inline pixel x264_clip_pixel( int x ) +{ + return (pixel) clamp( x, (int) 0, (int) PIXEL_MAX ); +} + +inline int2 x264_median_mv( short2 a, short2 b, short2 c ) +{ + short2 t1 = min(a, b); + short2 t2 = min(max(a, b), c); + return convert_int2(max(t1, t2)); +} + +inline sum2_t abs2( sum2_t a ) +{ + sum2_t s = ((a >> (BITS_PER_SUM - 1)) & (((sum2_t)1 << BITS_PER_SUM) + 1)) * ((sum_t)-1); + return (a + s) ^ s; +} + +#define HADAMARD4( d0, d1, d2, d3, s0, s1, s2, s3 ) {\ + sum2_t t0 = s0 + s1;\ + sum2_t t1 = s0 - s1;\ + sum2_t t2 = s2 + s3;\ + sum2_t t3 = s2 - s3;\ + d0 = t0 + t2;\ + d2 = t0 - t2;\ + d1 = t1 + t3;\ + d3 = t1 - t3;\ +} + +#define HADAMARD4V( d0, d1, d2, d3, s0, s1, s2, s3 ) {\ + int2 t0 = s0 + s1;\ + int2 t1 = s0 - s1;\ + int2 t2 = s2 + s3;\ + int2 t3 = s2 - s3;\ + d0 = t0 + t2;\ + d2 = t0 - t2;\ + d1 = t1 + t3;\ + d3 = t1 - t3;\ +} + +#define SATD_C_8x4_Q( name, q1, q2 )\ + int name( q1 pixel *pix1, int i_pix1, q2 pixel *pix2, int i_pix2 )\ + {\ + sum2_t tmp[4][4];\ + sum2_t a0, a1, a2, a3;\ + sum2_t sum = 0;\ + for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )\ + {\ + a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM);\ + a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM);\ + a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM);\ + a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM);\ + HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3 );\ + }\ + for( int i = 0; i < 4; i++ )\ + {\ + HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] );\ + sum += abs2( a0 ) + abs2( a1 ) + abs2( a2 ) + abs2( a3 );\ + }\ + return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;\ + } + +/* + * Utility function to perform a parallel sum reduction of an array of integers + */ +int parallel_sum( int value, int x, volatile local int *array ) +{ + array[x] = value; + barrier( CLK_LOCAL_MEM_FENCE ); + + int dim = get_local_size( 0 ); + + while( dim > 1 ) + { + dim >>= 1; + + if( x < dim ) + array[x] += array[x + dim]; + + if( dim > 32 ) + barrier( CLK_LOCAL_MEM_FENCE ); + } + + return array[0]; +} + +int mv_cost( uint2 mvd ) +{ + float2 mvdf = (float2)(mvd.x, mvd.y) + 1.0f; + float2 cost = round( log2(mvdf) * 2.0f + 0.718f + (float2)(!!mvd.x, !!mvd.y) ); + return (int) (cost.x + cost.y); +}
View file
x264-snapshot-20130224-2245.tar.bz2/common/osdep.h -> x264-snapshot-20130723-2245.tar.bz2/common/osdep.h
Changed
@@ -79,6 +79,7 @@ #else #define DECLARE_ALIGNED( var, n ) var __attribute__((aligned(n))) #endif +#define ALIGNED_32( var ) DECLARE_ALIGNED( var, 32 ) #define ALIGNED_16( var ) DECLARE_ALIGNED( var, 16 ) #define ALIGNED_8( var ) DECLARE_ALIGNED( var, 8 ) #define ALIGNED_4( var ) DECLARE_ALIGNED( var, 4 ) @@ -110,9 +111,26 @@ #define EXPAND(x) x +#if HAVE_32B_STACK_ALIGNMENT +#define ALIGNED_ARRAY_32( type, name, sub1, ... )\ + ALIGNED_32( type name sub1 __VA_ARGS__ ) +#else #define ALIGNED_ARRAY_32( ... ) EXPAND( ALIGNED_ARRAY_EMU( 31, __VA_ARGS__ ) ) +#endif + #define ALIGNED_ARRAY_64( ... ) EXPAND( ALIGNED_ARRAY_EMU( 63, __VA_ARGS__ ) ) +/* For AVX2 */ +#if ARCH_X86 || ARCH_X86_64 +#define NATIVE_ALIGN 32 +#define ALIGNED_N ALIGNED_32 +#define ALIGNED_ARRAY_N ALIGNED_ARRAY_32 +#else +#define NATIVE_ALIGN 16 +#define ALIGNED_N ALIGNED_16 +#define ALIGNED_ARRAY_N ALIGNED_ARRAY_16 +#endif + #define UNINIT(x) x=x #if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0) @@ -204,6 +222,25 @@ #define x264_threading_init() 0 #endif +static ALWAYS_INLINE int x264_pthread_fetch_and_add( int *val, int add, x264_pthread_mutex_t *mutex ) +{ +#if HAVE_THREAD +#if defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ > 0) && ARCH_X86 + return __sync_fetch_and_add( val, add ); +#else + x264_pthread_mutex_lock( mutex ); + int res = *val; + *val += add; + x264_pthread_mutex_unlock( mutex ); + return res; +#endif +#else + int res = *val; + *val += add; + return res; +#endif +} + #define WORD_SIZE sizeof(void*) #define asm __asm__ @@ -254,6 +291,13 @@ } #endif +/* For values with 4 bits or less. */ +static int ALWAYS_INLINE x264_ctz_4bit( uint32_t x ) +{ + static uint8_t lut[16] = {4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0}; + return lut[x]; +} + #if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 3) #define x264_clz(x) __builtin_clz(x) #define x264_ctz(x) __builtin_ctz(x)
View file
x264-snapshot-20130224-2245.tar.bz2/common/pixel.c -> x264-snapshot-20130723-2245.tar.bz2/common/pixel.c
Changed
@@ -370,7 +370,6 @@ return (sum+2)>>2; } - static NOINLINE uint64_t pixel_hadamard_ac( pixel *pix, intptr_t stride ) { sum2_t tmp[32]; @@ -501,6 +500,7 @@ #if !HIGH_BIT_DEPTH SATD_X_DECL6( _sse2 ) SATD_X_DECL7( _ssse3 ) +SATD_X_DECL6( _ssse3_atom ) SATD_X_DECL7( _sse4 ) SATD_X_DECL7( _avx ) SATD_X_DECL7( _xop ) @@ -528,6 +528,7 @@ INTRA_MBCMP_8x8( sad,, _c ) INTRA_MBCMP_8x8(sa8d,, _c ) #if HIGH_BIT_DEPTH && HAVE_MMX +#define x264_predict_8x8_v_sse2 x264_predict_8x8_v_sse INTRA_MBCMP_8x8( sad, _mmx2, _c ) INTRA_MBCMP_8x8(sa8d, _sse2, _sse2 ) #endif @@ -554,6 +555,9 @@ #if HAVE_MMX #if HIGH_BIT_DEPTH +#define x264_predict_8x8c_v_sse2 x264_predict_8x8c_v_sse +#define x264_predict_8x16c_v_sse2 x264_predict_8x16c_v_sse +#define x264_predict_16x16_v_sse2 x264_predict_16x16_v_sse INTRA_MBCMP( sad, 4x4, v, h, dc, , _mmx2, _c ) INTRA_MBCMP( sad, 8x8, dc, h, v, c, _mmx2, _c ) INTRA_MBCMP( sad, 16x16, v, h, dc, , _mmx2, _mmx2 ) @@ -841,6 +845,7 @@ if( cpu&X264_CPU_MMX2 ) { INIT7( sad, _mmx2 ); + INIT7_NAME( sad_aligned, sad, _mmx2 ); INIT7( sad_x3, _mmx2 ); INIT7( sad_x4, _mmx2 ); INIT8( satd, _mmx2 ); @@ -870,11 +875,14 @@ { INIT4_NAME( sad_aligned, sad, _sse2_aligned ); INIT5( ssd, _sse2 ); + INIT6( satd, _sse2 ); + pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_sse2; pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse2; #if ARCH_X86_64 pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2; + pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse2; #endif pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_sse2; pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_sse2; @@ -916,10 +924,14 @@ if( cpu&X264_CPU_SSSE3 ) { INIT4_NAME( sad_aligned, sad, _ssse3_aligned ); + pixf->sad_aligned[PIXEL_4x4] = x264_pixel_sad_4x4_ssse3; + pixf->sad_aligned[PIXEL_4x8] = x264_pixel_sad_4x8_ssse3; INIT7( sad, _ssse3 ); INIT7( sad_x3, _ssse3 ); INIT7( sad_x4, _ssse3 ); INIT_ADS( _ssse3 ); + INIT6( satd, _ssse3 ); + pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_ssse3; if( !(cpu&X264_CPU_STACK_MOD4) ) { @@ -930,6 +942,9 @@ pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_ssse3; pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3; +#if ARCH_X86_64 + pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3; +#endif pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_ssse3; pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_ssse3; pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_ssse3; @@ -937,16 +952,24 @@ } if( cpu&X264_CPU_SSE4 ) { + INIT6( satd, _sse4 ); + pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_sse4; if( !(cpu&X264_CPU_STACK_MOD4) ) { INIT4( hadamard_ac, _sse4 ); } pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse4; +#if ARCH_X86_64 + pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse4; +#endif } if( cpu&X264_CPU_AVX ) { + INIT5_NAME( sad_aligned, sad, _ssse3 ); /* AVX-capable CPUs doesn't benefit from an aligned version */ INIT_ADS( _avx ); + INIT6( satd, _avx ); + pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_avx; if( !(cpu&X264_CPU_STACK_MOD4) ) { INIT4( hadamard_ac, _avx ); @@ -959,12 +982,26 @@ pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx; pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_avx; pixf->ssim_end4 = x264_pixel_ssim_end4_avx; +#if ARCH_X86_64 + pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_avx; +#endif } if( cpu&X264_CPU_XOP ) { pixf->vsad = x264_pixel_vsad_xop; pixf->asd8 = x264_pixel_asd8_xop; } + if( cpu&X264_CPU_AVX2 ) + { + INIT2( ssd, _avx2 ); + INIT2( sad, _avx2 ); + INIT2_NAME( sad_aligned, sad, _avx2 ); + INIT2( sad_x3, _avx2 ); + INIT2( sad_x4, _avx2 ); + pixf->vsad = x264_pixel_vsad_avx2; + pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx2; + pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_avx2; + } #endif // HAVE_MMX #else // !HIGH_BIT_DEPTH #if HAVE_MMX @@ -1003,14 +1040,14 @@ INIT4( sad_x3, _cache32_mmx2 ); INIT4( sad_x4, _cache32_mmx2 ); } - else if( cpu&X264_CPU_CACHELINE_64 ) + else if( cpu&X264_CPU_CACHELINE_64 && !(cpu&X264_CPU_SLOW_ATOM) ) { INIT5( sad, _cache64_mmx2 ); INIT4( sad_x3, _cache64_mmx2 ); INIT4( sad_x4, _cache64_mmx2 ); } #else - if( cpu&X264_CPU_CACHELINE_64 ) + if( cpu&X264_CPU_CACHELINE_64 && !(cpu&X264_CPU_SLOW_ATOM) ) { pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_cache64_mmx2; pixf->sad[PIXEL_8x8] = x264_pixel_sad_8x8_cache64_mmx2; @@ -1044,6 +1081,7 @@ pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse2; #if ARCH_X86_64 pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2; + pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse2; #endif pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_sse2; pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_sse2; @@ -1060,10 +1098,7 @@ pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_sse2; INIT6( satd_x3, _sse2 ); INIT6( satd_x4, _sse2 ); - if( !(cpu&X264_CPU_STACK_MOD4) ) - { - INIT4( hadamard_ac, _sse2 ); - } + INIT4( hadamard_ac, _sse2 ); INIT_ADS( _sse2 ); pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2; pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_sse2; @@ -1113,9 +1148,9 @@ if( cpu&X264_CPU_SSSE3 ) { + INIT4( hadamard_ac, _ssse3 ); if( !(cpu&X264_CPU_STACK_MOD4) ) { - INIT4( hadamard_ac, _ssse3 ); pixf->intra_sad_x9_4x4 = x264_intra_sad_x9_4x4_ssse3; pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_ssse3; pixf->intra_sad_x9_8x8 = x264_intra_sad_x9_8x8_ssse3; @@ -1124,7 +1159,20 @@ #endif } INIT_ADS( _ssse3 ); - if( !(cpu&X264_CPU_SLOW_ATOM) ) + if( cpu&X264_CPU_SLOW_ATOM ) + { + pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3_atom; + pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3_atom; + INIT6( satd, _ssse3_atom ); + pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_ssse3_atom; + INIT6( satd_x3, _ssse3_atom ); + INIT6( satd_x4, _ssse3_atom ); + INIT4( hadamard_ac, _ssse3_atom ); +#if ARCH_X86_64 + pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3_atom; +#endif + }
View file
x264-snapshot-20130224-2245.tar.bz2/common/pixel.h -> x264-snapshot-20130723-2245.tar.bz2/common/pixel.h
Changed
@@ -90,6 +90,7 @@ x264_pixel_cmp_t sad_aligned[8]; /* Aligned SAD for mbcmp */ int (*vsad)( pixel *, intptr_t, int ); int (*asd8)( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height ); + uint64_t (*sa8d_satd[1])( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 ); uint64_t (*var[4])( pixel *pix, intptr_t stride ); int (*var2[4])( pixel *pix1, intptr_t stride1,
View file
x264-snapshot-20130224-2245.tar.bz2/common/quant.c -> x264-snapshot-20130723-2245.tar.bz2/common/quant.c
Changed
@@ -63,6 +63,19 @@ return !!nz; } +static int quant_4x4x4( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] ) +{ + int nza = 0; + for( int j = 0; j < 4; j++ ) + { + int nz = 0; + for( int i = 0; i < 16; i++ ) + QUANT_ONE( dct[j][i], mf[i], bias[i] ); + nza |= (!!nz)<<j; + } + return nza; +} + static int quant_4x4_dc( dctcoef dct[16], int mf, int bias ) { int nz = 0; @@ -405,6 +418,7 @@ { pf->quant_8x8 = quant_8x8; pf->quant_4x4 = quant_4x4; + pf->quant_4x4x4 = quant_4x4x4; pf->quant_4x4_dc = quant_4x4_dc; pf->quant_2x2_dc = quant_2x2_dc; @@ -442,11 +456,6 @@ pf->denoise_dct = x264_denoise_dct_mmx; pf->decimate_score15 = x264_decimate_score15_mmx2; pf->decimate_score16 = x264_decimate_score16_mmx2; - if( cpu&X264_CPU_SLOW_CTZ ) - { - pf->decimate_score15 = x264_decimate_score15_mmx2_slowctz; - pf->decimate_score16 = x264_decimate_score16_mmx2_slowctz; - } pf->decimate_score64 = x264_decimate_score64_mmx2; pf->coeff_last8 = x264_coeff_last8_mmx2; pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmx2; @@ -464,6 +473,7 @@ if( cpu&X264_CPU_SSE2 ) { pf->quant_4x4 = x264_quant_4x4_sse2; + pf->quant_4x4x4 = x264_quant_4x4x4_sse2; pf->quant_8x8 = x264_quant_8x8_sse2; pf->quant_2x2_dc = x264_quant_2x2_dc_sse2; pf->quant_4x4_dc = x264_quant_4x4_dc_sse2; @@ -474,11 +484,6 @@ pf->decimate_score15 = x264_decimate_score15_sse2; pf->decimate_score16 = x264_decimate_score16_sse2; pf->decimate_score64 = x264_decimate_score64_sse2; - if( cpu&X264_CPU_SLOW_CTZ ) - { - pf->decimate_score15 = x264_decimate_score15_sse2_slowctz; - pf->decimate_score16 = x264_decimate_score16_sse2_slowctz; - } pf->coeff_last8 = x264_coeff_last8_sse2; pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2; pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2; @@ -501,17 +506,13 @@ if( cpu&X264_CPU_SSSE3 ) { pf->quant_4x4 = x264_quant_4x4_ssse3; + pf->quant_4x4x4 = x264_quant_4x4x4_ssse3; pf->quant_8x8 = x264_quant_8x8_ssse3; pf->quant_2x2_dc = x264_quant_2x2_dc_ssse3; pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3; pf->denoise_dct = x264_denoise_dct_ssse3; pf->decimate_score15 = x264_decimate_score15_ssse3; pf->decimate_score16 = x264_decimate_score16_ssse3; - if( cpu&X264_CPU_SLOW_CTZ ) - { - pf->decimate_score15 = x264_decimate_score15_ssse3_slowctz; - pf->decimate_score16 = x264_decimate_score16_ssse3_slowctz; - } pf->decimate_score64 = x264_decimate_score64_ssse3; INIT_TRELLIS( ssse3 ); } @@ -520,6 +521,7 @@ pf->quant_2x2_dc = x264_quant_2x2_dc_sse4; pf->quant_4x4_dc = x264_quant_4x4_dc_sse4; pf->quant_4x4 = x264_quant_4x4_sse4; + pf->quant_4x4x4 = x264_quant_4x4x4_sse4; pf->quant_8x8 = x264_quant_8x8_sse4; } if( cpu&X264_CPU_AVX ) @@ -535,6 +537,17 @@ pf->dequant_8x8 = x264_dequant_8x8_xop; } } + if( cpu&X264_CPU_AVX2 ) + { + pf->quant_4x4 = x264_quant_4x4_avx2; + pf->quant_4x4_dc = x264_quant_4x4_dc_avx2; + pf->quant_8x8 = x264_quant_8x8_avx2; + pf->quant_4x4x4 = x264_quant_4x4x4_avx2; + pf->dequant_4x4 = x264_dequant_4x4_avx2; + pf->dequant_8x8 = x264_dequant_8x8_avx2; + pf->dequant_4x4_dc = x264_dequant_4x4dc_avx2; + pf->denoise_dct = x264_denoise_dct_avx2; + } #endif // HAVE_MMX #else // !HIGH_BIT_DEPTH #if HAVE_MMX @@ -543,6 +556,7 @@ { #if ARCH_X86 pf->quant_4x4 = x264_quant_4x4_mmx; + pf->quant_4x4x4 = x264_quant_4x4x4_mmx; pf->quant_8x8 = x264_quant_8x8_mmx; pf->dequant_4x4 = x264_dequant_4x4_mmx; pf->dequant_4x4_dc = x264_dequant_4x4dc_mmx2; @@ -563,11 +577,6 @@ pf->quant_4x4_dc = x264_quant_4x4_dc_mmx2; pf->decimate_score15 = x264_decimate_score15_mmx2; pf->decimate_score16 = x264_decimate_score16_mmx2; - if( cpu&X264_CPU_SLOW_CTZ ) - { - pf->decimate_score15 = x264_decimate_score15_mmx2_slowctz; - pf->decimate_score16 = x264_decimate_score16_mmx2_slowctz; - } pf->decimate_score64 = x264_decimate_score64_mmx2; pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmx2; pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2; @@ -592,6 +601,7 @@ { pf->quant_4x4_dc = x264_quant_4x4_dc_sse2; pf->quant_4x4 = x264_quant_4x4_sse2; + pf->quant_4x4x4 = x264_quant_4x4x4_sse2; pf->quant_8x8 = x264_quant_8x8_sse2; pf->dequant_4x4 = x264_dequant_4x4_sse2; pf->dequant_4x4_dc = x264_dequant_4x4dc_sse2; @@ -606,11 +616,6 @@ pf->decimate_score15 = x264_decimate_score15_sse2; pf->decimate_score16 = x264_decimate_score16_sse2; pf->decimate_score64 = x264_decimate_score64_sse2; - if( cpu&X264_CPU_SLOW_CTZ ) - { - pf->decimate_score15 = x264_decimate_score15_sse2_slowctz; - pf->decimate_score16 = x264_decimate_score16_sse2_slowctz; - } pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2; pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2; pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2; @@ -631,18 +636,25 @@ pf->quant_2x2_dc = x264_quant_2x2_dc_ssse3; pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3; pf->quant_4x4 = x264_quant_4x4_ssse3; + pf->quant_4x4x4 = x264_quant_4x4x4_ssse3; pf->quant_8x8 = x264_quant_8x8_ssse3; pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_ssse3; pf->denoise_dct = x264_denoise_dct_ssse3; pf->decimate_score15 = x264_decimate_score15_ssse3; pf->decimate_score16 = x264_decimate_score16_ssse3; - if( cpu&X264_CPU_SLOW_CTZ ) - { - pf->decimate_score15 = x264_decimate_score15_ssse3_slowctz; - pf->decimate_score16 = x264_decimate_score16_ssse3_slowctz; - } pf->decimate_score64 = x264_decimate_score64_ssse3; INIT_TRELLIS( ssse3 ); + pf->coeff_level_run4 = x264_coeff_level_run4_ssse3; + pf->coeff_level_run8 = x264_coeff_level_run8_ssse3; + pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3; + pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3; + if( cpu&X264_CPU_LZCNT ) + { + pf->coeff_level_run4 = x264_coeff_level_run4_ssse3; + pf->coeff_level_run8 = x264_coeff_level_run8_ssse3; + pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3_lzcnt; + pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3_lzcnt; + } } if( cpu&X264_CPU_SSE4 ) @@ -673,6 +685,30 @@ pf->dequant_8x8 = x264_dequant_8x8_xop; } } + + if( cpu&X264_CPU_AVX2 ) + { + pf->quant_4x4 = x264_quant_4x4_avx2; + pf->quant_4x4_dc = x264_quant_4x4_dc_avx2; + pf->quant_8x8 = x264_quant_8x8_avx2; + pf->quant_4x4x4 = x264_quant_4x4x4_avx2; + pf->dequant_4x4 = x264_dequant_4x4_avx2; + pf->dequant_8x8 = x264_dequant_8x8_avx2; + pf->dequant_4x4_dc = x264_dequant_4x4dc_avx2; + if( h->param.i_cqm_preset == X264_CQM_FLAT ) + { + pf->dequant_4x4 = x264_dequant_4x4_flat16_avx2; + pf->dequant_8x8 = x264_dequant_8x8_flat16_avx2; + } + pf->decimate_score64 = x264_decimate_score64_avx2; + pf->denoise_dct = x264_denoise_dct_avx2; + if( cpu&X264_CPU_LZCNT ) + { + pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2_lzcnt;
View file
x264-snapshot-20130224-2245.tar.bz2/common/quant.h -> x264-snapshot-20130723-2245.tar.bz2/common/quant.h
Changed
@@ -29,8 +29,9 @@ typedef struct { - int (*quant_8x8)( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] ); - int (*quant_4x4)( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] ); + int (*quant_8x8) ( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] ); + int (*quant_4x4) ( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] ); + int (*quant_4x4x4)( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] ); int (*quant_4x4_dc)( dctcoef dct[16], int mf, int bias ); int (*quant_2x2_dc)( dctcoef dct[4], int mf, int bias );
View file
x264-snapshot-20130224-2245.tar.bz2/common/set.c -> x264-snapshot-20130723-2245.tar.bz2/common/set.c
Changed
@@ -85,44 +85,49 @@ int max_qp_err = -1; int max_chroma_qp_err = -1; int min_qp_err = QP_MAX+1; - int num_8x8_lists = h->sps->i_chroma_format_idc == CHROMA_444 ? 4 : 2; /* Checkasm may segfault if optimized out by --chroma-format */ + int num_8x8_lists = h->sps->i_chroma_format_idc == CHROMA_444 ? 4 + : h->param.analyse.b_transform_8x8 ? 2 : 0; /* Checkasm may segfault if optimized out by --chroma-format */ - for( int i = 0; i < 4 + num_8x8_lists; i++ ) - { - int size = i<4 ? 16 : 64; - int j; - for( j = (i<4 ? 0 : 4); j < i; j++ ) - if( !memcmp( h->pps->scaling_list[i], h->pps->scaling_list[j], size*sizeof(uint8_t) ) ) - break; - if( j < i ) - { - h-> quant4_mf[i] = h-> quant4_mf[j]; - h->dequant4_mf[i] = h->dequant4_mf[j]; - h->unquant4_mf[i] = h->unquant4_mf[j]; - } - else - { - CHECKED_MALLOC( h-> quant4_mf[i], (QP_MAX+1)*size*sizeof(udctcoef) ); - CHECKED_MALLOC( h->dequant4_mf[i], 6*size*sizeof(int) ); - CHECKED_MALLOC( h->unquant4_mf[i], (QP_MAX+1)*size*sizeof(int) ); - } - - for( j = (i<4 ? 0 : 4); j < i; j++ ) - if( deadzone[j&3] == deadzone[i&3] && - !memcmp( h->pps->scaling_list[i], h->pps->scaling_list[j], size*sizeof(uint8_t) ) ) - break; - if( j < i ) - { - h->quant4_bias[i] = h->quant4_bias[j]; - h->quant4_bias0[i] = h->quant4_bias0[j]; - } - else - { - CHECKED_MALLOC( h->quant4_bias[i], (QP_MAX+1)*size*sizeof(udctcoef) ); - CHECKED_MALLOC( h->quant4_bias0[i], (QP_MAX+1)*size*sizeof(udctcoef) ); - } +#define CQM_ALLOC( w, count )\ + for( int i = 0; i < count; i++ )\ + {\ + int size = w*w;\ + int start = w == 8 ? 4 : 0;\ + int j;\ + for( j = 0; j < i; j++ )\ + if( !memcmp( h->pps->scaling_list[i+start], h->pps->scaling_list[j+start], size*sizeof(uint8_t) ) )\ + break;\ + if( j < i )\ + {\ + h-> quant##w##_mf[i] = h-> quant##w##_mf[j];\ + h->dequant##w##_mf[i] = h->dequant##w##_mf[j];\ + h->unquant##w##_mf[i] = h->unquant##w##_mf[j];\ + }\ + else\ + {\ + CHECKED_MALLOC( h-> quant##w##_mf[i], (QP_MAX+1)*size*sizeof(udctcoef) );\ + CHECKED_MALLOC( h->dequant##w##_mf[i], 6*size*sizeof(int) );\ + CHECKED_MALLOC( h->unquant##w##_mf[i], (QP_MAX+1)*size*sizeof(int) );\ + }\ + for( j = 0; j < i; j++ )\ + if( deadzone[j] == deadzone[i] &&\ + !memcmp( h->pps->scaling_list[i+start], h->pps->scaling_list[j+start], size*sizeof(uint8_t) ) )\ + break;\ + if( j < i )\ + {\ + h->quant##w##_bias[i] = h->quant##w##_bias[j];\ + h->quant##w##_bias0[i] = h->quant##w##_bias0[j];\ + }\ + else\ + {\ + CHECKED_MALLOC( h->quant##w##_bias[i], (QP_MAX+1)*size*sizeof(udctcoef) );\ + CHECKED_MALLOC( h->quant##w##_bias0[i], (QP_MAX+1)*size*sizeof(udctcoef) );\ + }\ } + CQM_ALLOC( 4, 4 ) + CQM_ALLOC( 8, num_8x8_lists ) + for( int q = 0; q < 6; q++ ) { for( int i = 0; i < 16; i++ ) @@ -204,6 +209,9 @@ for( int cat = 0; cat < 3 + CHROMA444; cat++ ) { int dct8x8 = cat&1; + if( !h->param.analyse.b_transform_8x8 && dct8x8 ) + continue; + int size = dct8x8 ? 64 : 16; udctcoef *nr_offset = h->nr_offset_emergency[q][cat]; /* Denoise chroma first (due to h264's chroma QP offset), then luma, then DC. */
View file
x264-snapshot-20130224-2245.tar.bz2/common/win32thread.c -> x264-snapshot-20130723-2245.tar.bz2/common/win32thread.c
Changed
@@ -279,7 +279,7 @@ memset( &thread_control, 0, sizeof(x264_win32thread_control_t) ); } -int x264_pthread_num_processors_np() +int x264_pthread_num_processors_np( void ) { DWORD_PTR system_cpus, process_cpus = 0; int cpus = 0;
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/bitstream-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/bitstream-a.asm
Changed
@@ -4,7 +4,7 @@ ;* Copyright (C) 2010-2013 x264 project ;* ;* Authors: Jason Garrett-Glaser <darkshikari@gmail.com> -;* Henrik Gramner <hengar-6@student.ltu.se> +;* Henrik Gramner <henrik@gramner.com> ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -32,100 +32,105 @@ ;----------------------------------------------------------------------------- ; uint8_t *x264_nal_escape( uint8_t *dst, uint8_t *src, uint8_t *end ) ;----------------------------------------------------------------------------- - %macro NAL_LOOP 2 -%1_escape: +%%escape: ; Detect false positive to avoid unneccessary escape loop xor r3d, r3d cmp byte [r0+r1-1], 0 setnz r3b - xor r3d, r4d + xor k3, k4 jnz .escape - jmp %1_continue + jmp %%continue ALIGN 16 %1: - pcmpeqb m3, m1, m4 - pcmpeqb m2, m0, m4 - pmovmskb r3d, m3 - %2 [r0+r1], m0 + mova [r0+r1+mmsize], m1 + pcmpeqb m1, m0 + mova [r0+r1], m2 + pcmpeqb m2, m0 + pmovmskb r3d, m1 + %2 m1, [r1+r2+3*mmsize] pmovmskb r4d, m2 - shl r3d, mmsize - mova m0, [r1+r2+2*mmsize] - or r4d, r3d - %2 [r0+r1+mmsize], m1 - lea r3d, [r4+r4+1] - mova m1, [r1+r2+3*mmsize] - and r4d, r3d - jnz %1_escape -%1_continue: + %2 m2, [r1+r2+2*mmsize] + shl k3, mmsize + or k3, k4 + lea k4, [2*r3+1] + and k4, k3 + jnz %%escape +%%continue: add r1, 2*mmsize jl %1 %endmacro %macro NAL_ESCAPE 0 +%if mmsize == 32 + %xdefine k3 r3 + %xdefine k4 r4 +%else + %xdefine k3 r3d + %xdefine k4 r4d +%endif cglobal nal_escape, 3,5 - mov r3w, [r1] + movzx r3d, byte [r1] sub r1, r2 ; r1 = offset of current src pointer from end of src - pxor m4, m4 + pxor m0, m0 + mov [r0], r3b sub r0, r1 ; r0 = projected end of dst, assuming no more escapes - mov [r0+r1], r3w - add r1, 2 - jge .ret + or r3d, 0xffffff00 ; ignore data before src - ; Start off by jumping into the escape loop in - ; case there's an escape at the start. - ; And do a few more in scalar until src is aligned again. - jmp .first_escape + ; Start off by jumping into the escape loop in case there's an escape at the start. + ; And do a few more in scalar until dst is aligned. + jmp .escape_loop +%if mmsize == 16 NAL_LOOP .loop_aligned, mova -%if mmsize==16 jmp .ret - NAL_LOOP .loop_unaligned, movu %endif + NAL_LOOP .loop_unaligned, movu .ret: movifnidn rax, r0 RET -ALIGN 16 .escape: ; Skip bytes that are known to be valid - and r4d, r3d - tzcnt r3d, r4d - add r1, r3 + and k4, k3 + tzcnt k4, k4 + xor r3d, r3d ; the last two bytes are known to be zero + add r1, r4 .escape_loop: inc r1 jge .ret -.first_escape: - movzx r3d, byte [r1+r2] - lea r4, [r1+r2] - cmp r3d, 3 - jna .escape_check -.no_escape: + movzx r4d, byte [r1+r2] + shl r3d, 8 + or r3d, r4d + test r3d, 0xfffffc ; if the last two bytes are 0 and the current byte is <=3 + jz .add_escape_byte +.escaped: + lea r4d, [r0+r1] mov [r0+r1], r3b - test r4d, mmsize-1 ; Do SIMD when src is aligned + test r4d, mmsize-1 ; Do SIMD when dst is aligned jnz .escape_loop - mova m0, [r4] - mova m1, [r4+mmsize] -%if mmsize==16 - lea r4d, [r0+r1] + movu m1, [r1+r2+mmsize] + movu m2, [r1+r2] +%if mmsize == 16 + lea r4d, [r1+r2] test r4d, mmsize-1 - jnz .loop_unaligned + jz .loop_aligned %endif - jmp .loop_aligned + jmp .loop_unaligned -ALIGN 16 -.escape_check: - cmp word [r0+r1-2], 0 - jnz .no_escape +.add_escape_byte: mov byte [r0+r1], 3 - inc r0 - jmp .no_escape + inc r0 + or r3d, 0x0300 + jmp .escaped %endmacro INIT_MMX mmx2 NAL_ESCAPE INIT_XMM sse2 NAL_ESCAPE -INIT_XMM avx +%if ARCH_X86_64 +INIT_YMM avx2 NAL_ESCAPE +%endif
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/cabac-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/cabac-a.asm
Changed
@@ -26,22 +26,69 @@ ;***************************************************************************** %include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA + +coeff_abs_level1_ctx: db 1, 2, 3, 4, 0, 0, 0, 0 +coeff_abs_levelgt1_ctx: db 5, 5, 5, 5, 6, 7, 8, 9 +coeff_abs_level_transition: db 1, 2, 3, 3, 4, 5, 6, 7 + db 4, 4, 4, 4, 5, 6, 7, 7 + +%if ARCH_X86_64 +%macro COEFF_LAST_TABLE 17 + %define funccpu1 %1 + %define funccpu2 %2 + %define funccpu3 %3 + %rep 14 + %ifidn %4, 4 + dq mangle(x264_coeff_last%4_ %+ funccpu1) + %elifidn %4, 64 + dq mangle(x264_coeff_last%4_ %+ funccpu2) + %else + dq mangle(x264_coeff_last%4_ %+ funccpu3) + %endif + %rotate 1 + %endrep +%endmacro + +cextern coeff_last4_mmx2 +cextern coeff_last4_mmx2_lzcnt +cextern coeff_last15_sse2 +cextern coeff_last15_sse2_lzcnt +cextern coeff_last16_sse2 +cextern coeff_last16_sse2_lzcnt +cextern coeff_last64_sse2 +cextern coeff_last64_sse2_lzcnt +cextern coeff_last64_avx2_lzcnt + +%ifdef PIC +SECTION .data +%endif +coeff_last_sse2: COEFF_LAST_TABLE mmx2, sse2, sse2, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 +coeff_last_sse2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, sse2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 +coeff_last_avx2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, avx2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 +%endif SECTION .text cextern cabac_range_lps cextern cabac_transition cextern cabac_renorm_shift +cextern cabac_entropy +cextern cabac_size_unary +cextern cabac_transition_unary +cextern significant_coeff_flag_offset +cextern significant_coeff_flag_offset_8x8 +cextern last_coeff_flag_offset +cextern last_coeff_flag_offset_8x8 +cextern coeff_abs_level_m1_offset +cextern count_cat_m1 +cextern cabac_encode_ue_bypass -; t3 must be ecx, since it's used for shift. -%if WIN64 - DECLARE_REG_TMP 3,1,2,0,6,5,4,2 - %define pointer resq -%elif ARCH_X86_64 - DECLARE_REG_TMP 0,1,2,3,4,5,6,6 +%if ARCH_X86_64 %define pointer resq %else - DECLARE_REG_TMP 0,4,2,1,3,5,6,2 %define pointer resd %endif @@ -58,24 +105,34 @@ .state: resb 1024 endstruc -%macro LOAD_GLOBAL 4 +%macro LOAD_GLOBAL 3-5 0 ; dst, base, off1, off2, tmp %ifdef PIC - ; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea - lea r7, [%2] - %ifnidn %3, 0 - add r7, %3 + %ifidn %4, 0 + movzx %1, byte [%2+%3+r7-$$] + %else + lea %5, [r7+%4] + movzx %1, byte [%2+%3+%5-$$] %endif - movzx %1, byte [r7+%4] %else movzx %1, byte [%2+%3+%4] %endif %endmacro -cglobal cabac_encode_decision_asm, 0,7 - movifnidn t0, r0mp +%macro CABAC 1 +; t3 must be ecx, since it's used for shift. +%if WIN64 + DECLARE_REG_TMP 3,1,2,0,5,6,4,4 +%elif ARCH_X86_64 + DECLARE_REG_TMP 0,1,2,3,4,5,6,6 +%else + DECLARE_REG_TMP 0,4,2,1,3,5,6,2 +%endif + +cglobal cabac_encode_decision_%1, 1,7 movifnidn t1d, r1m - mov t5d, [t0+cb.range] - movzx t6d, byte [t0+cb.state+t1] + mov t5d, [r0+cb.range] + movzx t6d, byte [r0+cb.state+t1] + movifnidn t0, r0 ; WIN64 mov t4d, ~1 mov t3d, t5d and t4d, t6d @@ -84,8 +141,11 @@ %if WIN64 PUSH r7 %endif - LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*2 - LOAD_GLOBAL t4d, cabac_transition, t2, t6*2 +%ifdef PIC + lea r7, [$$] +%endif + LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*2, t4 + LOAD_GLOBAL t4d, cabac_transition, t2, t6*2, t4 and t6d, 1 sub t3d, t5d cmp t6d, t2d @@ -96,66 +156,82 @@ mov [t0+cb.state+t1], t4b ;cabac_encode_renorm mov t4d, t3d +%ifidn %1, bmi2 + lzcnt t3d, t3d + sub t3d, 23 + shlx t4d, t4d, t3d + shlx t6d, t6d, t3d +%else shr t3d, 3 - LOAD_GLOBAL t3d, cabac_renorm_shift, 0, t3 + LOAD_GLOBAL t3d, cabac_renorm_shift, t3 + shl t4d, t3b + shl t6d, t3b +%endif %if WIN64 POP r7 %endif - shl t4d, t3b - shl t6d, t3b mov [t0+cb.range], t4d add t3d, [t0+cb.queue] - jge cabac_putbyte + jge cabac_putbyte_%1 .update_queue_low: mov [t0+cb.low], t6d mov [t0+cb.queue], t3d RET -cglobal cabac_encode_bypass_asm, 0,3 - movifnidn t0, r0mp - movifnidn t3d, r1m - mov t7d, [t0+cb.low] - and t3d, [t0+cb.range] - lea t7d, [t7*2+t3] - mov t3d, [t0+cb.queue] +cglobal cabac_encode_bypass_%1, 2,3 + mov t7d, [r0+cb.low] + and r1d, [r0+cb.range] + lea t7d, [t7*2+r1] + movifnidn t0, r0 ; WIN64 + mov t3d, [r0+cb.queue] inc t3d -%if UNIX64 ; .putbyte compiles to nothing but a jmp - jge cabac_putbyte +%if ARCH_X86_64 ; .putbyte compiles to nothing but a jmp + jge cabac_putbyte_%1 %else jge .putbyte %endif mov [t0+cb.low], t7d mov [t0+cb.queue], t3d RET +%if ARCH_X86_64 == 0 .putbyte: PROLOGUE 0,7 movifnidn t6d, t7d - jmp cabac_putbyte + jmp cabac_putbyte_%1 +%endif -cglobal cabac_encode_terminal_asm, 0,3 - movifnidn t0, r0mp
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/const-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/const-a.asm
Changed
@@ -26,39 +26,53 @@ %include "x86inc.asm" -SECTION_RODATA +SECTION_RODATA 32 + +const pb_1, times 32 db 1 +const hsub_mul, times 16 db 1, -1 +const pw_1, times 16 dw 1 +const pw_16, times 16 dw 16 +const pw_32, times 16 dw 32 +const pw_512, times 16 dw 512 +const pw_00ff, times 16 dw 0x00ff +const pw_pixel_max,times 16 dw ((1 << BIT_DEPTH)-1) +const pd_1, times 8 dd 1 +const deinterleave_shufd, dd 0,4,1,5,2,6,3,7 +const pb_unpackbd1, times 2 db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3 +const pb_unpackbd2, times 2 db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7 const pb_01, times 8 db 0,1 const pb_0, times 16 db 0 const pb_a1, times 16 db 0xa1 -const pb_1, times 16 db 1 const pb_3, times 16 db 3 -const hsub_mul, times 8 db 1, -1 const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6 -const pw_1, times 8 dw 1 const pw_2, times 8 dw 2 const pw_m2, times 8 dw -2 const pw_4, times 8 dw 4 const pw_8, times 8 dw 8 -const pw_16, times 8 dw 16 -const pw_32, times 8 dw 32 const pw_64, times 8 dw 64 +const pw_256, times 8 dw 256 const pw_32_0, times 4 dw 32, times 4 dw 0 const pw_8000, times 8 dw 0x8000 const pw_3fff, times 8 dw 0x3fff -const pw_pixel_max,times 8 dw ((1 << BIT_DEPTH)-1) const pw_ppppmmmm, dw 1,1,1,1,-1,-1,-1,-1 const pw_ppmmppmm, dw 1,1,-1,-1,1,1,-1,-1 const pw_pmpmpmpm, dw 1,-1,1,-1,1,-1,1,-1 const pw_pmmpzzzz, dw 1,-1,-1,1,0,0,0,0 -const pd_1, times 4 dd 1 const pd_32, times 4 dd 32 const pd_1024, times 4 dd 1024 const pd_ffff, times 4 dd 0xffff -const pw_00ff, times 8 dw 0x00ff const pw_ff00, times 8 dw 0xff00 +const popcnt_table +%assign x 0 +%rep 256 +; population count +db ((x>>0)&1)+((x>>1)&1)+((x>>2)&1)+((x>>3)&1)+((x>>4)&1)+((x>>5)&1)+((x>>6)&1)+((x>>7)&1) +%assign x x+1 +%endrep + const sw_64, dd 64
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/cpu-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/cpu-a.asm
Changed
@@ -66,7 +66,27 @@ mov [r4], edx RET -%if ARCH_X86_64 == 0 +%if ARCH_X86_64 + +;----------------------------------------------------------------------------- +; void stack_align( void (*func)(void*), void *arg ); +;----------------------------------------------------------------------------- +cglobal stack_align + push rbp + mov rbp, rsp +%if WIN64 + sub rsp, 32 ; shadow space +%endif + and rsp, ~31 + mov rax, r0 + mov r0, r1 + mov r1, r2 + mov r2, r3 + call rax + leave + ret + +%else ;----------------------------------------------------------------------------- ; int cpu_cpuid_test( void ) @@ -94,14 +114,11 @@ popfd ret -;----------------------------------------------------------------------------- -; void stack_align( void (*func)(void*), void *arg ); -;----------------------------------------------------------------------------- cglobal stack_align push ebp mov ebp, esp sub esp, 12 - and esp, ~15 + and esp, ~31 mov ecx, [ebp+8] mov edx, [ebp+12] mov [esp], edx @@ -165,7 +182,10 @@ %endif push rbp mov rbp, rsp - and rsp, ~15 +%if WIN64 + sub rsp, 32 ; shadow space +%endif + and rsp, ~31 call intel_cpu_indicator_init leave %if ARCH_X86_64
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/dct-64.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/dct-64.asm
Changed
@@ -311,6 +311,42 @@ INIT_XMM xop DCT_SUB8 +INIT_YMM avx2 +cglobal sub16x16_dct8, 3,3,10 + add r0, 128 + add r2, 4*FDEC_STRIDE + call .sub16x8_dct8 + add r0, 256 + add r1, FENC_STRIDE*8 + add r2, FDEC_STRIDE*8 + call .sub16x8_dct8 + RET +.sub16x8_dct8: + LOAD_DIFF16x2_AVX2 0, 1, 2, 3, 0, 1 + LOAD_DIFF16x2_AVX2 2, 3, 4, 5, 2, 3 + LOAD_DIFF16x2_AVX2 4, 5, 6, 7, 4, 5 + LOAD_DIFF16x2_AVX2 6, 7, 8, 9, 6, 7 + DCT8_1D w, 0,1,2,3,4,5,6,7,8,9 + TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8 + DCT8_1D w, 0,1,2,3,4,5,6,7,8,9 + mova [r0-0x80+0x00], xm0 + vextracti128 [r0+0x00], m0, 1 + mova [r0-0x80+0x10], xm1 + vextracti128 [r0+0x10], m1, 1 + mova [r0-0x80+0x20], xm2 + vextracti128 [r0+0x20], m2, 1 + mova [r0-0x80+0x30], xm3 + vextracti128 [r0+0x30], m3, 1 + mova [r0-0x80+0x40], xm4 + vextracti128 [r0+0x40], m4, 1 + mova [r0-0x80+0x50], xm5 + vextracti128 [r0+0x50], m5, 1 + mova [r0-0x80+0x60], xm6 + vextracti128 [r0+0x60], m6, 1 + mova [r0-0x80+0x70], xm7 + vextracti128 [r0+0x70], m7, 1 + ret + ;----------------------------------------------------------------------------- ; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] ) ;----------------------------------------------------------------------------- @@ -390,4 +426,5 @@ ADD8x8 INIT_XMM avx ADD8x8 + %endif ; !HIGH_BIT_DEPTH
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/dct-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/dct-a.asm
Changed
@@ -30,7 +30,7 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA +SECTION_RODATA 32 pw_ppmmmmpp: dw 1,1,-1,-1,-1,-1,1,1 pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15 pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15 @@ -39,8 +39,6 @@ pb_scan4frameb: SHUFFLE_MASK_W 0,4,1,2,5,6,3,7 pb_scan4frame2a: SHUFFLE_MASK_W 0,4,1,2,5,8,12,9 pb_scan4frame2b: SHUFFLE_MASK_W 6,3,7,10,13,14,11,15 -pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3 -pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7 pb_scan8framet1: SHUFFLE_MASK_W 0, 1, 6, 7, 8, 9, 13, 14 pb_scan8framet2: SHUFFLE_MASK_W 2 , 3, 4, 7, 9, 15, 10, 14 @@ -74,6 +72,7 @@ cextern pw_32_0 cextern pw_32 +cextern pw_512 cextern pw_8000 cextern pw_pixel_max cextern hsub_mul @@ -83,6 +82,9 @@ cextern pd_32 cextern pw_ppppmmmm cextern pw_pmpmpmpm +cextern deinterleave_shufd +cextern pb_unpackbd1 +cextern pb_unpackbd2 %macro WALSH4_1D 6 SUMSUB_BADC %1, %5, %4, %3, %2, %6 @@ -377,6 +379,135 @@ ADD4x4 INIT_XMM avx ADD4x4 + +%macro STOREx2_AVX2 9 + movq xm%3, [r0+%5*FDEC_STRIDE] + vinserti128 m%3, m%3, [r0+%6*FDEC_STRIDE], 1 + movq xm%4, [r0+%7*FDEC_STRIDE] + vinserti128 m%4, m%4, [r0+%8*FDEC_STRIDE], 1 + punpcklbw m%3, m%9 + punpcklbw m%4, m%9 + psraw m%1, 6 + psraw m%2, 6 + paddsw m%1, m%3 + paddsw m%2, m%4 + packuswb m%1, m%2 + vextracti128 xm%2, m%1, 1 + movq [r0+%5*FDEC_STRIDE], xm%1 + movq [r0+%6*FDEC_STRIDE], xm%2 + movhps [r0+%7*FDEC_STRIDE], xm%1 + movhps [r0+%8*FDEC_STRIDE], xm%2 +%endmacro + +INIT_YMM avx2 +cglobal add8x8_idct, 2,3,8 + add r0, 4*FDEC_STRIDE + pxor m7, m7 + TAIL_CALL .skip_prologue, 0 +global current_function %+ .skip_prologue +.skip_prologue: + ; TRANSPOSE4x4Q + mova xm0, [r1+ 0] + mova xm1, [r1+32] + mova xm2, [r1+16] + mova xm3, [r1+48] + vinserti128 m0, m0, [r1+ 64], 1 + vinserti128 m1, m1, [r1+ 96], 1 + vinserti128 m2, m2, [r1+ 80], 1 + vinserti128 m3, m3, [r1+112], 1 + SBUTTERFLY qdq, 0, 1, 4 + SBUTTERFLY qdq, 2, 3, 4 + IDCT4_1D w,0,1,2,3,4,5 + TRANSPOSE2x4x4W 0,1,2,3,4 + paddw m0, [pw_32] + IDCT4_1D w,0,1,2,3,4,5 + STOREx2_AVX2 0, 1, 4, 5, -4, 0, -3, 1, 7 + STOREx2_AVX2 2, 3, 4, 5, -2, 2, -1, 3, 7 + ret + +; 2xdst, 2xtmp, 4xsrcrow, 1xzero +%macro LOAD_DIFF8x2_AVX2 9 + movq xm%1, [r1+%5*FENC_STRIDE] + movq xm%2, [r1+%6*FENC_STRIDE] + vinserti128 m%1, m%1, [r1+%7*FENC_STRIDE], 1 + vinserti128 m%2, m%2, [r1+%8*FENC_STRIDE], 1 + punpcklbw m%1, m%9 + punpcklbw m%2, m%9 + movq xm%3, [r2+(%5-4)*FDEC_STRIDE] + movq xm%4, [r2+(%6-4)*FDEC_STRIDE] + vinserti128 m%3, m%3, [r2+(%7-4)*FDEC_STRIDE], 1 + vinserti128 m%4, m%4, [r2+(%8-4)*FDEC_STRIDE], 1 + punpcklbw m%3, m%9 + punpcklbw m%4, m%9 + psubw m%1, m%3 + psubw m%2, m%4 +%endmacro + +; 4x src, 1x tmp +%macro STORE8_DCT_AVX2 5 + SBUTTERFLY qdq, %1, %2, %5 + SBUTTERFLY qdq, %3, %4, %5 + mova [r0+ 0], xm%1 + mova [r0+ 16], xm%3 + mova [r0+ 32], xm%2 + mova [r0+ 48], xm%4 + vextracti128 [r0+ 64], m%1, 1 + vextracti128 [r0+ 80], m%3, 1 + vextracti128 [r0+ 96], m%2, 1 + vextracti128 [r0+112], m%4, 1 +%endmacro + +%macro STORE16_DCT_AVX2 5 + SBUTTERFLY qdq, %1, %2, %5 + SBUTTERFLY qdq, %3, %4, %5 + mova [r0+ 0-128], xm%1 + mova [r0+16-128], xm%3 + mova [r0+32-128], xm%2 + mova [r0+48-128], xm%4 + vextracti128 [r0+ 0], m%1, 1 + vextracti128 [r0+16], m%3, 1 + vextracti128 [r0+32], m%2, 1 + vextracti128 [r0+48], m%4, 1 +%endmacro + +INIT_YMM avx2 +cglobal sub8x8_dct, 3,3,7 + pxor m6, m6 + add r2, 4*FDEC_STRIDE + LOAD_DIFF8x2_AVX2 0, 1, 4, 5, 0, 1, 4, 5, 6 + LOAD_DIFF8x2_AVX2 2, 3, 4, 5, 2, 3, 6, 7, 6 + DCT4_1D 0, 1, 2, 3, 4 + TRANSPOSE2x4x4W 0, 1, 2, 3, 4 + DCT4_1D 0, 1, 2, 3, 4 + STORE8_DCT_AVX2 0, 1, 2, 3, 4 + RET + +INIT_YMM avx2 +cglobal sub16x16_dct, 3,3,6 + add r0, 128 + add r2, 4*FDEC_STRIDE + call .sub16x4_dct + add r0, 64 + add r1, 4*FENC_STRIDE + add r2, 4*FDEC_STRIDE + call .sub16x4_dct + add r0, 256-64 + add r1, 4*FENC_STRIDE + add r2, 4*FDEC_STRIDE + call .sub16x4_dct + add r0, 64 + add r1, 4*FENC_STRIDE + add r2, 4*FDEC_STRIDE + call .sub16x4_dct + RET +.sub16x4_dct: + LOAD_DIFF16x2_AVX2 0, 1, 4, 5, 0, 1 + LOAD_DIFF16x2_AVX2 2, 3, 4, 5, 2, 3 + DCT4_1D 0, 1, 2, 3, 4 + TRANSPOSE2x4x4W 0, 1, 2, 3, 4 + DCT4_1D 0, 1, 2, 3, 4 + STORE16_DCT_AVX2 0, 1, 2, 3, 4 + ret %endif ; HIGH_BIT_DEPTH INIT_MMX @@ -422,7 +553,7 @@ cglobal %1, 2,2,11 pxor m7, m7 %endif -%if mmsize==16 && %3!=256 +%if mmsize>=16 && %3!=256 add r0, 4*FDEC_STRIDE %endif .skip_prologue: @@ -497,6 +628,9 @@ SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2, 128, 8, 0, 0, 11 SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3, 128, 8, 0, 0, 11 SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx, 128, 8, 0, 0, 11 + +INIT_YMM +ADD_NxN_IDCT add16x16_idct_avx2, add8x8_idct_avx2, 128, 8, 0, 0 %endif ; HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH @@ -607,10 +741,9 @@ movh m0, [r1] pxor m1, m1 add r0, FDEC_STRIDE*4 - paddw m0, [pw_32] - psraw m0, 6 + pmulhrsw m0, [pw_512] psubw m1, m0 - mova m5, [pb_idctdc_unpack]
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/dct.h -> x264-snapshot-20130723-2245.tar.bz2/common/x86/dct.h
Changed
@@ -40,6 +40,8 @@ void x264_sub16x16_dct_avx ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); void x264_sub8x8_dct_xop ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 ); void x264_sub16x16_dct_xop ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x8_dct_avx2 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub16x16_dct_avx2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); void x264_sub8x8_dct_dc_mmx2( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 ); void x264_sub8x8_dct_dc_sse2( dctcoef dct [ 4], pixel *pix1, pixel *pix2 ); void x264_sub8x16_dct_dc_sse2 ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 ); @@ -56,14 +58,17 @@ void x264_add16x16_idct_dc_mmx2 ( uint8_t *p_dst, int16_t dct [16] ); void x264_add8x8_idct_sse2 ( pixel *p_dst, dctcoef dct[ 4][16] ); void x264_add8x8_idct_avx ( pixel *p_dst, dctcoef dct[ 4][16] ); +void x264_add8x8_idct_avx2 ( pixel *p_dst, dctcoef dct[ 4][16] ); void x264_add16x16_idct_sse2 ( pixel *p_dst, dctcoef dct[16][16] ); void x264_add16x16_idct_avx ( pixel *p_dst, dctcoef dct[16][16] ); +void x264_add16x16_idct_avx2 ( pixel *p_dst, dctcoef dct[16][16] ); void x264_add8x8_idct_dc_sse2 ( pixel *p_dst, dctcoef dct [ 4] ); void x264_add16x16_idct_dc_sse2 ( pixel *p_dst, dctcoef dct [16] ); void x264_add8x8_idct_dc_ssse3 ( uint8_t *p_dst, int16_t dct [ 4] ); void x264_add16x16_idct_dc_ssse3( uint8_t *p_dst, int16_t dct [16] ); void x264_add8x8_idct_dc_avx ( pixel *p_dst, dctcoef dct [ 4] ); void x264_add16x16_idct_dc_avx ( pixel *p_dst, dctcoef dct [16] ); +void x264_add16x16_idct_dc_avx2 ( uint8_t *p_dst, int16_t dct [16] ); void x264_dct4x4dc_mmx ( int16_t d[16] ); void x264_dct4x4dc_sse2 ( int32_t d[16] ); @@ -82,6 +87,7 @@ void x264_sub16x16_dct8_sse4 ( int32_t dct[4][64], uint16_t *pix1, uint16_t *pix2 ); void x264_sub8x8_dct8_avx ( dctcoef dct [64], pixel *pix1, pixel *pix2 ); void x264_sub16x16_dct8_avx ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 ); +void x264_sub16x16_dct8_avx2 ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 ); void x264_add8x8_idct8_mmx ( uint8_t *dst, int16_t dct [64] ); @@ -118,5 +124,6 @@ void x264_zigzag_interleave_8x8_cavlc_mmx ( int16_t *dst, int16_t *src, uint8_t *nnz ); void x264_zigzag_interleave_8x8_cavlc_sse2( dctcoef *dst, dctcoef *src, uint8_t *nnz ); void x264_zigzag_interleave_8x8_cavlc_avx ( dctcoef *dst, dctcoef *src, uint8_t *nnz ); +void x264_zigzag_interleave_8x8_cavlc_avx2( int16_t *dst, int16_t *src, uint8_t *nnz ); #endif
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/deblock-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/deblock-a.asm
Changed
@@ -28,8 +28,10 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA +SECTION_RODATA 32 +load_bytes_shuf: times 2 db 3,4,5,6,11,12,13,14,4,5,6,7,12,13,14,15 +insert_top_shuf: dd 0,1,4,5,7,2,3,6 transpose_shuf: db 0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15 SECTION .text @@ -42,6 +44,7 @@ cextern pw_4 cextern pw_00ff cextern pw_pixel_max +cextern pb_unpackbd1 %if HIGH_BIT_DEPTH ; out: %4 = |%1-%2|-%3 @@ -162,14 +165,12 @@ ;----------------------------------------------------------------------------- ; void deblock_v_luma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal deblock_v_luma, 5,5,8 - %assign pad 5*mmsize+12-(stack_offset&15) +cglobal deblock_v_luma, 5,5,8,0-5*mmsize %define tcm [rsp] %define ms1 [rsp+mmsize] %define ms2 [rsp+mmsize*2] %define am [rsp+mmsize*3] %define bm [rsp+mmsize*4] - SUB rsp, pad add r1, r1 LOAD_AB m4, m5, r2d, r3d mov r3, 32/mmsize @@ -213,11 +214,9 @@ add r4, mmsize/8 dec r3 jg .loop - ADD rsp, pad RET -cglobal deblock_h_luma, 5,6,8 - %assign pad 7*mmsize+12-(stack_offset&15) +cglobal deblock_h_luma, 5,6,8,0-7*mmsize %define tcm [rsp] %define ms1 [rsp+mmsize] %define ms2 [rsp+mmsize*2] @@ -225,7 +224,6 @@ %define p2m [rsp+mmsize*4] %define am [rsp+mmsize*5] %define bm [rsp+mmsize*6] - SUB rsp, pad add r1, r1 LOAD_AB m4, m5, r2d, r3d mov r3, r1 @@ -302,7 +300,6 @@ lea r2, [r2+r1*(mmsize/2)] dec r5 jg .loop - ADD rsp, pad RET %endmacro @@ -485,7 +482,6 @@ %endmacro %macro LUMA_INTRA_INIT 1 - %xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15) %define t0 m4 %define t1 m5 %define t2 m6 @@ -495,7 +491,6 @@ CAT_XDEFINE t, i, [rsp+mmsize*(i-4)] %assign i i+1 %endrep - SUB rsp, pad add r1, r1 %endmacro @@ -724,7 +719,7 @@ ;----------------------------------------------------------------------------- ; void deblock_v_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_v_luma_intra, 4,7,8 +cglobal deblock_v_luma_intra, 4,7,8,0-3*mmsize LUMA_INTRA_INIT 3 lea r4, [r1*4] lea r5, [r1*3] @@ -744,13 +739,12 @@ add r4, mmsize dec r6 jg .loop - ADD rsp, pad RET ;----------------------------------------------------------------------------- ; void deblock_h_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_h_luma_intra, 4,7,8 +cglobal deblock_h_luma_intra, 4,7,8,0-8*mmsize LUMA_INTRA_INIT 8 %if mmsize == 8 lea r4, [r1*3] @@ -785,7 +779,6 @@ dec r6 %endif jg .loop - ADD rsp, pad RET %endmacro @@ -871,6 +864,19 @@ movh %8, m4 %endmacro +; in: 8 rows of 4 bytes in %9..%10 +; out: 8 rows of 4 bytes in %1..%8 +%macro STORE_8x4B 10 + movd %1, %9 + pextrd %2, %9, 1 + pextrd %3, %9, 2 + pextrd %4, %9, 3 + movd %5, %10 + pextrd %6, %10, 1 + pextrd %7, %10, 2 + pextrd %8, %10, 3 +%endmacro + %macro TRANSPOSE4x8B_LOAD 8 TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8 %endmacro @@ -925,6 +931,45 @@ ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] %macro TRANSPOSE6x8_MEM 9 RESET_MM_PERMUTATION +%if cpuflag(avx) + ; input: + ; _ABCDEF_ + ; _GHIJKL_ + ; _MNOPQR_ + ; _STUVWX_ + ; _YZabcd_ + ; _efghij_ + ; _klmnop_ + ; _qrstuv_ + + movh m0, %1 + movh m2, %2 + movh m1, %3 + movh m3, %4 + punpcklbw m0, m2 ; __ AG BH CI DJ EK FL __ + punpcklbw m1, m3 ; __ MS NT OU PV QW RX __ + movh m2, %5 + movh m3, %6 + punpcklbw m2, m3 ; __ Ye Zf ag bh ci dj __ + movh m3, %7 + movh m4, %8 + punpcklbw m3, m4 ; __ kq lr ms nt ou pv __ + + SBUTTERFLY wd, 0, 1, 4 ; __ __ AG MS BH NT CI OU + ; DJ PV EK QW FL RX __ __ + SBUTTERFLY wd, 2, 3, 4 ; __ __ Ye kq Zf lr ag ms + ; bh nt ci ou dj pv __ __ + SBUTTERFLY dq, 0, 2, 4 ; __ __ __ __ AG MS Ye kq + ; BH NT Zf lr CI FL OU RX + SBUTTERFLY dq, 1, 3, 4 ; DJ PV bh nt EK QW Zf lr + ; FL RX dj pv __ __ __ __ + movhps [%9+0x00], m0 + movh [%9+0x10], m2 + movhps [%9+0x20], m2 + movh [%9+0x30], m1 + movhps [%9+0x40], m1 + movh [%9+0x50], m3 +%else movq m0, %1 movq m1, %2 movq m2, %3 @@ -951,13 +996,41 @@ movq [%9+0x30], m1 movq [%9+0x40], m5 movq [%9+0x50], m3 +%endif RESET_MM_PERMUTATION %endmacro + ; in: 8 rows of 8 in %1..%8 ; out: 8 rows of 8 in %9..%16 %macro TRANSPOSE8x8_MEM 16 RESET_MM_PERMUTATION +%if cpuflag(avx) + movh m0, %1 + movh m4, %2 + movh m1, %3 + movh m5, %4 + movh m2, %5 + movh m3, %7 + punpcklbw m0, m4
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/mc-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/mc-a.asm
Changed
@@ -34,7 +34,7 @@ SECTION_RODATA 32 -ch_shuf: db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9 +ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9 ch_shuf_adj: times 8 db 0 times 8 db 2 times 8 db 4 @@ -49,10 +49,12 @@ cextern pw_8 cextern pw_32 cextern pw_64 +cextern pw_512 cextern pw_00ff cextern pw_pixel_max cextern sw_64 cextern pd_32 +cextern deinterleave_shufd ;============================================================================= ; implicit weighted biprediction @@ -141,8 +143,7 @@ movh m1, %2 punpcklbw m0, m1 pmaddubsw m0, m3 - paddw m0, m4 - psraw m0, 6 + pmulhrsw m0, m4 %endmacro %macro BIWEIGHT_START_SSSE3 0 @@ -151,9 +152,13 @@ sub t7d, t6d shl t7d, 8 add t6d, t7d - movd m3, t6d - mova m4, [pw_32] + mova m4, [pw_512] + movd xm3, t6d +%if cpuflag(avx2) + vpbroadcastw m3, xm3 +%else SPLATW m3, m3 ; weight_dst,src +%endif %endmacro %if HIGH_BIT_DEPTH @@ -244,6 +249,25 @@ INIT_XMM ssse3 AVG_WEIGHT 8, 7 AVG_WEIGHT 16, 7 + +INIT_YMM avx2 +cglobal pixel_avg_weight_w16 + BIWEIGHT_START + AVG_START 5 +.height_loop: + movu xm0, [t2] + movu xm1, [t4] + vinserti128 m0, m0, [t2+t3], 1 + vinserti128 m1, m1, [t4+t5], 1 + SBUTTERFLY bw, 0, 1, 2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + packuswb m0, m1 + mova [t0], xm0 + vextracti128 [t0+t1], m0, 1 + AVG_END %endif ;HIGH_BIT_DEPTH ;============================================================================= @@ -274,7 +298,7 @@ %endmacro ; src, dst, width -%macro WEIGHT_TWO_ROW 3 +%macro WEIGHT_TWO_ROW 4 %assign x 0 %rep (%3+mmsize/2-1)/(mmsize/2) %if %3-x/2 <= 4 && mmsize == 16 @@ -298,16 +322,21 @@ %else ; !HIGH_BIT_DEPTH %macro WEIGHT_START 1 +%if cpuflag(avx2) + vbroadcasti128 m3, [r4] + vbroadcasti128 m4, [r4+16] +%else mova m3, [r4] mova m4, [r4+16] %if notcpuflag(ssse3) movd m5, [r4+32] %endif +%endif pxor m2, m2 %endmacro -; src1, src2, dst1, dst2 -%macro WEIGHT_ROWx2 4 +; src1, src2, dst1, dst2, fast +%macro WEIGHT_ROWx2 5 movh m0, [%1 ] movh m1, [%1+mmsize/2] movh m6, [%2 ] @@ -317,10 +346,12 @@ punpcklbw m6, m2 punpcklbw m7, m2 %if cpuflag(ssse3) +%if %5==0 psllw m0, 7 psllw m1, 7 psllw m6, 7 psllw m7, 7 +%endif pmulhrsw m0, m3 pmulhrsw m1, m3 pmulhrsw m6, m3 @@ -349,15 +380,54 @@ mova [%4], m6 %endmacro -; src1, src2, dst1, dst2, width -%macro WEIGHT_COL 5 +; src1, src2, dst1, dst2, width, fast +%macro WEIGHT_COL 6 +%if cpuflag(avx2) +%if %5==16 + movu xm0, [%1] + vinserti128 m0, m0, [%2], 1 + punpckhbw m1, m0, m2 + punpcklbw m0, m0, m2 +%if %6==0 + psllw m0, 7 + psllw m1, 7 +%endif + pmulhrsw m0, m3 + pmulhrsw m1, m3 + paddw m0, m4 + paddw m1, m4 + packuswb m0, m1 + mova [%3], xm0 + vextracti128 [%4], m0, 1 +%else + movq xm0, [%1] + vinserti128 m0, m0, [%2], 1 + punpcklbw m0, m2 +%if %6==0 + psllw m0, 7 +%endif + pmulhrsw m0, m3 + paddw m0, m4 + packuswb m0, m0 + vextracti128 xm1, m0, 1 +%if %5 == 8 + movq [%3], xm0 + movq [%4], xm1 +%else + movd [%3], xm0 + movd [%4], xm1 +%endif +%endif +%else movh m0, [%1] movh m1, [%2] punpcklbw m0, m2 punpcklbw m1, m2 %if cpuflag(ssse3) +%if %6==0 psllw m0, 7 psllw m1, 7 +%endif pmulhrsw m0, m3 pmulhrsw m1, m3 paddw m0, m4 @@ -380,18 +450,22 @@ movd [%3], m0 ; width 2 can write garbage for the last 2 bytes movd [%4], m1 %endif +%endif %endmacro - ; src, dst, width -%macro WEIGHT_TWO_ROW 3 +%macro WEIGHT_TWO_ROW 4 %assign x 0 %rep %3 %if (%3-x) >= mmsize - WEIGHT_ROWx2 %1+x, %1+r3+x, %2+x, %2+r1+x + WEIGHT_ROWx2 %1+x, %1+r3+x, %2+x, %2+r1+x, %4 %assign x (x+mmsize) %else - WEIGHT_COL %1+x, %1+r3+x, %2+x, %2+r1+x, %3-x - %exitrep + %assign w %3-x +%if w == 20 + %assign w 16 +%endif
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/mc-a2.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/mc-a2.asm
Changed
@@ -30,13 +30,14 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA +SECTION_RODATA 32 + +filt_mul20: times 32 db 20 +filt_mul15: times 16 db 1, -5 +filt_mul51: times 16 db -5, 1 +hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15 +deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15 -filt_mul20: times 16 db 20 -filt_mul15: times 8 db 1, -5 -filt_mul51: times 8 db -5, 1 -hpel_shuf: db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15 -deinterleave_shuf: db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15 %if HIGH_BIT_DEPTH deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14 deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15 @@ -44,6 +45,7 @@ deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30 deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31 %endif +pw_1024: times 16 dw 1024 pd_16: times 4 dd 16 pd_0f: times 4 dd 0xffff @@ -64,6 +66,7 @@ cextern pw_1 cextern pw_16 cextern pw_32 +cextern pw_512 cextern pw_00ff cextern pw_3fff cextern pw_pixel_max @@ -127,19 +130,24 @@ paddw %4, %6 %endmacro -%macro FILT_PACK 4-6 b - paddw %1, %4 - paddw %2, %4 -%if %0 == 6 - psubusw %1, %6 - psubusw %2, %6 - psrlw %1, %3 - psrlw %2, %3 +%macro FILT_PACK 3-5 +%if cpuflag(ssse3) + pmulhrsw %1, %3 + pmulhrsw %2, %3 +%else + paddw %1, %3 + paddw %2, %3 +%if %0 == 5 + psubusw %1, %5 + psubusw %2, %5 + psrlw %1, %4 + psrlw %2, %4 %else - psraw %1, %3 - psraw %2, %3 + psraw %1, %4 + psraw %2, %4 %endif -%ifnidn w, %5 +%endif +%if HIGH_BIT_DEPTH == 0 packuswb %1, %2 %endif %endmacro @@ -203,7 +211,7 @@ mova [r2+r4+mmsize], m4 paddw m1, s30 paddw m4, s30 - FILT_PACK m1, m4, 5, m6, w, s10 + FILT_PACK m1, m4, m6, 5, s10 CLIPW m1, m0, m7 CLIPW m4, m0, m7 mova [r0+r4], m1 @@ -295,7 +303,7 @@ FILT_H2 m1, m2, m3, m4, m5, m6 mova m7, [pw_1] pxor m2, m2 - FILT_PACK m1, m4, 1, m7, w + FILT_PACK m1, m4, m7, 1 CLIPW m1, m2, m0 CLIPW m4, m2, m0 mova [r0+r2], m1 @@ -349,17 +357,25 @@ paddw m4, m5 paddw m1, m3 paddw m4, m6 + mova m7, [pw_1024] %else LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1 LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1 LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0 LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1 FILT_V2 m1, m2, m3, m4, m5, m6 + mova m7, [pw_16] %endif - mova m7, [pw_16] +%if mmsize==32 + mova [r2+r4*2], xm1 + mova [r2+r4*2+mmsize/2], xm4 + vextracti128 [r2+r4*2+mmsize], m1, 1 + vextracti128 [r2+r4*2+mmsize*3/2], m4, 1 +%else mova [r2+r4*2], m1 mova [r2+r4*2+mmsize], m4 - FILT_PACK m1, m4, 5, m7 +%endif + FILT_PACK m1, m4, m7, 5 movnta [r0+r4], m1 add r1, mmsize add r5, mmsize @@ -371,8 +387,8 @@ ;----------------------------------------------------------------------------- ; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width ); ;----------------------------------------------------------------------------- -INIT_MMX -cglobal hpel_filter_c_mmx2, 3,3 +INIT_MMX mmx2 +cglobal hpel_filter_c, 3,3 add r0, r2 lea r1, [r1+r2*2] neg r2 @@ -392,7 +408,7 @@ paddw m5, [src+12] ; b1 paddw m6, [src+10] ; c1 FILT_H2 m1, m2, m3, m4, m5, m6 - FILT_PACK m1, m4, 6, m7 + FILT_PACK m1, m4, m7, 6 movntq [r0+r2], m1 add r2, 8 jl .loop @@ -401,7 +417,8 @@ ;----------------------------------------------------------------------------- ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width ); ;----------------------------------------------------------------------------- -cglobal hpel_filter_h_mmx2, 3,3 +INIT_MMX mmx2 +cglobal hpel_filter_h, 3,3 add r0, r2 add r1, r2 neg r2 @@ -436,14 +453,12 @@ paddw m6, m7 ; a1 movq m7, [pw_1] FILT_H2 m1, m2, m3, m4, m5, m6 - FILT_PACK m1, m4, 1, m7 + FILT_PACK m1, m4, m7, 1 movntq [r0+r2], m1 add r2, 8 jl .loop RET -INIT_XMM - %macro HPEL_C 0 ;----------------------------------------------------------------------------- ; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width ); @@ -454,29 +469,33 @@ neg r2 %define src r1+r2*2 %ifnidn cpuname, sse2 +%if cpuflag(ssse3) + mova m7, [pw_512] +%else mova m7, [pw_32] - %define tpw_32 m7 +%endif + %define pw_rnd m7 %elif ARCH_X86_64 mova m8, [pw_32] - %define tpw_32 m8 + %define pw_rnd m8 %else - %define tpw_32 [pw_32] + %define pw_rnd [pw_32] %endif ; This doesn't seem to be faster (with AVX) on Sandy Bridge or Bulldozer... -%if cpuflag(misalign) +%if cpuflag(misalign) || mmsize==32 .loop: movu m4, [src-4] movu m5, [src-2] - mova m6, [src] - movu m3, [src+12] - movu m2, [src+14] - mova m1, [src+16] + mova m6, [src+0] + movu m3, [src-4+mmsize] + movu m2, [src-2+mmsize] + mova m1, [src+0+mmsize] paddw m4, [src+6] paddw m5, [src+4]
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/mc-c.c -> x264-snapshot-20130723-2245.tar.bz2/common/x86/mc-c.c
Changed
@@ -35,7 +35,8 @@ #define DECL_SUF( func, args )\ void func##_mmx2 args;\ void func##_sse2 args;\ - void func##_ssse3 args; + void func##_ssse3 args;\ + void func##_avx2 args; DECL_SUF( x264_pixel_avg_16x16, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int )) DECL_SUF( x264_pixel_avg_16x8, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int )) @@ -72,15 +73,20 @@ MC_WEIGHT( 12, ssse3 ) MC_WEIGHT( 16, ssse3 ) MC_WEIGHT( 20, ssse3 ) +MC_WEIGHT( 8, avx2 ) +MC_WEIGHT( 16, avx2 ) +MC_WEIGHT( 20, avx2 ) #undef MC_OFFSET #undef MC_WEIGHT -void x264_mc_copy_w4_mmx ( pixel *, intptr_t, pixel *, intptr_t, int ); -void x264_mc_copy_w8_mmx ( pixel *, intptr_t, pixel *, intptr_t, int ); -void x264_mc_copy_w8_sse2 ( pixel *, intptr_t, pixel *, intptr_t, int ); -void x264_mc_copy_w16_mmx ( pixel *, intptr_t, pixel *, intptr_t, int ); -void x264_mc_copy_w16_sse2( pixel *, intptr_t, pixel *, intptr_t, int ); -void x264_mc_copy_w16_aligned_sse2( pixel *, intptr_t, pixel *, intptr_t, int ); +void x264_mc_copy_w4_mmx ( pixel *, intptr_t, pixel *, intptr_t, int ); +void x264_mc_copy_w8_mmx ( pixel *, intptr_t, pixel *, intptr_t, int ); +void x264_mc_copy_w8_sse ( pixel *, intptr_t, pixel *, intptr_t, int ); +void x264_mc_copy_w16_mmx( pixel *, intptr_t, pixel *, intptr_t, int ); +void x264_mc_copy_w16_sse( pixel *, intptr_t, pixel *, intptr_t, int ); +void x264_mc_copy_w16_aligned_sse( pixel *, intptr_t, pixel *, intptr_t, int ); +void x264_mc_copy_w16_avx( uint16_t *, intptr_t, uint16_t *, intptr_t, int ); +void x264_mc_copy_w16_aligned_avx( uint16_t *, intptr_t, uint16_t *, intptr_t, int ); void x264_prefetch_fenc_420_mmx2( pixel *, intptr_t, pixel *, intptr_t, int ); void x264_prefetch_fenc_422_mmx2( pixel *, intptr_t, pixel *, intptr_t, int ); void x264_prefetch_ref_mmx2( pixel *, intptr_t, int ); @@ -121,18 +127,23 @@ void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, intptr_t i_src, int height ); void x264_load_deinterleave_chroma_fdec_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height ); void x264_load_deinterleave_chroma_fdec_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height ); -void *x264_memcpy_aligned_mmx ( void *dst, const void *src, size_t n ); -void *x264_memcpy_aligned_sse2( void *dst, const void *src, size_t n ); -void x264_memzero_aligned_mmx ( void *dst, size_t n ); -void x264_memzero_aligned_sse2( void *dst, size_t n ); +void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n ); +void *x264_memcpy_aligned_sse( void *dst, const void *src, size_t n ); +void x264_memzero_aligned_mmx( void *dst, size_t n ); +void x264_memzero_aligned_sse( void *dst, size_t n ); +void x264_memzero_aligned_avx( void *dst, size_t n ); void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride ); +void x264_integral_init4h_avx2( uint16_t *sum, uint8_t *pix, intptr_t stride ); void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride ); void x264_integral_init8h_avx ( uint16_t *sum, uint8_t *pix, intptr_t stride ); +void x264_integral_init8h_avx2( uint16_t *sum, uint8_t *pix, intptr_t stride ); void x264_integral_init4v_mmx ( uint16_t *sum8, uint16_t *sum4, intptr_t stride ); void x264_integral_init4v_sse2 ( uint16_t *sum8, uint16_t *sum4, intptr_t stride ); void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, intptr_t stride ); +void x264_integral_init4v_avx2( uint16_t *sum8, uint16_t *sum4, intptr_t stride ); void x264_integral_init8v_mmx ( uint16_t *sum8, intptr_t stride ); void x264_integral_init8v_sse2( uint16_t *sum8, intptr_t stride ); +void x264_integral_init8v_avx2( uint16_t *sum8, intptr_t stride ); void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); void x264_mbtree_propagate_cost_avx ( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, @@ -151,7 +162,7 @@ MC_CHROMA(ssse3) MC_CHROMA(ssse3_cache64) MC_CHROMA(avx) -MC_CHROMA(avx_cache64) +MC_CHROMA(avx2) #define LOWRES(cpu)\ void x264_frame_init_lowres_core_##cpu( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,\ @@ -162,6 +173,7 @@ LOWRES(ssse3) LOWRES(avx) LOWRES(xop) +LOWRES(avx2) #define PIXEL_AVG_W(width,cpu)\ void x264_pixel_avg2_w##width##_##cpu( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t ); @@ -176,6 +188,7 @@ PIXEL_AVG_WALL(sse2) PIXEL_AVG_WALL(sse2_misalign) PIXEL_AVG_WALL(cache64_ssse3) +PIXEL_AVG_WALL(avx2) #define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\ static void (* const x264_pixel_avg_wtab_##instr[6])( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t ) =\ @@ -194,6 +207,8 @@ #define x264_pixel_avg2_w20_mmx2 x264_pixel_avg2_w18_mmx2 #define x264_pixel_avg2_w12_sse2 x264_pixel_avg2_w10_sse2 #define x264_pixel_avg2_w20_sse2 x264_pixel_avg2_w18_sse2 +#define x264_pixel_avg2_w12_avx2 x264_pixel_avg2_w16_avx2 +#define x264_pixel_avg2_w20_avx2 x264_pixel_avg2_w18_avx2 #else /* w16 sse2 is faster than w12 mmx as long as the cacheline issue is resolved */ #define x264_pixel_avg2_w12_cache64_ssse3 x264_pixel_avg2_w16_cache64_ssse3 @@ -205,6 +220,7 @@ PIXEL_AVG_WTAB(mmx2, mmx2, mmx2, mmx2, mmx2, mmx2) #if HIGH_BIT_DEPTH PIXEL_AVG_WTAB(sse2, mmx2, sse2, sse2, sse2, sse2) +PIXEL_AVG_WTAB(avx2, mmx2, sse2, avx2, avx2, avx2) #else // !HIGH_BIT_DEPTH #if ARCH_X86 PIXEL_AVG_WTAB(cache32_mmx2, mmx2, cache32_mmx2, cache32_mmx2, cache32_mmx2, cache32_mmx2) @@ -214,6 +230,8 @@ PIXEL_AVG_WTAB(sse2_misalign, mmx2, mmx2, sse2, sse2, sse2_misalign) PIXEL_AVG_WTAB(cache64_sse2, mmx2, cache64_mmx2, cache64_sse2, cache64_sse2, cache64_sse2) PIXEL_AVG_WTAB(cache64_ssse3, mmx2, cache64_mmx2, cache64_ssse3, cache64_ssse3, cache64_sse2) +PIXEL_AVG_WTAB(cache64_ssse3_atom, mmx2, mmx2, cache64_ssse3, cache64_ssse3, sse2) +PIXEL_AVG_WTAB(avx2, mmx2, mmx2, sse2, sse2, avx2) #endif // HIGH_BIT_DEPTH #define MC_COPY_WTAB(instr, name1, name2, name3)\ @@ -228,9 +246,10 @@ MC_COPY_WTAB(mmx,mmx,mmx,mmx) #if HIGH_BIT_DEPTH -MC_COPY_WTAB(sse2,mmx,sse2,sse2) +MC_COPY_WTAB(sse,mmx,sse,sse) +MC_COPY_WTAB(avx,mmx,sse,avx) #else -MC_COPY_WTAB(sse2,mmx,mmx,sse2) +MC_COPY_WTAB(sse,mmx,mmx,sse) #endif #define MC_WEIGHT_WTAB(function, instr, name1, name2, w12version)\ @@ -282,6 +301,7 @@ MC_WEIGHT_WTAB(offsetadd,sse2,mmx2,mmx2,16) MC_WEIGHT_WTAB(offsetsub,sse2,mmx2,mmx2,16) MC_WEIGHT_WTAB(weight,ssse3,ssse3,ssse3,16) +MC_WEIGHT_WTAB(weight,avx2,ssse3,avx2,16) static void x264_weight_cache_mmx2( x264_t *h, x264_weight_t *w ) { @@ -357,14 +377,17 @@ } MC_LUMA(mmx2,mmx2,mmx) -MC_LUMA(sse2,sse2,sse2) -#if !HIGH_BIT_DEPTH +MC_LUMA(sse2,sse2,sse) +#if HIGH_BIT_DEPTH +MC_LUMA(avx2,avx2,avx) +#else #if ARCH_X86 MC_LUMA(cache32_mmx2,cache32_mmx2,mmx) MC_LUMA(cache64_mmx2,cache64_mmx2,mmx) #endif -MC_LUMA(cache64_sse2,cache64_sse2,sse2) -MC_LUMA(cache64_ssse3,cache64_ssse3,sse2) +MC_LUMA(cache64_sse2,cache64_sse2,sse) +MC_LUMA(cache64_ssse3,cache64_ssse3,sse) +MC_LUMA(cache64_ssse3_atom,cache64_ssse3_atom,sse) #endif // !HIGH_BIT_DEPTH #define GET_REF(name)\ @@ -400,6 +423,7 @@ GET_REF(mmx2) GET_REF(sse2) +GET_REF(avx2) #if !HIGH_BIT_DEPTH #if ARCH_X86 GET_REF(cache32_mmx2) @@ -408,6 +432,7 @@ GET_REF(sse2_misalign) GET_REF(cache64_sse2) GET_REF(cache64_ssse3) +GET_REF(cache64_ssse3_atom) #endif // !HIGH_BIT_DEPTH #define HPEL(align, cpu, cpuv, cpuc, cpuh)\ @@ -425,8 +450,8 @@ width += realign;\ while( height-- )\ {\ - x264_hpel_filter_v_##cpuv( dstv, src, buf+8, stride, width );\ - x264_hpel_filter_c_##cpuc( dstc, buf+8, width );\ + x264_hpel_filter_v_##cpuv( dstv, src, buf+16, stride, width );\ + x264_hpel_filter_c_##cpuc( dstc, buf+16, width );\ x264_hpel_filter_h_##cpuh( dsth, src, width );\ dsth += stride;\ dstv += stride;\ @@ -445,10 +470,12 @@ void x264_hpel_filter_sse2 ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf ); void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf ); void x264_hpel_filter_avx ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf ); +void x264_hpel_filter_avx2 ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf ); #else HPEL(16, sse2, sse2, sse2, sse2) HPEL(16, ssse3, ssse3, ssse3, ssse3) HPEL(16, avx, avx, avx, avx) +HPEL(32, avx2, avx2, avx2, avx2) #endif HPEL(16, sse2_misalign, sse2, sse2_misalign, sse2) #endif // HIGH_BIT_DEPTH @@ -545,6 +572,12 @@
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/pixel-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/pixel-a.asm
Changed
@@ -32,8 +32,17 @@ %include "x86util.asm" SECTION_RODATA 32 +hmul_16p: times 16 db 1 + times 8 db 1, -1 +hmul_8p: times 8 db 1 + times 4 db 1, -1 + times 8 db 1 + times 4 db 1, -1 mask_ff: times 16 db 0xff times 16 db 0 +mask_ac4: times 2 dw 0, -1, -1, -1, 0, -1, -1, -1 +mask_ac4b: times 2 dw 0, -1, 0, -1, -1, -1, -1, -1 +mask_ac8: times 2 dw 0, -1, -1, -1, -1, -1, -1, -1 %if BIT_DEPTH == 10 ssim_c1: times 4 dd 6697.7856 ; .01*.01*1023*1023*64 ssim_c2: times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63 @@ -46,12 +55,7 @@ ssim_c1: times 4 dd 416 ; .01*.01*255*255*64 ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63 %endif -mask_ac4: dw 0, -1, -1, -1, 0, -1, -1, -1 -mask_ac4b: dw 0, -1, 0, -1, -1, -1, -1, -1 -mask_ac8: dw 0, -1, -1, -1, -1, -1, -1, -1 hmul_4p: times 2 db 1, 1, 1, 1, 1, -1, 1, -1 -hmul_8p: times 8 db 1 - times 4 db 1, -1 mask_10: times 4 dw 0, -1 mask_1100: times 2 dd 0, -1 pb_pppm: times 4 db 1,1,1,-1 @@ -85,6 +89,7 @@ intrax9b_v2: db 2, 3,-1,-1,-1,-1,-1,-1, 6, 7,-1,-1,-1,-1,-1,-1 intrax9b_lut: db 0x60,0x64,0x80,0x00,0x04,0x20,0x40,0x24,0x44,0,0,0,0,0,0,0 +ALIGN 32 intra8x9_h1: db 7, 7, 7, 7, 7, 7, 7, 7, 5, 5, 5, 5, 5, 5, 5, 5 intra8x9_h2: db 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4 intra8x9_h3: db 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1 @@ -120,9 +125,29 @@ transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15 sw_f0: dq 0xfff0, 0 -sq_0f: dq 0xffffffff, 0 pd_f0: times 4 dd 0xffff0000 +pw_76543210: dw 0, 1, 2, 3, 4, 5, 6, 7 + +ads_mvs_shuffle: +%macro ADS_MVS_SHUFFLE 8 + %assign y x + %rep 8 + %rep 7 + %rotate (~y)&1 + %assign y y>>((~y)&1) + %endrep + db %1*2, %1*2+1 + %rotate 1 + %assign y y>>1 + %endrep +%endmacro +%assign x 0 +%rep 256 + ADS_MVS_SHUFFLE 0, 1, 2, 3, 4, 5, 6, 7 +%assign x x+1 +%endrep + SECTION .text cextern pb_0 @@ -136,7 +161,9 @@ cextern pw_ppmmppmm cextern pw_pmpmpmpm cextern pw_pmmpzzzz +cextern pd_1 cextern hsub_mul +cextern popcnt_table ;============================================================================= ; SSD @@ -144,69 +171,67 @@ %if HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- -; int pixel_ssd_MxN( uint16_t *, intptr_t, uint16_t *, intptr_t ) +; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t ) ;----------------------------------------------------------------------------- %macro SSD_ONE 2 -cglobal pixel_ssd_%1x%2, 4,5,6 - mov r4, %1*%2/mmsize +cglobal pixel_ssd_%1x%2, 4,7,6 + FIX_STRIDES r1, r3 +%if mmsize == %1*2 + %define offset0_1 r1 + %define offset0_2 r1*2 + %define offset0_3 r5 + %define offset1_1 r3 + %define offset1_2 r3*2 + %define offset1_3 r6 + lea r5, [3*r1] + lea r6, [3*r3] +%elif mmsize == %1 + %define offset0_1 mmsize + %define offset0_2 r1 + %define offset0_3 r1+mmsize + %define offset1_1 mmsize + %define offset1_2 r3 + %define offset1_3 r3+mmsize +%elif mmsize == %1/2 + %define offset0_1 mmsize + %define offset0_2 mmsize*2 + %define offset0_3 mmsize*3 + %define offset1_1 mmsize + %define offset1_2 mmsize*2 + %define offset1_3 mmsize*3 +%endif + %assign %%n %2/(2*mmsize/%1) +%if %%n > 1 + mov r4d, %%n +%endif pxor m0, m0 .loop mova m1, [r0] -%if %1 <= mmsize/2 - mova m3, [r0+r1*2] - %define offset r3*2 - %define num_rows 2 -%else - mova m3, [r0+mmsize] - %define offset mmsize - %define num_rows 1 -%endif - lea r0, [r0+r1*2*num_rows] + mova m2, [r0+offset0_1] + mova m3, [r0+offset0_2] + mova m4, [r0+offset0_3] psubw m1, [r2] - psubw m3, [r2+offset] - lea r2, [r2+r3*2*num_rows] + psubw m2, [r2+offset1_1] + psubw m3, [r2+offset1_2] + psubw m4, [r2+offset1_3] +%if %%n > 1 + lea r0, [r0+r1*(%2/%%n)] + lea r2, [r2+r3*(%2/%%n)] +%endif pmaddwd m1, m1 + pmaddwd m2, m2 pmaddwd m3, m3 + pmaddwd m4, m4 + paddd m1, m2 + paddd m3, m4 paddd m0, m1 paddd m0, m3 - dec r4 +%if %%n > 1 + dec r4d jg .loop +%endif HADDD m0, m5 - movd eax, m0 - RET -%endmacro - -%macro SSD_16_MMX 2 -cglobal pixel_ssd_%1x%2, 4,5 - mov r4, %1*%2/mmsize/2 - pxor m0, m0 -.loop - mova m1, [r0] - mova m2, [r2] - mova m3, [r0+mmsize] - mova m4, [r2+mmsize] - mova m5, [r0+mmsize*2] - mova m6, [r2+mmsize*2] - mova m7, [r0+mmsize*3] - psubw m1, m2 - psubw m3, m4 - mova m2, [r2+mmsize*3] - psubw m5, m6 - pmaddwd m1, m1 - psubw m7, m2 - pmaddwd m3, m3 - pmaddwd m5, m5 - lea r0, [r0+r1*2] - lea r2, [r2+r3*2] - pmaddwd m7, m7 - paddd m1, m3 - paddd m5, m7 - paddd m0, m1 - paddd m0, m5 - dec r4 - jg .loop - HADDD m0, m7 - movd eax, m0 + movd eax, xm0 RET %endmacro @@ -217,14 +242,17 @@
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/pixel.h -> x264-snapshot-20130723-2245.tar.bz2/common/x86/pixel.h
Changed
@@ -52,10 +52,12 @@ DECL_X1( sad, sse2_aligned ) DECL_X1( sad, ssse3 ) DECL_X1( sad, ssse3_aligned ) +DECL_X1( sad, avx2 ) DECL_X4( sad, mmx2 ) DECL_X4( sad, sse2 ) DECL_X4( sad, sse3 ) DECL_X4( sad, ssse3 ) +DECL_X4( sad, avx2 ) DECL_X1( ssd, mmx ) DECL_X1( ssd, mmx2 ) DECL_X1( ssd, sse2slow ) @@ -63,18 +65,23 @@ DECL_X1( ssd, ssse3 ) DECL_X1( ssd, avx ) DECL_X1( ssd, xop ) +DECL_X1( ssd, avx2 ) DECL_X1( satd, mmx2 ) DECL_X1( satd, sse2 ) DECL_X1( satd, ssse3 ) +DECL_X1( satd, ssse3_atom ) DECL_X1( satd, sse4 ) DECL_X1( satd, avx ) DECL_X1( satd, xop ) +DECL_X1( satd, avx2 ) DECL_X1( sa8d, mmx2 ) DECL_X1( sa8d, sse2 ) DECL_X1( sa8d, ssse3 ) +DECL_X1( sa8d, ssse3_atom ) DECL_X1( sa8d, sse4 ) DECL_X1( sa8d, avx ) DECL_X1( sa8d, xop ) +DECL_X1( sa8d, avx2 ) DECL_X1( sad, cache32_mmx2 ); DECL_X1( sad, cache64_mmx2 ); DECL_X1( sad, cache64_sse2 ); @@ -88,12 +95,15 @@ DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, var, avx, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, var, xop, ( pixel *pix, intptr_t i_stride )) +DECL_PIXELS( uint64_t, var, avx2, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, mmx2, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( pixel *pix, intptr_t i_stride )) +DECL_PIXELS( uint64_t, hadamard_ac, ssse3_atom, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, sse4, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, avx, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, xop, ( pixel *pix, intptr_t i_stride )) +DECL_PIXELS( uint64_t, hadamard_ac, avx2, ( pixel *pix, intptr_t i_stride )) void x264_intra_satd_x3_4x4_mmx2 ( pixel *, pixel *, int * ); @@ -106,16 +116,19 @@ void x264_intra_sad_x3_8x8c_mmx2 ( pixel *, pixel *, int * ); void x264_intra_sad_x3_8x8c_sse2 ( pixel *, pixel *, int * ); void x264_intra_sad_x3_8x8c_ssse3 ( pixel *, pixel *, int * ); +void x264_intra_sad_x3_8x8c_avx2 ( pixel *, pixel *, int * ); void x264_intra_satd_x3_16x16_mmx2 ( pixel *, pixel *, int * ); void x264_intra_satd_x3_16x16_ssse3( uint8_t *, uint8_t *, int * ); void x264_intra_sad_x3_16x16_mmx2 ( pixel *, pixel *, int * ); void x264_intra_sad_x3_16x16_sse2 ( pixel *, pixel *, int * ); void x264_intra_sad_x3_16x16_ssse3 ( pixel *, pixel *, int * ); +void x264_intra_sad_x3_16x16_avx2 ( pixel *, pixel *, int * ); void x264_intra_sa8d_x3_8x8_mmx2 ( uint8_t *, uint8_t *, int * ); void x264_intra_sa8d_x3_8x8_sse2 ( pixel *, pixel *, int * ); void x264_intra_sad_x3_8x8_mmx2 ( pixel *, pixel *, int * ); void x264_intra_sad_x3_8x8_sse2 ( pixel *, pixel *, int * ); void x264_intra_sad_x3_8x8_ssse3 ( pixel *, pixel *, int * ); +void x264_intra_sad_x3_8x8_avx2 ( uint16_t*, uint16_t*, int * ); int x264_intra_satd_x9_4x4_ssse3( uint8_t *, uint8_t *, uint16_t * ); int x264_intra_satd_x9_4x4_sse4 ( uint8_t *, uint8_t *, uint16_t * ); int x264_intra_satd_x9_4x4_avx ( uint8_t *, uint8_t *, uint16_t * ); @@ -129,6 +142,7 @@ int x264_intra_sad_x9_8x8_ssse3 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * ); int x264_intra_sad_x9_8x8_sse4 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * ); int x264_intra_sad_x9_8x8_avx ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * ); +int x264_intra_sad_x9_8x8_avx2 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * ); void x264_pixel_ssd_nv12_core_mmx2( pixel *pixuv1, intptr_t stride1, pixel *pixuv2, intptr_t stride2, int width, @@ -139,6 +153,9 @@ void x264_pixel_ssd_nv12_core_avx ( pixel *pixuv1, intptr_t stride1, pixel *pixuv2, intptr_t stride2, int width, int height, uint64_t *ssd_u, uint64_t *ssd_v ); +void x264_pixel_ssd_nv12_core_avx2( pixel *pixuv1, intptr_t stride1, + pixel *pixuv2, intptr_t stride2, int width, + int height, uint64_t *ssd_u, uint64_t *ssd_v ); void x264_pixel_ssim_4x4x2_core_mmx2( const uint8_t *pix1, intptr_t stride1, const uint8_t *pix2, intptr_t stride2, int sums[2][4] ); void x264_pixel_ssim_4x4x2_core_sse2( const pixel *pix1, intptr_t stride1, @@ -151,17 +168,28 @@ int x264_pixel_var2_8x8_sse2 ( pixel *, intptr_t, pixel *, intptr_t, int * ); int x264_pixel_var2_8x8_ssse3 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); int x264_pixel_var2_8x8_xop ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); +int x264_pixel_var2_8x8_avx2 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); int x264_pixel_var2_8x16_mmx2 ( pixel *, intptr_t, pixel *, intptr_t, int * ); int x264_pixel_var2_8x16_sse2 ( pixel *, intptr_t, pixel *, intptr_t, int * ); int x264_pixel_var2_8x16_ssse3( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); int x264_pixel_var2_8x16_xop ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); +int x264_pixel_var2_8x16_avx2 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); int x264_pixel_vsad_mmx2 ( pixel *src, intptr_t stride, int height ); int x264_pixel_vsad_sse2 ( pixel *src, intptr_t stride, int height ); int x264_pixel_vsad_ssse3( pixel *src, intptr_t stride, int height ); int x264_pixel_vsad_xop ( pixel *src, intptr_t stride, int height ); +int x264_pixel_vsad_avx2 ( uint16_t *src, intptr_t stride, int height ); int x264_pixel_asd8_sse2 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height ); int x264_pixel_asd8_ssse3( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height ); int x264_pixel_asd8_xop ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height ); +uint64_t x264_pixel_sa8d_satd_16x16_sse2 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 ); +uint64_t x264_pixel_sa8d_satd_16x16_ssse3 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 ); +uint64_t x264_pixel_sa8d_satd_16x16_ssse3_atom( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 ); +uint64_t x264_pixel_sa8d_satd_16x16_sse4 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 ); +uint64_t x264_pixel_sa8d_satd_16x16_avx ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 ); +uint64_t x264_pixel_sa8d_satd_16x16_xop ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 ); +uint64_t x264_pixel_sa8d_satd_16x16_avx2 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 ); + #define DECL_ADS( size, suffix ) \ int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\ @@ -178,6 +206,9 @@ DECL_ADS( 4, avx ) DECL_ADS( 2, avx ) DECL_ADS( 1, avx ) +DECL_ADS( 4, avx2 ) +DECL_ADS( 2, avx2 ) +DECL_ADS( 1, avx2 ) #undef DECL_PIXELS #undef DECL_X1
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/predict-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/predict-a.asm
Changed
@@ -6,6 +6,7 @@ ;* Authors: Loren Merritt <lorenm@u.washington.edu> ;* Holger Lubitz <holger@lubitz.org> ;* Jason Garrett-Glaser <darkshikari@gmail.com> +;* Henrik Gramner <henrik@gramner.com> ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -28,13 +29,12 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA +SECTION_RODATA 32 -pw_76543210: -pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7 -pw_43210123: dw -3, -2, -1, 0, 1, 2, 3, 4 -pw_m3: times 8 dw -3 -pw_m7: times 8 dw -7 +pw_0to15: dw 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +pw_43210123: times 2 dw -3, -2, -1, 0, 1, 2, 3, 4 +pw_m3: times 16 dw -3 +pw_m7: times 16 dw -7 pb_00s_ff: times 8 db 0 pb_0s_ff: times 7 db 0 db 0xff @@ -57,109 +57,106 @@ cextern pw_00ff cextern pw_pixel_max -%macro STORE8x8 2-4 - add r0, 4*FDEC_STRIDEB - mova [r0 + -4*FDEC_STRIDEB], %1 - mova [r0 + -3*FDEC_STRIDEB], %1 - mova [r0 + -2*FDEC_STRIDEB], %1 - mova [r0 + -1*FDEC_STRIDEB], %1 - mova [r0 + 0*FDEC_STRIDEB], %2 - mova [r0 + 1*FDEC_STRIDEB], %2 - mova [r0 + 2*FDEC_STRIDEB], %2 - mova [r0 + 3*FDEC_STRIDEB], %2 +%macro STORE8 1 + mova [r0+0*FDEC_STRIDEB], %1 + mova [r0+1*FDEC_STRIDEB], %1 + add r0, 4*FDEC_STRIDEB + mova [r0-2*FDEC_STRIDEB], %1 + mova [r0-1*FDEC_STRIDEB], %1 + mova [r0+0*FDEC_STRIDEB], %1 + mova [r0+1*FDEC_STRIDEB], %1 + mova [r0+2*FDEC_STRIDEB], %1 + mova [r0+3*FDEC_STRIDEB], %1 %endmacro -%macro STORE8x16 4 - add r0, 4*FDEC_STRIDEB - mova [r0 + -4*FDEC_STRIDEB], %1 - mova [r0 + -3*FDEC_STRIDEB], %1 - mova [r0 + -2*FDEC_STRIDEB], %1 - mova [r0 + -1*FDEC_STRIDEB], %1 - add r0, 4*FDEC_STRIDEB - mova [r0 + -4*FDEC_STRIDEB], %2 - mova [r0 + -3*FDEC_STRIDEB], %2 - mova [r0 + -2*FDEC_STRIDEB], %2 - mova [r0 + -1*FDEC_STRIDEB], %2 - add r0, 4*FDEC_STRIDEB - mova [r0 + -4*FDEC_STRIDEB], %3 - mova [r0 + -3*FDEC_STRIDEB], %3 - mova [r0 + -2*FDEC_STRIDEB], %3 - mova [r0 + -1*FDEC_STRIDEB], %3 - mova [r0 + 0*FDEC_STRIDEB], %4 - mova [r0 + 1*FDEC_STRIDEB], %4 - mova [r0 + 2*FDEC_STRIDEB], %4 - mova [r0 + 3*FDEC_STRIDEB], %4 +%macro STORE16 1-4 +%if %0 > 1 + mov r1d, 2*%0 +.loop: + mova [r0+0*FDEC_STRIDEB+0*mmsize], %1 + mova [r0+0*FDEC_STRIDEB+1*mmsize], %2 + mova [r0+1*FDEC_STRIDEB+0*mmsize], %1 + mova [r0+1*FDEC_STRIDEB+1*mmsize], %2 +%ifidn %0, 4 + mova [r0+0*FDEC_STRIDEB+2*mmsize], %3 + mova [r0+0*FDEC_STRIDEB+3*mmsize], %4 + mova [r0+1*FDEC_STRIDEB+2*mmsize], %3 + mova [r0+1*FDEC_STRIDEB+3*mmsize], %4 + add r0, 2*FDEC_STRIDEB +%else ; %0 == 2 + add r0, 4*FDEC_STRIDEB + mova [r0-2*FDEC_STRIDEB+0*mmsize], %1 + mova [r0-2*FDEC_STRIDEB+1*mmsize], %2 + mova [r0-1*FDEC_STRIDEB+0*mmsize], %1 + mova [r0-1*FDEC_STRIDEB+1*mmsize], %2 +%endif + dec r1d + jg .loop +%else ; %0 == 1 + STORE8 %1 +%if HIGH_BIT_DEPTH ; Different code paths to reduce code size + add r0, 6*FDEC_STRIDEB + mova [r0-2*FDEC_STRIDEB], %1 + mova [r0-1*FDEC_STRIDEB], %1 + mova [r0+0*FDEC_STRIDEB], %1 + mova [r0+1*FDEC_STRIDEB], %1 + add r0, 4*FDEC_STRIDEB + mova [r0-2*FDEC_STRIDEB], %1 + mova [r0-1*FDEC_STRIDEB], %1 + mova [r0+0*FDEC_STRIDEB], %1 + mova [r0+1*FDEC_STRIDEB], %1 +%else + add r0, 8*FDEC_STRIDE + mova [r0-4*FDEC_STRIDE], %1 + mova [r0-3*FDEC_STRIDE], %1 + mova [r0-2*FDEC_STRIDE], %1 + mova [r0-1*FDEC_STRIDE], %1 + mova [r0+0*FDEC_STRIDE], %1 + mova [r0+1*FDEC_STRIDE], %1 + mova [r0+2*FDEC_STRIDE], %1 + mova [r0+3*FDEC_STRIDE], %1 +%endif ; HIGH_BIT_DEPTH +%endif %endmacro -%macro STORE16x16 2-4 -%ifidn %0, 4 - mov r1d, 8 -.loop: - mova [r0 + 0*FDEC_STRIDEB + 0], %1 - mova [r0 + 1*FDEC_STRIDEB + 0], %1 - mova [r0 + 0*FDEC_STRIDEB + 8], %2 - mova [r0 + 1*FDEC_STRIDEB + 8], %2 - mova [r0 + 0*FDEC_STRIDEB +16], %3 - mova [r0 + 1*FDEC_STRIDEB +16], %3 - mova [r0 + 0*FDEC_STRIDEB +24], %4 - mova [r0 + 1*FDEC_STRIDEB +24], %4 - add r0, 2*FDEC_STRIDEB - dec r1d - jg .loop +%macro PRED_H_LOAD 2 ; reg, offset +%if cpuflag(avx2) + vpbroadcastpix %1, [r0+(%2)*FDEC_STRIDEB-SIZEOF_PIXEL] +%elif HIGH_BIT_DEPTH + movd %1, [r0+(%2)*FDEC_STRIDEB-4] + SPLATW %1, %1, 1 %else - mov r1d, 4 -.loop: - mova [r0 + 0*FDEC_STRIDE], %1 - mova [r0 + 1*FDEC_STRIDE], %1 - mova [r0 + 2*FDEC_STRIDE], %1 - mova [r0 + 3*FDEC_STRIDE], %1 - mova [r0 + 0*FDEC_STRIDE + 8], %2 - mova [r0 + 1*FDEC_STRIDE + 8], %2 - mova [r0 + 2*FDEC_STRIDE + 8], %2 - mova [r0 + 3*FDEC_STRIDE + 8], %2 - add r0, 4*FDEC_STRIDE - dec r1d - jg .loop + SPLATB_LOAD %1, r0+(%2)*FDEC_STRIDE-1, m2 %endif %endmacro -%macro STORE16x16_SSE2 1-2 -%ifidn %0,2 - mov r1d, 4 -.loop - mova [r0+0*FDEC_STRIDEB+ 0], %1 - mova [r0+0*FDEC_STRIDEB+16], %2 - mova [r0+1*FDEC_STRIDEB+ 0], %1 - mova [r0+1*FDEC_STRIDEB+16], %2 - mova [r0+2*FDEC_STRIDEB+ 0], %1 - mova [r0+2*FDEC_STRIDEB+16], %2 - mova [r0+3*FDEC_STRIDEB+ 0], %1 - mova [r0+3*FDEC_STRIDEB+16], %2 - add r0, 4*FDEC_STRIDEB - dec r1d - jg .loop +%macro PRED_H_STORE 3 ; reg, offset, width +%assign %%w %3*SIZEOF_PIXEL +%if %%w == 8 + movq [r0+(%2)*FDEC_STRIDEB], %1 %else - add r0, 4*FDEC_STRIDEB - mova [r0 + -4*FDEC_STRIDEB], %1 - mova [r0 + -3*FDEC_STRIDEB], %1 - mova [r0 + -2*FDEC_STRIDEB], %1 - mova [r0 + -1*FDEC_STRIDEB], %1 - mova [r0 + 0*FDEC_STRIDEB], %1 - mova [r0 + 1*FDEC_STRIDEB], %1 - mova [r0 + 2*FDEC_STRIDEB], %1 - mova [r0 + 3*FDEC_STRIDEB], %1 - add r0, 8*FDEC_STRIDEB - mova [r0 + -4*FDEC_STRIDEB], %1 - mova [r0 + -3*FDEC_STRIDEB], %1 - mova [r0 + -2*FDEC_STRIDEB], %1 - mova [r0 + -1*FDEC_STRIDEB], %1 - mova [r0 + 0*FDEC_STRIDEB], %1 - mova [r0 + 1*FDEC_STRIDEB], %1 - mova [r0 + 2*FDEC_STRIDEB], %1 - mova [r0 + 3*FDEC_STRIDEB], %1
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/predict-c.c -> x264-snapshot-20130723-2245.tar.bz2/common/x86/predict-c.c
Changed
@@ -43,6 +43,7 @@ PREDICT_16x16_DC( mmx2 ) PREDICT_16x16_DC( sse2 ) +PREDICT_16x16_DC( avx2 ) #define PREDICT_16x16_DC_LEFT(name)\ static void x264_predict_16x16_dc_left_##name( pixel *src )\ @@ -58,10 +59,11 @@ PREDICT_16x16_DC_LEFT( mmx2 ) PREDICT_16x16_DC_LEFT( sse2 ) +PREDICT_16x16_DC_LEFT( avx2 ) #define PREDICT_P_SUM(j,i)\ H += i * ( src[j+i - FDEC_STRIDE ] - src[j-i - FDEC_STRIDE ] );\ - V += i * ( src[(j+i)*FDEC_STRIDE -1] - src[(j-i)*FDEC_STRIDE -1] );\ + V += i * ( src[(j+i)*FDEC_STRIDE -1] - src[(j-i)*FDEC_STRIDE -1] ); ALIGNED_16( static const int16_t pw_12345678[8] ) = {1,2,3,4,5,6,7,8}; ALIGNED_16( static const int16_t pw_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1}; @@ -70,178 +72,181 @@ ALIGNED_8( static const int8_t pb_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1}; ALIGNED_8( static const int8_t pb_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4}; -#if !HIGH_BIT_DEPTH -#define PREDICT_16x16_P(name)\ -static void x264_predict_16x16_p_##name( pixel *src )\ -{\ - int a, b, c;\ +#define PREDICT_16x16_P_CORE\ int H = 0;\ int V = 0;\ - int i00;\ - PREDICT_P_SUM(7,1) \ - PREDICT_P_SUM(7,2) \ - PREDICT_P_SUM(7,3) \ - PREDICT_P_SUM(7,4) \ - PREDICT_P_SUM(7,5) \ - PREDICT_P_SUM(7,6) \ - PREDICT_P_SUM(7,7) \ - PREDICT_P_SUM(7,8) \ - a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] );\ - b = ( 5 * H + 32 ) >> 6;\ - c = ( 5 * V + 32 ) >> 6;\ - i00 = a - b * 7 - c * 7 + 16;\ - x264_predict_16x16_p_core_##name( src, i00, b, c );\ -} -#ifndef ARCH_X86_64 -PREDICT_16x16_P( mmx2 ) -#endif -PREDICT_16x16_P( sse2 ) -PREDICT_16x16_P( avx ) -#endif //!HIGH_BIT_DEPTH + PREDICT_P_SUM(7,1)\ + PREDICT_P_SUM(7,2)\ + PREDICT_P_SUM(7,3)\ + PREDICT_P_SUM(7,4)\ + PREDICT_P_SUM(7,5)\ + PREDICT_P_SUM(7,6)\ + PREDICT_P_SUM(7,7)\ + PREDICT_P_SUM(7,8) -#define PREDICT_8x16C_P_CORE \ - int H = 0, V = 0;\ - for( int i = 0; i < 4; i++ )\ - H += ( i + 1 ) * ( src[4 + i - FDEC_STRIDE] - src[2 - i - FDEC_STRIDE] );\ - for( int i = 0; i < 8; i++ )\ - V += ( i + 1 ) * ( src[-1 + (i+8)*FDEC_STRIDE] - src[-1 + (6-i)*FDEC_STRIDE] );\ - int a = 16 * ( src[-1 + 15*FDEC_STRIDE] + src[7 - FDEC_STRIDE] );\ - int b = ( 17 * H + 16 ) >> 5;\ - int c = ( 5 * V + 32 ) >> 6; - -#if HIGH_BIT_DEPTH -#define PREDICT_8x16_P(name)\ -static void x264_predict_8x16c_p_##name( uint16_t *src )\ -{\ - PREDICT_8x16C_P_CORE \ - x264_predict_8x16c_p_core_##name( src, a, b, c );\ -} +#define PREDICT_16x16_P_END(name)\ + int a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] );\ + int b = ( 5 * H + 32 ) >> 6;\ + int c = ( 5 * V + 32 ) >> 6;\ + int i00 = a - b * 7 - c * 7 + 16;\ + /* b*15 + c*15 can overflow: it's easier to just branch away in this rare case + * than to try to consider it in the asm. */\ + if( BIT_DEPTH > 8 && (i00 > 0x7fff || abs(b) > 1092 || abs(c) > 1092) )\ + x264_predict_16x16_p_c( src );\ + else\ + x264_predict_16x16_p_core_##name( src, i00, b, c ); -PREDICT_8x16_P(sse2) -PREDICT_8x16_P(avx) -#else -#define PREDICT_8x16_P(name)\ -static void x264_predict_8x16c_p_##name( uint8_t *src )\ +#define PREDICT_16x16_P(name, name2)\ +static void x264_predict_16x16_p_##name( pixel *src )\ {\ - PREDICT_8x16C_P_CORE \ - int i00 = a -3*b -7*c + 16;\ - x264_predict_8x16c_p_core_##name( src, i00, b, c );\ + PREDICT_16x16_P_CORE\ + PREDICT_16x16_P_END(name2)\ } -#ifndef ARCH_X86_64 -PREDICT_8x16_P(mmx2) -#endif -PREDICT_8x16_P(sse2) -PREDICT_8x16_P(avx) -#endif #if HAVE_X86_INLINE_ASM #if HIGH_BIT_DEPTH -static void x264_predict_16x16_p_sse2( uint16_t *src ) -#else -static void x264_predict_16x16_p_ssse3( uint8_t *src ) -#endif -{ - int a, b, c, i00; - int H, V; -#if HIGH_BIT_DEPTH - asm ( - "movdqu %1, %%xmm1 \n" - "movdqa %2, %%xmm0 \n" - "pmaddwd %3, %%xmm0 \n" - "pmaddwd %4, %%xmm1 \n" - "paddd %%xmm1, %%xmm0 \n" - "movhlps %%xmm0, %%xmm1 \n" - "paddd %%xmm1, %%xmm0 \n" - "pshuflw $14, %%xmm0, %%xmm1 \n" - "paddd %%xmm1, %%xmm0 \n" - "movd %%xmm0, %0 \n" - :"=r"(H) - :"m"(src[-FDEC_STRIDE-1]), "m"(src[-FDEC_STRIDE+8]), - "m"(*pw_12345678), "m"(*pw_m87654321) +#define PREDICT_16x16_P_ASM\ + asm (\ + "movdqu %1, %%xmm1 \n"\ + "movdqa %2, %%xmm0 \n"\ + "pmaddwd %3, %%xmm0 \n"\ + "pmaddwd %4, %%xmm1 \n"\ + "paddd %%xmm1, %%xmm0 \n"\ + "movhlps %%xmm0, %%xmm1 \n"\ + "paddd %%xmm1, %%xmm0 \n"\ + "pshuflw $14, %%xmm0, %%xmm1 \n"\ + "paddd %%xmm1, %%xmm0 \n"\ + "movd %%xmm0, %0 \n"\ + :"=r"(H)\ + :"m"(src[-FDEC_STRIDE-1]), "m"(src[-FDEC_STRIDE+8]),\ + "m"(*pw_12345678), "m"(*pw_m87654321)\ ); -#else - asm ( - "movq %1, %%mm1 \n" - "movq %2, %%mm0 \n" - "palignr $7, %3, %%mm1 \n" - "pmaddubsw %4, %%mm0 \n" - "pmaddubsw %5, %%mm1 \n" - "paddw %%mm1, %%mm0 \n" - "pshufw $14, %%mm0, %%mm1 \n" - "paddw %%mm1, %%mm0 \n" - "pshufw $1, %%mm0, %%mm1 \n" - "paddw %%mm1, %%mm0 \n" - "movd %%mm0, %0 \n" - "movswl %w0, %0 \n" - :"=r"(H) - :"m"(src[-FDEC_STRIDE]), "m"(src[-FDEC_STRIDE+8]), - "m"(src[-FDEC_STRIDE-8]), "m"(*pb_12345678), "m"(*pb_m87654321) +#else // !HIGH_BIT_DEPTH +#define PREDICT_16x16_P_ASM\ + asm (\ + "movq %1, %%mm1 \n"\ + "movq %2, %%mm0 \n"\ + "palignr $7, %3, %%mm1 \n"\ + "pmaddubsw %4, %%mm0 \n"\ + "pmaddubsw %5, %%mm1 \n"\ + "paddw %%mm1, %%mm0 \n"\ + "pshufw $14, %%mm0, %%mm1 \n"\ + "paddw %%mm1, %%mm0 \n"\ + "pshufw $1, %%mm0, %%mm1 \n"\ + "paddw %%mm1, %%mm0 \n"\ + "movd %%mm0, %0 \n"\ + "movswl %w0, %0 \n"\ + :"=r"(H)\ + :"m"(src[-FDEC_STRIDE]), "m"(src[-FDEC_STRIDE+8]),\ + "m"(src[-FDEC_STRIDE-8]), "m"(*pb_12345678), "m"(*pb_m87654321)\ ); -#endif - V = 8 * ( src[15*FDEC_STRIDE-1] - src[-1*FDEC_STRIDE-1] ) - + 7 * ( src[14*FDEC_STRIDE-1] - src[ 0*FDEC_STRIDE-1] ) - + 6 * ( src[13*FDEC_STRIDE-1] - src[ 1*FDEC_STRIDE-1] ) - + 5 * ( src[12*FDEC_STRIDE-1] - src[ 2*FDEC_STRIDE-1] ) - + 4 * ( src[11*FDEC_STRIDE-1] - src[ 3*FDEC_STRIDE-1] ) - + 3 * ( src[10*FDEC_STRIDE-1] - src[ 4*FDEC_STRIDE-1] ) - + 2 * ( src[ 9*FDEC_STRIDE-1] - src[ 5*FDEC_STRIDE-1] ) +#endif // HIGH_BIT_DEPTH + +#define PREDICT_16x16_P_CORE_INLINE\
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/predict.h -> x264-snapshot-20130723-2245.tar.bz2/common/x86/predict.h
Changed
@@ -34,48 +34,57 @@ void x264_predict_8x8_init_mmx ( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter ); void x264_predict_16x16_v_mmx2( pixel *src ); -void x264_predict_16x16_v_sse2( pixel *src ); +void x264_predict_16x16_v_sse ( pixel *src ); +void x264_predict_16x16_v_avx ( uint16_t *src ); void x264_predict_16x16_h_mmx2( pixel *src ); void x264_predict_16x16_h_sse2( uint16_t *src ); void x264_predict_16x16_h_ssse3( uint8_t *src ); +void x264_predict_16x16_h_avx2( uint16_t *src ); void x264_predict_16x16_dc_mmx2( pixel *src ); void x264_predict_16x16_dc_sse2( pixel *src ); void x264_predict_16x16_dc_core_mmx2( pixel *src, int i_dc_left ); void x264_predict_16x16_dc_core_sse2( pixel *src, int i_dc_left ); +void x264_predict_16x16_dc_core_avx2( pixel *src, int i_dc_left ); void x264_predict_16x16_dc_left_core_mmx2( pixel *src, int i_dc_left ); void x264_predict_16x16_dc_left_core_sse2( pixel *src, int i_dc_left ); +void x264_predict_16x16_dc_left_core_avx2( pixel *src, int i_dc_left ); void x264_predict_16x16_dc_top_mmx2( pixel *src ); void x264_predict_16x16_dc_top_sse2( pixel *src ); -void x264_predict_16x16_dc_top_ssse3( uint16_t *src ); +void x264_predict_16x16_dc_top_avx2( pixel *src ); void x264_predict_16x16_p_core_mmx2( uint8_t *src, int i00, int b, int c ); void x264_predict_16x16_p_core_sse2( pixel *src, int i00, int b, int c ); void x264_predict_16x16_p_core_avx( pixel *src, int i00, int b, int c ); +void x264_predict_16x16_p_core_avx2( pixel *src, int i00, int b, int c ); void x264_predict_8x16c_dc_mmx2( pixel *src ); void x264_predict_8x16c_dc_sse2( uint16_t *src ); void x264_predict_8x16c_dc_top_mmx2( uint8_t *src ); void x264_predict_8x16c_dc_top_sse2( uint16_t *src ); void x264_predict_8x16c_v_mmx( uint8_t *src ); -void x264_predict_8x16c_v_sse2( uint16_t *src ); +void x264_predict_8x16c_v_sse( uint16_t *src ); void x264_predict_8x16c_h_mmx2( pixel *src ); -void x264_predict_8x16c_h_sse2( pixel *src ); +void x264_predict_8x16c_h_sse2( uint16_t *src ); void x264_predict_8x16c_h_ssse3( uint8_t *src ); +void x264_predict_8x16c_h_avx2( uint16_t *src ); void x264_predict_8x16c_p_core_mmx2( uint8_t *src, int i00, int b, int c ); void x264_predict_8x16c_p_core_sse2( pixel *src, int i00, int b, int c ); -void x264_predict_8x16c_p_core_avx( pixel *src, int i00, int b, int c ); +void x264_predict_8x16c_p_core_avx ( pixel *src, int i00, int b, int c ); +void x264_predict_8x16c_p_core_avx2( pixel *src, int i00, int b, int c ); void x264_predict_8x8c_p_core_mmx2( uint8_t *src, int i00, int b, int c ); void x264_predict_8x8c_p_core_sse2( pixel *src, int i00, int b, int c ); -void x264_predict_8x8c_p_core_avx( pixel *src, int i00, int b, int c ); +void x264_predict_8x8c_p_core_avx ( pixel *src, int i00, int b, int c ); +void x264_predict_8x8c_p_core_avx2( pixel *src, int i00, int b, int c ); void x264_predict_8x8c_dc_mmx2( pixel *src ); void x264_predict_8x8c_dc_sse2( uint16_t *src ); void x264_predict_8x8c_dc_top_mmx2( uint8_t *src ); void x264_predict_8x8c_dc_top_sse2( uint16_t *src ); void x264_predict_8x8c_v_mmx( pixel *src ); -void x264_predict_8x8c_v_sse2( uint16_t *src ); +void x264_predict_8x8c_v_sse( uint16_t *src ); void x264_predict_8x8c_h_mmx2( pixel *src ); -void x264_predict_8x8c_h_sse2( pixel *src ); +void x264_predict_8x8c_h_sse2( uint16_t *src ); void x264_predict_8x8c_h_ssse3( uint8_t *src ); +void x264_predict_8x8c_h_avx2( uint16_t *src ); void x264_predict_8x8_v_mmx2( uint8_t *src, uint8_t edge[36] ); -void x264_predict_8x8_v_sse2( uint16_t *src, uint16_t edge[36] ); +void x264_predict_8x8_v_sse ( uint16_t *src, uint16_t edge[36] ); void x264_predict_8x8_h_mmx2( uint8_t *src, uint8_t edge[36] ); void x264_predict_8x8_h_sse2( uint16_t *src, uint16_t edge[36] ); void x264_predict_8x8_hd_mmx2( uint8_t *src, uint8_t edge[36] ); @@ -114,6 +123,7 @@ void x264_predict_8x8_filter_sse2( uint16_t *src, uint16_t edge[36], int i_neighbor, int i_filters ); void x264_predict_8x8_filter_ssse3( pixel *src, pixel edge[36], int i_neighbor, int i_filters ); void x264_predict_8x8_filter_avx( uint16_t *src, uint16_t edge[36], int i_neighbor, int i_filters ); +void x264_predict_4x4_h_avx2( uint16_t *src ); void x264_predict_4x4_ddl_mmx2( pixel *src ); void x264_predict_4x4_ddl_sse2( uint16_t *src ); void x264_predict_4x4_ddl_avx( uint16_t *src );
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/quant-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/quant-a.asm
Changed
@@ -7,7 +7,7 @@ ;* Jason Garrett-Glaser <darkshikari@gmail.com> ;* Christian Heine <sennindemokrit@gmx.net> ;* Oskar Arvidsson <oskar@irock.se> -;* Henrik Gramner <hengar-6@student.ltu.se> +;* Henrik Gramner <henrik@gramner.com> ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -30,7 +30,7 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA +SECTION_RODATA 32 %macro DQM4 3 dw %1, %2, %1, %2, %2, %3, %2, %3 @@ -39,8 +39,7 @@ dw %1, %4, %5, %4, %1, %4, %5, %4 dw %4, %2, %6, %2, %4, %2, %6, %2 dw %5, %6, %3, %6, %5, %6, %3, %6 - ; last line not used, just padding for power-of-2 stride - times 8 dw 0 + dw %4, %2, %6, %2, %4, %2, %6, %2 %endmacro dequant4_scale: @@ -75,27 +74,55 @@ chroma_dc_dct_mask: dw 1, 1,-1,-1, 1, 1,-1,-1 chroma_dc_dmf_mask: dw 1, 1,-1,-1, 1,-1,-1, 1 +%if HIGH_BIT_DEPTH==0 +dct_coef_shuffle: +%macro DCT_COEF_SHUFFLE 8 + %assign y x + %rep 8 + %rep 7 + %rotate (~(y>>7))&1 + %assign y y<<((~(y>>7))&1) + %endrep + db %1*2 + %rotate 1 + %assign y y<<1 + %endrep +%endmacro +%assign x 0 +%rep 256 + DCT_COEF_SHUFFLE 7, 6, 5, 4, 3, 2, 1, 0 +%assign x x+1 +%endrep +%endif + SECTION .text cextern pb_1 cextern pw_1 +cextern pw_2 +cextern pw_256 cextern pd_1 cextern pb_01 cextern pd_1024 - -%macro QUANT_DC_START 0 - movd m6, r1m ; mf - movd m7, r2m ; bias -%if HIGH_BIT_DEPTH - SPLATD m6, m6 - SPLATD m7, m7 +cextern deinterleave_shufd +cextern popcnt_table + +%macro QUANT_DC_START 2 + movd xm%1, r1m ; mf + movd xm%2, r2m ; bias +%if cpuflag(avx2) + vpbroadcastdct m%1, xm%1 + vpbroadcastdct m%2, xm%2 +%elif HIGH_BIT_DEPTH + SPLATD m%1, m%1 + SPLATD m%2, m%2 %elif cpuflag(sse4) ; ssse3, but not faster on conroe mova m5, [pb_01] - pshufb m6, m5 - pshufb m7, m5 + pshufb m%1, m5 + pshufb m%2, m5 %else - SPLATW m6, m6 - SPLATW m7, m7 + SPLATW m%1, m%1 + SPLATW m%2, m%2 %endif %endmacro @@ -175,7 +202,7 @@ %endif ; cpuflag %endmacro -%macro QUANT_ONE_AC_MMX 4 +%macro QUANT_ONE_AC_MMX 5 mova m0, [%1] mova m2, [%2] ABSD m1, m0 @@ -191,10 +218,10 @@ psrad m1, 16 PSIGND m1, m0 mova [%1], m1 - ACCUM por, 5, 1, %4 + ACCUM por, %5, 1, %4 %endmacro -%macro QUANT_TWO_AC 4 +%macro QUANT_TWO_AC 5 %if cpuflag(sse4) mova m0, [%1 ] mova m1, [%1+mmsize] @@ -210,11 +237,11 @@ PSIGND m3, m1 mova [%1 ], m2 mova [%1+mmsize], m3 - ACCUM por, 5, 2, %4 - por m5, m3 + ACCUM por, %5, 2, %4 + por m%5, m3 %else ; !sse4 - QUANT_ONE_AC_MMX %1, %2, %3, %4 - QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, %4+mmsize + QUANT_ONE_AC_MMX %1, %2, %3, %4, %5 + QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, 1, %5 %endif ; cpuflag %endmacro @@ -223,7 +250,7 @@ ;----------------------------------------------------------------------------- %macro QUANT_DC 2 cglobal quant_%1x%2_dc, 3,3,8 - QUANT_DC_START + QUANT_DC_START 6,7 %if %1*%2 <= mmsize/4 QUANT_ONE_DC r0, m6, m7, 0 %else @@ -244,35 +271,87 @@ cglobal quant_%1x%2, 3,3,8 %assign x 0 %rep %1*%2/(mmsize/2) - QUANT_TWO_AC r0+x, r1+x, r2+x, x + QUANT_TWO_AC r0+x, r1+x, r2+x, x, 5 %assign x x+mmsize*2 %endrep QUANT_END RET %endmacro +%macro QUANT_4x4 2 + QUANT_TWO_AC r0+%1+mmsize*0, r1+mmsize*0, r2+mmsize*0, 0, %2 + QUANT_TWO_AC r0+%1+mmsize*2, r1+mmsize*2, r2+mmsize*2, 1, %2 +%endmacro + +%macro QUANT_4x4x4 0 +cglobal quant_4x4x4, 3,3,8 + QUANT_4x4 0, 5 + QUANT_4x4 64, 6 + add r0, 128 + packssdw m5, m6 + QUANT_4x4 0, 6 + QUANT_4x4 64, 7 + packssdw m6, m7 + packssdw m5, m6 + packssdw m5, m5 ; AA BB CC DD + packsswb m5, m5 ; A B C D + pxor m4, m4 + pcmpeqb m5, m4 + pmovmskb eax, m5 + not eax + and eax, 0xf + RET +%endmacro + INIT_XMM sse2 QUANT_DC 2, 2 QUANT_DC 4, 4 QUANT_AC 4, 4 QUANT_AC 8, 8 +QUANT_4x4x4 INIT_XMM ssse3 QUANT_DC 2, 2 QUANT_DC 4, 4 QUANT_AC 4, 4 QUANT_AC 8, 8 +QUANT_4x4x4 INIT_XMM sse4 QUANT_DC 2, 2 QUANT_DC 4, 4 QUANT_AC 4, 4 QUANT_AC 8, 8 +QUANT_4x4x4 +
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/quant.h -> x264-snapshot-20130723-2245.tar.bz2/common/x86/quant.h
Changed
@@ -31,19 +31,27 @@ int x264_quant_2x2_dc_mmx2( dctcoef dct[4], int mf, int bias ); int x264_quant_4x4_dc_mmx2( dctcoef dct[16], int mf, int bias ); int x264_quant_4x4_mmx( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] ); +int x264_quant_4x4x4_mmx( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] ); int x264_quant_8x8_mmx( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] ); int x264_quant_2x2_dc_sse2( dctcoef dct[16], int mf, int bias ); int x264_quant_4x4_dc_sse2( dctcoef dct[16], int mf, int bias ); int x264_quant_4x4_sse2( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] ); +int x264_quant_4x4x4_sse2( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] ); int x264_quant_8x8_sse2( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] ); int x264_quant_2x2_dc_ssse3( dctcoef dct[4], int mf, int bias ); int x264_quant_4x4_dc_ssse3( dctcoef dct[16], int mf, int bias ); int x264_quant_4x4_ssse3( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] ); +int x264_quant_4x4x4_ssse3( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] ); int x264_quant_8x8_ssse3( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] ); int x264_quant_2x2_dc_sse4( dctcoef dct[16], int mf, int bias ); int x264_quant_4x4_dc_sse4( dctcoef dct[16], int mf, int bias ); int x264_quant_4x4_sse4( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] ); +int x264_quant_4x4x4_sse4( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] ); int x264_quant_8x8_sse4( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] ); +int x264_quant_4x4_avx2( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] ); +int x264_quant_4x4_dc_avx2( dctcoef dct[16], int mf, int bias ); +int x264_quant_8x8_avx2( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] ); +int x264_quant_4x4x4_avx2( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] ); void x264_dequant_4x4_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_4x4dc_mmx2( int16_t dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_8x8_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp ); @@ -56,10 +64,15 @@ void x264_dequant_4x4_xop( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_4x4dc_xop( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_8x8_xop( dctcoef dct[64], int dequant_mf[6][64], int i_qp ); +void x264_dequant_4x4_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); +void x264_dequant_4x4dc_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); +void x264_dequant_8x8_avx2( dctcoef dct[64], int dequant_mf[6][64], int i_qp ); void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp ); void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_8x8_flat16_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp ); +void x264_dequant_4x4_flat16_avx2( int16_t dct[16], int dequant_mf[6][16], int i_qp ); +void x264_dequant_8x8_flat16_avx2( int16_t dct[64], int dequant_mf[6][64], int i_qp ); int x264_optimize_chroma_2x2_dc_sse2( dctcoef dct[4], int dequant_mf ); int x264_optimize_chroma_2x2_dc_ssse3( dctcoef dct[4], int dequant_mf ); int x264_optimize_chroma_2x2_dc_sse4( dctcoef dct[4], int dequant_mf ); @@ -68,21 +81,17 @@ void x264_denoise_dct_sse2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ); void x264_denoise_dct_ssse3( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ); void x264_denoise_dct_avx ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ); +void x264_denoise_dct_avx2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ); int x264_decimate_score15_mmx2( dctcoef *dct ); int x264_decimate_score15_sse2( dctcoef *dct ); int x264_decimate_score15_ssse3( dctcoef *dct ); int x264_decimate_score16_mmx2( dctcoef *dct ); int x264_decimate_score16_sse2( dctcoef *dct ); int x264_decimate_score16_ssse3( dctcoef *dct ); -int x264_decimate_score15_mmx2_slowctz( dctcoef *dct ); -int x264_decimate_score15_sse2_slowctz( dctcoef *dct ); -int x264_decimate_score15_ssse3_slowctz( dctcoef *dct ); -int x264_decimate_score16_mmx2_slowctz( dctcoef *dct ); -int x264_decimate_score16_sse2_slowctz( dctcoef *dct ); -int x264_decimate_score16_ssse3_slowctz( dctcoef *dct ); int x264_decimate_score64_mmx2( dctcoef *dct ); int x264_decimate_score64_sse2( dctcoef *dct ); int x264_decimate_score64_ssse3( dctcoef *dct ); +int x264_decimate_score64_avx2( int16_t *dct ); int x264_coeff_last4_mmx2( dctcoef *dct ); int x264_coeff_last8_mmx2( dctcoef *dct ); int x264_coeff_last15_mmx2( dctcoef *dct ); @@ -98,18 +107,29 @@ int x264_coeff_last15_sse2_lzcnt( dctcoef *dct ); int x264_coeff_last16_sse2_lzcnt( dctcoef *dct ); int x264_coeff_last64_sse2_lzcnt( dctcoef *dct ); +int x264_coeff_last64_avx2_lzcnt( dctcoef *dct ); int x264_coeff_level_run16_mmx2( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run16_sse2( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run16_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run16_ssse3( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run16_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run16_avx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run15_mmx2( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run15_sse2( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run15_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run15_ssse3( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run15_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run15_avx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run4_mmx2( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run4_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run4_ssse3( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run4_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run8_mmx2( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run8_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run8_sse2( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run8_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run8_ssse3( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run8_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_trellis_cabac_4x4_sse2 ( TRELLIS_PARAMS, int b_ac ); int x264_trellis_cabac_4x4_ssse3( TRELLIS_PARAMS, int b_ac ); int x264_trellis_cabac_8x8_sse2 ( TRELLIS_PARAMS, int b_interlaced );
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/sad-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/sad-a.asm
Changed
@@ -29,6 +29,12 @@ %include "x86inc.asm" %include "x86util.asm" +SECTION_RODATA 32 + +pb_shuf8x8c2: times 2 db 0,0,0,0,8,8,8,8,-1,-1,-1,-1,-1,-1,-1,-1 +deinterleave_sadx4: dd 0,4,2,6 +hpred_shuf: db 0,0,2,2,8,8,10,10,1,1,3,3,9,9,11,11 + SECTION .text cextern pb_3 @@ -556,6 +562,65 @@ INIT_MMX ssse3 INTRA_SAD_8x8C +INIT_YMM avx2 +cglobal intra_sad_x3_8x8c, 3,3,7 + vpbroadcastq m2, [r1 - FDEC_STRIDE] ; V pred + add r1, FDEC_STRIDE*4-1 + pxor xm5, xm5 + punpckldq xm3, xm2, xm5 ; V0 _ V1 _ + movd xm0, [r1 + FDEC_STRIDE*-1 - 3] + movd xm1, [r1 + FDEC_STRIDE* 3 - 3] + pinsrb xm0, [r1 + FDEC_STRIDE*-4], 0 + pinsrb xm1, [r1 + FDEC_STRIDE* 0], 0 + pinsrb xm0, [r1 + FDEC_STRIDE*-3], 1 + pinsrb xm1, [r1 + FDEC_STRIDE* 1], 1 + pinsrb xm0, [r1 + FDEC_STRIDE*-2], 2 + pinsrb xm1, [r1 + FDEC_STRIDE* 2], 2 + punpcklqdq xm0, xm1 ; H0 _ H1 _ + vinserti128 m3, m3, xm0, 1 ; V0 V1 H0 H1 + pshufb xm0, [hpred_shuf] ; H00224466 H11335577 + psadbw m3, m5 ; s0 s1 s2 s3 + vpermq m4, m3, q3312 ; s2 s1 s3 s3 + vpermq m3, m3, q1310 ; s0 s1 s3 s1 + paddw m3, m4 + psrlw m3, 2 + pavgw m3, m5 ; s0+s2 s1 s3 s1+s3 + pshufb m3, [pb_shuf8x8c2] ; DC0 _ DC1 _ + vpblendd m3, m3, m2, 11001100b ; DC0 V DC1 V + vinserti128 m1, m3, xm3, 1 ; DC0 V DC0 V + vperm2i128 m6, m3, m3, q0101 ; DC1 V DC1 V + vpermq m0, m0, q3120 ; H00224466 _ H11335577 _ + movddup m2, [r0+FENC_STRIDE*0] + movddup m4, [r0+FENC_STRIDE*2] + pshuflw m3, m0, q0000 + psadbw m3, m2 + psadbw m2, m1 + pshuflw m5, m0, q1111 + psadbw m5, m4 + psadbw m4, m1 + paddw m2, m4 + paddw m3, m5 + movddup m4, [r0+FENC_STRIDE*4] + pshuflw m5, m0, q2222 + psadbw m5, m4 + psadbw m4, m6 + paddw m2, m4 + paddw m3, m5 + movddup m4, [r0+FENC_STRIDE*6] + pshuflw m5, m0, q3333 + psadbw m5, m4 + psadbw m4, m6 + paddw m2, m4 + paddw m3, m5 + vextracti128 xm0, m2, 1 + vextracti128 xm1, m3, 1 + paddw xm2, xm0 ; DC V + paddw xm3, xm1 ; H + pextrd [r2+8], xm2, 2 ; V + movd [r2+4], xm3 ; H + movd [r2+0], xm2 ; DC + RET + ;----------------------------------------------------------------------------- ; void intra_sad_x3_16x16( uint8_t *fenc, uint8_t *fdec, int res[3] ); @@ -648,7 +713,50 @@ INIT_XMM ssse3 INTRA_SAD16 - +INIT_YMM avx2 +cglobal intra_sad_x3_16x16, 3,5,6 + pxor xm0, xm0 + psadbw xm0, [r1-FDEC_STRIDE] + movhlps xm1, xm0 + paddw xm0, xm1 + movd r3d, xm0 +%assign x 0 +%rep 16 + movzx r4d, byte [r1-1+FDEC_STRIDE*(x&3)] +%if (x&3)==3 && x!=15 + add r1, FDEC_STRIDE*4 +%endif + add r3d, r4d +%assign x x+1 +%endrep + sub r1, FDEC_STRIDE*12 + add r3d, 16 + shr r3d, 5 + movd xm5, r3d + vpbroadcastb xm5, xm5 + vinserti128 m5, m5, [r1-FDEC_STRIDE], 1 ; m5 contains DC and V prediction + + pxor m4, m4 ; DC / V accumulator + pxor xm3, xm3 ; H accumulator + mov r3d, 15*FENC_STRIDE +.vloop: + vpbroadcastb xm2, [r1+r3*2-1] + vbroadcasti128 m0, [r0+r3] + psadbw m1, m0, m5 + psadbw xm0, xm2 + paddw m4, m1 + paddw xm3, xm0 + add r3d, -FENC_STRIDE + jge .vloop + punpckhqdq m5, m4, m4 + movhlps xm2, xm3 + paddw m4, m5 ; DC / V + paddw xm3, xm2 ; H + vextracti128 xm2, m4, 1 + movd [r2+0], xm2 + movd [r2+4], xm3 + movd [r2+8], xm4 + RET ;============================================================================= ; SAD x3/x4 MMX @@ -944,17 +1052,27 @@ %endif %endmacro -%macro SAD_X3_2x16P_SSE2 1 -%if %1 +%macro SAD_X3_4x16P_SSE2 2 +%if %1==0 +%if UNIX64 + mov r6, r5 +%endif + lea r5, [r4*3] SAD_X3_START_1x16P_SSE2 %else - SAD_X3_1x16P_SSE2 0, 0 + SAD_X3_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0 +%endif + SAD_X3_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r4*1 + SAD_X3_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2 + SAD_X3_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), r5 +%if %1 != %2-1 +%if (%1&1) != 0 + add r0, 8*FENC_STRIDE +%endif + lea r1, [r1+4*r4] + lea r2, [r2+4*r4] + lea r3, [r3+4*r4] %endif - SAD_X3_1x16P_SSE2 FENC_STRIDE, r4 - add r0, 2*FENC_STRIDE - lea r1, [r1+2*r4] - lea r2, [r2+2*r4] - lea r3, [r3+2*r4] %endmacro %macro SAD_X3_START_2x8P_SSE2 0 @@ -971,15 +1089,15 @@ psadbw xmm2, xmm7 %endmacro -%macro SAD_X3_2x8P_SSE2 0 - movq xmm7, [r0] - movq xmm3, [r1] - movq xmm4, [r2] - movq xmm5, [r3] - movhps xmm7, [r0+FENC_STRIDE] - movhps xmm3, [r1+r4] - movhps xmm4, [r2+r4] - movhps xmm5, [r3+r4] +%macro SAD_X3_2x8P_SSE2 4 + movq xmm7, [r0+%1] + movq xmm3, [r1+%2] + movq xmm4, [r2+%2] + movq xmm5, [r3+%2] + movhps xmm7, [r0+%3] + movhps xmm3, [r1+%4] + movhps xmm4, [r2+%4] + movhps xmm5, [r3+%4] psadbw xmm3, xmm7 psadbw xmm4, xmm7 psadbw xmm5, xmm7 @@ -1005,18 +1123,18 @@ psadbw xmm3, xmm7 %endmacro -%macro SAD_X4_2x8P_SSE2 0 - movq xmm7, [r0] - movq xmm4, [r1] - movq xmm5, [r2]
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/sad16-a.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/sad16-a.asm
Changed
@@ -4,6 +4,7 @@ ;* Copyright (C) 2010-2013 x264 project ;* ;* Authors: Oskar Arvidsson <oskar@irock.se> +;* Henrik Gramner <henrik@gramner.com> ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -90,11 +91,18 @@ ; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t ) ;----------------------------------------------------------------------------- %macro SAD_MMX 3 -cglobal pixel_sad_%1x%2, 4,4 +cglobal pixel_sad_%1x%2, 4,5-(%2&4/4) pxor m0, m0 -%rep %2/%3 +%if %2 == 4 SAD_INC_%3x%1P_MMX -%endrep + SAD_INC_%3x%1P_MMX +%else + mov r4d, %2/%3 +.loop: + SAD_INC_%3x%1P_MMX + dec r4d + jg .loop +%endif %if %1*%2 == 256 HADDUW m0, m1 %else @@ -120,7 +128,8 @@ ; SAD XMM ;============================================================================= -%macro SAD_INC_2x16P_XMM 0 +%macro SAD_INC_2ROW 1 +%if 2*%1 > mmsize movu m1, [r2+ 0] movu m2, [r2+16] movu m3, [r2+2*r3+ 0] @@ -137,9 +146,7 @@ paddw m3, m4 paddw m0, m1 paddw m0, m3 -%endmacro - -%macro SAD_INC_2x8P_XMM 0 +%else movu m1, [r2] movu m2, [r2+2*r3] psubw m1, [r0] @@ -149,44 +156,55 @@ lea r2, [r2+4*r3] paddw m0, m1 paddw m0, m2 +%endif %endmacro ;----------------------------------------------------------------------------- ; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t ) ;----------------------------------------------------------------------------- -%macro SAD_XMM 2 -cglobal pixel_sad_%1x%2, 4,4,8 +%macro SAD 2 +cglobal pixel_sad_%1x%2, 4,5-(%2&4/4),8*(%1/mmsize) pxor m0, m0 -%rep %2/2 - SAD_INC_2x%1P_XMM -%endrep +%if %2 == 4 + SAD_INC_2ROW %1 + SAD_INC_2ROW %1 +%else + mov r4d, %2/2 +.loop: + SAD_INC_2ROW %1 + dec r4d + jg .loop +%endif HADDW m0, m1 - movd eax, m0 + movd eax, xm0 RET %endmacro INIT_XMM sse2 -SAD_XMM 16, 16 -SAD_XMM 16, 8 -SAD_XMM 8, 16 -SAD_XMM 8, 8 -SAD_XMM 8, 4 +SAD 16, 16 +SAD 16, 8 +SAD 8, 16 +SAD 8, 8 +SAD 8, 4 INIT_XMM sse2, aligned -SAD_XMM 16, 16 -SAD_XMM 16, 8 -SAD_XMM 8, 16 -SAD_XMM 8, 8 +SAD 16, 16 +SAD 16, 8 +SAD 8, 16 +SAD 8, 8 INIT_XMM ssse3 -SAD_XMM 16, 16 -SAD_XMM 16, 8 -SAD_XMM 8, 16 -SAD_XMM 8, 8 -SAD_XMM 8, 4 +SAD 16, 16 +SAD 16, 8 +SAD 8, 16 +SAD 8, 8 +SAD 8, 4 INIT_XMM ssse3, aligned -SAD_XMM 16, 16 -SAD_XMM 16, 8 -SAD_XMM 8, 16 -SAD_XMM 8, 8 +SAD 16, 16 +SAD 16, 8 +SAD 8, 16 +SAD 8, 8 +INIT_YMM avx2 +SAD 16, 16 +SAD 16, 8 ;============================================================================= ; SAD x3/x4 @@ -237,14 +255,14 @@ HADDW m2, m5 %endif %if UNIX64 - movd [r5+0], m0 - movd [r5+4], m1 - movd [r5+8], m2 + movd [r5+0], xm0 + movd [r5+4], xm1 + movd [r5+8], xm2 %else mov r0, r5mp - movd [r0+0], m0 - movd [r0+4], m1 - movd [r0+8], m2 + movd [r0+0], xm0 + movd [r0+4], xm1 + movd [r0+8], xm2 %endif RET %endmacro @@ -333,10 +351,10 @@ HADDW m3, m7 %endif mov r0, r6mp - movd [r0+ 0], m0 - movd [r0+ 4], m1 - movd [r0+ 8], m2 - movd [r0+12], m3 + movd [r0+ 0], xm0 + movd [r0+ 4], xm1 + movd [r0+ 8], xm2 + movd [r0+12], xm3 RET %endmacro @@ -400,8 +418,39 @@ INIT_XMM xop PIXEL_VSAD +INIT_YMM avx2 +cglobal pixel_vsad, 3,3 + mova m0, [r0] + mova m1, [r0+2*r1] + lea r0, [r0+4*r1] + psubw m0, m1 + pabsw m0, m0 + sub r2d, 2 + je .end +.loop: + mova m2, [r0] + mova m3, [r0+2*r1] + lea r0, [r0+4*r1] + psubw m1, m2 + psubw m2, m3 + pabsw m1, m1 + pabsw m2, m2 + paddw m0, m1 + paddw m0, m2 + mova m1, m3 + sub r2d, 2 + jg .loop +.end: +%if BIT_DEPTH == 9 + HADDW m0, m1 +%else + HADDUW m0, m1 +%endif + movd eax, xm0
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/trellis-64.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/trellis-64.asm
Changed
@@ -96,6 +96,15 @@ %endif %endmacro +%macro LOAD_DUP 2 ; dst, src +%if cpuflag(ssse3) + movddup %1, %2 +%else + movd %1, %2 + punpcklqdq %1, %1 +%endif +%endmacro + ;----------------------------------------------------------------------------- ; int trellis_cabac_4x4_psy( ; const int *unquant_mf, const uint8_t *zigzag, int lambda2, @@ -186,12 +195,11 @@ mov dword levelgt1_ctxm, 9 %endif %if psy - movd m6, psy_trellism + LOAD_DUP m6, psy_trellism %define psy_trellis m6 %elif dc - movd m6, [unquant_mfq] + LOAD_DUP m6, [unquant_mfq] paddd m6, m6 - punpcklqdq m6, m6 %define unquant_mf m6 %endif %ifdef PIC @@ -333,13 +341,12 @@ movd m0, abs_leveld mov r6, orig_coefsm %if HIGH_BIT_DEPTH - movd m1, [r6 + zigzagiq*SIZEOF_DCTCOEF] + LOAD_DUP m1, [r6 + zigzagiq*SIZEOF_DCTCOEF] %else - movd m1, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2] - psrad m1, 16 + LOAD_DUP m1, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2] + psrad m1, 16 ; sign_coef %endif punpcklqdq m0, m0 ; quant_coef - punpcklqdq m1, m1 ; sign_coef %if cpuflag(ssse3) pabsd m0, m0 pabsd m2, m1 ; abs_coef @@ -403,11 +410,10 @@ %else %ifdef PIC mov r10, unquant_mfm - movd m3, [r10 + zigzagiq*4] + LOAD_DUP m3, [r10 + zigzagiq*4] %else - movd m3, [unquant_mfq + zigzagiq*4] + LOAD_DUP m3, [unquant_mfq + zigzagiq*4] %endif - punpcklqdq m3, m3 pmuludq m0, m3 %endif paddd m0, [pq_128] @@ -420,8 +426,7 @@ %if dc psllq m0, 8 %else - movd m5, [dct_weight2_tab + zigzagiq*4 GLOBAL] - punpcklqdq m5, m5 + LOAD_DUP m5, [dct_weight2_tab + zigzagiq*4 GLOBAL] pmuludq m0, m5 %endif @@ -434,12 +439,11 @@ ; ssd1[k] -= psy_weight * psy_value; mov r6, fenc_dctm %if HIGH_BIT_DEPTH - movd m3, [r6 + zigzagiq*SIZEOF_DCTCOEF] + LOAD_DUP m3, [r6 + zigzagiq*SIZEOF_DCTCOEF] %else - movd m3, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2] + LOAD_DUP m3, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2] psrad m3, 16 ; orig_coef %endif - punpcklqdq m3, m3 %if cpuflag(ssse3) psignd m4, m1 ; SIGN(unquant_abs_level, sign_coef) %else @@ -453,9 +457,8 @@ ABSD m3, m4 SWAP 4, 3 %endif - movd m1, [dct_weight1_tab + zigzagiq*4 GLOBAL] + LOAD_DUP m1, [dct_weight1_tab + zigzagiq*4 GLOBAL] pmuludq m1, psy_trellis - punpcklqdq m1, m1 pmuludq m4, m1 psubq m0, m4 %if %1
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/util.h -> x264-snapshot-20130723-2245.tar.bz2/common/x86/util.h
Changed
@@ -121,42 +121,132 @@ return amvd; } +#define x264_predictor_clip x264_predictor_clip_mmx2 +static int ALWAYS_INLINE x264_predictor_clip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv ) +{ + static const uint32_t pd_32 = 0x20; + intptr_t tmp = (intptr_t)mv_limit, mvc_max = i_mvc, i = 0; + + asm( + "movq (%2), %%mm5 \n" + "movd %6, %%mm3 \n" + "psllw $2, %%mm5 \n" // Convert to subpel + "pshufw $0xEE, %%mm5, %%mm6 \n" + "dec %k3 \n" + "jz 2f \n" // if( i_mvc == 1 ) {do the last iteration} + "punpckldq %%mm3, %%mm3 \n" + "punpckldq %%mm5, %%mm5 \n" + "movd %7, %%mm4 \n" + "lea (%0,%3,4), %3 \n" + "1: \n" + "movq (%0), %%mm0 \n" + "add $8, %0 \n" + "movq %%mm3, %%mm1 \n" + "pxor %%mm2, %%mm2 \n" + "pcmpeqd %%mm0, %%mm1 \n" // mv == pmv + "pcmpeqd %%mm0, %%mm2 \n" // mv == 0 + "por %%mm1, %%mm2 \n" // (mv == pmv || mv == 0) * -1 + "pmovmskb %%mm2, %k2 \n" // (mv == pmv || mv == 0) * 0xf + "pmaxsw %%mm5, %%mm0 \n" + "pminsw %%mm6, %%mm0 \n" + "pand %%mm4, %%mm2 \n" // (mv0 == pmv || mv0 == 0) * 32 + "psrlq %%mm2, %%mm0 \n" // drop mv0 if it's skipped + "movq %%mm0, (%5,%4,4) \n" + "and $24, %k2 \n" + "add $2, %4 \n" + "add $8, %k2 \n" + "shr $4, %k2 \n" // (4-val)>>1 + "sub %2, %4 \n" // +1 for each valid motion vector + "cmp %3, %0 \n" + "jl 1b \n" + "jg 3f \n" // if( i == i_mvc - 1 ) {do the last iteration} + + /* Do the last iteration */ + "2: \n" + "movd (%0), %%mm0 \n" + "pxor %%mm2, %%mm2 \n" + "pcmpeqd %%mm0, %%mm3 \n" + "pcmpeqd %%mm0, %%mm2 \n" + "por %%mm3, %%mm2 \n" + "pmovmskb %%mm2, %k2 \n" + "pmaxsw %%mm5, %%mm0 \n" + "pminsw %%mm6, %%mm0 \n" + "movd %%mm0, (%5,%4,4) \n" + "inc %4 \n" + "and $1, %k2 \n" + "sub %2, %4 \n" // output += !(mv == pmv || mv == 0) + "3: \n" + :"+r"(mvc), "=m"(M64( dst )), "+r"(tmp), "+r"(mvc_max), "+r"(i) + :"r"(dst), "g"(pmv), "m"(pd_32), "m"(M64( mvc )) + ); + return i; +} + +/* Same as the above, except we do (mv + 2) >> 2 on the input. */ #define x264_predictor_roundclip x264_predictor_roundclip_mmx2 -static void ALWAYS_INLINE x264_predictor_roundclip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max ) +static int ALWAYS_INLINE x264_predictor_roundclip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv ) { - uint32_t mv_min = pack16to32_mask( mv_x_min, mv_y_min ); - uint32_t mv_max = pack16to32_mask( mv_x_max, mv_y_max ); static const uint64_t pw_2 = 0x0002000200020002ULL; - intptr_t i = i_mvc; + static const uint32_t pd_32 = 0x20; + intptr_t tmp = (intptr_t)mv_limit, mvc_max = i_mvc, i = 0; + asm( - "movd %2, %%mm5 \n" - "movd %3, %%mm6 \n" - "movq %4, %%mm7 \n" - "punpckldq %%mm5, %%mm5 \n" - "punpckldq %%mm6, %%mm6 \n" - "test $1, %0 \n" - "jz 1f \n" - "movd -4(%6,%0,4), %%mm0 \n" - "paddw %%mm7, %%mm0 \n" - "psraw $2, %%mm0 \n" - "pmaxsw %%mm5, %%mm0 \n" - "pminsw %%mm6, %%mm0 \n" - "movd %%mm0, -4(%5,%0,4) \n" - "dec %0 \n" - "jz 2f \n" - "1: \n" - "movq -8(%6,%0,4), %%mm0 \n" - "paddw %%mm7, %%mm0 \n" - "psraw $2, %%mm0 \n" - "pmaxsw %%mm5, %%mm0 \n" - "pminsw %%mm6, %%mm0 \n" - "movq %%mm0, -8(%5,%0,4) \n" - "sub $2, %0 \n" - "jnz 1b \n" - "2: \n" - :"+r"(i), "=m"(M64( dst )) - :"g"(mv_min), "g"(mv_max), "m"(pw_2), "r"(dst), "r"(mvc), "m"(M64( mvc )) + "movq (%2), %%mm5 \n" + "movq %6, %%mm7 \n" + "movd %7, %%mm3 \n" + "pshufw $0xEE, %%mm5, %%mm6 \n" + "dec %k3 \n" + "jz 2f \n" + "punpckldq %%mm3, %%mm3 \n" + "punpckldq %%mm5, %%mm5 \n" + "movd %8, %%mm4 \n" + "lea (%0,%3,4), %3 \n" + "1: \n" + "movq (%0), %%mm0 \n" + "add $8, %0 \n" + "paddw %%mm7, %%mm0 \n" + "psraw $2, %%mm0 \n" + "movq %%mm3, %%mm1 \n" + "pxor %%mm2, %%mm2 \n" + "pcmpeqd %%mm0, %%mm1 \n" + "pcmpeqd %%mm0, %%mm2 \n" + "por %%mm1, %%mm2 \n" + "pmovmskb %%mm2, %k2 \n" + "pmaxsw %%mm5, %%mm0 \n" + "pminsw %%mm6, %%mm0 \n" + "pand %%mm4, %%mm2 \n" + "psrlq %%mm2, %%mm0 \n" + "movq %%mm0, (%5,%4,4) \n" + "and $24, %k2 \n" + "add $2, %4 \n" + "add $8, %k2 \n" + "shr $4, %k2 \n" + "sub %2, %4 \n" + "cmp %3, %0 \n" + "jl 1b \n" + "jg 3f \n" + + /* Do the last iteration */ + "2: \n" + "movd (%0), %%mm0 \n" + "paddw %%mm7, %%mm0 \n" + "psraw $2, %%mm0 \n" + "pxor %%mm2, %%mm2 \n" + "pcmpeqd %%mm0, %%mm3 \n" + "pcmpeqd %%mm0, %%mm2 \n" + "por %%mm3, %%mm2 \n" + "pmovmskb %%mm2, %k2 \n" + "pmaxsw %%mm5, %%mm0 \n" + "pminsw %%mm6, %%mm0 \n" + "movd %%mm0, (%5,%4,4) \n" + "inc %4 \n" + "and $1, %k2 \n" + "sub %2, %4 \n" + "3: \n" + :"+r"(mvc), "=m"(M64( dst )), "+r"(tmp), "+r"(mvc_max), "+r"(i) + :"r"(dst), "m"(pw_2), "g"(pmv), "m"(pd_32), "m"(M64( mvc )) ); + return i; } #endif
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/x86inc.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/x86inc.asm
Changed
@@ -6,7 +6,7 @@ ;* Authors: Loren Merritt <lorenm@u.washington.edu> ;* Anton Mitrofanov <BugMaster@narod.ru> ;* Jason Garrett-Glaser <darkshikari@gmail.com> -;* Henrik Gramner <hengar-6@student.ltu.se> +;* Henrik Gramner <henrik@gramner.com> ;* ;* Permission to use, copy, modify, and/or distribute this software for any ;* purpose with or without fee is hereby granted, provided that the above @@ -34,8 +34,12 @@ ; as this feature might be useful for others as well. Send patches or ideas ; to x264-devel@videolan.org . -%ifndef program_name - %define program_name x264 +%ifndef private_prefix + %define private_prefix x264 +%endif + +%ifndef public_prefix + %define public_prefix private_prefix %endif %define WIN64 0 @@ -56,29 +60,12 @@ %define mangle(x) x %endif -; Name of the .rodata section. -; Kludge: Something on OS X fails to align .rodata even given an align attribute, -; so use a different read-only section. %macro SECTION_RODATA 0-1 16 - %ifidn __OUTPUT_FORMAT__,macho64 - SECTION .text align=%1 - %elifidn __OUTPUT_FORMAT__,macho - SECTION .text align=%1 - fakegot: - %elifidn __OUTPUT_FORMAT__,aout - section .text - %else - SECTION .rodata align=%1 - %endif + SECTION .rodata align=%1 %endmacro -; aout does not support align= %macro SECTION_TEXT 0-1 16 - %ifidn __OUTPUT_FORMAT__,aout - SECTION .text - %else - SECTION .text align=%1 - %endif + SECTION .text align=%1 %endmacro %if WIN64 @@ -323,14 +310,18 @@ %if stack_size < 0 %assign stack_size -stack_size %endif - %if mmsize != 8 - %assign xmm_regs_used %2 + %assign stack_size_padded stack_size + %if WIN64 + %assign stack_size_padded stack_size_padded + 32 ; reserve 32 bytes for shadow space + %if mmsize != 8 + %assign xmm_regs_used %2 + %if xmm_regs_used > 8 + %assign stack_size_padded stack_size_padded + (xmm_regs_used-8)*16 + %endif + %endif %endif %if mmsize <= 16 && HAVE_ALIGNED_STACK - %assign stack_size_padded stack_size + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1)) - %if xmm_regs_used > 6 - %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16 - %endif + %assign stack_size_padded stack_size_padded + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1)) SUB rsp, stack_size_padded %else %assign %%reg_num (regs_used - 1) @@ -340,14 +331,6 @@ ; stack in a single instruction (i.e. mov rsp, rstk or mov ; rsp, [rsp+stack_size_padded]) mov rstk, rsp - %assign stack_size_padded stack_size - %if xmm_regs_used > 6 - %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16 - %if mmsize == 32 && xmm_regs_used & 1 - ; re-align to 32 bytes - %assign stack_size_padded (stack_size_padded + 16) - %endif - %endif %if %1 < 0 ; need to store rsp on stack sub rsp, gprsize+stack_size_padded and rsp, ~(%%stack_alignment-1) @@ -359,9 +342,7 @@ %xdefine rstkm rstk %endif %endif - %if xmm_regs_used > 6 - WIN64_PUSH_XMM - %endif + WIN64_PUSH_XMM %endif %endif %endmacro @@ -422,40 +403,55 @@ %endmacro %macro WIN64_PUSH_XMM 0 - %assign %%i xmm_regs_used - %rep (xmm_regs_used-6) - %assign %%i %%i-1 - movdqa [rsp + (%%i-6)*16 + stack_size + (~stack_offset&8)], xmm %+ %%i - %endrep + ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. + %if xmm_regs_used > 6 + movaps [rstk + stack_offset + 8], xmm6 + %endif + %if xmm_regs_used > 7 + movaps [rstk + stack_offset + 24], xmm7 + %endif + %if xmm_regs_used > 8 + %assign %%i 8 + %rep xmm_regs_used-8 + movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i + %assign %%i %%i+1 + %endrep + %endif %endmacro %macro WIN64_SPILL_XMM 1 %assign xmm_regs_used %1 ASSERT xmm_regs_used <= 16 - %if xmm_regs_used > 6 - SUB rsp, (xmm_regs_used-6)*16+16 - WIN64_PUSH_XMM + %if xmm_regs_used > 8 + %assign stack_size_padded (xmm_regs_used-8)*16 + (~stack_offset&8) + 32 + SUB rsp, stack_size_padded %endif + WIN64_PUSH_XMM %endmacro %macro WIN64_RESTORE_XMM_INTERNAL 1 - %if xmm_regs_used > 6 + %assign %%pad_size 0 + %if xmm_regs_used > 8 %assign %%i xmm_regs_used - %rep (xmm_regs_used-6) + %rep xmm_regs_used-8 %assign %%i %%i-1 - movdqa xmm %+ %%i, [%1 + (%%i-6)*16+stack_size+(~stack_offset&8)] + movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32] %endrep - %if stack_size_padded == 0 - add %1, (xmm_regs_used-6)*16+16 - %endif %endif %if stack_size_padded > 0 %if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0) mov rsp, rstkm %else add %1, stack_size_padded + %assign %%pad_size stack_size_padded %endif %endif + %if xmm_regs_used > 7 + movaps xmm7, [%1 + stack_offset - %%pad_size + 24] + %endif + %if xmm_regs_used > 6 + movaps xmm6, [%1 + stack_offset - %%pad_size + 8] + %endif %endmacro %macro WIN64_RESTORE_XMM 1 @@ -643,38 +639,48 @@ ; Applies any symbol mangling needed for C linkage, and sets up a define such that ; subsequent uses of the function name automatically refer to the mangled version. ; Appends cpuflags to the function name if cpuflags has been specified. +; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX +; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2). %macro cglobal 1-2+ "" ; name, [PROLOGUE args] - ; the "" is a workaround for nasm, which fails if SUFFIX is empty - ; and we call cglobal_internal with just %1 %+ SUFFIX (without %2) - cglobal_internal %1 %+ SUFFIX, %2 + cglobal_internal 1, %1 %+ SUFFIX, %2 %endmacro -%macro cglobal_internal 1-2+ - %ifndef cglobaled_%1 - %xdefine %1 mangle(program_name %+ _ %+ %1) - %xdefine %1.skip_prologue %1 %+ .skip_prologue - CAT_XDEFINE cglobaled_, %1, 1 +%macro cvisible 1-2+ "" ; name, [PROLOGUE args] + cglobal_internal 0, %1 %+ SUFFIX, %2 +%endmacro +%macro cglobal_internal 2-3+ + %if %1 + %xdefine %%FUNCTION_PREFIX private_prefix
View file
x264-snapshot-20130224-2245.tar.bz2/common/x86/x86util.asm -> x264-snapshot-20130723-2245.tar.bz2/common/x86/x86util.asm
Changed
@@ -30,10 +30,14 @@ %assign SIZEOF_PIXEL 1 %assign SIZEOF_DCTCOEF 2 %define pixel byte +%define vpbroadcastdct vpbroadcastw +%define vpbroadcastpix vpbroadcastb %if HIGH_BIT_DEPTH %assign SIZEOF_PIXEL 2 %assign SIZEOF_DCTCOEF 4 %define pixel word + %define vpbroadcastdct vpbroadcastd + %define vpbroadcastpix vpbroadcastw %endif %assign FENC_STRIDEB SIZEOF_PIXEL*FENC_STRIDE @@ -52,7 +56,10 @@ %macro SBUTTERFLY 4 -%if avx_enabled && mmsize == 16 +%ifidn %1, dqqq + vperm2i128 m%4, m%2, m%3, q0301 ; punpckh + vinserti128 m%2, m%2, xm%3, 1 ; punpckl +%elif avx_enabled && mmsize >= 16 punpckh%1 m%4, m%2, m%3 punpckl%1 m%2, m%3 %else @@ -214,15 +221,20 @@ %endif %endmacro -%macro ABSD 2 +%macro ABSD 2-3 %if cpuflag(ssse3) pabsd %1, %2 %else - pxor %1, %1 - pcmpgtd %1, %2 - pxor %2, %1 - psubd %2, %1 - SWAP %1, %2 + %define %%s %2 +%if %0 == 3 + mova %3, %2 + %define %%s %3 +%endif + pxor %1, %1 + pcmpgtd %1, %%s + pxor %%s, %1 + psubd %%s, %1 + SWAP %1, %%s %endif %endmacro @@ -255,9 +267,13 @@ %endmacro %imacro SPLATW 2-3 0 - PSHUFLW %1, %2, (%3)*q1111 +%if cpuflag(avx2) && %3 == 0 + vpbroadcastw %1, %2 +%else + PSHUFLW %1, %2, (%3)*q1111 %if mmsize == 16 - punpcklqdq %1, %1 + punpcklqdq %1, %1 +%endif %endif %endmacro @@ -275,16 +291,24 @@ %endmacro %macro HADDD 2 ; sum junk -%if mmsize == 16 +%if sizeof%1 == 32 +%define %2 xmm%2 + vextracti128 %2, %1, 1 +%define %1 xmm%1 + paddd %1, %2 +%endif +%if mmsize >= 16 movhlps %2, %1 paddd %1, %2 %endif PSHUFLW %2, %1, q0032 paddd %1, %2 +%undef %1 +%undef %2 %endmacro %macro HADDW 2 ; reg, tmp -%if cpuflag(xop) && mmsize == 16 +%if cpuflag(xop) && sizeof%1 == 16 vphaddwq %1, %1 movhlps %2, %1 paddd %1, %2 @@ -294,22 +318,41 @@ %endif %endmacro -%macro HADDUW 2 -%if cpuflag(xop) && mmsize == 16 - vphadduwq %1, %1 - movhlps %2, %1 - paddd %1, %2 +%macro HADDUWD 2 +%if cpuflag(xop) && sizeof%1 == 16 + vphadduwd %1, %1 %else psrld %2, %1, 16 pslld %1, 16 psrld %1, 16 paddd %1, %2 - HADDD %1, %2 +%endif +%endmacro + +%macro HADDUW 2 +%if cpuflag(xop) && sizeof%1 == 16 + vphadduwq %1, %1 + movhlps %2, %1 + paddd %1, %2 +%else + HADDUWD %1, %2 + HADDD %1, %2 %endif %endmacro %macro PALIGNR 4-5 ; [dst,] src1, src2, imm, tmp -%if cpuflag(ssse3) +; AVX2 version uses a precalculated extra input that +; can be re-used across calls +%if sizeof%1==32 + ; %3 = abcdefgh ijklmnop (lower address) + ; %2 = ABCDEFGH IJKLMNOP (higher address) +; vperm2i128 %5, %2, %3, q0003 ; %5 = ijklmnop ABCDEFGH +%if %4 < 16 + palignr %1, %5, %3, %4 ; %1 = bcdefghi jklmnopA +%else + palignr %1, %2, %5, %4-16 ; %1 = pABCDEFG HIJKLMNO +%endif +%elif cpuflag(ssse3) %if %0==5 palignr %1, %2, %3, %4 %else @@ -475,7 +518,7 @@ %endif %elifidn %1, q shufps m%5, m%3, m%4, q3131 - shufps m%3, m%4, q2020 + shufps m%3, m%3, m%4, q2020 SWAP %4, %5 %endif %endmacro @@ -498,22 +541,24 @@ ; %5(%6): tmpregs %if %1!=0 ; have to reorder stuff for horizontal op %ifidn %2, sumsub - %define ORDER ord - ; sumsub needs order because a-b != b-a unless a=b + %define ORDER ord + ; sumsub needs order because a-b != b-a unless a=b %else - %define ORDER unord - ; if we just max, order doesn't matter (allows pblendw+or in sse4) + %define ORDER unord + ; if we just max, order doesn't matter (allows pblendw+or in sse4) %endif %if %1==1 - TRANS d, ORDER, %3, %4, %5, %6 + TRANS d, ORDER, %3, %4, %5, %6 %elif %1==2 - %if mmsize==8 - SBUTTERFLY dq, %3, %4, %5 - %else - TRANS q, ORDER, %3, %4, %5, %6 - %endif + %if mmsize==8 + SBUTTERFLY dq, %3, %4, %5 + %else + TRANS q, ORDER, %3, %4, %5, %6 + %endif %elif %1==4 - SBUTTERFLY qdq, %3, %4, %5 + SBUTTERFLY qdq, %3, %4, %5 + %elif %1==8 + SBUTTERFLY dqqq, %3, %4, %5 %endif %endif %ifidn %2, sumsub @@ -675,11 +720,18 @@ %endmacro -%macro LOAD_DIFF 5 +%macro LOAD_DIFF 5-6 1 %if HIGH_BIT_DEPTH +%if %6 ; %5 aligned? mova %1, %4
View file
x264-snapshot-20130224-2245.tar.bz2/configure -> x264-snapshot-20130723-2245.tar.bz2/configure
Changed
@@ -25,6 +25,7 @@ --system-libx264 use system libx264 instead of internal --enable-shared build shared library --enable-static build static library + --disable-opencl disable OpenCL features --disable-gpl disable GPL-only features --disable-thread disable multithreaded encoding --enable-win32thread use win32threads (windows only) @@ -46,7 +47,7 @@ --sysroot=SYSROOT root of cross-build tree External library support: - --disable-avs disable avisynth support (windows only) + --disable-avs disable avisynth support --disable-swscale disable swscale support --disable-lavf disable libavformat support --disable-ffms disable ffmpegsource support @@ -80,6 +81,9 @@ [[ "$arg" = -falign-loops* ]] && arg= [ "$arg" = -fno-tree-vectorize ] && arg= [ "$arg" = -Wshadow ] && arg= + [[ "$arg" = -mpreferred-stack-boundary* ]] && arg= + [[ "$arg" = -l* ]] && arg= + [[ "$arg" = -L* ]] && arg= if [ $compiler = ICL ]; then [ "$arg" = -Wall ] && arg=-W0 [ "$arg" = -g ] && arg=-Z7 @@ -133,7 +137,7 @@ [ -n "$1" ] && echo "#include <$1>" > conftest.c echo "int main () { $3 return 0; }" >> conftest.c if [ $compiler = ICL ]; then - cc_cmd="$CC conftest.c $CFLAGS $2 -link $(icl_ldflags $2 $LDFLAGSCLI $LDFLAGS)" + cc_cmd="$CC conftest.c $(intel_cflags $CFLAGS $2) -link $(icl_ldflags $2 $LDFLAGSCLI $LDFLAGS)" else cc_cmd="$CC conftest.c $CFLAGS $2 $LDFLAGSCLI $LDFLAGS -o conftest" fi @@ -273,6 +277,7 @@ bit_depth="8" chroma_format="all" compiler="GNU" +opencl="yes" CFLAGS="$CFLAGS -Wall -I. -I\$(SRCPATH)" LDFLAGS="$LDFLAGS" @@ -285,7 +290,7 @@ EXE="" # list of all preprocessor HAVE values we can define -CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F VISUALIZE SWSCALE LAVF FFMS GPAC GF_MALLOC AVS GPL VECTOREXT INTERLACED CPU_COUNT" +CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F VISUALIZE SWSCALE LAVF FFMS GPAC GF_MALLOC AVS GPL VECTOREXT INTERLACED CPU_COUNT OPENCL" # parse options @@ -381,6 +386,9 @@ --host=*) host="$optarg" ;; + --disable-opencl) + opencl="no" + ;; --cross-prefix=*) cross_prefix="$optarg" ;; @@ -521,6 +529,13 @@ fi HAVE_GETOPT_LONG=0 ;; + *qnx*) + SYS="QNX" + define HAVE_MALLOC_H + libm="-lm" + HAVE_GETOPT_LONG=0 + CFLAGS="$CFLAGS -I\$(SRCPATH)/extras" + ;; *) die "Unknown system $host, edit the configure" ;; @@ -564,6 +579,7 @@ elif [ "$SYS" = WINDOWS -o "$SYS" = CYGWIN ]; then ASFLAGS="$ASFLAGS -f win32 -DPREFIX" LDFLAGS="$LDFLAGS -Wl,--large-address-aware" + [ $compiler = GNU ] && LDFLAGS="$LDFLAGS -Wl,--nxcompat -Wl,--dynamicbase" [ $compiler = GNU ] && RCFLAGS="--target=pe-i386 $RCFLAGS" else ASFLAGS="$ASFLAGS -f elf" @@ -583,6 +599,7 @@ ASFLAGS="$ASFLAGS -f win32 -m amd64" # only the GNU toolchain is inconsistent in prefixing function names with _ [ $compiler = GNU ] && cc_check "" "-S" && grep -q "_main:" conftest && ASFLAGS="$ASFLAGS -DPREFIX" + [ $compiler = GNU ] && LDFLAGS="$LDFLAGS -Wl,--nxcompat -Wl,--dynamicbase" [ $compiler = GNU ] && RCFLAGS="--target=pe-x86-64 $RCFLAGS" else ASFLAGS="$ASFLAGS -f elf -m amd64" @@ -703,6 +720,10 @@ exit 1 fi define HAVE_MMX + if cc_check '' -mpreferred-stack-boundary=5 ; then + CFLAGS="$CFLAGS -mpreferred-stack-boundary=5" + define HAVE_32B_STACK_ALIGNMENT + fi fi if [ $asm = auto -a $ARCH = ARM ] ; then @@ -770,6 +791,9 @@ thread="win32" fi ;; + QNX) + cc_check pthread.h -lc && thread="posix" && libpthread="-lc" + ;; *) cc_check pthread.h -lpthread && thread="posix" && libpthread="-lpthread" ;; @@ -917,8 +941,16 @@ avs="no" # cygwin can use avisynth if it can use LoadLibrary if [ $SYS = WINDOWS ] || ([ $SYS = CYGWIN ] && cc_check windows.h "" "LoadLibrary(0);") ; then - avs="yes" + avs="avisynth" + define HAVE_AVS + define USE_AVXSYNTH 0 + elif [ "$SYS" = "LINUX" -o "$SYS" = "MACOSX" ] ; then + # AvxSynth currently only supports Linux and OSX + avs="avxsynth" define HAVE_AVS + define USE_AVXSYNTH 1 + AVS_LIBS="-ldl" + LDFLAGSCLI="$AVS_LIBS $LDFLAGSCLI" fi fi @@ -978,6 +1010,7 @@ if [ "$bit_depth" -gt "8" ]; then define HIGH_BIT_DEPTH ASFLAGS="$ASFLAGS -DHIGH_BIT_DEPTH=1" + opencl="no" else ASFLAGS="$ASFLAGS -DHIGH_BIT_DEPTH=0" fi @@ -992,6 +1025,30 @@ [ $interlaced = yes ] && define HAVE_INTERLACED && x264_interlaced=1 || x264_interlaced=0 +libdl="" +if [ "$opencl" = "yes" ]; then + opencl="no" + log_check "for perl" + output=$(perl -v) + if [ "$output" = "" ]; then + log_fail + echo 'OpenCL support requires perl to compile.' + echo 'use --disable-opencl to compile without OpenCL.' + exit 1 + fi + log_ok + # cygwin can use opencl if it can use LoadLibrary + if [ $SYS = WINDOWS ] || ([ $SYS = CYGWIN ] && cc_check windows.h "" "LoadLibrary(0);") ; then + opencl="yes" + define HAVE_OPENCL + elif [ "$SYS" = "LINUX" -o "$SYS" = "MACOSX" ] ; then + opencl="yes" + define HAVE_OPENCL + libdl="-ldl" + fi + LDFLAGS="$LDFLAGS $libdl" +fi + #define undefined vars as 0 for var in $CONFIG_HAVE; do grep -q "HAVE_$var 1" config.h || define HAVE_$var 0 @@ -1083,6 +1140,7 @@ PROF_GEN_LD=$PROF_GEN_LD PROF_USE_CC=$PROF_USE_CC PROF_USE_LD=$PROF_USE_LD +HAVE_OPENCL=$opencl EOF if [ $compiler = ICL ]; then @@ -1162,7 +1220,7 @@ Description: H.264 (MPEG4 AVC) encoder library Version: $(grep POINTVER < x264_config.h | sed -e 's/.* "//; s/".*//') Libs: -L$libdir -lx264 -Libs.private: $libpthread $libm +Libs.private: $libpthread $libm $libdl Cflags: -I$includedir EOF @@ -1186,6 +1244,7 @@ gpac: $gpac gpl: $gpl thread: $thread +opencl: $opencl filters: $filters debug: $debug gprof: $gprof
View file
x264-snapshot-20130224-2245.tar.bz2/doc/regression_test.txt -> x264-snapshot-20130723-2245.tar.bz2/doc/regression_test.txt
Changed
@@ -4,7 +4,7 @@ inherently caused by compression. # Install and compile x264 : -svn co svn://svn.videolan.org/x264/trunk x264 +git clone git://git.videolan.org/x264.git x264 cd x264 ./configure make
View file
x264-snapshot-20130224-2245.tar.bz2/encoder/analyse.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/analyse.c
Changed
@@ -467,8 +467,8 @@ if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col ) h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv ); } - h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border; - h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border; + h->mb.mv_limit_fpel[0][0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border; + h->mb.mv_limit_fpel[1][0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border; if( h->mb.i_mb_x == 0 && !(h->mb.i_mb_y & PARAM_INTERLACED) ) { int mb_y = h->mb.i_mb_y >> SLICE_MBAFF; @@ -516,8 +516,8 @@ h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range ); h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] ); h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 ); - h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border; - h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border; + h->mb.mv_limit_fpel[0][1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border; + h->mb.mv_limit_fpel[1][1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border; } } if( PARAM_INTERLACED ) @@ -527,8 +527,8 @@ h->mb.mv_max[1] = h->mb.mv_maxy_row[i]; h->mb.mv_min_spel[1] = h->mb.mv_miny_spel_row[i]; h->mb.mv_max_spel[1] = h->mb.mv_maxy_spel_row[i]; - h->mb.mv_min_fpel[1] = h->mb.mv_miny_fpel_row[i]; - h->mb.mv_max_fpel[1] = h->mb.mv_maxy_fpel_row[i]; + h->mb.mv_limit_fpel[0][1] = h->mb.mv_miny_fpel_row[i]; + h->mb.mv_limit_fpel[1][1] = h->mb.mv_maxy_fpel_row[i]; } #undef CLIP_FMV @@ -888,7 +888,7 @@ { if( !h->mb.b_lossless && predict_mode[5] >= 0 ) { - int satd[9]; + ALIGNED_ARRAY_16( int32_t, satd,[9] ); h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd ); int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V]; satd[i_pred_mode] -= 3 * lambda; @@ -1006,7 +1006,7 @@ { if( !h->mb.b_lossless && predict_mode[5] >= 0 ) { - int satd[9]; + ALIGNED_ARRAY_16( int32_t, satd,[9] ); h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd ); int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V]; satd[i_pred_mode] -= 3 * lambda; @@ -1706,7 +1706,7 @@ static ALWAYS_INLINE int x264_mb_analyse_inter_p4x4_chroma_internal( x264_t *h, x264_mb_analysis_t *a, pixel **p_fref, int i8x8, int size, int chroma ) { - ALIGNED_ARRAY_16( pixel, pix1,[16*16] ); + ALIGNED_ARRAY_N( pixel, pix1,[16*16] ); pixel *pix2 = pix1+8; int i_stride = h->mb.pic.i_stride[1]; int chroma_h_shift = chroma <= CHROMA_422; @@ -1890,8 +1890,8 @@ static ALWAYS_INLINE int x264_analyse_bi_chroma( x264_t *h, x264_mb_analysis_t *a, int idx, int i_pixel ) { - ALIGNED_ARRAY_16( pixel, pix, [4],[16*16] ); - ALIGNED_ARRAY_16( pixel, bi, [2],[16*16] ); + ALIGNED_ARRAY_N( pixel, pix, [4],[16*16] ); + ALIGNED_ARRAY_N( pixel, bi, [2],[16*16] ); int i_chroma_cost = 0; int chromapix = h->luma2chroma_pixel[i_pixel]; @@ -1984,8 +1984,8 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) { - ALIGNED_ARRAY_16( pixel, pix0,[16*16] ); - ALIGNED_ARRAY_16( pixel, pix1,[16*16] ); + ALIGNED_ARRAY_N( pixel, pix0,[16*16] ); + ALIGNED_ARRAY_N( pixel, pix1,[16*16] ); pixel *src0, *src1; intptr_t stride0 = 16, stride1 = 16; int i_ref, i_mvc; @@ -2454,7 +2454,7 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd ) { - ALIGNED_ARRAY_16( pixel, pix,[2],[16*8] ); + ALIGNED_ARRAY_N( pixel, pix,[2],[16*8] ); ALIGNED_4( int16_t mvc[3][2] ); h->mb.i_partition = D_16x8; @@ -2836,12 +2836,28 @@ int plane_count = CHROMA444 && h->mb.b_chroma_me ? 3 : 1; int i_cost8 = 0, i_cost4 = 0; - for( int p = 0; p < plane_count; p++ ) + /* Not all platforms have a merged SATD function */ + if( h->pixf.sa8d_satd[PIXEL_16x16] ) { - i_cost8 += h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, - h->mb.pic.p_fdec[p], FDEC_STRIDE ); - i_cost4 += h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, - h->mb.pic.p_fdec[p], FDEC_STRIDE ); + uint64_t cost = 0; + for( int p = 0; p < plane_count; p++ ) + { + cost += h->pixf.sa8d_satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, + h->mb.pic.p_fdec[p], FDEC_STRIDE ); + + } + i_cost8 = (uint32_t)cost; + i_cost4 = (uint32_t)(cost >> 32); + } + else + { + for( int p = 0; p < plane_count; p++ ) + { + i_cost8 += h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, + h->mb.pic.p_fdec[p], FDEC_STRIDE ); + i_cost4 += h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, + h->mb.pic.p_fdec[p], FDEC_STRIDE ); + } } h->mb.b_transform_8x8 = i_cost8 < i_cost4; @@ -3002,8 +3018,8 @@ h->mb.i_qp = x264_ratecontrol_mb_qp( h ); /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB, * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */ - if( h->param.rc.i_aq_mode && h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 ) - h->mb.i_qp = h->mb.i_last_qp; + if( h->param.rc.i_aq_mode && h->param.analyse.i_subpel_refine < 10 ) + h->mb.i_qp = abs(h->mb.i_qp - h->mb.i_last_qp) == 1 ? h->mb.i_last_qp : h->mb.i_qp; if( h->param.analyse.b_mb_info ) h->fdec->effective_qp[h->mb.i_mb_xy] = h->mb.i_qp; /* Store the real analysis QP. */
View file
x264-snapshot-20130224-2245.tar.bz2/encoder/analyse.h -> x264-snapshot-20130723-2245.tar.bz2/encoder/analyse.h
Changed
@@ -34,7 +34,7 @@ void x264_macroblock_analyse( x264_t *h ); void x264_slicetype_decide( x264_t *h ); -void x264_slicetype_analyse( x264_t *h, int keyframe ); +void x264_slicetype_analyse( x264_t *h, int intra_minigop ); int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t *w );
View file
x264-snapshot-20130224-2245.tar.bz2/encoder/cabac.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/cabac.c
Changed
@@ -152,8 +152,10 @@ int i_dqp = h->mb.i_qp - h->mb.i_last_qp; int ctx; - /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely flat background area */ - if( h->mb.i_type == I_16x16 && !h->mb.cbp[h->mb.i_mb_xy] ) + /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely + * flat background area. Don't do this if it would raise the quantizer, since that could + * cause unexpected deblocking artifacts. */ + if( h->mb.i_type == I_16x16 && !h->mb.cbp[h->mb.i_mb_xy] && h->mb.i_qp > h->mb.i_last_qp ) { #if !RDO_SKIP_BS h->mb.i_qp = h->mb.i_last_qp; @@ -161,9 +163,7 @@ i_dqp = 0; } - /* Since, per the above, empty-CBP I16x16 blocks never have delta quants, - * we don't have to check for them. */ - ctx = h->mb.i_last_dqp && h->mb.cbp[h->mb.i_mb_prev_xy]; + ctx = h->mb.i_last_dqp && (h->mb.type[h->mb.i_mb_prev_xy] == I_16x16 || (h->mb.cbp[h->mb.i_mb_prev_xy]&0x3f)); if( i_dqp != 0 ) { @@ -644,26 +644,17 @@ } } -static const uint16_t significant_coeff_flag_offset[2][14] = -{ - { 105+0, 105+15, 105+29, 105+44, 105+47, 402, 484+0, 484+15, 484+29, 660, 528+0, 528+15, 528+29, 718 }, - { 277+0, 277+15, 277+29, 277+44, 277+47, 436, 776+0, 776+15, 776+29, 675, 820+0, 820+15, 820+29, 733 } -}; -static const uint16_t last_coeff_flag_offset[2][14] = -{ - { 166+0, 166+15, 166+29, 166+44, 166+47, 417, 572+0, 572+15, 572+29, 690, 616+0, 616+15, 616+29, 748 }, - { 338+0, 338+15, 338+29, 338+44, 338+47, 451, 864+0, 864+15, 864+29, 699, 908+0, 908+15, 908+29, 757 } -}; -static const uint16_t coeff_abs_level_m1_offset[14] = -{ - 227+0, 227+10, 227+20, 227+30, 227+39, 426, 952+0, 952+10, 952+20, 708, 982+0, 982+10, 982+20, 766 -}; -#if RDO_SKIP_BS -extern const uint8_t x264_significant_coeff_flag_offset_8x8[2][63]; +#if !RDO_SKIP_BS +extern const uint8_t x264_significant_coeff_flag_offset_8x8[2][64]; extern const uint8_t x264_last_coeff_flag_offset_8x8[63]; extern const uint8_t x264_coeff_flag_offset_chroma_422_dc[7]; +extern const uint16_t x264_significant_coeff_flag_offset[2][16]; +extern const uint16_t x264_last_coeff_flag_offset[2][16]; +extern const uint16_t x264_coeff_abs_level_m1_offset[16]; +extern const uint8_t x264_count_cat_m1[14]; #else -const uint8_t x264_significant_coeff_flag_offset_8x8[2][63] = +/* Padded to [64] for easier addressing */ +const uint8_t x264_significant_coeff_flag_offset_8x8[2][64] = {{ 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5, 4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7, @@ -683,6 +674,21 @@ 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8 }; const uint8_t x264_coeff_flag_offset_chroma_422_dc[7] = { 0, 0, 1, 1, 2, 2, 2 }; /* MIN( i/2, 2 ) */ +const uint16_t x264_significant_coeff_flag_offset[2][16] = +{ + { 105+0, 105+15, 105+29, 105+44, 105+47, 402, 484+0, 484+15, 484+29, 660, 528+0, 528+15, 528+29, 718, 0, 0 }, + { 277+0, 277+15, 277+29, 277+44, 277+47, 436, 776+0, 776+15, 776+29, 675, 820+0, 820+15, 820+29, 733, 0, 0 } +}; +const uint16_t x264_last_coeff_flag_offset[2][16] = +{ + { 166+0, 166+15, 166+29, 166+44, 166+47, 417, 572+0, 572+15, 572+29, 690, 616+0, 616+15, 616+29, 748, 0, 0 }, + { 338+0, 338+15, 338+29, 338+44, 338+47, 451, 864+0, 864+15, 864+29, 699, 908+0, 908+15, 908+29, 757, 0, 0 } +}; +const uint16_t x264_coeff_abs_level_m1_offset[16] = +{ + 227+0, 227+10, 227+20, 227+30, 227+39, 426, 952+0, 952+10, 952+20, 708, 982+0, 982+10, 982+20, 766 +}; +const uint8_t x264_count_cat_m1[14] = {15, 14, 15, 3, 14, 63, 15, 14, 15, 63, 15, 14, 15, 63}; #endif // node ctx: 0..3: abslevel1 (with abslevelgt1 == 0). @@ -694,20 +700,20 @@ /* 4:2:2 chroma dc uses a slightly different state machine for some reason, also note that * 4:2:0 chroma dc doesn't use the last state so it has identical output with both arrays. */ static const uint8_t coeff_abs_levelgt1_ctx_chroma_dc[8] = { 5, 5, 5, 5, 6, 7, 8, 8 }; + static const uint8_t coeff_abs_level_transition[2][8] = { /* update node ctx after coding a level=1 */ { 1, 2, 3, 3, 4, 5, 6, 7 }, /* update node ctx after coding a level>1 */ { 4, 4, 4, 4, 5, 6, 7, 7 } }; -static const uint8_t count_cat_m1[14] = {15, 14, 15, 3, 14, 63, 15, 14, 15, 63, 15, 14, 15, 63}; #if !RDO_SKIP_BS static ALWAYS_INLINE void x264_cabac_block_residual_internal( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l, int chroma422dc ) { - int ctx_sig = significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; - int ctx_last = last_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; - int ctx_level = coeff_abs_level_m1_offset[ctx_block_cat]; + int ctx_sig = x264_significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; + int ctx_last = x264_last_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; + int ctx_level = x264_coeff_abs_level_m1_offset[ctx_block_cat]; int coeff_idx = -1, node_ctx = 0; int last = h->quantf.coeff_last[ctx_block_cat]( l ); const uint8_t *levelgt1_ctx = chroma422dc ? coeff_abs_levelgt1_ctx_chroma_dc : coeff_abs_levelgt1_ctx; @@ -747,7 +753,7 @@ } else { - int count_m1 = count_cat_m1[ctx_block_cat]; + int count_m1 = x264_count_cat_m1[ctx_block_cat]; if( count_m1 == 63 ) { const uint8_t *sig_offset = x264_significant_coeff_flag_offset_8x8[MB_INTERLACED]; @@ -787,10 +793,20 @@ x264_cabac_encode_bypass( cb, coeff_sign ); } while( --coeff_idx >= 0 ); } -static void x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) + +void x264_cabac_block_residual_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) { x264_cabac_block_residual_internal( h, cb, ctx_block_cat, l, 0 ); } + +static void ALWAYS_INLINE x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) +{ +#if ARCH_X86_64 && HAVE_MMX + h->bsf.cabac_block_residual_internal( l, MB_INTERLACED, ctx_block_cat, cb ); +#else + x264_cabac_block_residual_c( h, cb, ctx_block_cat, l ); +#endif +} static void x264_cabac_block_residual_422_dc( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) { /* Template a version specifically for chroma 4:2:2 DC in order to avoid @@ -806,16 +822,16 @@ static void ALWAYS_INLINE x264_cabac_block_residual_internal( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l, int b_8x8, int chroma422dc ) { const uint8_t *sig_offset = x264_significant_coeff_flag_offset_8x8[MB_INTERLACED]; - int ctx_sig = significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; - int ctx_last = last_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; - int ctx_level = coeff_abs_level_m1_offset[ctx_block_cat]; + int ctx_sig = x264_significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; + int ctx_last = x264_last_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; + int ctx_level = x264_coeff_abs_level_m1_offset[ctx_block_cat]; int last = h->quantf.coeff_last[ctx_block_cat]( l ); int coeff_abs = abs(l[last]); int ctx = coeff_abs_level1_ctx[0] + ctx_level; int node_ctx; const uint8_t *levelgt1_ctx = chroma422dc ? coeff_abs_levelgt1_ctx_chroma_dc : coeff_abs_levelgt1_ctx; - if( last != (b_8x8 ? 63 : chroma422dc ? 7 : count_cat_m1[ctx_block_cat]) ) + if( last != (b_8x8 ? 63 : chroma422dc ? 7 : x264_count_cat_m1[ctx_block_cat]) ) { x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[last] : chroma422dc ? x264_coeff_flag_offset_chroma_422_dc[last] : last), 1 ); @@ -888,17 +904,35 @@ } } -static void x264_cabac_block_residual_8x8( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) +void x264_cabac_block_residual_8x8_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) { x264_cabac_block_residual_internal( h, cb, ctx_block_cat, l, 1, 0 ); } -static void x264_cabac_block_residual_422_dc( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) +void x264_cabac_block_residual_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) { - x264_cabac_block_residual_internal( h, cb, DCT_CHROMA_DC, l, 0, 1 ); + x264_cabac_block_residual_internal( h, cb, ctx_block_cat, l, 0, 0 ); } -static void x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) + +static ALWAYS_INLINE void x264_cabac_block_residual_8x8( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) { - x264_cabac_block_residual_internal( h, cb, ctx_block_cat, l, 0, 0 ); +#if ARCH_X86_64 && HAVE_MMX + h->bsf.cabac_block_residual_8x8_rd_internal( l, MB_INTERLACED, ctx_block_cat, cb ); +#else + x264_cabac_block_residual_8x8_rd_c( h, cb, ctx_block_cat, l ); +#endif +} +static ALWAYS_INLINE void x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) +{ +#if ARCH_X86_64 && HAVE_MMX + h->bsf.cabac_block_residual_rd_internal( l, MB_INTERLACED, ctx_block_cat, cb ); +#else + x264_cabac_block_residual_rd_c( h, cb, ctx_block_cat, l ); +#endif +} + +static void x264_cabac_block_residual_422_dc( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) +{ + x264_cabac_block_residual_internal( h, cb, DCT_CHROMA_DC, l, 0, 1 ); } #endif @@ -1051,25 +1085,23 @@
View file
x264-snapshot-20130224-2245.tar.bz2/encoder/cavlc.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/cavlc.c
Changed
@@ -128,13 +128,13 @@ unsigned int i_sign; /* level and run and total */ - /* set these to 2 to allow branchless i_trailing calculation */ - runlevel.level[1] = 2; - runlevel.level[2] = 2; i_total = h->quantf.coeff_level_run[ctx_block_cat]( l, &runlevel ); x264_prefetch( &x264_run_before[runlevel.mask] ); i_total_zero = runlevel.last + 1 - i_total; + /* branchless i_trailing calculation */ + runlevel.level[i_total+0] = 2; + runlevel.level[i_total+1] = 2; i_trailing = ((((runlevel.level[0]+1) | (1-runlevel.level[0])) >> 31) & 1) // abs(runlevel.level[0])>1 | ((((runlevel.level[1]+1) | (1-runlevel.level[1])) >> 31) & 2) | ((((runlevel.level[2]+1) | (1-runlevel.level[2])) >> 31) & 4); @@ -213,11 +213,14 @@ bs_t *s = &h->out.bs; int i_dqp = h->mb.i_qp - h->mb.i_last_qp; - /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely flat background area */ + /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely + * flat background area. Don't do this if it would raise the quantizer, since that could + * cause unexpected deblocking artifacts. */ if( h->mb.i_type == I_16x16 && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) && !h->mb.cache.non_zero_count[x264_scan8[LUMA_DC]] && !h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] - && !h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] ) + && !h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] + && h->mb.i_qp > h->mb.i_last_qp ) { #if !RDO_SKIP_BS h->mb.i_qp = h->mb.i_last_qp; @@ -268,20 +271,33 @@ } } -static inline void x264_cavlc_macroblock_luma_residual( x264_t *h, int i8start, int i8end ) +static ALWAYS_INLINE void x264_cavlc_macroblock_luma_residual( x264_t *h, int plane_count ) { if( h->mb.b_transform_8x8 ) { /* shuffle 8x8 dct coeffs into 4x4 lists */ - for( int i8 = i8start; i8 <= i8end; i8++ ) - if( h->mb.cache.non_zero_count[x264_scan8[i8*4]] ) - h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4], h->dct.luma8x8[i8], &h->mb.cache.non_zero_count[x264_scan8[i8*4]] ); + for( int p = 0; p < plane_count; p++ ) + for( int i8 = 0; i8 < 4; i8++ ) + if( h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4]] ) + h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[p*16+i8*4], h->dct.luma8x8[p*4+i8], + &h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4]] ); } - for( int i8 = i8start; i8 <= i8end; i8++ ) - if( h->mb.i_cbp_luma & (1 << (i8&3)) ) + for( int p = 0; p < plane_count; p++ ) + FOREACH_BIT( i8, 0, h->mb.i_cbp_luma ) for( int i4 = 0; i4 < 4; i4++ ) - x264_cavlc_block_residual( h, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4] ); + x264_cavlc_block_residual( h, DCT_LUMA_4x4, i4+i8*4+p*16, h->dct.luma4x4[i4+i8*4+p*16] ); +} + +static ALWAYS_INLINE void x264_cavlc_partition_luma_residual( x264_t *h, int i8, int p ) +{ + if( h->mb.b_transform_8x8 && h->mb.cache.non_zero_count[x264_scan8[i8*4]] ) + h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4+p*16], h->dct.luma8x8[i8+p*4], + &h->mb.cache.non_zero_count[x264_scan8[i8*4+p*16]] ); + + if( h->mb.i_cbp_luma & (1 << i8) ) + for( int i4 = 0; i4 < 4; i4++ ) + x264_cavlc_block_residual( h, DCT_LUMA_4x4, i4+i8*4+p*16, h->dct.luma4x4[i4+i8*4+p*16] ); } static void x264_cavlc_mb_header_i( x264_t *h, int i_mb_type, int i_mb_i_offset, int chroma ) @@ -552,7 +568,7 @@ else if( h->mb.i_cbp_luma | h->mb.i_cbp_chroma ) { x264_cavlc_qp_delta( h ); - x264_cavlc_macroblock_luma_residual( h, 0, plane_count*4-1 ); + x264_cavlc_macroblock_luma_residual( h, plane_count ); } if( h->mb.i_cbp_chroma ) { @@ -612,7 +628,7 @@ for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- ) { for( int p = 0; p < plane_count; p++ ) - x264_cavlc_macroblock_luma_residual( h, p*4+i8, p*4+i8 ); + x264_cavlc_partition_luma_residual( h, i8, p ); if( h->mb.i_cbp_chroma ) { if( CHROMA_FORMAT == CHROMA_422 ) @@ -665,7 +681,7 @@ h->out.bs.i_bits_encoded = x264_cavlc_intra4x4_pred_size( h, 4*i8, i_mode ); bs_write_ue( &h->out.bs, cbp_to_golomb[!CHROMA444][1][(h->mb.i_cbp_chroma << 4)|h->mb.i_cbp_luma] ); for( int p = 0; p < plane_count; p++ ) - x264_cavlc_macroblock_luma_residual( h, p*4+i8, p*4+i8 ); + x264_cavlc_partition_luma_residual( h, i8, p ); return h->out.bs.i_bits_encoded; }
View file
x264-snapshot-20130224-2245.tar.bz2/encoder/encoder.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/encoder.c
Changed
@@ -353,34 +353,49 @@ /* If we are within a reasonable distance of the end of the memory allocated for the bitstream, */ /* reallocate, adding an arbitrary amount of space. */ -static int x264_bitstream_check_buffer( x264_t *h ) +static int x264_bitstream_check_buffer_internal( x264_t *h, int size, int b_cabac, int i_nal ) { - uint8_t *bs_bak = h->out.p_bitstream; - int max_row_size = (2500 << SLICE_MBAFF) * h->mb.i_mb_width; - if( (h->param.b_cabac && (h->cabac.p_end - h->cabac.p < max_row_size)) || - (h->out.bs.p_end - h->out.bs.p < max_row_size) ) + if( (b_cabac && (h->cabac.p_end - h->cabac.p < size)) || + (h->out.bs.p_end - h->out.bs.p < size) ) { - h->out.i_bitstream += max_row_size; - CHECKED_MALLOC( h->out.p_bitstream, h->out.i_bitstream ); - h->mc.memcpy_aligned( h->out.p_bitstream, bs_bak, (h->out.i_bitstream - max_row_size) & ~15 ); - intptr_t delta = h->out.p_bitstream - bs_bak; + int buf_size = h->out.i_bitstream + size; + uint8_t *buf = x264_malloc( buf_size ); + if( !buf ) + return -1; + int aligned_size = h->out.i_bitstream & ~15; + h->mc.memcpy_aligned( buf, h->out.p_bitstream, aligned_size ); + memcpy( buf + aligned_size, h->out.p_bitstream + aligned_size, h->out.i_bitstream - aligned_size ); + + intptr_t delta = buf - h->out.p_bitstream; h->out.bs.p_start += delta; h->out.bs.p += delta; - h->out.bs.p_end = h->out.p_bitstream + h->out.i_bitstream; + h->out.bs.p_end = buf + buf_size; h->cabac.p_start += delta; h->cabac.p += delta; - h->cabac.p_end = h->out.p_bitstream + h->out.i_bitstream; + h->cabac.p_end = buf + buf_size; - for( int i = 0; i <= h->out.i_nal; i++ ) + for( int i = 0; i <= i_nal; i++ ) h->out.nal[i].p_payload += delta; - x264_free( bs_bak ); + + x264_free( h->out.p_bitstream ); + h->out.p_bitstream = buf; + h->out.i_bitstream = buf_size; } return 0; -fail: - x264_free( bs_bak ); - return -1; +} + +static int x264_bitstream_check_buffer( x264_t *h ) +{ + int max_row_size = (2500 << SLICE_MBAFF) * h->mb.i_mb_width; + return x264_bitstream_check_buffer_internal( h, max_row_size, h->param.b_cabac, h->out.i_nal ); +} + +static int x264_bitstream_check_buffer_filler( x264_t *h, int filler ) +{ + filler += 32; // add padding for safety + return x264_bitstream_check_buffer_internal( h, filler, 0, -1 ); } #if HAVE_THREAD @@ -417,17 +432,33 @@ static int x264_validate_parameters( x264_t *h, int b_open ) { #if HAVE_MMX -#ifdef __SSE__ - if( b_open && !(x264_cpu_detect() & X264_CPU_SSE) ) + if( b_open ) { - x264_log( h, X264_LOG_ERROR, "your cpu does not support SSE1, but x264 was compiled with asm support\n"); + int cpuflags = x264_cpu_detect(); + int fail = 0; +#ifdef __SSE__ + if( !(cpuflags & X264_CPU_SSE) ) + { + x264_log( h, X264_LOG_ERROR, "your cpu does not support SSE1, but x264 was compiled with asm\n"); + fail = 1; + } #else - if( b_open && !(x264_cpu_detect() & X264_CPU_MMX2) ) - { - x264_log( h, X264_LOG_ERROR, "your cpu does not support MMXEXT, but x264 was compiled with asm support\n"); + if( !(cpuflags & X264_CPU_MMX2) ) + { + x264_log( h, X264_LOG_ERROR, "your cpu does not support MMXEXT, but x264 was compiled with asm\n"); + fail = 1; + } #endif - x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm support (configure --disable-asm)\n"); - return -1; + if( !fail && !(cpuflags & X264_CPU_CMOV) ) + { + x264_log( h, X264_LOG_ERROR, "your cpu does not support CMOV, but x264 was compiled with asm\n"); + fail = 1; + } + if( fail ) + { + x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm (configure --disable-asm)\n"); + return -1; + } } #endif @@ -503,8 +534,6 @@ if( h->param.i_threads == X264_THREADS_AUTO ) h->param.i_threads = x264_cpu_num_processors() * (h->param.b_sliced_threads?2:3)/2; - if( h->param.i_lookahead_threads == X264_THREADS_AUTO ) - h->param.i_lookahead_threads = h->param.i_threads / (h->param.b_sliced_threads?1:6); int max_sliced_threads = X264_MAX( 1, (h->param.i_height+15)/16 / 4 ); if( h->param.i_threads > 1 ) { @@ -518,7 +547,6 @@ h->param.i_threads = X264_MIN( h->param.i_threads, max_sliced_threads ); } h->param.i_threads = x264_clip3( h->param.i_threads, 1, X264_THREAD_MAX ); - h->param.i_lookahead_threads = x264_clip3( h->param.i_lookahead_threads, 1, X264_MIN( max_sliced_threads, X264_LOOKAHEAD_THREAD_MAX ) ); if( h->param.i_threads == 1 ) { h->param.b_sliced_threads = 0; @@ -528,6 +556,28 @@ if( h->i_thread_frames > 1 ) h->param.nalu_process = NULL; + if( h->param.b_opencl ) + { +#if !HAVE_OPENCL + x264_log( h, X264_LOG_WARNING, "OpenCL: not compiled with OpenCL support, disabling\n" ); + h->param.b_opencl = 0; +#elif BIT_DEPTH > 8 + x264_log( h, X264_LOG_WARNING, "OpenCL lookahead does not support high bit depth, disabling opencl\n" ); + h->param.b_opencl = 0; +#else + if( h->param.i_width < 32 || h->param.i_height < 32 ) + { + x264_log( h, X264_LOG_WARNING, "OpenCL: frame size is too small, disabling opencl\n" ); + h->param.b_opencl = 0; + } +#endif + if( h->param.opencl_device_id && h->param.i_opencl_device ) + { + x264_log( h, X264_LOG_WARNING, "OpenCL: device id and device skip count configured; dropping skip\n" ); + h->param.i_opencl_device = 0; + } + } + h->param.i_keyint_max = x264_clip3( h->param.i_keyint_max, 1, X264_KEYINT_MAX_INFINITE ); if( h->param.i_keyint_max == 1 ) { @@ -646,7 +696,7 @@ h->param.rc.i_rc_method == X264_RC_ABR ) { x264_log( h, X264_LOG_WARNING, "max bitrate less than average bitrate, assuming CBR\n" ); - h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate; + h->param.rc.i_bitrate = h->param.rc.i_vbv_max_bitrate; } } else if( h->param.rc.i_vbv_max_bitrate ) @@ -657,6 +707,22 @@ h->param.i_slice_max_size = X264_MAX( h->param.i_slice_max_size, 0 ); h->param.i_slice_max_mbs = X264_MAX( h->param.i_slice_max_mbs, 0 ); + h->param.i_slice_min_mbs = X264_MAX( h->param.i_slice_min_mbs, 0 ); + if( h->param.i_slice_max_mbs ) + h->param.i_slice_min_mbs = X264_MIN( h->param.i_slice_min_mbs, h->param.i_slice_max_mbs/2 ); + else if( !h->param.i_slice_max_size ) + h->param.i_slice_min_mbs = 0; + if( PARAM_INTERLACED && h->param.i_slice_min_mbs ) + { + x264_log( h, X264_LOG_WARNING, "interlace + slice-min-mbs is not implemented\n" ); + h->param.i_slice_min_mbs = 0; + } + int mb_width = (h->param.i_width+15)/16; + if( h->param.i_slice_min_mbs > mb_width ) + { + x264_log( h, X264_LOG_WARNING, "slice-min-mbs > row mb size (%d) not implemented\n", mb_width ); + h->param.i_slice_min_mbs = mb_width; + } int max_slices = (h->param.i_height+((16<<PARAM_INTERLACED)-1))/(16<<PARAM_INTERLACED); if( h->param.b_sliced_threads ) @@ -667,6 +733,8 @@ if( h->param.i_slice_max_mbs || h->param.i_slice_max_size ) h->param.i_slice_count = 0; } + if( h->param.i_slice_count_max > 0 ) + h->param.i_slice_count_max = X264_MAX( h->param.i_slice_count, h->param.i_slice_count_max ); if( h->param.b_bluray_compat ) { @@ -895,6 +963,35 @@ h->param.analyse.i_weighted_pred = x264_clip3( h->param.analyse.i_weighted_pred, X264_WEIGHTP_NONE, X264_WEIGHTP_SMART ); + if( h->param.i_lookahead_threads == X264_THREADS_AUTO )
View file
x264-snapshot-20130224-2245.tar.bz2/encoder/lookahead.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/lookahead.c
Changed
@@ -70,18 +70,19 @@ x264_stack_align( x264_slicetype_decide, h ); x264_lookahead_update_last_nonb( h, h->lookahead->next.list[0] ); + int shift_frames = h->lookahead->next.list[0]->i_bframes + 1; x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex ); while( h->lookahead->ofbuf.i_size == h->lookahead->ofbuf.i_max_size ) x264_pthread_cond_wait( &h->lookahead->ofbuf.cv_empty, &h->lookahead->ofbuf.mutex ); x264_pthread_mutex_lock( &h->lookahead->next.mutex ); - x264_lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, h->lookahead->next.list[0]->i_bframes + 1 ); + x264_lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, shift_frames ); x264_pthread_mutex_unlock( &h->lookahead->next.mutex ); /* For MB-tree and VBV lookahead, we have to perform propagation analysis on I-frames too. */ if( h->lookahead->b_analyse_keyframe && IS_X264_TYPE_I( h->lookahead->last_nonb->i_type ) ) - x264_stack_align( x264_slicetype_analyse, h, 1 ); + x264_stack_align( x264_slicetype_analyse, h, shift_frames ); x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex ); } @@ -236,11 +237,12 @@ x264_stack_align( x264_slicetype_decide, h ); x264_lookahead_update_last_nonb( h, h->lookahead->next.list[0] ); - x264_lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, h->lookahead->next.list[0]->i_bframes + 1 ); + int shift_frames = h->lookahead->next.list[0]->i_bframes + 1; + x264_lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, shift_frames ); /* For MB-tree and VBV lookahead, we have to perform propagation analysis on I-frames too. */ if( h->lookahead->b_analyse_keyframe && IS_X264_TYPE_I( h->lookahead->last_nonb->i_type ) ) - x264_stack_align( x264_slicetype_analyse, h, 1 ); + x264_stack_align( x264_slicetype_analyse, h, shift_frames ); x264_lookahead_encoder_shift( h ); }
View file
x264-snapshot-20130224-2245.tar.bz2/encoder/macroblock.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/macroblock.c
Changed
@@ -128,8 +128,8 @@ pixel *p_src = h->mb.pic.p_fenc[p]; pixel *p_dst = h->mb.pic.p_fdec[p]; - ALIGNED_ARRAY_16( dctcoef, dct4x4,[16],[16] ); - ALIGNED_ARRAY_16( dctcoef, dct_dc4x4,[16] ); + ALIGNED_ARRAY_N( dctcoef, dct4x4,[16],[16] ); + ALIGNED_ARRAY_N( dctcoef, dct_dc4x4,[16] ); int nz, block_cbp = 0; int decimate_score = h->mb.b_dct_decimate ? 0 : 9; @@ -157,28 +157,51 @@ return; } + M32( &h->mb.cache.non_zero_count[x264_scan8[ 0+p*16]] ) = 0; + M32( &h->mb.cache.non_zero_count[x264_scan8[ 2+p*16]] ) = 0; + M32( &h->mb.cache.non_zero_count[x264_scan8[ 8+p*16]] ) = 0; + M32( &h->mb.cache.non_zero_count[x264_scan8[10+p*16]] ) = 0; + h->dctf.sub16x16_dct( dct4x4, p_src, p_dst ); - for( int i = 0; i < 16; i++ ) + if( h->mb.b_noise_reduction ) + for( int idx = 0; idx < 16; idx++ ) + h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 ); + + for( int idx = 0; idx < 16; idx++ ) { - /* copy dc coeff */ - if( h->mb.b_noise_reduction ) - h->quantf.denoise_dct( dct4x4[i], h->nr_residual_sum[0], h->nr_offset[0], 16 ); - dct_dc4x4[block_idx_xy_1d[i]] = dct4x4[i][0]; - dct4x4[i][0] = 0; + dct_dc4x4[block_idx_xy_1d[idx]] = dct4x4[idx][0]; + dct4x4[idx][0] = 0; + } - /* quant/scan/dequant */ - if( h->mb.b_trellis ) - nz = x264_quant_4x4_trellis( h, dct4x4[i], i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_AC][p], 1, !!p, i ); - else - nz = h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] ); - h->mb.cache.non_zero_count[x264_scan8[16*p+i]] = nz; - if( nz ) + if( h->mb.b_trellis ) + { + for( int idx = 0; idx < 16; idx++ ) + if( x264_quant_4x4_trellis( h, dct4x4[idx], i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_AC][p], 1, !!p, idx ) ) + { + block_cbp = 0xf; + h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+idx], dct4x4[idx] ); + h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[i_quant_cat], i_qp ); + if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+idx] ); + h->mb.cache.non_zero_count[x264_scan8[16*p+idx]] = 1; + } + } + else + { + for( int i8x8 = 0; i8x8 < 4; i8x8++ ) { - h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+i], dct4x4[i] ); - h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[i_quant_cat], i_qp ); - if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+i] ); - block_cbp = 0xf; + nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] ); + if( nz ) + { + block_cbp = 0xf; + FOREACH_BIT( idx, i8x8*4, nz ) + { + h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+idx], dct4x4[idx] ); + h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[i_quant_cat], i_qp ); + if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+idx] ); + h->mb.cache.non_zero_count[x264_scan8[16*p+idx]] = 1; + } + } } } @@ -245,6 +268,18 @@ h->mb.i_cbp_chroma = 0; h->nr_count[2] += h->mb.b_noise_reduction * 4; + M16( &h->mb.cache.non_zero_count[x264_scan8[16]] ) = 0; + M16( &h->mb.cache.non_zero_count[x264_scan8[18]] ) = 0; + M16( &h->mb.cache.non_zero_count[x264_scan8[32]] ) = 0; + M16( &h->mb.cache.non_zero_count[x264_scan8[34]] ) = 0; + if( chroma422 ) + { + M16( &h->mb.cache.non_zero_count[x264_scan8[24]] ) = 0; + M16( &h->mb.cache.non_zero_count[x264_scan8[26]] ) = 0; + M16( &h->mb.cache.non_zero_count[x264_scan8[40]] ) = 0; + M16( &h->mb.cache.non_zero_count[x264_scan8[42]] ) = 0; + } + /* Early termination: check variance of chroma residual before encoding. * Don't bother trying early termination at low QPs. * Values are experimentally derived. */ @@ -259,17 +294,6 @@ score += h->pixf.var2[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] ); if( score < thresh*4 ) { - M16( &h->mb.cache.non_zero_count[x264_scan8[16]] ) = 0; - M16( &h->mb.cache.non_zero_count[x264_scan8[18]] ) = 0; - M16( &h->mb.cache.non_zero_count[x264_scan8[32]] ) = 0; - M16( &h->mb.cache.non_zero_count[x264_scan8[34]] ) = 0; - if( chroma422 ) - { - M16( &h->mb.cache.non_zero_count[x264_scan8[24]] ) = 0; - M16( &h->mb.cache.non_zero_count[x264_scan8[26]] ) = 0; - M16( &h->mb.cache.non_zero_count[x264_scan8[40]] ) = 0; - M16( &h->mb.cache.non_zero_count[x264_scan8[42]] ) = 0; - } h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] = 0; h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] = 0; @@ -326,10 +350,10 @@ { pixel *p_src = h->mb.pic.p_fenc[1+ch]; pixel *p_dst = h->mb.pic.p_fdec[1+ch]; - int i_decimate_score = 0; + int i_decimate_score = b_decimate ? 0 : 7; int nz_ac = 0; - ALIGNED_ARRAY_16( dctcoef, dct4x4,[8],[16] ); + ALIGNED_ARRAY_N( dctcoef, dct4x4,[8],[16] ); if( h->mb.b_lossless ) { @@ -361,20 +385,40 @@ dct2x2dc( dct_dc, dct4x4 ); /* calculate dct coeffs */ - for( int i = 0; i < (chroma422?8:4); i++ ) + for( int i8x8 = 0; i8x8 < (chroma422?2:1); i8x8++ ) { if( h->mb.b_trellis ) - nz = x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 1, 0 ); + { + for( int i4x4 = 0; i4x4 < 4; i4x4++ ) + { + if( x264_quant_4x4_trellis( h, dct4x4[i8x8*4+i4x4], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 1, 0 ) ) + { + int idx = 16+ch*16+i8x8*8+i4x4; + h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[i8x8*4+i4x4] ); + h->quantf.dequant_4x4( dct4x4[i8x8*4+i4x4], dequant_mf, i_qp ); + if( i_decimate_score < 7 ) + i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[idx] ); + h->mb.cache.non_zero_count[x264_scan8[idx]] = 1; + nz_ac = 1; + } + } + } else - nz = h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qp], h->quant4_bias[CQM_4IC+b_inter][i_qp] ); - h->mb.cache.non_zero_count[x264_scan8[16+i+(chroma422?i&4:0)+ch*16]] = nz; - if( nz ) { - nz_ac = 1; - h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16], dct4x4[i] ); - h->quantf.dequant_4x4( dct4x4[i], dequant_mf, i_qp ); - if( b_decimate ) - i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16] ); + nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[CQM_4IC+b_inter][i_qp], + h->quant4_bias[CQM_4IC+b_inter][i_qp] ); + nz_ac |= nz; + + FOREACH_BIT( i4x4, 0, nz ) + { + int idx = 16+ch*16+i8x8*8+i4x4; + + h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[i8x8*4+i4x4] ); + h->quantf.dequant_4x4( dct4x4[i8x8*4+i4x4], dequant_mf, i_qp ); + if( i_decimate_score < 7 ) + i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[idx] ); + h->mb.cache.non_zero_count[x264_scan8[idx]] = 1; + } } } @@ -390,7 +434,7 @@ h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = nz_dc; - if( (b_decimate && i_decimate_score < 7) || !nz_ac ) + if( i_decimate_score < 7 || !nz_ac ) { /* Decimate the block */ M16( &h->mb.cache.non_zero_count[x264_scan8[16+16*ch]] ) = 0; @@ -646,11 +690,8 @@ { h->mb.b_transform_8x8 = 0; - for( int p = 0; p < plane_count; p++ ) - { + for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp ) x264_mb_encode_i16x16( h, p, i_qp ); - i_qp = h->mb.i_chroma_qp; - }
View file
x264-snapshot-20130224-2245.tar.bz2/encoder/macroblock.h -> x264-snapshot-20130723-2245.tar.bz2/encoder/macroblock.h
Changed
@@ -104,12 +104,16 @@ M32( &h->mb.cache.non_zero_count[x264_scan8[16*p+10]] ) = 0;\ } while(0) +/* A special for loop that iterates branchlessly over each set + * bit in a 4-bit input. */ +#define FOREACH_BIT(idx,start,mask) for( int idx = start, msk = mask, skip; msk && (skip = x264_ctz_4bit(msk), idx += skip, msk >>= skip+1, 1); idx++ ) + static ALWAYS_INLINE void x264_mb_encode_i4x4( x264_t *h, int p, int idx, int i_qp, int i_mode, int b_predict ) { int nz; pixel *p_src = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[idx]]; pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[idx]]; - ALIGNED_ARRAY_16( dctcoef, dct4x4,[16] ); + ALIGNED_ARRAY_N( dctcoef, dct4x4,[16] ); if( b_predict ) { @@ -147,7 +151,7 @@ int nz; pixel *p_src = &h->mb.pic.p_fenc[p][8*x + 8*y*FENC_STRIDE]; pixel *p_dst = &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE]; - ALIGNED_ARRAY_16( dctcoef, dct8x8,[64] ); + ALIGNED_ARRAY_N( dctcoef, dct8x8,[64] ); ALIGNED_ARRAY_32( pixel, edge_buf,[36] ); if( b_predict )
View file
x264-snapshot-20130224-2245.tar.bz2/encoder/me.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/me.c
Changed
@@ -61,21 +61,22 @@ (p_cost_mvx[(mx)<<2] + p_cost_mvy[(my)<<2]) #define COST_MV( mx, my )\ +do\ {\ int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE,\ &p_fref_w[(my)*stride+(mx)], stride )\ + BITS_MVD(mx,my);\ COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my );\ -} +} while(0) -#define COST_MV_HPEL( mx, my ) \ -{ \ - intptr_t stride2 = 16; \ - pixel *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh, &m->weight[0] ); \ - int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, src, stride2 ) \ - + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \ - COPY3_IF_LT( bpred_cost, cost, bpred_mx, mx, bpred_my, my ); \ -} +#define COST_MV_HPEL( mx, my, cost )\ +do\ +{\ + intptr_t stride2 = 16;\ + pixel *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh, &m->weight[0] );\ + cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, src, stride2 )\ + + p_cost_mvx[ mx ] + p_cost_mvy[ my ];\ +} while(0) #define COST_MV_X3_DIR( m0x, m0y, m1x, m1y, m2x, m2y, costs )\ {\ @@ -174,6 +175,10 @@ }\ } +#define FPEL(mv) (((mv)+2)>>2) /* Convert subpel MV to fullpel with rounding... */ +#define SPEL(mv) ((mv)<<2) /* ... and the reverse. */ +#define SPELx2(mv) (SPEL(mv)&0xFFFCFFFC) /* for two packed MVs */ + void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_halfpel_thresh ) { const int bw = x264_pixel_size[m->i_pixel].w; @@ -181,97 +186,136 @@ const int i_pixel = m->i_pixel; const int stride = m->i_stride[0]; int i_me_range = h->param.analyse.i_me_range; - int bmx, bmy, bcost; - int bpred_mx = 0, bpred_my = 0, bpred_cost = COST_MAX; + int bmx, bmy, bcost = COST_MAX; + int bpred_cost = COST_MAX; int omx, omy, pmx, pmy; pixel *p_fenc = m->p_fenc[0]; pixel *p_fref_w = m->p_fref_w; - ALIGNED_ARRAY_16( pixel, pix,[16*16] ); - - int costs[16]; - - int mv_x_min = h->mb.mv_min_fpel[0]; - int mv_y_min = h->mb.mv_min_fpel[1]; - int mv_x_max = h->mb.mv_max_fpel[0]; - int mv_y_max = h->mb.mv_max_fpel[1]; - int mv_x_min_qpel = mv_x_min << 2; - int mv_y_min_qpel = mv_y_min << 2; - int mv_x_max_qpel = mv_x_max << 2; - int mv_y_max_qpel = mv_y_max << 2; + ALIGNED_ARRAY_N( pixel, pix,[16*16] ); + ALIGNED_ARRAY_8( int16_t, mvc_temp,[16],[2] ); + + ALIGNED_ARRAY_16( int, costs,[16] ); + + int mv_x_min = h->mb.mv_limit_fpel[0][0]; + int mv_y_min = h->mb.mv_limit_fpel[0][1]; + int mv_x_max = h->mb.mv_limit_fpel[1][0]; + int mv_y_max = h->mb.mv_limit_fpel[1][1]; /* Special version of pack to allow shortcuts in CHECK_MVRANGE */ #define pack16to32_mask2(mx,my) ((mx<<16)|(my&0x7FFF)) uint32_t mv_min = pack16to32_mask2( -mv_x_min, -mv_y_min ); uint32_t mv_max = pack16to32_mask2( mv_x_max, mv_y_max )|0x8000; + uint32_t pmv, bpred_mv = 0; #define CHECK_MVRANGE(mx,my) (!(((pack16to32_mask2(mx,my) + mv_min) | (mv_max - pack16to32_mask2(mx,my))) & 0x80004000)) const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0]; const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1]; - uint32_t pmv; - bmx = x264_clip3( m->mvp[0], mv_x_min_qpel, mv_x_max_qpel ); - bmy = x264_clip3( m->mvp[1], mv_y_min_qpel, mv_y_max_qpel ); - pmx = ( bmx + 2 ) >> 2; - pmy = ( bmy + 2 ) >> 2; - bcost = COST_MAX; - - /* try extra predictors if provided */ + /* Try extra predictors if provided. If subme >= 3, check subpel predictors, + * otherwise round them to fullpel. */ if( h->mb.i_subpel_refine >= 3 ) { - pmv = pack16to32_mask(bmx,bmy); - if( i_mvc ) - COST_MV_HPEL( bmx, bmy ); - for( int i = 0; i < i_mvc; i++ ) + /* Calculate and check the MVP first */ + int bpred_mx = x264_clip3( m->mvp[0], SPEL(mv_x_min), SPEL(mv_x_max) ); + int bpred_my = x264_clip3( m->mvp[1], SPEL(mv_y_min), SPEL(mv_y_max) ); + pmv = pack16to32_mask( bpred_mx, bpred_my ); + pmx = FPEL( bpred_mx ); + pmy = FPEL( bpred_my ); + + COST_MV_HPEL( bpred_mx, bpred_my, bpred_cost ); + int pmv_cost = bpred_cost; + + if( i_mvc > 0 ) { - if( M32( mvc[i] ) && (pmv != M32( mvc[i] )) ) + /* Clip MV candidates and eliminate those equal to zero and pmv. */ + int valid_mvcs = x264_predictor_clip( mvc_temp+2, mvc, i_mvc, h->mb.mv_limit_fpel, pmv ); + if( valid_mvcs > 0 ) { - int mx = x264_clip3( mvc[i][0], mv_x_min_qpel, mv_x_max_qpel ); - int my = x264_clip3( mvc[i][1], mv_y_min_qpel, mv_y_max_qpel ); - COST_MV_HPEL( mx, my ); + int i = 1, cost; + /* We stuff pmv here to branchlessly pick between pmv and the various + * MV candidates. [0] gets skipped in order to maintain alignment for + * x264_predictor_clip. */ + M32( mvc_temp[1] ) = pmv; + bpred_cost <<= 4; + do + { + int mx = mvc_temp[i+1][0]; + int my = mvc_temp[i+1][1]; + COST_MV_HPEL( mx, my, cost ); + COPY1_IF_LT( bpred_cost, (cost << 4) + i ); + } while( ++i <= valid_mvcs ); + bpred_mx = mvc_temp[(bpred_cost&15)+1][0]; + bpred_my = mvc_temp[(bpred_cost&15)+1][1]; + bpred_cost >>= 4; } } - bmx = ( bpred_mx + 2 ) >> 2; - bmy = ( bpred_my + 2 ) >> 2; - COST_MV( bmx, bmy ); + + /* Round the best predictor back to fullpel and get the cost, since this is where + * we'll be starting the fullpel motion search. */ + bmx = FPEL( bpred_mx ); + bmy = FPEL( bpred_my ); + bpred_mv = pack16to32_mask(bpred_mx, bpred_my); + if( bpred_mv&0x00030003 ) /* Only test if the tested predictor is actually subpel... */ + COST_MV( bmx, bmy ); + else /* Otherwise just copy the cost (we already know it) */ + bcost = bpred_cost; + + /* Test the zero vector if it hasn't been tested yet. */ + if( pmv ) + { + if( bmx|bmy ) COST_MV( 0, 0 ); + } + /* If a subpel mv candidate was better than the zero vector, the previous + * fullpel check won't have gotten it even if the pmv was zero. So handle + * that possibility here. */ + else + { + COPY3_IF_LT( bcost, pmv_cost, bmx, 0, bmy, 0 ); + } } else { - /* check the MVP */ - bmx = pmx; - bmy = pmy; + /* Calculate and check the fullpel MVP first */ + bmx = pmx = x264_clip3( FPEL(m->mvp[0]), mv_x_min, mv_x_max ); + bmy = pmy = x264_clip3( FPEL(m->mvp[1]), mv_y_min, mv_y_max ); + pmv = pack16to32_mask( bmx, bmy ); + /* Because we are rounding the predicted motion vector to fullpel, there will be * an extra MV cost in 15 out of 16 cases. However, when the predicted MV is * chosen as the best predictor, it is often the case that the subpel search will - * result in a vector at or next to the predicted motion vector. Therefore, it is - * sensible to omit the cost of the MV from the rounded MVP to avoid unfairly - * biasing against use of the predicted motion vector. */ + * result in a vector at or next to the predicted motion vector. Therefore, we omit + * the cost of the MV from the rounded MVP to avoid unfairly biasing against use of + * the predicted motion vector. + * + * Disclaimer: this is a post-hoc rationalization for why this hack works. */ bcost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[bmy*stride+bmx], stride ); - pmv = pack16to32_mask( bmx, bmy ); + if( i_mvc > 0 ) { - ALIGNED_ARRAY_8( int16_t, mvc_fpel,[16],[2] ); - x264_predictor_roundclip( mvc_fpel+2, mvc, i_mvc, mv_x_min, mv_x_max, mv_y_min, mv_y_max ); - M32( mvc_fpel[1] ) = pmv; - bcost <<= 4; - for( int i = 1; i <= i_mvc; i++ ) + /* Like in subme>=3, except we also round the candidates to fullpel. */ + int valid_mvcs = x264_predictor_roundclip( mvc_temp+2, mvc, i_mvc, h->mb.mv_limit_fpel, pmv );
View file
x264-snapshot-20130224-2245.tar.bz2/encoder/ratecontrol.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/ratecontrol.c
Changed
@@ -1118,7 +1118,8 @@ total_qp_aq += qp_aq; p = next; } - h->pps->i_pic_init_qp = SPEC_QP( (int)(total_qp_aq / rc->num_entries + 0.5) ); + if( !h->param.b_stitchable ) + h->pps->i_pic_init_qp = SPEC_QP( (int)(total_qp_aq / rc->num_entries + 0.5) ); x264_free( stats_buf ); @@ -1667,7 +1668,8 @@ rc->qpm = x264_clip3f( (prev_row_qp + rc->qpm)*0.5f, prev_row_qp + 1.0f, qp_max ); rc->qpa_rc = rc->qpa_rc_prev; rc->qpa_aq = rc->qpa_aq_prev; - h->fdec->i_row_bits[y] = h->fdec->i_row_bits[y-SLICE_MBAFF] = 0; + h->fdec->i_row_bits[y] = 0; + h->fdec->i_row_bits[y-SLICE_MBAFF] = 0; return -1; } } @@ -1683,7 +1685,8 @@ rc->qpm = qp_max; rc->qpa_rc = rc->qpa_rc_prev; rc->qpa_aq = rc->qpa_aq_prev; - h->fdec->i_row_bits[y] = h->fdec->i_row_bits[y-SLICE_MBAFF] = 0; + h->fdec->i_row_bits[y] = 0; + h->fdec->i_row_bits[y-SLICE_MBAFF] = 0; return -1; } } @@ -2591,14 +2594,16 @@ if( h->i_frame == 0 ) for( int i = 0; i < h->param.i_threads; i++ ) { - x264_ratecontrol_t *t = h->thread[i]->rc; - memcpy( t->row_preds, rc->row_preds, sizeof(rc->row_preds) ); + x264_t *t = h->thread[i]; + if( t != h ) + memcpy( t->rc->row_preds, rc->row_preds, sizeof(rc->row_preds) ); } for( int i = 0; i < h->param.i_threads; i++ ) { x264_t *t = h->thread[i]; - memcpy( t->rc, rc, offsetof(x264_ratecontrol_t, row_pred) ); + if( t != h ) + memcpy( t->rc, rc, offsetof(x264_ratecontrol_t, row_pred) ); t->rc->row_pred = &t->rc->row_preds[h->sh.i_type]; /* Calculate the planned slice size. */ if( rc->b_vbv && rc->frame_size_planned )
View file
x264-snapshot-20130224-2245.tar.bz2/encoder/rdo.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/rdo.c
Changed
@@ -634,13 +634,13 @@ const uint8_t *zigzag, int ctx_block_cat, int lambda2, int b_ac, int b_chroma, int dc, int num_coefs, int idx ) { - ALIGNED_ARRAY_16( dctcoef, orig_coefs, [64] ); - ALIGNED_ARRAY_16( dctcoef, quant_coefs, [64] ); + ALIGNED_ARRAY_N( dctcoef, orig_coefs, [64] ); + ALIGNED_ARRAY_N( dctcoef, quant_coefs, [64] ); const uint32_t *coef_weight1 = num_coefs == 64 ? x264_dct8_weight_tab : x264_dct4_weight_tab; const uint32_t *coef_weight2 = num_coefs == 64 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab; const int b_interlaced = MB_INTERLACED; - uint8_t *cabac_state_sig = &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][ctx_block_cat] ]; - uint8_t *cabac_state_last = &h->cabac.state[ last_coeff_flag_offset[b_interlaced][ctx_block_cat] ]; + uint8_t *cabac_state_sig = &h->cabac.state[ x264_significant_coeff_flag_offset[b_interlaced][ctx_block_cat] ]; + uint8_t *cabac_state_last = &h->cabac.state[ x264_last_coeff_flag_offset[b_interlaced][ctx_block_cat] ]; int levelgt1_ctx = b_chroma && dc ? 8 : 9; if( dc ) @@ -683,7 +683,7 @@ } int last_nnz = h->quantf.coeff_last[ctx_block_cat]( quant_coefs+b_ac )+b_ac; - uint8_t *cabac_state = &h->cabac.state[ coeff_abs_level_m1_offset[ctx_block_cat] ]; + uint8_t *cabac_state = &h->cabac.state[ x264_coeff_abs_level_m1_offset[ctx_block_cat] ]; /* shortcut for dc-only blocks. * this doesn't affect the output, but saves some unnecessary computation. */ @@ -1161,5 +1161,6 @@ h->mb.cache.non_zero_count[x264_scan8[idx*4+i]] = nz; nzaccum |= nz; } + STORE_8x8_NNZ( 0, idx, 0 ); return nzaccum; }
View file
x264-snapshot-20130224-2245.tar.bz2/encoder/set.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/set.c
Changed
@@ -208,9 +208,9 @@ ( csp >= X264_CSP_BGR ? 1 : 0 ) ); sps->vui.b_color_description_present = 0; - sps->vui.i_colorprim = ( param->vui.i_colorprim >= 0 && param->vui.i_colorprim <= 8 ? param->vui.i_colorprim : 2 ); - sps->vui.i_transfer = ( param->vui.i_transfer >= 0 && param->vui.i_transfer <= 10 ? param->vui.i_transfer : 2 ); - sps->vui.i_colmatrix = ( param->vui.i_colmatrix >= 0 && param->vui.i_colmatrix <= 8 ? param->vui.i_colmatrix : + sps->vui.i_colorprim = ( param->vui.i_colorprim >= 0 && param->vui.i_colorprim <= 9 ? param->vui.i_colorprim : 2 ); + sps->vui.i_transfer = ( param->vui.i_transfer >= 0 && param->vui.i_transfer <= 15 ? param->vui.i_transfer : 2 ); + sps->vui.i_colmatrix = ( param->vui.i_colmatrix >= 0 && param->vui.i_colmatrix <= 10 ? param->vui.i_colmatrix : ( csp >= X264_CSP_BGR ? 0 : 2 ) ); if( sps->vui.i_colorprim != 2 || sps->vui.i_transfer != 2 || @@ -430,7 +430,7 @@ pps->b_weighted_pred = param->analyse.i_weighted_pred > 0; pps->b_weighted_bipred = param->analyse.b_weighted_bipred ? 2 : 0; - pps->i_pic_init_qp = param->rc.i_rc_method == X264_RC_ABR ? 26 + QP_BD_OFFSET : SPEC_QP( param->rc.i_qp_constant ); + pps->i_pic_init_qp = param->rc.i_rc_method == X264_RC_ABR || param->b_stitchable ? 26 + QP_BD_OFFSET : SPEC_QP( param->rc.i_qp_constant ); pps->i_pic_init_qs = 26 + QP_BD_OFFSET; pps->i_chroma_qp_index_offset = param->analyse.i_chroma_qp_offset;
View file
x264-snapshot-20130723-2245.tar.bz2/encoder/slicetype-cl.c
Added
@@ -0,0 +1,780 @@ +/***************************************************************************** + * slicetype-cl.c: OpenCL slicetype decision code (lowres lookahead) + ***************************************************************************** + * Copyright (C) 2012-2013 x264 project + * + * Authors: Steve Borho <sborho@multicorewareinc.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "common/common.h" +#include "macroblock.h" +#include "me.h" + +#if HAVE_OPENCL +#ifdef _WIN32 +#include <windows.h> +#endif + +void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead ); + +/* We define CL_QUEUE_THREAD_HANDLE_AMD here because it is not defined + * in the OpenCL headers shipped with NVIDIA drivers. We need to be + * able to compile on an NVIDIA machine and run optimally on an AMD GPU. */ +#define CL_QUEUE_THREAD_HANDLE_AMD 0x403E + +#define OCLCHECK( method, ... )\ +do\ +{\ + if( h->opencl.b_fatal_error )\ + return -1;\ + status = ocl->method( __VA_ARGS__ );\ + if( status != CL_SUCCESS ) {\ + h->param.b_opencl = 0;\ + h->opencl.b_fatal_error = 1;\ + x264_log( h, X264_LOG_ERROR, # method " error '%d'\n", status );\ + return -1;\ + }\ +} while( 0 ) + +void x264_opencl_flush( x264_t *h ) +{ + x264_opencl_function_t *ocl = h->opencl.ocl; + + ocl->clFinish( h->opencl.queue ); + + /* Finish copies from the GPU by copying from the page-locked buffer to + * their final destination */ + for( int i = 0; i < h->opencl.num_copies; i++ ) + memcpy( h->opencl.copies[i].dest, h->opencl.copies[i].src, h->opencl.copies[i].bytes ); + h->opencl.num_copies = 0; + h->opencl.pl_occupancy = 0; +} + +static void *x264_opencl_alloc_locked( x264_t *h, int bytes ) +{ + if( h->opencl.pl_occupancy + bytes >= PAGE_LOCKED_BUF_SIZE ) + x264_opencl_flush( h ); + assert( bytes < PAGE_LOCKED_BUF_SIZE ); + char *ptr = h->opencl.page_locked_ptr + h->opencl.pl_occupancy; + h->opencl.pl_occupancy += bytes; + return ptr; +} + +int x264_opencl_lowres_init( x264_t *h, x264_frame_t *fenc, int lambda ) +{ + if( fenc->b_intra_calculated ) + return 0; + fenc->b_intra_calculated = 1; + + x264_opencl_function_t *ocl = h->opencl.ocl; + int luma_length = fenc->i_stride[0] * fenc->i_lines[0]; + +#define CREATEBUF( out, flags, size )\ + out = ocl->clCreateBuffer( h->opencl.context, (flags), (size), NULL, &status );\ + if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateBuffer error '%d'\n", status ); return -1; } +#define CREATEIMAGE( out, flags, pf, width, height )\ + out = ocl->clCreateImage2D( h->opencl.context, (flags), &pf, width, height, 0, NULL, &status );\ + if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateImage2D error '%d'\n", status ); return -1; } + + int mb_count = h->mb.i_mb_count; + cl_int status; + + if( !h->opencl.lowres_mv_costs ) + { + /* Allocate shared memory buffers */ + int width = h->mb.i_mb_width * 8 * sizeof(pixel); + int height = h->mb.i_mb_height * 8 * sizeof(pixel); + + cl_image_format pixel_format; + pixel_format.image_channel_order = CL_R; + pixel_format.image_channel_data_type = CL_UNSIGNED_INT32; + CREATEIMAGE( h->opencl.weighted_luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height ); + + for( int i = 0; i < NUM_IMAGE_SCALES; i++ ) + { + pixel_format.image_channel_order = CL_RGBA; + pixel_format.image_channel_data_type = CL_UNSIGNED_INT8; + CREATEIMAGE( h->opencl.weighted_scaled_images[i], CL_MEM_READ_WRITE, pixel_format, width, height ); + width >>= 1; + height >>= 1; + } + + CREATEBUF( h->opencl.lowres_mv_costs, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) ); + CREATEBUF( h->opencl.lowres_costs[0], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) ); + CREATEBUF( h->opencl.lowres_costs[1], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) ); + CREATEBUF( h->opencl.mv_buffers[0], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 ); + CREATEBUF( h->opencl.mv_buffers[1], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 ); + CREATEBUF( h->opencl.mvp_buffer, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 ); + CREATEBUF( h->opencl.frame_stats[0], CL_MEM_WRITE_ONLY, 4 * sizeof(int) ); + CREATEBUF( h->opencl.frame_stats[1], CL_MEM_WRITE_ONLY, 4 * sizeof(int) ); + CREATEBUF( h->opencl.row_satds[0], CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) ); + CREATEBUF( h->opencl.row_satds[1], CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) ); + CREATEBUF( h->opencl.luma_16x16_image[0], CL_MEM_READ_ONLY, luma_length ); + CREATEBUF( h->opencl.luma_16x16_image[1], CL_MEM_READ_ONLY, luma_length ); + } + + if( !fenc->opencl.intra_cost ) + { + /* Allocate per-frame buffers */ + int width = h->mb.i_mb_width * 8 * sizeof(pixel); + int height = h->mb.i_mb_height * 8 * sizeof(pixel); + + cl_image_format pixel_format; + pixel_format.image_channel_order = CL_R; + pixel_format.image_channel_data_type = CL_UNSIGNED_INT32; + CREATEIMAGE( fenc->opencl.luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height ); + + for( int i = 0; i < NUM_IMAGE_SCALES; i++ ) + { + pixel_format.image_channel_order = CL_RGBA; + pixel_format.image_channel_data_type = CL_UNSIGNED_INT8; + CREATEIMAGE( fenc->opencl.scaled_image2Ds[i], CL_MEM_READ_WRITE, pixel_format, width, height ); + width >>= 1; + height >>= 1; + } + CREATEBUF( fenc->opencl.inv_qscale_factor, CL_MEM_READ_ONLY, mb_count * sizeof(int16_t) ); + CREATEBUF( fenc->opencl.intra_cost, CL_MEM_WRITE_ONLY, mb_count * sizeof(int16_t) ); + CREATEBUF( fenc->opencl.lowres_mvs0, CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) ); + CREATEBUF( fenc->opencl.lowres_mvs1, CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) ); + CREATEBUF( fenc->opencl.lowres_mv_costs0, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) ); + CREATEBUF( fenc->opencl.lowres_mv_costs1, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) ); + } +#undef CREATEBUF +#undef CREATEIMAGE + + /* Copy image to the GPU, downscale to unpadded 8x8, then continue for all scales */ + + char *locked = x264_opencl_alloc_locked( h, luma_length ); + memcpy( locked, fenc->plane[0], luma_length ); + OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue, h->opencl.luma_16x16_image[h->opencl.last_buf], CL_FALSE, 0, luma_length, locked, 0, NULL, NULL ); + + size_t gdim[2]; + if( h->param.rc.i_aq_mode && fenc->i_inv_qscale_factor ) + { + int size = h->mb.i_mb_count * sizeof(int16_t); + locked = x264_opencl_alloc_locked( h, size ); + memcpy( locked, fenc->i_inv_qscale_factor, size ); + OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue, fenc->opencl.inv_qscale_factor, CL_FALSE, 0, size, locked, 0, NULL, NULL ); + } + else + { + /* Fill fenc->opencl.inv_qscale_factor with NOP (256) */ + cl_uint arg = 0; + int16_t value = 256; + OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor ); + OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(int16_t), &value ); + gdim[0] = h->mb.i_mb_count; + OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.memset_kernel, 1, NULL, gdim, NULL, 0, NULL, NULL ); + } + + int stride = fenc->i_stride[0]; + cl_uint arg = 0; + OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.luma_16x16_image[h->opencl.last_buf] ); + OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] ); + OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.luma_hpel ); + OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(int), &stride ); + gdim[0] = 8 * h->mb.i_mb_width; + gdim[1] = 8 * h->mb.i_mb_height; + OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.downscale_hpel_kernel, 2, NULL, gdim, NULL, 0, NULL, NULL ); + + for( int i = 0; i < NUM_IMAGE_SCALES - 1; i++ ) + { + /* Workaround for AMD Southern Island:
View file
x264-snapshot-20130224-2245.tar.bz2/encoder/slicetype.c -> x264-snapshot-20130723-2245.tar.bz2/encoder/slicetype.c
Changed
@@ -36,6 +36,18 @@ x264_frame_t **frames, int p0, int p1, int b, int b_intra_penalty ); +void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead ); + +#if HAVE_OPENCL +int x264_opencl_lowres_init( x264_t *h, x264_frame_t *fenc, int lambda ); +int x264_opencl_motionsearch( x264_t *h, x264_frame_t **frames, int b, int ref, int b_islist1, int lambda, const x264_weight_t *w ); +int x264_opencl_finalize_cost( x264_t *h, int lambda, x264_frame_t **frames, int p0, int p1, int b, int dist_scale_factor ); +int x264_opencl_precalculate_frame_cost( x264_t *h, x264_frame_t **frames, int lambda, int p0, int p1, int b ); +void x264_opencl_flush( x264_t *h ); +void x264_opencl_slicetype_prep( x264_t *h, x264_frame_t **frames, int num_frames, int lambda ); +void x264_opencl_slicetype_end( x264_t *h ); +#endif + static void x264_lowres_context_init( x264_t *h, x264_mb_analysis_t *a ) { a->i_qp = X264_LOOKAHEAD_QP; @@ -60,7 +72,7 @@ w->i_offset = offset; w->i_denom = 7; w->i_scale = weight_nonh264; - while( w->i_denom > 0 && (w->i_scale > 127 || !(w->i_scale & 1)) ) + while( w->i_denom > 0 && (w->i_scale > 127) ) { w->i_denom--; w->i_scale >>= 1; @@ -276,7 +288,7 @@ return cost; } -static void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead ) +void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead ) { int i_delta_index = fenc->i_frame - ref->i_frame - 1; /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */ @@ -286,21 +298,40 @@ SET_WEIGHT( weights[1], 0, 1, 0, 0 ); SET_WEIGHT( weights[2], 0, 1, 0, 0 ); int chroma_initted = 0; + float guess_scale[3]; + float fenc_mean[3]; + float ref_mean[3]; + for( int plane = 0; plane <= 2*!b_lookahead; plane++ ) + { + float fenc_var = fenc->i_pixel_ssd[plane] + !ref->i_pixel_ssd[plane]; + float ref_var = ref->i_pixel_ssd[plane] + !ref->i_pixel_ssd[plane]; + guess_scale[plane] = sqrtf( fenc_var / ref_var ); + fenc_mean[plane] = (float)fenc->i_pixel_sum[plane] / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]) / (1 << (BIT_DEPTH - 8)); + ref_mean[plane] = (float) ref->i_pixel_sum[plane] / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]) / (1 << (BIT_DEPTH - 8)); + } + + int chroma_denom = 7; + if( !b_lookahead ) + { + /* make sure both our scale factors fit */ + while( chroma_denom > 0 ) + { + float thresh = 127.f / (1<<chroma_denom); + if( guess_scale[1] < thresh && guess_scale[2] < thresh ) + break; + chroma_denom--; + } + } + /* Don't check chroma in lookahead, or if there wasn't a luma weight. */ for( int plane = 0; plane <= 2 && !( plane && ( !weights[0].weightfn || b_lookahead ) ); plane++ ) { - int cur_offset, start_offset, end_offset; int minoff, minscale, mindenom; unsigned int minscore, origscore; int found; - float fenc_var = fenc->i_pixel_ssd[plane] + !ref->i_pixel_ssd[plane]; - float ref_var = ref->i_pixel_ssd[plane] + !ref->i_pixel_ssd[plane]; - float guess_scale = sqrtf( fenc_var / ref_var ); - float fenc_mean = (float)fenc->i_pixel_sum[plane] / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]) / (1 << (BIT_DEPTH - 8)); - float ref_mean = (float) ref->i_pixel_sum[plane] / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]) / (1 << (BIT_DEPTH - 8)); //early termination - if( fabsf( ref_mean - fenc_mean ) < 0.5f && fabsf( 1.f - guess_scale ) < epsilon ) + if( fabsf( ref_mean[plane] - fenc_mean[plane] ) < 0.5f && fabsf( 1.f - guess_scale[plane] ) < epsilon ) { SET_WEIGHT( weights[plane], 0, 1, 0, 0 ); continue; @@ -308,8 +339,8 @@ if( plane ) { - weights[plane].i_denom = 6; - weights[plane].i_scale = x264_clip3( round( guess_scale * 64 ), 0, 255 ); + weights[plane].i_denom = chroma_denom; + weights[plane].i_scale = x264_clip3( round( guess_scale[plane] * (1<<chroma_denom) ), 0, 255 ); if( weights[plane].i_scale > 127 ) { weights[1].weightfn = weights[2].weightfn = NULL; @@ -317,7 +348,7 @@ } } else - x264_weight_get_h264( round( guess_scale * 128 ), 0, &weights[plane] ); + x264_weight_get_h264( round( guess_scale[plane] * 128 ), 0, &weights[plane] ); found = 0; mindenom = weights[plane].i_denom; @@ -357,33 +388,65 @@ if( !minscore ) continue; - // This gives a slight improvement due to rounding errors but only tests one offset in lookahead. - // Currently only searches within +/- 1 of the best offset found so far. - // TODO: Try other offsets/multipliers/combinations thereof? - cur_offset = fenc_mean - ref_mean * minscale / (1 << mindenom) + 0.5f * b_lookahead; - start_offset = x264_clip3( cur_offset - !b_lookahead, -128, 127 ); - end_offset = x264_clip3( cur_offset + !b_lookahead, -128, 127 ); - for( int i_off = start_offset; i_off <= end_offset; i_off++ ) + /* Picked somewhat arbitrarily */ + static const uint8_t weight_check_distance[][2] = + { + {0,0},{0,0},{0,1},{0,1}, + {0,1},{0,1},{0,1},{1,1}, + {1,1},{2,1},{2,1},{4,2} + }; + int scale_dist = b_lookahead ? 0 : weight_check_distance[h->param.analyse.i_subpel_refine][0]; + int offset_dist = b_lookahead ? 0 : weight_check_distance[h->param.analyse.i_subpel_refine][1]; + + int start_scale = x264_clip3( minscale - scale_dist, 0, 127 ); + int end_scale = x264_clip3( minscale + scale_dist, 0, 127 ); + for( int i_scale = start_scale; i_scale <= end_scale; i_scale++ ) { - SET_WEIGHT( weights[plane], 1, minscale, mindenom, i_off ); - unsigned int s; - if( plane ) + int cur_scale = i_scale; + int cur_offset = fenc_mean[plane] - ref_mean[plane] * cur_scale / (1 << mindenom) + 0.5f * b_lookahead; + if( cur_offset < - 128 || cur_offset > 127 ) { - if( CHROMA444 ) - s = x264_weight_cost_chroma444( h, fenc, mcbuf, &weights[plane], plane ); - else - s = x264_weight_cost_chroma( h, fenc, mcbuf, &weights[plane] ); + /* Rescale considering the constraints on cur_offset. We do it in this order + * because scale has a much wider range than offset (because of denom), so + * it should almost never need to be clamped. */ + cur_offset = x264_clip3( cur_offset, -128, 127 ); + cur_scale = (1 << mindenom) * (fenc_mean[plane] - cur_offset) / ref_mean[plane] + 0.5f; + cur_scale = x264_clip3( cur_scale, 0, 127 ); } - else - s = x264_weight_cost_luma( h, fenc, mcbuf, &weights[plane] ); - COPY3_IF_LT( minscore, s, minoff, i_off, found, 1 ); + int start_offset = x264_clip3( cur_offset - offset_dist, -128, 127 ); + int end_offset = x264_clip3( cur_offset + offset_dist, -128, 127 ); + for( int i_off = start_offset; i_off <= end_offset; i_off++ ) + { + SET_WEIGHT( weights[plane], 1, cur_scale, mindenom, i_off ); + unsigned int s; + if( plane ) + { + if( CHROMA444 ) + s = x264_weight_cost_chroma444( h, fenc, mcbuf, &weights[plane], plane ); + else + s = x264_weight_cost_chroma( h, fenc, mcbuf, &weights[plane] ); + } + else + s = x264_weight_cost_luma( h, fenc, mcbuf, &weights[plane] ); + COPY4_IF_LT( minscore, s, minscale, cur_scale, minoff, i_off, found, 1 ); - // Don't check any more offsets if the previous one had a lower cost than the current one - if( minoff == start_offset && i_off != start_offset ) - break; + // Don't check any more offsets if the previous one had a lower cost than the current one + if( minoff == start_offset && i_off != start_offset ) + break; + } } x264_emms(); + /* Use a smaller denominator if possible */ + if( !plane ) + { + while( mindenom > 0 && !(minscale&1) ) + { + mindenom--; + minscale >>= 1; + } + } + /* FIXME: More analysis can be done here on SAD vs. SATD termination. */ /* 0.2% termination derived experimentally to avoid weird weights in frames that are mostly intra. */ if( !found || (minscale == 1 << mindenom && minoff == 0) || (float)minscore / origscore > 0.998f ) @@ -398,18 +461,29 @@ fenc->f_weighted_cost_delta[i_delta_index] = (float)minscore / origscore; } - //FIXME, what is the correct way to deal with this? - if( weights[1].weightfn && weights[2].weightfn && weights[1].i_denom != weights[2].i_denom ) + /* Optimize and unify denominator */ + if( weights[1].weightfn || weights[2].weightfn ) {
View file
x264-snapshot-20130723-2245.tar.bz2/extras/avxsynth_c.h
Added
@@ -0,0 +1,727 @@ +// Avisynth C Interface Version 0.20 +// Copyright 2003 Kevin Atkinson + +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit +// http://www.gnu.org/copyleft/gpl.html . +// +// As a special exception, I give you permission to link to the +// Avisynth C interface with independent modules that communicate with +// the Avisynth C interface solely through the interfaces defined in +// avisynth_c.h, regardless of the license terms of these independent +// modules, and to copy and distribute the resulting combined work +// under terms of your choice, provided that every copy of the +// combined work is accompanied by a complete copy of the source code +// of the Avisynth C interface and Avisynth itself (with the version +// used to produce the combined work), being distributed under the +// terms of the GNU General Public License plus this exception. An +// independent module is a module which is not derived from or based +// on Avisynth C Interface, such as 3rd-party filters, import and +// export plugins, or graphical user interfaces. + +#ifndef __AVXSYNTH_C__ +#define __AVXSYNTH_C__ + +#include "windowsPorts/windows2linux.h" +#include <stdarg.h> + +#ifdef __cplusplus +# define EXTERN_C extern "C" +#else +# define EXTERN_C +#endif + +#define AVSC_USE_STDCALL 1 + +#ifndef AVSC_USE_STDCALL +# define AVSC_CC __cdecl +#else +# define AVSC_CC __stdcall +#endif + +#define AVSC_INLINE static __inline + +#ifdef AVISYNTH_C_EXPORTS +# define AVSC_EXPORT EXTERN_C +# define AVSC_API(ret, name) EXTERN_C __declspec(dllexport) ret AVSC_CC name +#else +# define AVSC_EXPORT EXTERN_C __declspec(dllexport) +# ifndef AVSC_NO_DECLSPEC +# define AVSC_API(ret, name) EXTERN_C __declspec(dllimport) ret AVSC_CC name +# else +# define AVSC_API(ret, name) typedef ret (AVSC_CC *name##_func) +# endif +#endif + +#ifdef __GNUC__ +typedef long long int INT64; +#else +typedef __int64 INT64; +#endif + + +///////////////////////////////////////////////////////////////////// +// +// Constants +// + +#ifndef __AVXSYNTH_H__ +enum { AVISYNTH_INTERFACE_VERSION = 3 }; +#endif + +enum {AVS_SAMPLE_INT8 = 1<<0, + AVS_SAMPLE_INT16 = 1<<1, + AVS_SAMPLE_INT24 = 1<<2, + AVS_SAMPLE_INT32 = 1<<3, + AVS_SAMPLE_FLOAT = 1<<4}; + +enum {AVS_PLANAR_Y=1<<0, + AVS_PLANAR_U=1<<1, + AVS_PLANAR_V=1<<2, + AVS_PLANAR_ALIGNED=1<<3, + AVS_PLANAR_Y_ALIGNED=AVS_PLANAR_Y|AVS_PLANAR_ALIGNED, + AVS_PLANAR_U_ALIGNED=AVS_PLANAR_U|AVS_PLANAR_ALIGNED, + AVS_PLANAR_V_ALIGNED=AVS_PLANAR_V|AVS_PLANAR_ALIGNED}; + + // Colorspace properties. +enum {AVS_CS_BGR = 1<<28, + AVS_CS_YUV = 1<<29, + AVS_CS_INTERLEAVED = 1<<30, + AVS_CS_PLANAR = 1<<31}; + + // Specific colorformats +enum { + AVS_CS_UNKNOWN = 0, + AVS_CS_BGR24 = 1<<0 | AVS_CS_BGR | AVS_CS_INTERLEAVED, + AVS_CS_BGR32 = 1<<1 | AVS_CS_BGR | AVS_CS_INTERLEAVED, + AVS_CS_YUY2 = 1<<2 | AVS_CS_YUV | AVS_CS_INTERLEAVED, + AVS_CS_YV12 = 1<<3 | AVS_CS_YUV | AVS_CS_PLANAR, // y-v-u, planar + AVS_CS_I420 = 1<<4 | AVS_CS_YUV | AVS_CS_PLANAR, // y-u-v, planar + AVS_CS_IYUV = 1<<4 | AVS_CS_YUV | AVS_CS_PLANAR // same as above +}; + +enum { + AVS_IT_BFF = 1<<0, + AVS_IT_TFF = 1<<1, + AVS_IT_FIELDBASED = 1<<2}; + +enum { + AVS_FILTER_TYPE=1, + AVS_FILTER_INPUT_COLORSPACE=2, + AVS_FILTER_OUTPUT_TYPE=9, + AVS_FILTER_NAME=4, + AVS_FILTER_AUTHOR=5, + AVS_FILTER_VERSION=6, + AVS_FILTER_ARGS=7, + AVS_FILTER_ARGS_INFO=8, + AVS_FILTER_ARGS_DESCRIPTION=10, + AVS_FILTER_DESCRIPTION=11}; + +enum { //SUBTYPES + AVS_FILTER_TYPE_AUDIO=1, + AVS_FILTER_TYPE_VIDEO=2, + AVS_FILTER_OUTPUT_TYPE_SAME=3, + AVS_FILTER_OUTPUT_TYPE_DIFFERENT=4}; + +enum { + AVS_CACHE_NOTHING=0, + AVS_CACHE_RANGE=1, + AVS_CACHE_ALL=2, + AVS_CACHE_AUDIO=3, + AVS_CACHE_AUDIO_NONE=4, + AVS_CACHE_AUDIO_AUTO=5 +}; + +#define AVS_FRAME_ALIGN 16 + +typedef struct AVS_Clip AVS_Clip; +typedef struct AVS_ScriptEnvironment AVS_ScriptEnvironment; + +///////////////////////////////////////////////////////////////////// +// +// AVS_VideoInfo +// + +// AVS_VideoInfo is layed out identicly to VideoInfo +typedef struct AVS_VideoInfo { + int width, height; // width=0 means no video + unsigned fps_numerator, fps_denominator; + int num_frames; + + int pixel_type; + + int audio_samples_per_second; // 0 means no audio + int sample_type; + INT64 num_audio_samples; + int nchannels; + + // Imagetype properties + + int image_type; +} AVS_VideoInfo; + +// useful functions of the above +AVSC_INLINE int avs_has_video(const AVS_VideoInfo * p) + { return (p->width!=0); } + +AVSC_INLINE int avs_has_audio(const AVS_VideoInfo * p) + { return (p->audio_samples_per_second!=0); } + +AVSC_INLINE int avs_is_rgb(const AVS_VideoInfo * p) + { return !!(p->pixel_type&AVS_CS_BGR); } + +AVSC_INLINE int avs_is_rgb24(const AVS_VideoInfo * p) + { return (p->pixel_type&AVS_CS_BGR24)==AVS_CS_BGR24; } // Clear out additional properties + +AVSC_INLINE int avs_is_rgb32(const AVS_VideoInfo * p) + { return (p->pixel_type & AVS_CS_BGR32) == AVS_CS_BGR32 ; } + +AVSC_INLINE int avs_is_yuv(const AVS_VideoInfo * p) + { return !!(p->pixel_type&AVS_CS_YUV ); } + +AVSC_INLINE int avs_is_yuy2(const AVS_VideoInfo * p) + { return (p->pixel_type & AVS_CS_YUY2) == AVS_CS_YUY2; } + +AVSC_INLINE int avs_is_yv12(const AVS_VideoInfo * p) + { return ((p->pixel_type & AVS_CS_YV12) == AVS_CS_YV12)||((p->pixel_type & AVS_CS_I420) == AVS_CS_I420); } + +AVSC_INLINE int avs_is_color_space(const AVS_VideoInfo * p, int c_space)
View file
x264-snapshot-20130723-2245.tar.bz2/extras/cl.h
Added
@@ -0,0 +1,1209 @@ +/******************************************************************************* + * Copyright (c) 2008 - 2012 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + ******************************************************************************/ + +#ifndef __OPENCL_CL_H +#define __OPENCL_CL_H + +#include "cl_platform.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/******************************************************************************/ + +typedef struct _cl_platform_id * cl_platform_id; +typedef struct _cl_device_id * cl_device_id; +typedef struct _cl_context * cl_context; +typedef struct _cl_command_queue * cl_command_queue; +typedef struct _cl_mem * cl_mem; +typedef struct _cl_program * cl_program; +typedef struct _cl_kernel * cl_kernel; +typedef struct _cl_event * cl_event; +typedef struct _cl_sampler * cl_sampler; + +typedef cl_uint cl_bool; /* WARNING! Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */ +typedef cl_ulong cl_bitfield; +typedef cl_bitfield cl_device_type; +typedef cl_uint cl_platform_info; +typedef cl_uint cl_device_info; +typedef cl_bitfield cl_device_fp_config; +typedef cl_uint cl_device_mem_cache_type; +typedef cl_uint cl_device_local_mem_type; +typedef cl_bitfield cl_device_exec_capabilities; +typedef cl_bitfield cl_command_queue_properties; +typedef intptr_t cl_device_partition_property; +typedef cl_bitfield cl_device_affinity_domain; + +typedef intptr_t cl_context_properties; +typedef cl_uint cl_context_info; +typedef cl_uint cl_command_queue_info; +typedef cl_uint cl_channel_order; +typedef cl_uint cl_channel_type; +typedef cl_bitfield cl_mem_flags; +typedef cl_uint cl_mem_object_type; +typedef cl_uint cl_mem_info; +typedef cl_bitfield cl_mem_migration_flags; +typedef cl_uint cl_image_info; +typedef cl_uint cl_buffer_create_type; +typedef cl_uint cl_addressing_mode; +typedef cl_uint cl_filter_mode; +typedef cl_uint cl_sampler_info; +typedef cl_bitfield cl_map_flags; +typedef cl_uint cl_program_info; +typedef cl_uint cl_program_build_info; +typedef cl_uint cl_program_binary_type; +typedef cl_int cl_build_status; +typedef cl_uint cl_kernel_info; +typedef cl_uint cl_kernel_arg_info; +typedef cl_uint cl_kernel_arg_address_qualifier; +typedef cl_uint cl_kernel_arg_access_qualifier; +typedef cl_bitfield cl_kernel_arg_type_qualifier; +typedef cl_uint cl_kernel_work_group_info; +typedef cl_uint cl_event_info; +typedef cl_uint cl_command_type; +typedef cl_uint cl_profiling_info; + + +typedef struct _cl_image_format { + cl_channel_order image_channel_order; + cl_channel_type image_channel_data_type; +} cl_image_format; + +typedef struct _cl_image_desc { + cl_mem_object_type image_type; + size_t image_width; + size_t image_height; + size_t image_depth; + size_t image_array_size; + size_t image_row_pitch; + size_t image_slice_pitch; + cl_uint num_mip_levels; + cl_uint num_samples; + cl_mem buffer; +} cl_image_desc; + +typedef struct _cl_buffer_region { + size_t origin; + size_t size; +} cl_buffer_region; + + +/******************************************************************************/ + +/* Error Codes */ +#define CL_SUCCESS 0 +#define CL_DEVICE_NOT_FOUND -1 +#define CL_DEVICE_NOT_AVAILABLE -2 +#define CL_COMPILER_NOT_AVAILABLE -3 +#define CL_MEM_OBJECT_ALLOCATION_FAILURE -4 +#define CL_OUT_OF_RESOURCES -5 +#define CL_OUT_OF_HOST_MEMORY -6 +#define CL_PROFILING_INFO_NOT_AVAILABLE -7 +#define CL_MEM_COPY_OVERLAP -8 +#define CL_IMAGE_FORMAT_MISMATCH -9 +#define CL_IMAGE_FORMAT_NOT_SUPPORTED -10 +#define CL_BUILD_PROGRAM_FAILURE -11 +#define CL_MAP_FAILURE -12 +#define CL_MISALIGNED_SUB_BUFFER_OFFSET -13 +#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14 +#define CL_COMPILE_PROGRAM_FAILURE -15 +#define CL_LINKER_NOT_AVAILABLE -16 +#define CL_LINK_PROGRAM_FAILURE -17 +#define CL_DEVICE_PARTITION_FAILED -18 +#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE -19 + +#define CL_INVALID_VALUE -30 +#define CL_INVALID_DEVICE_TYPE -31 +#define CL_INVALID_PLATFORM -32 +#define CL_INVALID_DEVICE -33 +#define CL_INVALID_CONTEXT -34 +#define CL_INVALID_QUEUE_PROPERTIES -35 +#define CL_INVALID_COMMAND_QUEUE -36 +#define CL_INVALID_HOST_PTR -37 +#define CL_INVALID_MEM_OBJECT -38 +#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR -39 +#define CL_INVALID_IMAGE_SIZE -40 +#define CL_INVALID_SAMPLER -41 +#define CL_INVALID_BINARY -42 +#define CL_INVALID_BUILD_OPTIONS -43 +#define CL_INVALID_PROGRAM -44 +#define CL_INVALID_PROGRAM_EXECUTABLE -45 +#define CL_INVALID_KERNEL_NAME -46 +#define CL_INVALID_KERNEL_DEFINITION -47 +#define CL_INVALID_KERNEL -48 +#define CL_INVALID_ARG_INDEX -49 +#define CL_INVALID_ARG_VALUE -50 +#define CL_INVALID_ARG_SIZE -51 +#define CL_INVALID_KERNEL_ARGS -52 +#define CL_INVALID_WORK_DIMENSION -53 +#define CL_INVALID_WORK_GROUP_SIZE -54 +#define CL_INVALID_WORK_ITEM_SIZE -55 +#define CL_INVALID_GLOBAL_OFFSET -56 +#define CL_INVALID_EVENT_WAIT_LIST -57 +#define CL_INVALID_EVENT -58 +#define CL_INVALID_OPERATION -59 +#define CL_INVALID_GL_OBJECT -60 +#define CL_INVALID_BUFFER_SIZE -61 +#define CL_INVALID_MIP_LEVEL -62 +#define CL_INVALID_GLOBAL_WORK_SIZE -63 +#define CL_INVALID_PROPERTY -64 +#define CL_INVALID_IMAGE_DESCRIPTOR -65 +#define CL_INVALID_COMPILER_OPTIONS -66 +#define CL_INVALID_LINKER_OPTIONS -67 +#define CL_INVALID_DEVICE_PARTITION_COUNT -68 + +/* OpenCL Version */ +#define CL_VERSION_1_0 1 +#define CL_VERSION_1_1 1 +#define CL_VERSION_1_2 1 + +/* cl_bool */ +#define CL_FALSE 0 +#define CL_TRUE 1 +#define CL_BLOCKING CL_TRUE +#define CL_NON_BLOCKING CL_FALSE + +/* cl_platform_info */ +#define CL_PLATFORM_PROFILE 0x0900 +#define CL_PLATFORM_VERSION 0x0901 +#define CL_PLATFORM_NAME 0x0902 +#define CL_PLATFORM_VENDOR 0x0903 +#define CL_PLATFORM_EXTENSIONS 0x0904 + +/* cl_device_type - bitfield */ +#define CL_DEVICE_TYPE_DEFAULT (1 << 0) +#define CL_DEVICE_TYPE_CPU (1 << 1) +#define CL_DEVICE_TYPE_GPU (1 << 2) +#define CL_DEVICE_TYPE_ACCELERATOR (1 << 3) +#define CL_DEVICE_TYPE_CUSTOM (1 << 4)
View file
x264-snapshot-20130723-2245.tar.bz2/extras/cl_platform.h
Added
@@ -0,0 +1,1268 @@ +/********************************************************************************** + * Copyright (c) 2008-2012 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +/* $Revision: 11803 $ on $Date: 2010-06-25 10:02:12 -0700 (Fri, 25 Jun 2010) $ */ + +#ifndef __CL_PLATFORM_H +#define __CL_PLATFORM_H + +#ifdef __APPLE__ + /* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */ + #include <AvailabilityMacros.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(_WIN32) + #define CL_API_ENTRY + #define CL_API_CALL __stdcall + #define CL_CALLBACK __stdcall +#else + #define CL_API_ENTRY + #define CL_API_CALL + #define CL_CALLBACK +#endif + +#ifdef __APPLE__ + #define CL_EXTENSION_WEAK_LINK __attribute__((weak_import)) + #ifndef UNAVAILABLE_ATTRIBUTE + #define UNAVAILABLE_ATTRIBUTE + #endif + #ifdef AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER + #define CL_API_SUFFIX__VERSION_1_0 AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_0 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER + #else + #define CL_API_SUFFIX__VERSION_1_0 UNAVAILABLE_ATTRIBUTE + #define CL_EXT_SUFFIX__VERSION_1_0 CL_EXTENSION_WEAK_LINK UNAVAILABLE_ATTRIBUTE + #endif + #ifdef AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define CL_API_SUFFIX__VERSION_1_1 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define GCL_API_SUFFIX__VERSION_1_1 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_1 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 + #else + #define CL_API_SUFFIX__VERSION_1_1 UNAVAILABLE_ATTRIBUTE + #define GCL_API_SUFFIX__VERSION_1_1 UNAVAILABLE_ATTRIBUTE + #define CL_EXT_SUFFIX__VERSION_1_1 CL_EXTENSION_WEAK_LINK UNAVAILABLE_ATTRIBUTE + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATE CL_EXT_SUFFIX__VERSION_1_0 + #endif + #ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define GCL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_2 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 + #else + #define CL_API_SUFFIX__VERSION_1_2 UNAVAILABLE_ATTRIBUTE + #define GCL_API_SUFFIX__VERSION_1_2 UNAVAILABLE_ATTRIBUTE + #define CL_EXT_SUFFIX__VERSION_1_2 CL_EXTENSION_WEAK_LINK UNAVAILABLE_ATTRIBUTE + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXT_SUFFIX__VERSION_1_1 + #endif +#else + #define CL_EXTENSION_WEAK_LINK + #define CL_API_SUFFIX__VERSION_1_0 + #define CL_EXT_SUFFIX__VERSION_1_0 + #define CL_API_SUFFIX__VERSION_1_1 + #define CL_EXT_SUFFIX__VERSION_1_1 + #define CL_API_SUFFIX__VERSION_1_2 + #define CL_EXT_SUFFIX__VERSION_1_2 + + #ifdef __GNUC__ + #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED __attribute__((deprecated)) + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED __attribute__((deprecated)) + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #endif + #elif _WIN32 + #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED __declspec(deprecated) + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED __declspec(deprecated) + #endif + #else + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED + + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #endif +#endif + +#if (defined (_WIN32) && defined(_MSC_VER)) + +/* scalar types */ +typedef signed __int8 cl_char; +typedef unsigned __int8 cl_uchar; +typedef signed __int16 cl_short; +typedef unsigned __int16 cl_ushort; +typedef signed __int32 cl_int; +typedef unsigned __int32 cl_uint; +typedef signed __int64 cl_long; +typedef unsigned __int64 cl_ulong; + +typedef unsigned __int16 cl_half; +typedef float cl_float; +typedef double cl_double; + +/* Macro names and corresponding values defined by OpenCL */ +#define CL_CHAR_BIT 8 +#define CL_SCHAR_MAX 127 +#define CL_SCHAR_MIN (-127-1) +#define CL_CHAR_MAX CL_SCHAR_MAX +#define CL_CHAR_MIN CL_SCHAR_MIN +#define CL_UCHAR_MAX 255 +#define CL_SHRT_MAX 32767 +#define CL_SHRT_MIN (-32767-1) +#define CL_USHRT_MAX 65535 +#define CL_INT_MAX 2147483647 +#define CL_INT_MIN (-2147483647-1) +#define CL_UINT_MAX 0xffffffffU +#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL) +#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL) +#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL) + +#define CL_FLT_DIG 6 +#define CL_FLT_MANT_DIG 24 +#define CL_FLT_MAX_10_EXP +38 +#define CL_FLT_MAX_EXP +128 +#define CL_FLT_MIN_10_EXP -37 +#define CL_FLT_MIN_EXP -125 +#define CL_FLT_RADIX 2 +#define CL_FLT_MAX 340282346638528859811704183484516925440.0f +#define CL_FLT_MIN 1.175494350822287507969e-38f +#define CL_FLT_EPSILON 0x1.0p-23f + +#define CL_DBL_DIG 15 +#define CL_DBL_MANT_DIG 53 +#define CL_DBL_MAX_10_EXP +308 +#define CL_DBL_MAX_EXP +1024 +#define CL_DBL_MIN_10_EXP -307 +#define CL_DBL_MIN_EXP -1021 +#define CL_DBL_RADIX 2 +#define CL_DBL_MAX 179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0 +#define CL_DBL_MIN 2.225073858507201383090e-308 +#define CL_DBL_EPSILON 2.220446049250313080847e-16 + +#define CL_M_E 2.718281828459045090796 +#define CL_M_LOG2E 1.442695040888963387005 +#define CL_M_LOG10E 0.434294481903251816668 +#define CL_M_LN2 0.693147180559945286227 +#define CL_M_LN10 2.302585092994045901094 +#define CL_M_PI 3.141592653589793115998 +#define CL_M_PI_2 1.570796326794896557999 +#define CL_M_PI_4 0.785398163397448278999 +#define CL_M_1_PI 0.318309886183790691216 +#define CL_M_2_PI 0.636619772367581382433 +#define CL_M_2_SQRTPI 1.128379167095512558561
View file
x264-snapshot-20130723-2245.tar.bz2/extras/windowsPorts
Added
+(directory)
View file
x264-snapshot-20130723-2245.tar.bz2/extras/windowsPorts/basicDataTypeConversions.h
Added
@@ -0,0 +1,85 @@ +#ifndef __DATA_TYPE_CONVERSIONS_H__ +#define __DATA_TYPE_CONVERSIONS_H__ + +#include <stdint.h> +#include <wchar.h> + +#ifdef __cplusplus +namespace avxsynth { +#endif // __cplusplus + +typedef int64_t __int64; +typedef int32_t __int32; +#ifdef __cplusplus +typedef bool BOOL; +#else +typedef uint32_t BOOL; +#endif // __cplusplus +typedef void* HMODULE; +typedef void* LPVOID; +typedef void* PVOID; +typedef PVOID HANDLE; +typedef HANDLE HWND; +typedef HANDLE HINSTANCE; +typedef void* HDC; +typedef void* HBITMAP; +typedef void* HICON; +typedef void* HFONT; +typedef void* HGDIOBJ; +typedef void* HBRUSH; +typedef void* HMMIO; +typedef void* HACMSTREAM; +typedef void* HACMDRIVER; +typedef void* HIC; +typedef void* HACMOBJ; +typedef HACMSTREAM* LPHACMSTREAM; +typedef void* HACMDRIVERID; +typedef void* LPHACMDRIVER; +typedef unsigned char BYTE; +typedef BYTE* LPBYTE; +typedef char TCHAR; +typedef TCHAR* LPTSTR; +typedef const TCHAR* LPCTSTR; +typedef char* LPSTR; +typedef LPSTR LPOLESTR; +typedef const char* LPCSTR; +typedef LPCSTR LPCOLESTR; +typedef wchar_t WCHAR; +typedef unsigned short WORD; +typedef unsigned int UINT; +typedef UINT MMRESULT; +typedef uint32_t DWORD; +typedef DWORD COLORREF; +typedef DWORD FOURCC; +typedef DWORD HRESULT; +typedef DWORD* LPDWORD; +typedef DWORD* DWORD_PTR; +typedef int32_t LONG; +typedef int32_t* LONG_PTR; +typedef LONG_PTR LRESULT; +typedef uint32_t ULONG; +typedef uint32_t* ULONG_PTR; +//typedef __int64_t intptr_t; +typedef uint64_t _fsize_t; + + +// +// Structures +// + +typedef struct _GUID { + DWORD Data1; + WORD Data2; + WORD Data3; + BYTE Data4[8]; +} GUID; + +typedef GUID REFIID; +typedef GUID CLSID; +typedef CLSID* LPCLSID; +typedef GUID IID; + +#ifdef __cplusplus +}; // namespace avxsynth +#endif // __cplusplus +#endif // __DATA_TYPE_CONVERSIONS_H__
View file
x264-snapshot-20130723-2245.tar.bz2/extras/windowsPorts/windows2linux.h
Added
@@ -0,0 +1,77 @@ +#ifndef __WINDOWS2LINUX_H__ +#define __WINDOWS2LINUX_H__ + +/* + * LINUX SPECIFIC DEFINITIONS +*/ +// +// Data types conversions +// +#include <stdlib.h> +#include <string.h> +#include "basicDataTypeConversions.h" + +#ifdef __cplusplus +namespace avxsynth { +#endif // __cplusplus +// +// purposefully define the following MSFT definitions +// to mean nothing (as they do not mean anything on Linux) +// +#define __stdcall +#define __cdecl +#define noreturn +#define __declspec(x) +#define STDAPI extern "C" HRESULT +#define STDMETHODIMP HRESULT __stdcall +#define STDMETHODIMP_(x) x __stdcall + +#define STDMETHOD(x) virtual HRESULT x +#define STDMETHOD_(a, x) virtual a x + +#ifndef TRUE +#define TRUE true +#endif + +#ifndef FALSE +#define FALSE false +#endif + +#define S_OK (0x00000000) +#define S_FALSE (0x00000001) +#define E_NOINTERFACE (0X80004002) +#define E_POINTER (0x80004003) +#define E_FAIL (0x80004005) +#define E_OUTOFMEMORY (0x8007000E) + +#define INVALID_HANDLE_VALUE ((HANDLE)((LONG_PTR)-1)) +#define FAILED(hr) ((hr) & 0x80000000) +#define SUCCEEDED(hr) (!FAILED(hr)) + + +// +// Functions +// +#define MAKEDWORD(a,b,c,d) ((a << 24) | (b << 16) | (c << 8) | (d)) +#define MAKEWORD(a,b) ((a << 8) | (b)) + +#define lstrlen strlen +#define lstrcpy strcpy +#define lstrcmpi strcasecmp +#define _stricmp strcasecmp +#define InterlockedIncrement(x) __sync_fetch_and_add((x), 1) +#define InterlockedDecrement(x) __sync_fetch_and_sub((x), 1) +// Windows uses (new, old) ordering but GCC has (old, new) +#define InterlockedCompareExchange(x,y,z) __sync_val_compare_and_swap(x,z,y) + +#define UInt32x32To64(a, b) ( (uint64_t) ( ((uint64_t)((uint32_t)(a))) * ((uint32_t)(b)) ) ) +#define Int64ShrlMod32(a, b) ( (uint64_t) ( (uint64_t)(a) >> (b) ) ) +#define Int32x32To64(a, b) ((__int64)(((__int64)((long)(a))) * ((long)(b)))) + +#define MulDiv(nNumber, nNumerator, nDenominator) (int32_t) (((int64_t) (nNumber) * (int64_t) (nNumerator) + (int64_t) ((nDenominator)/2)) / (int64_t) (nDenominator)) + +#ifdef __cplusplus +}; // namespace avxsynth +#endif // __cplusplus + +#endif // __WINDOWS2LINUX_H__
View file
x264-snapshot-20130224-2245.tar.bz2/input/avs.c -> x264-snapshot-20130723-2245.tar.bz2/input/avs.c
Changed
@@ -24,12 +24,30 @@ *****************************************************************************/ #include "input.h" +#if USE_AVXSYNTH +#include <dlfcn.h> +#if SYS_MACOSX +#define avs_open dlopen( "libavxsynth.dylib", RTLD_NOW ) +#else +#define avs_open dlopen( "libavxsynth.so", RTLD_NOW ) +#endif +#define avs_close dlclose +#define avs_address dlsym +#else #include <windows.h> +#define avs_open LoadLibrary( "avisynth" ) +#define avs_close FreeLibrary +#define avs_address GetProcAddress +#endif #define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "avs", __VA_ARGS__ ) #define AVSC_NO_DECLSPEC #undef EXTERN_C +#if USE_AVXSYNTH +#include "extras/avxsynth_c.h" +#else #include "extras/avisynth_c.h" +#endif #define AVSC_DECLARE_FUNC(name) name##_func name /* AVS uses a versioned interface to control backwards compatibility */ @@ -40,12 +58,20 @@ #include <libavutil/pixfmt.h> #endif +/* AvxSynth doesn't have yv24, yv16, yv411, or y8, so disable them. */ +#if USE_AVXSYNTH +#define avs_is_yv24( vi ) 0 +#define avs_is_yv16( vi ) 0 +#define avs_is_yv411( vi ) 0 +#define avs_is_y8( vi ) 0 +#endif + /* maximum size of the sequence of filters to try on non script files */ #define AVS_MAX_SEQUENCE 5 #define LOAD_AVS_FUNC(name, continue_on_fail)\ {\ - h->func.name = (void*)GetProcAddress( h->library, #name );\ + h->func.name = (void*)avs_address( h->library, #name );\ if( !continue_on_fail && !h->func.name )\ goto fail;\ } @@ -76,7 +102,7 @@ /* load the library and functions we require from it */ static int x264_avs_load_library( avs_hnd_t *h ) { - h->library = LoadLibrary( "avisynth" ); + h->library = avs_open; if( !h->library ) return -1; LOAD_AVS_FUNC( avs_clip_get_error, 0 ); @@ -93,7 +119,7 @@ LOAD_AVS_FUNC( avs_take_clip, 0 ); return 0; fail: - FreeLibrary( h->library ); + avs_close( h->library ); return -1; } @@ -101,6 +127,9 @@ static void avs_build_filter_sequence( char *filename_ext, const char *filter[AVS_MAX_SEQUENCE+1] ) { int i = 0; +#if USE_AVXSYNTH + const char *all_purpose[] = { "FFVideoSource", 0 }; +#else const char *all_purpose[] = { "FFmpegSource2", "DSS2", "DirectShowSource", 0 }; if( !strcasecmp( filename_ext, "avi" ) ) filter[i++] = "AVISource"; @@ -108,6 +137,7 @@ filter[i++] = "MPEG2Source"; if( !strcasecmp( filename_ext, "dga" ) ) filter[i++] = "AVCSource"; +#endif for( int j = 0; all_purpose[j] && i < AVS_MAX_SEQUENCE; j++ ) filter[i++] = all_purpose[j]; } @@ -123,6 +153,13 @@ static float get_avs_version( avs_hnd_t *h ) { +/* AvxSynth has its version defined starting at 4.0, even though it's based on + AviSynth 2.5.8. This is troublesome for get_avs_version and working around + the new colorspaces in 2.6. So if AvxSynth is detected, explicitly define + the version as 2.58. */ +#if USE_AVXSYNTH + return 2.58f; +#else FAIL_IF_ERROR( !h->func.avs_function_exists( h->env, "VersionNumber" ), "VersionNumber does not exist\n" ) AVS_Value ver = h->func.avs_invoke( h->env, "VersionNumber", avs_new_value_array( NULL, 0 ), NULL ); FAIL_IF_ERROR( avs_is_error( ver ), "unable to determine avisynth version: %s\n", avs_as_error( ver ) ) @@ -130,6 +167,7 @@ float ret = avs_as_float( ver ); h->func.avs_release_value( ver ); return ret; +#endif } static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt ) @@ -219,11 +257,11 @@ } #if !HAVE_SWSCALE /* if swscale is not available, convert the CSP if necessary */ + FAIL_IF_ERROR( avs_version < 2.6f && (opt->output_csp == X264_CSP_I422 || opt->output_csp == X264_CSP_I444), + "avisynth >= 2.6 is required for i422/i444 output\n" ) if( (opt->output_csp == X264_CSP_I420 && !avs_is_yv12( vi )) || (opt->output_csp == X264_CSP_I422 && !avs_is_yv16( vi )) || (opt->output_csp == X264_CSP_I444 && !avs_is_yv24( vi )) || (opt->output_csp == X264_CSP_RGB && !avs_is_rgb( vi )) ) { - FAIL_IF_ERROR( avs_version < 2.6f && (opt->output_csp == X264_CSP_I422 || opt->output_csp == X264_CSP_I444), - "avisynth >= 2.6 is required for i422/i444 output\n" ) const char *csp = opt->output_csp == X264_CSP_I420 ? "YV12" : opt->output_csp == X264_CSP_I422 ? "YV16" : @@ -270,6 +308,7 @@ opt->input_range = opt->output_range; } #endif + h->func.avs_release_value( res ); info->width = vi->width; @@ -357,7 +396,7 @@ h->func.avs_release_clip( h->clip ); if( h->func.avs_delete_script_environment ) h->func.avs_delete_script_environment( h->env ); - FreeLibrary( h->library ); + avs_close( h->library ); free( h ); return 0; }
View file
x264-snapshot-20130224-2245.tar.bz2/input/lavf.c -> x264-snapshot-20130723-2245.tar.bz2/input/lavf.c
Changed
@@ -183,8 +183,8 @@ h->stream_id = i; h->next_frame = 0; AVCodecContext *c = h->lavf->streams[i]->codec; - info->fps_num = h->lavf->streams[i]->r_frame_rate.num; - info->fps_den = h->lavf->streams[i]->r_frame_rate.den; + info->fps_num = h->lavf->streams[i]->avg_frame_rate.num; + info->fps_den = h->lavf->streams[i]->avg_frame_rate.den; info->timebase_num = h->lavf->streams[i]->time_base.num; info->timebase_den = h->lavf->streams[i]->time_base.den; /* lavf is thread unsafe as calling av_read_frame invalidates previously read AVPackets */
View file
x264-snapshot-20130224-2245.tar.bz2/input/y4m.c -> x264-snapshot-20130723-2245.tar.bz2/input/y4m.c
Changed
@@ -46,7 +46,6 @@ static int parse_csp_and_depth( char *csp_name, int *bit_depth ) { int csp = X264_CSP_MAX; - *bit_depth = 8; /* Set colorspace from known variants */ if( !strncmp( "420", csp_name, 3 ) ) @@ -57,8 +56,8 @@ csp = X264_CSP_I444; /* Set high bit depth from known extensions */ - if( !strncmp( "p", csp_name + 3, 1 ) ) - *bit_depth = strtol( csp_name + 4, NULL, 10 ); + if( sscanf( csp_name, "%*d%*[pP]%d", bit_depth ) != 1 ) + *bit_depth = 8; return csp; }
View file
x264-snapshot-20130224-2245.tar.bz2/tools/checkasm-a.asm -> x264-snapshot-20130723-2245.tar.bz2/tools/checkasm-a.asm
Changed
@@ -4,7 +4,7 @@ ;* Copyright (C) 2008-2013 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> -;* Henrik Gramner <hengar-6@student.ltu.se> +;* Henrik Gramner <henrik@gramner.com> ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -88,8 +88,7 @@ ; intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... ) ;----------------------------------------------------------------------------- INIT_XMM -cglobal checkasm_call, 2,15,16 - SUB rsp, max_args*8+16 +cglobal checkasm_call, 2,15,16,max_args*8+8 mov r6, r0 mov [rsp+max_args*8], r1 @@ -158,7 +157,6 @@ mov dword [r1], 0 mov rax, r9 .ok: - ADD rsp, max_args*8+16 RET %else @@ -207,8 +205,12 @@ ; int x264_stack_pagealign( int (*func)(), int align ) ;----------------------------------------------------------------------------- cglobal stack_pagealign, 2,2 + movsxdifnidn r1, r1d push rbp mov rbp, rsp +%if WIN64 + sub rsp, 32 ; shadow space +%endif and rsp, ~0xfff sub rsp, r1 call r0
View file
x264-snapshot-20130224-2245.tar.bz2/tools/checkasm.c -> x264-snapshot-20130723-2245.tar.bz2/tools/checkasm.c
Changed
@@ -61,7 +61,7 @@ { void *pointer; // just for detecting duplicates uint32_t cpu; - uint32_t cycles; + uint64_t cycles; uint32_t den; } bench_t; @@ -137,12 +137,12 @@ static void print_bench(void) { - uint16_t nops[10000] = {0}; + uint16_t nops[10000]; int nfuncs, nop_time=0; for( int i = 0; i < 10000; i++ ) { - int t = read_time(); + uint32_t t = read_time(); nops[i] = read_time() - t; } qsort( nops, 10000, sizeof(uint16_t), cmp_nop ); @@ -164,6 +164,7 @@ if( k < j ) continue; printf( "%s_%s%s: %"PRId64"\n", benchs[i].name, +#if HAVE_MMX b->cpu&X264_CPU_AVX2 && b->cpu&X264_CPU_FMA3 ? "avx2_fma3" : b->cpu&X264_CPU_AVX2 ? "avx2" : b->cpu&X264_CPU_FMA3 ? "fma3" : @@ -176,21 +177,30 @@ /* print sse2slow only if there's also a sse2fast version of the same func */ b->cpu&X264_CPU_SSE2_IS_SLOW && j<MAX_CPUS-1 && b[1].cpu&X264_CPU_SSE2_IS_FAST && !(b[1].cpu&X264_CPU_SSE3) ? "sse2slow" : b->cpu&X264_CPU_SSE2 ? "sse2" : + b->cpu&X264_CPU_SSE ? "sse" : b->cpu&X264_CPU_MMX ? "mmx" : +#elif ARCH_PPC b->cpu&X264_CPU_ALTIVEC ? "altivec" : +#elif ARCH_ARM b->cpu&X264_CPU_NEON ? "neon" : - b->cpu&X264_CPU_ARMV6 ? "armv6" : "c", + b->cpu&X264_CPU_ARMV6 ? "armv6" : +#endif + "c", +#if HAVE_MMX b->cpu&X264_CPU_CACHELINE_32 ? "_c32" : + b->cpu&X264_CPU_SLOW_ATOM && b->cpu&X264_CPU_CACHELINE_64 ? "_c64_atom" : b->cpu&X264_CPU_CACHELINE_64 ? "_c64" : - b->cpu&X264_CPU_SHUFFLE_IS_FAST && !(b->cpu&X264_CPU_SSE4) ? "_fastshuffle" : + b->cpu&X264_CPU_SLOW_SHUFFLE ? "_slowshuffle" : b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" : b->cpu&X264_CPU_LZCNT ? "_lzcnt" : b->cpu&X264_CPU_BMI2 ? "_bmi2" : - b->cpu&X264_CPU_TBM ? "_tbm" : b->cpu&X264_CPU_BMI1 ? "_bmi1" : - b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" : b->cpu&X264_CPU_SLOW_CTZ ? "_slow_ctz" : - b->cpu&X264_CPU_SLOW_ATOM ? "_slow_atom" : "", + b->cpu&X264_CPU_SLOW_ATOM ? "_atom" : +#elif ARCH_ARM + b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" : +#endif + "", ((int64_t)10*b->cycles/b->den - nop_time)/4 ); } } @@ -231,7 +241,7 @@ #define call_bench(func,cpu,...)\ if( do_bench && !strncmp(func_name, bench_pattern, bench_pattern_len) )\ {\ - uint32_t tsum = 0;\ + uint64_t tsum = 0;\ int tcount = 0;\ call_a1(func, __VA_ARGS__);\ for( int ti = 0; ti < (cpu?BENCH_RUNS:BENCH_RUNS/4); ti++ )\ @@ -242,7 +252,7 @@ func(__VA_ARGS__);\ func(__VA_ARGS__);\ t = read_time() - t;\ - if( t*tcount <= tsum*4 && ti > 0 )\ + if( (uint64_t)t*tcount <= tsum*4 && ti > 0 )\ {\ tsum += t;\ tcount++;\ @@ -299,7 +309,7 @@ #define TEST_PIXEL( name, align ) \ ok = 1, used_asm = 0; \ - for( int i = 0; i < 8; i++ ) \ + for( int i = 0; i < ARRAY_ELEMS(pixel_c.name); i++ ) \ { \ int res_c, res_asm; \ if( pixel_asm.name[i] != pixel_ref.name[i] ) \ @@ -337,11 +347,49 @@ TEST_PIXEL( satd, 0 ); TEST_PIXEL( sa8d, 1 ); + ok = 1, used_asm = 0; + if( pixel_asm.sa8d_satd[PIXEL_16x16] != pixel_ref.sa8d_satd[PIXEL_16x16] ) + { + set_func_name( "sa8d_satd_%s", pixel_names[PIXEL_16x16] ); + used_asm = 1; + for( int j = 0; j < 64; j++ ) + { + uint32_t cost8_c = pixel_c.sa8d[PIXEL_16x16]( pbuf1, 16, pbuf2, 64 ); + uint32_t cost4_c = pixel_c.satd[PIXEL_16x16]( pbuf1, 16, pbuf2, 64 ); + uint64_t res_a = call_a( pixel_asm.sa8d_satd[PIXEL_16x16], pbuf1, (intptr_t)16, pbuf2, (intptr_t)64 ); + uint32_t cost8_a = res_a; + uint32_t cost4_a = res_a >> 32; + if( cost8_a != cost8_c || cost4_a != cost4_c ) + { + ok = 0; + fprintf( stderr, "sa8d_satd [%d]: (%d,%d) != (%d,%d) [FAILED]\n", PIXEL_16x16, + cost8_c, cost4_c, cost8_a, cost4_a ); + break; + } + } + for( int j = 0; j < 0x1000 && ok; j += 256 ) \ + { + uint32_t cost8_c = pixel_c.sa8d[PIXEL_16x16]( pbuf3+j, 16, pbuf4+j, 16 ); + uint32_t cost4_c = pixel_c.satd[PIXEL_16x16]( pbuf3+j, 16, pbuf4+j, 16 ); + uint64_t res_a = pixel_asm.sa8d_satd[PIXEL_16x16]( pbuf3+j, 16, pbuf4+j, 16 ); + uint32_t cost8_a = res_a; + uint32_t cost4_a = res_a >> 32; + if( cost8_a != cost8_c || cost4_a != cost4_c ) + { + ok = 0; + fprintf( stderr, "sa8d_satd [%d]: overflow (%d,%d) != (%d,%d) [FAILED]\n", PIXEL_16x16, + cost8_c, cost4_c, cost8_a, cost4_a ); + } + } + } + report( "pixel sa8d_satd :" ); + #define TEST_PIXEL_X( N ) \ ok = 1; used_asm = 0; \ for( int i = 0; i < 7; i++ ) \ { \ - int res_c[4]={0}, res_asm[4]={0}; \ + ALIGNED_16( int res_c[4] ) = {0}; \ + ALIGNED_16( int res_asm[4] ) = {0}; \ if( pixel_asm.sad_x##N[i] && pixel_asm.sad_x##N[i] != pixel_ref.sad_x##N[i] ) \ { \ set_func_name( "sad_x%d_%s", N, pixel_names[i] ); \ @@ -494,7 +542,8 @@ #define TEST_INTRA_X3( name, i8x8, ... ) \ if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \ { \ - int res_c[3], res_asm[3]; \ + ALIGNED_16( int res_c[3] ); \ + ALIGNED_16( int res_asm[3] ); \ set_func_name( #name ); \ used_asm = 1; \ call_c( pixel_c.name, pbuf1+48, i8x8 ? edge : pbuf3+48, res_c ); \ @@ -696,8 +745,8 @@ { ALIGNED_16( uint16_t sums[72] ); ALIGNED_16( int dc[4] ); - ALIGNED_16( int16_t mvs_a[32] ); - ALIGNED_16( int16_t mvs_c[32] ); + ALIGNED_16( int16_t mvs_a[48] ); + ALIGNED_16( int16_t mvs_c[48] ); int mvn_a, mvn_c; int thresh = rand() & 0x3fff; set_func_name( "esa_ads" ); @@ -732,10 +781,10 @@ x264_dct_function_t dct_asm; x264_quant_function_t qf; int ret = 0, ok, used_asm, interlace = 0; - ALIGNED_16( dctcoef dct1[16][16] ); - ALIGNED_16( dctcoef dct2[16][16] ); - ALIGNED_16( dctcoef dct4[16][16] ); - ALIGNED_16( dctcoef dct8[4][64] ); + ALIGNED_ARRAY_N( dctcoef, dct1, [16],[16] ); + ALIGNED_ARRAY_N( dctcoef, dct2, [16],[16] ); + ALIGNED_ARRAY_N( dctcoef, dct4, [16],[16] ); + ALIGNED_ARRAY_N( dctcoef, dct8, [4],[64] ); ALIGNED_16( dctcoef dctdc[2][8] ); x264_t h_buf; x264_t *h = &h_buf; @@ -1030,7 +1079,7 @@ call_a( zigzag_asm[interlace].name, t2, dct, buf4 ); \ if( memcmp( t1, t2, size*sizeof(dctcoef) ) || memcmp( buf3, buf4, 10 ) ) \ { \ - ok = 0; \ + ok = 0; printf("%d: %d %d %d %d\n%d %d %d %d\n\n",memcmp( t1, t2, size*sizeof(dctcoef) ),buf3[0], buf3[1], buf3[8], buf3[9], buf4[0], buf4[1], buf4[8], buf4[9]);break;\ } \ } \ } @@ -1040,13 +1089,13 @@ x264_zigzag_init( cpu_new, &zigzag_asm[0], &zigzag_asm[1] ); ok = 1; used_asm = 0; - TEST_INTERLEAVE( interleave_8x8_cavlc, level1, level2, dct1[0], 64 ); + TEST_INTERLEAVE( interleave_8x8_cavlc, level1, level2, dct8[0], 64 ); report( "zigzag_interleave :" ); for( interlace = 0; interlace <= 1; interlace++ )
View file
x264-snapshot-20130723-2245.tar.bz2/tools/cltostr.pl
Added
@@ -0,0 +1,65 @@ +# Perl script used for compiling OpenCL src into x264 binary +# +# Copyright (C) 2013 x264 project +# Authors: Steve Borho <sborho@multicorewareinc.com> + +use Digest::MD5 qw(md5_hex); + +# xxd takes a VAR, which will be the variable name +# and BYTES, a string of bytes to beencoded. +sub xxd +{ + my %args = @_; + my $var = $args{VAR}; + my $bytes = $args{BYTES}; + my @hexbytes; + my @bytes = split //, $$bytes; + foreach $b (@bytes) + { + push @hexbytes, sprintf("0x%02X", ord($b)); + } + + # Format 'em nice and pretty-like. + print 'static const char ' . $var . '[] = {' . "\n"; + my $count = 0; + foreach my $h (@hexbytes) + { + print "$h, "; + $count++; + if ($count == 16) + { + print "\n"; + $count = 0; + } + } + print "\n0x00 };\n\n"; + + return; +} + +if (@ARGV < 1) +{ + printf "%s: VARNAME ", $0 . "\n"; + exit(-1); +} + + +my @lines; +while(<STDIN>) +{ + s/^\s+//; # trim leading whitespace + if (/^\/\//) + { + next; # skip the line if it starts with '//' + } + push @lines, $_; +} + +my $lines = join '', @lines; +xxd(VAR => @ARGV[0], BYTES => \$lines); + +my $hash = md5_hex($lines); +@hash = ( $hash =~ m/../g ); + + +xxd(VAR => @ARGV[0] . "_hash", BYTES => \$hash);
View file
x264-snapshot-20130224-2245.tar.bz2/x264.c -> x264-snapshot-20130723-2245.tar.bz2/x264.c
Changed
@@ -225,7 +225,7 @@ va_end( arg ); } -static void print_version_info() +static void print_version_info( void ) { #ifdef X264_POINTVER printf( "x264 "X264_POINTVER"\n" ); @@ -596,8 +596,11 @@ H2( " --slices <integer> Number of slices per frame; forces rectangular\n" " slices and is overridden by other slicing options\n" ); else H1( " --slices <integer> Number of slices per frame\n" ); + H2( " --slices-max <integer> Absolute maximum slices per frame; overrides\n" + " slice-max-size/slice-max-mbs when necessary\n" ); H2( " --slice-max-size <integer> Limit the size of each slice in bytes\n"); - H2( " --slice-max-mbs <integer> Limit the size of each slice in macroblocks\n"); + H2( " --slice-max-mbs <integer> Limit the size of each slice in macroblocks (max)\n"); + H2( " --slice-min-mbs <integer> Limit the size of each slice in macroblocks (min)\n"); H0( " --tff Enable interlaced mode (top field first)\n" ); H0( " --bff Enable interlaced mode (bottom field first)\n" ); H2( " --constrained-intra Enable constrained intra prediction.\n" ); @@ -743,16 +746,18 @@ H2( " --range <string> Specify color range [\"%s\"]\n" " - %s\n", range_names[0], stringify_names( buf, range_names ) ); H2( " --colorprim <string> Specify color primaries [\"%s\"]\n" - " - undef, bt709, bt470m, bt470bg\n" - " smpte170m, smpte240m, film\n", + " - undef, bt709, bt470m, bt470bg, smpte170m,\n" + " smpte240m, film, bt2020\n", strtable_lookup( x264_colorprim_names, defaults->vui.i_colorprim ) ); H2( " --transfer <string> Specify transfer characteristics [\"%s\"]\n" - " - undef, bt709, bt470m, bt470bg, linear,\n" - " log100, log316, smpte170m, smpte240m\n", + " - undef, bt709, bt470m, bt470bg, smpte170m,\n" + " smpte240m, linear, log100, log316,\n" + " iec61966-2-4, bt1361e, iec61966-2-1,\n" + " bt2020-10, bt2020-12\n", strtable_lookup( x264_transfer_names, defaults->vui.i_transfer ) ); H2( " --colormatrix <string> Specify color matrix setting [\"%s\"]\n" - " - undef, bt709, fcc, bt470bg\n" - " smpte170m, smpte240m, GBR, YCgCo\n", + " - undef, bt709, fcc, bt470bg, smpte170m,\n" + " smpte240m, GBR, YCgCo, bt2020nc, bt2020c\n", strtable_lookup( x264_colmatrix_names, defaults->vui.i_colmatrix ) ); H2( " --chromaloc <integer> Specify chroma sample location (0 to 5) [%d]\n", defaults->vui.i_chroma_loc ); @@ -787,6 +792,8 @@ H0( " --frames <integer> Maximum number of frames to encode\n" ); H0( " --level <string> Specify level (as defined by Annex A)\n" ); H1( " --bluray-compat Enable compatibility hacks for Blu-ray support\n" ); + H1( " --stitchable Don't optimize headers based on video content\n" + " Ensures ability to recombine a segmented encode\n" ); H1( "\n" ); H1( " -v, --verbose Print stats for each frame\n" ); H1( " --no-progress Don't show the progress indicator while encoding\n" ); @@ -806,6 +813,9 @@ " as opposed to letting them select different algorithms\n" ); H2( " --asm <integer> Override CPU detection\n" ); H2( " --no-asm Disable all CPU optimizations\n" ); + H2( " --opencl Enable use of OpenCL\n" ); + H2( " --opencl-clbin <string> Specify path of compiled OpenCL kernel cache\n" ); + H2( " --opencl-device <integer> Specify OpenCL device ordinal\n" ); H2( " --visualize Show MB types overlayed on the encoded video\n" ); H2( " --dump-yuv <string> Save reconstructed frames\n" ); H2( " --sps-id <integer> Set SPS and PPS id numbers [%d]\n", defaults->i_sps_id ); @@ -910,6 +920,9 @@ { "ref", required_argument, NULL, 'r' }, { "asm", required_argument, NULL, 0 }, { "no-asm", no_argument, NULL, 0 }, + { "opencl", no_argument, NULL, 1 }, + { "opencl-clbin",required_argument, NULL, 0 }, + { "opencl-device",required_argument, NULL, 0 }, { "sar", required_argument, NULL, 0 }, { "fps", required_argument, NULL, OPT_FPS }, { "frames", required_argument, NULL, OPT_FRAMES }, @@ -971,7 +984,9 @@ { "no-sliced-threads", no_argument, NULL, 0 }, { "slice-max-size", required_argument, NULL, 0 }, { "slice-max-mbs", required_argument, NULL, 0 }, + { "slice-min-mbs", required_argument, NULL, 0 }, { "slices", required_argument, NULL, 0 }, + { "slices-max", required_argument, NULL, 0 }, { "thread-input", no_argument, NULL, OPT_THREAD_INPUT }, { "sync-lookahead", required_argument, NULL, 0 }, { "non-deterministic", no_argument, NULL, 0 }, @@ -1025,6 +1040,7 @@ { "dts-compress", no_argument, NULL, OPT_DTS_COMPRESSION }, { "output-csp", required_argument, NULL, OPT_OUTPUT_CSP }, { "input-range", required_argument, NULL, OPT_INPUT_RANGE }, + { "stitchable", no_argument, NULL, 0 }, {0, 0, 0, 0} };
View file
x264-snapshot-20130224-2245.tar.bz2/x264.h -> x264-snapshot-20130723-2245.tar.bz2/x264.h
Changed
@@ -28,7 +28,7 @@ #ifndef X264_X264_H #define X264_X264_H -#if !defined(_STDINT_H) && !defined(_STDINT_H_) && \ +#if !defined(_STDINT_H) && !defined(_STDINT_H_) && !defined(_STDINT_H_INCLUDED) &&\ !defined(_INTTYPES_H) && !defined(_INTTYPES_H_) # ifdef _MSC_VER # pragma message("You must include stdint.h or inttypes.h before x264.h") @@ -41,7 +41,7 @@ #include "x264_config.h" -#define X264_BUILD 129 +#define X264_BUILD 135 /* Application developers planning to link against a shared library version of * libx264 from a Microsoft Visual Studio or similar development environment @@ -109,43 +109,53 @@ /**************************************************************************** * Encoder parameters ****************************************************************************/ -/* CPU flags - */ -#define X264_CPU_CACHELINE_32 0x0000001 /* avoid memory loads that span the border between two cachelines */ -#define X264_CPU_CACHELINE_64 0x0000002 /* 32/64 is the size of a cacheline in bytes */ -#define X264_CPU_ALTIVEC 0x0000004 -#define X264_CPU_MMX 0x0000008 -#define X264_CPU_MMX2 0x0000010 /* MMX2 aka MMXEXT aka ISSE */ +/* CPU flags */ + +/* x86 */ +#define X264_CPU_CMOV 0x0000001 +#define X264_CPU_MMX 0x0000002 +#define X264_CPU_MMX2 0x0000004 /* MMX2 aka MMXEXT aka ISSE */ #define X264_CPU_MMXEXT X264_CPU_MMX2 -#define X264_CPU_SSE 0x0000020 -#define X264_CPU_SSE2 0x0000040 -#define X264_CPU_SSE2_IS_SLOW 0x0000080 /* avoid most SSE2 functions on Athlon64 */ -#define X264_CPU_SSE2_IS_FAST 0x0000100 /* a few functions are only faster on Core2 and Phenom */ -#define X264_CPU_SSE3 0x0000200 -#define X264_CPU_SSSE3 0x0000400 -#define X264_CPU_SHUFFLE_IS_FAST 0x0000800 /* Penryn, Nehalem, and Phenom have fast shuffle units */ -#define X264_CPU_STACK_MOD4 0x0001000 /* if stack is only mod4 and not mod16 */ -#define X264_CPU_SSE4 0x0002000 /* SSE4.1 */ -#define X264_CPU_SSE42 0x0004000 /* SSE4.2 */ -#define X264_CPU_SSE_MISALIGN 0x0008000 /* Phenom support for misaligned SSE instruction arguments */ -#define X264_CPU_LZCNT 0x0010000 /* Phenom support for "leading zero count" instruction. */ -#define X264_CPU_ARMV6 0x0020000 -#define X264_CPU_NEON 0x0040000 /* ARM NEON */ -#define X264_CPU_FAST_NEON_MRC 0x0080000 /* Transfer from NEON to ARM register is fast (Cortex-A9) */ -#define X264_CPU_SLOW_CTZ 0x0100000 /* BSR/BSF x86 instructions are really slow on some CPUs */ -#define X264_CPU_SLOW_ATOM 0x0200000 /* The Atom just sucks */ -#define X264_CPU_AVX 0x0400000 /* AVX support: requires OS support even if YMM registers - * aren't used. */ -#define X264_CPU_XOP 0x0800000 /* AMD XOP */ -#define X264_CPU_FMA4 0x1000000 /* AMD FMA4 */ -#define X264_CPU_AVX2 0x2000000 /* AVX2 */ -#define X264_CPU_FMA3 0x4000000 /* Intel FMA3 */ -#define X264_CPU_BMI1 0x8000000 /* BMI1 */ -#define X264_CPU_BMI2 0x10000000 /* BMI2 */ -#define X264_CPU_TBM 0x20000000 /* AMD TBM */ - -/* Analyse flags - */ +#define X264_CPU_SSE 0x0000008 +#define X264_CPU_SSE2 0x0000010 +#define X264_CPU_SSE3 0x0000020 +#define X264_CPU_SSSE3 0x0000040 +#define X264_CPU_SSE4 0x0000080 /* SSE4.1 */ +#define X264_CPU_SSE42 0x0000100 /* SSE4.2 */ +#define X264_CPU_SSE_MISALIGN 0x0000200 /* Phenom support for misaligned SSE instruction arguments */ +#define X264_CPU_LZCNT 0x0000400 /* Phenom support for "leading zero count" instruction. */ +#define X264_CPU_AVX 0x0000800 /* AVX support: requires OS support even if YMM registers aren't used. */ +#define X264_CPU_XOP 0x0001000 /* AMD XOP */ +#define X264_CPU_FMA4 0x0002000 /* AMD FMA4 */ +#define X264_CPU_AVX2 0x0004000 /* AVX2 */ +#define X264_CPU_FMA3 0x0008000 /* Intel FMA3 */ +#define X264_CPU_BMI1 0x0010000 /* BMI1 */ +#define X264_CPU_BMI2 0x0020000 /* BMI2 */ +/* x86 modifiers */ +#define X264_CPU_CACHELINE_32 0x0040000 /* avoid memory loads that span the border between two cachelines */ +#define X264_CPU_CACHELINE_64 0x0080000 /* 32/64 is the size of a cacheline in bytes */ +#define X264_CPU_SSE2_IS_SLOW 0x0100000 /* avoid most SSE2 functions on Athlon64 */ +#define X264_CPU_SSE2_IS_FAST 0x0200000 /* a few functions are only faster on Core2 and Phenom */ +#define X264_CPU_SLOW_SHUFFLE 0x0400000 /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */ +#define X264_CPU_STACK_MOD4 0x0800000 /* if stack is only mod4 and not mod16 */ +#define X264_CPU_SLOW_CTZ 0x1000000 /* BSR/BSF x86 instructions are really slow on some CPUs */ +#define X264_CPU_SLOW_ATOM 0x2000000 /* The Atom is terrible: slow SSE unaligned loads, slow + * SIMD multiplies, slow SIMD variable shifts, slow pshufb, + * cacheline split penalties -- gather everything here that + * isn't shared by other CPUs to avoid making half a dozen + * new SLOW flags. */ +#define X264_CPU_SLOW_PSHUFB 0x4000000 /* such as on the Intel Atom */ +#define X264_CPU_SLOW_PALIGNR 0x8000000 /* such as on the AMD Bobcat */ + +/* PowerPC */ +#define X264_CPU_ALTIVEC 0x0000001 + +/* ARM */ +#define X264_CPU_ARMV6 0x0000001 +#define X264_CPU_NEON 0x0000002 /* ARM NEON */ +#define X264_CPU_FAST_NEON_MRC 0x0000004 /* Transfer from NEON to ARM register is fast (Cortex-A9) */ + +/* Analyse flags */ #define X264_ANALYSE_I4x4 0x0001 /* Analyse i4x4 */ #define X264_ANALYSE_I8x8 0x0002 /* Analyse i8x8 (requires 8x8 transform) */ #define X264_ANALYSE_PSUB16x16 0x0010 /* Analyse p16x8, p8x16 and p8x8 */ @@ -188,9 +198,10 @@ static const char * const x264_overscan_names[] = { "undef", "show", "crop", 0 }; static const char * const x264_vidformat_names[] = { "component", "pal", "ntsc", "secam", "mac", "undef", 0 }; static const char * const x264_fullrange_names[] = { "off", "on", 0 }; -static const char * const x264_colorprim_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "film", 0 }; -static const char * const x264_transfer_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "linear", "log100", "log316", 0 }; -static const char * const x264_colmatrix_names[] = { "GBR", "bt709", "undef", "", "fcc", "bt470bg", "smpte170m", "smpte240m", "YCgCo", 0 }; +static const char * const x264_colorprim_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "film", "bt2020", 0 }; +static const char * const x264_transfer_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "linear", "log100", "log316", + "iec61966-2-4", "bt1361e", "iec61966-2-1", "bt2020-10", "bt2020-12", 0 }; +static const char * const x264_colmatrix_names[] = { "GBR", "bt709", "undef", "", "fcc", "bt470bg", "smpte170m", "smpte240m", "YCgCo", "bt2020nc", "bt2020c", 0 }; static const char * const x264_nal_hrd_names[] = { "none", "vbr", "cbr", 0 }; /* Colorspace type */ @@ -464,10 +475,23 @@ int b_fake_interlaced; + /* Don't optimize header parameters based on video content, e.g. ensure that splitting an input video, compressing + * each part, and stitching them back together will result in identical SPS/PPS. This is necessary for stitching + * with container formats that don't allow multiple SPS/PPS. */ + int b_stitchable; + + int b_opencl; /* use OpenCL when available */ + int i_opencl_device; /* specify count of GPU devices to skip, for CLI users */ + void *opencl_device_id; /* pass explicit cl_device_id as void*, for API users */ + char *psz_clbin_file; /* compiled OpenCL kernel cache file */ + /* Slicing parameters */ int i_slice_max_size; /* Max size per slice in bytes; includes estimated NAL overhead. */ int i_slice_max_mbs; /* Max number of MBs per slice; overrides i_slice_count. */ + int i_slice_min_mbs; /* Min number of MBs per slice */ int i_slice_count; /* Number of slices per frame: forces rectangular slices. */ + int i_slice_count_max; /* Absolute cap on slices per frame; stops applying slice-max-size + * and slice-max-mbs if this is reached. */ /* Optional callback for freeing this x264_param_t when it is done being used. * Only used when the x264_param_t sits in memory for an indefinite period of time, @@ -481,7 +505,7 @@ * is done encoding. * * This callback MUST do the following in order to work correctly: - * 1) Have available an output buffer of at least size nal->i_payload*3/2 + 5 + 16. + * 1) Have available an output buffer of at least size nal->i_payload*3/2 + 5 + 64. * 2) Call x264_nal_encode( h, dst, nal ), where dst is the output buffer. * After these steps, the content of nal is valid and can be used in the same way as if * the NAL unit were output by x264_encoder_encode. @@ -834,7 +858,13 @@ * due to delay, this may not be the next frame passed to encoder_encode. * if the change should apply to some particular frame, use x264_picture_t->param instead. * returns 0 on success, negative on parameter validation error. - * not all parameters can be changed; see the actual function for a detailed breakdown. */ + * not all parameters can be changed; see the actual function for a detailed breakdown. + * + * since not all parameters can be changed, moving from preset to preset may not always + * fully copy all relevant parameters, but should still work usably in practice. however, + * more so than for other presets, many of the speed shortcuts used in ultrafast cannot be + * switched out of; using reconfig to switch between ultrafast and other presets is not + * recommended without a more fine-grained breakdown of parameters to take this into account. */ int x264_encoder_reconfig( x264_t *, x264_param_t * ); /* x264_encoder_parameters: * copies the current internal set of parameters to the pointer provided
Locations
Projects
Search
Status Monitor
Help
Open Build Service
OBS Manuals
API Documentation
OBS Portal
Reporting a Bug
Contact
Mailing List
Forums
Chat (IRC)
Twitter
Open Build Service (OBS)
is an
openSUSE project
.